From 58bbdec7eb0580f03e0e50b675a2b8b72bee0e0e Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Thu, 21 Jul 2022 00:17:56 +0800 Subject: [PATCH 01/15] RVV: layernorm fp32 w/o elempack --- src/layer/riscv/layernorm_riscv.cpp | 410 ++++++++++++++++++++++++++++ src/layer/riscv/layernorm_riscv.h | 31 +++ 2 files changed, 441 insertions(+) create mode 100644 src/layer/riscv/layernorm_riscv.cpp create mode 100644 src/layer/riscv/layernorm_riscv.h diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp new file mode 100644 index 000000000..8a4fc452d --- /dev/null +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -0,0 +1,410 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +#include "layernorm_riscv.h" +#include + +#if __riscv_vector +#ifdef RVV_SPEC_0_7 +#include "riscv_v_071_fix.h" +#else +#include +#endif +#endif // __riscv_vector + +namespace ncnn { +LayerNorm_riscv::LayerNorm_riscv() +{ +#if __riscv_vector +#warning "TODO: packing. Don't merge at this moment." + // support_packing = true; +#endif +} + +int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ +// x = (x - mean) / sqrt(var + eps) * gamma + beta +#if __riscv_vector + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + if (dims == 1) + { + float* ptr = bottom_top_blob; + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + + { + int n = w; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / w; + + { + int n = w; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / w; + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = w; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + + { + int n = w; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / w; + float tmp = 0.f; + + { + int n = w; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / w; + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = w; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + } + } + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = w; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / w; + + { + int n = w; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / w; + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = w; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = w; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + + float mean = sum / size; + { + int n = w; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = w; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + } + } + } + +#else // __riscv_vector + return LayerNorm::forward_inplace(bottom_top_blob, opt); +#endif // __riscv_vector + return 0; +} + +} // namespace ncnn \ No newline at end of file diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h new file mode 100644 index 000000000..e23cbe8ff --- /dev/null +++ b/src/layer/riscv/layernorm_riscv.h @@ -0,0 +1,31 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_LAYERNORM_RISCV_H +#define LAYER_LAYERNORM_RISCV_H + +#include "layernorm.h" + +namespace ncnn { +class LayerNorm_riscv : virtual public LayerNorm +{ +public: + LayerNorm_riscv(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_LAYERNORM_RISCV_H From 062efce8b76f3733445a5541adce486e370462b5 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Thu, 21 Jul 2022 01:11:28 +0800 Subject: [PATCH 02/15] Refactor: RVV:layernorm fp32 w/o elempack --- src/layer/riscv/layernorm_riscv.cpp | 305 +++++++--------------------- 1 file changed, 78 insertions(+), 227 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index 8a4fc452d..aa597c198 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -31,94 +31,102 @@ LayerNorm_riscv::LayerNorm_riscv() #endif } -int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const -{ -// x = (x - mean) / sqrt(var + eps) * gamma + beta #if __riscv_vector - int dims = bottom_top_blob.dims; - int w = bottom_top_blob.w; - if (dims == 1) +static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine) +{ + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); { - float* ptr = bottom_top_blob; + int n = w; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / w; - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = w; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / w; + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + { + int n = w; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) { - int n = w; - float* ptr_sum = ptr; while (n > 0) { word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; } } - sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / w; - + else { - int n = w; - float* ptr_sqsum = ptr; while (n > 0) { word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); n -= vl; - ptr_sqsum += vl; + ptr_store += vl; } } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / w; - // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; + } + return 0; +} +#endif // __riscv_vector - { - int n = w; - float* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse32_v_f32m8(ptr_store, _p, vl); +int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ +// x = (x - mean) / sqrt(var + eps) * gamma + beta +#if __riscv_vector + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + if (dims == 1) + { + float* ptr = bottom_top_blob; - n -= vl; - ptr_store += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } + return layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); } if (dims == 2) @@ -131,87 +139,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); - - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - - { - int n = w; - float* ptr_sum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / w; - float tmp = 0.f; - - { - int n = w; - float* ptr_sqsum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / w; - // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; - - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; - - { - int n = w; - float* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse32_v_f32m8(ptr_store, _p, vl); - - n -= vl; - ptr_store += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); } } if (dims == 3) @@ -230,84 +158,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co { float* ptr = bottom_top_blob.channel(q).row(i); - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - { - int n = w; - float* ptr_sum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / w; - - { - int n = w; - float* ptr_sqsum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / w; - // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; - - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; - - { - int n = w; - float* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse32_v_f32m8(ptr_store, _p, vl); - - n -= vl; - ptr_store += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); } } } From 569b24c14fa17f9c9e85e6ddb7d4e72a214fc497 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Thu, 21 Jul 2022 11:22:39 +0800 Subject: [PATCH 03/15] RVV: layernorm fp32 elempack --- src/layer/riscv/layernorm_riscv.cpp | 291 +++++++++++++++++++++------- 1 file changed, 216 insertions(+), 75 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index aa597c198..ddd22cb1e 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -22,12 +22,13 @@ #endif #endif // __riscv_vector +#include "riscv_usability.h" + namespace ncnn { LayerNorm_riscv::LayerNorm_riscv() { #if __riscv_vector -#warning "TODO: packing. Don't merge at this moment." - // support_packing = true; + support_packing = true; #endif } @@ -114,19 +115,72 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* } return 0; } + +static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine, const word_type vl) +{ + // mean and var + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl); + for (int i = 0; i < w; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _sum = vfadd_vv_f32m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); + } + vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, w, vl); + for (int i = 0; i < w; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _p = vfsub_vv_f32m1(_p, _mean, vl); + _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); + } + vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); + vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl); + if (affine) + { + for (int i = 0; i < w; i++) + { + const int offset = vl * i; + vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); + _p = vfmadd_vv_f32m1(_p, _a, _b, vl); + _p = vfmul_vf_f32m1(_p, gamma_data[i], vl); + _p = vfadd_vf_f32m1(_p, beta_data[i], vl); + vse32_v_f32m1(ptr + offset, _p, vl); + } + } + else + { + for (int i = 0; i < w; i++) + { + const int offset = vl * i; + vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); + _p = vfmadd_vv_f32m1(_p, _a, _b, vl); + vse32_v_f32m1(ptr + offset, _p, vl); + } + } + + return 0; +} #endif // __riscv_vector int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { // x = (x - mean) / sqrt(var + eps) * gamma + beta #if __riscv_vector + int elempack = bottom_top_blob.elempack; + const int packn = csrr_vlenb() / 4; int dims = bottom_top_blob.dims; int w = bottom_top_blob.w; + if (dims == 1) { float* ptr = bottom_top_blob; - return layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine); } if (dims == 2) @@ -134,12 +188,24 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int w = bottom_top_blob.w; int h = bottom_top_blob.h; // assert affine_size == w - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + } + } + if (elempack == packn) { - float* ptr = bottom_top_blob.row(i); - layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + const word_type vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + } } } if (dims == 3) @@ -151,100 +217,175 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co if (affine_size == w) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == 1) { - for (int i = 0; i < h; i++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - float* ptr = bottom_top_blob.channel(q).row(i); + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); - layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + } + } + } + if (elempack == packn) + { + const word_type vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + } } } } else // if (affine_size == size) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (elempack == 1) { - float* ptr = bottom_top_blob.channel(q); - - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - int n = w; - float* ptr_sum = ptr; - while (n > 0) + float* ptr = bottom_top_blob.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; + int n = size; + float* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } } - } - sum = vfmv_f_s_f32m1_f32(_sum); + sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / size; - { - int n = w; - float* ptr_sqsum = ptr; - while (n > 0) + float mean = sum / size; { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; + int n = size; + float* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; - float var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; + { + int n = size; + float* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + } + } + if (elempack == packn) + { + const word_type vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - int n = w; - float* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; + float* ptr = bottom_top_blob.channel(q); + // mean and var + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1()); + for (int i = 0; i < size; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _sum = vfadd_vv_f32m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); + } + vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _p = vfsub_vv_f32m1(_p, _mean, vl); + _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); + } + vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / w - mean * mean; + vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); + vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl); if (affine) { - while (n > 0) + for (int i = 0; i < size; i++) { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse32_v_f32m8(ptr_store, _p, vl); - - n -= vl; - ptr_store += vl; + const int offset = vl * i; + vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); + _p = vfmadd_vv_f32m1(_p, _a, _b, vl); + _p = vfmul_vf_f32m1(_p, gamma_data[i], vl); + _p = vfadd_vf_f32m1(_p, beta_data[i], vl); + vse32_v_f32m1(ptr + offset, _p, vl); } } else { - while (n > 0) + for (int i = 0; i < size; i++) { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; + const int offset = vl * i; + vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); + _p = vfmadd_vv_f32m1(_p, _a, _b, vl); + vse32_v_f32m1(ptr + offset, _p, vl); } } } From 706ebd14ac22451e3c753006f4f2aba1c63699c8 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Thu, 21 Jul 2022 18:13:11 +0800 Subject: [PATCH 04/15] Refactor: layernorm refactor condition check order when dim >= 2. --- src/layer/riscv/layernorm_riscv.cpp | 97 ++++++++++++++++------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index ddd22cb1e..a3f1f44c8 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -182,14 +182,13 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine); } - - if (dims == 2) + if (elempack == 1) { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - // assert affine_size == w - if (elempack == 1) + if (dims == 2) { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { @@ -197,27 +196,14 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); } } - if (elempack == packn) - { - const word_type vl = vsetvl_e32m1(packn); - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - float* ptr = bottom_top_blob.row(i); - layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); - } - } - } - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - if (affine_size == w) + if (dims == 3) { - if (elempack == 1) + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + if (affine_size == w) { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -230,24 +216,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } } } - if (elempack == packn) - { - const word_type vl = vsetvl_e32m1(packn); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - for (int i = 0; i < h; i++) - { - float* ptr = bottom_top_blob.channel(q).row(i); - - layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); - } - } - } - } - else // if (affine_size == size) - { - if (elempack == 1) + else // if (affine_size == size) { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -337,7 +306,47 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } } } - if (elempack == packn) + } + } + + if (elempack == packn) + { + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + const word_type vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + } + } + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + const word_type vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + } + } + } + else // if (affine_size == size) { const word_type vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) From c42186e6e4ba43b99289b193cc56cb0cfd21db1b Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Thu, 21 Jul 2022 18:53:35 +0800 Subject: [PATCH 05/15] Refactor: layernorm * remove some unused variable; * refactor when "affine_size == size && dim == 3" --- src/layer/riscv/layernorm_riscv.cpp | 150 +++------------------------- 1 file changed, 12 insertions(+), 138 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index a3f1f44c8..606502fb5 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -33,7 +33,7 @@ LayerNorm_riscv::LayerNorm_riscv() } #if __riscv_vector -static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine) +static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) { float sum = 0.f; float sqsum = 0.f; @@ -47,7 +47,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* word_type vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); ptr_sum += vl; n -= vl; } @@ -63,7 +63,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* word_type vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); n -= vl; ptr_sqsum += vl; } @@ -116,7 +116,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* return 0; } -static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine, const word_type vl) +static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) { // mean and var vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); @@ -180,7 +180,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co { float* ptr = bottom_top_blob; - return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine); + return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine); } if (elempack == 1) { @@ -193,7 +193,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); - layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine); } } @@ -212,7 +212,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co { float* ptr = bottom_top_blob.channel(q).row(i); - layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine); + layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine); } } } @@ -222,88 +222,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int q = 0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); - - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - { - int n = size; - float* ptr_sum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f32m1_f32(_sum); - - float mean = sum / size; - { - int n = size; - float* ptr_sqsum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - - float var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; - - { - int n = size; - float* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse32_v_f32m8(ptr_store, _p, vl); - - n -= vl; - ptr_store += vl; - ptr_gamma += vl; - ptr_beta += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } + layernorm_rvv_pack1_procedure(size, ptr, gamma_data, beta_data, eps, affine); } } } @@ -311,18 +230,18 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co if (elempack == packn) { + const word_type vl = vsetvl_e32m1(packn); if (dims == 2) { int w = bottom_top_blob.w; int h = bottom_top_blob.h; // assert affine_size == w - const word_type vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); - layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); } } if (dims == 3) @@ -334,7 +253,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co if (affine_size == w) { - const word_type vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { @@ -342,61 +260,17 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co { float* ptr = bottom_top_blob.channel(q).row(i); - layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl); + layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); } } } else // if (affine_size == size) { - const word_type vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); - // mean and var - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1()); - for (int i = 0; i < size; i++) - { - vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); - _sum = vfadd_vv_f32m1(_p, _sum, vl); - // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); - } - vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl); - for (int i = 0; i < size; i++) - { - vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); - _p = vfsub_vv_f32m1(_p, _mean, vl); - _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); - } - vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl); - - // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; - vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); - vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl); - if (affine) - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); - _p = vfmadd_vv_f32m1(_p, _a, _b, vl); - _p = vfmul_vf_f32m1(_p, gamma_data[i], vl); - _p = vfadd_vf_f32m1(_p, beta_data[i], vl); - vse32_v_f32m1(ptr + offset, _p, vl); - } - } - else - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); - _p = vfmadd_vv_f32m1(_p, _a, _b, vl); - vse32_v_f32m1(ptr + offset, _p, vl); - } - } + layernorm_rvv_packn_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl); } } } From 03ca1578a256fd6fbd0567fb285d5141d0ea6341 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Fri, 22 Jul 2022 23:55:22 +0800 Subject: [PATCH 06/15] Refactor: RVV LayerNorm fp32 w/ elempack * Use `vfmacc.vv` to get sqsum; --- src/layer/riscv/layernorm_riscv.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index 606502fb5..53f3d19f1 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -132,13 +132,14 @@ static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* { vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); _p = vfsub_vv_f32m1(_p, _mean, vl); - _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); + _sqsum = vfmacc_vv_f32m1(_sqsum, _p, _p, vl); } vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl); // the var maybe minus due to accuracy //float var = sqsum / w - mean * mean; vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); + // how about vfrsqrt7.v? vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl); if (affine) { From b495fa5413694083ae720a361069635383341da9 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 23 Jul 2022 18:00:30 +0800 Subject: [PATCH 07/15] Refactor: RVV LayerNorm fp32 --- src/layer/riscv/layernorm_riscv.cpp | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index 53f3d19f1..c613f8679 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -33,14 +33,14 @@ LayerNorm_riscv::LayerNorm_riscv() } #if __riscv_vector -static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) { float sum = 0.f; float sqsum = 0.f; vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); { - int n = w; + int n = size; float* ptr_sum = ptr; while (n > 0) { @@ -53,10 +53,10 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* } } sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / w; + float mean = sum / size; { - int n = w; + int n = size; float* ptr_sqsum = ptr; while (n > 0) { @@ -69,14 +69,14 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* } } sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / w; + float var = sqsum / size; // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; + //float var = sqsum / size - mean * mean; float a = static_cast(1.f / (sqrt(var + eps))); float b = -mean * a; { - int n = w; + int n = size; float* ptr_store = ptr; const float* ptr_gamma = gamma_data; const float* ptr_beta = beta_data; @@ -116,34 +116,34 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* return 0; } -static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) { // mean and var vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl); - for (int i = 0; i < w; i++) + for (int i = 0; i < size; i++) { vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); _sum = vfadd_vv_f32m1(_p, _sum, vl); // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); } - vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, w, vl); - for (int i = 0; i < w; i++) + vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl); + for (int i = 0; i < size; i++) { vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); _p = vfsub_vv_f32m1(_p, _mean, vl); _sqsum = vfmacc_vv_f32m1(_sqsum, _p, _p, vl); } - vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl); + vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl); // the var maybe minus due to accuracy - //float var = sqsum / w - mean * mean; + //float var = sqsum / size - mean * mean; vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); // how about vfrsqrt7.v? vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl); if (affine) { - for (int i = 0; i < w; i++) + for (int i = 0; i < size; i++) { const int offset = vl * i; vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); @@ -155,7 +155,7 @@ static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* } else { - for (int i = 0; i < w; i++) + for (int i = 0; i < size; i++) { const int offset = vl * i; vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl); From 5edb46b6db4e36ac42e5b3d88515be21a3ebb80a Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 23 Jul 2022 18:48:22 +0800 Subject: [PATCH 08/15] RVV: layernorm fp16 storage w/ elempack support --- src/layer/riscv/layernorm_riscv.cpp | 263 ++++++++++++++++++++++++++++ src/layer/riscv/layernorm_riscv.h | 5 + 2 files changed, 268 insertions(+) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index c613f8679..aaaed4328 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -29,7 +29,10 @@ LayerNorm_riscv::LayerNorm_riscv() { #if __riscv_vector support_packing = true; +#if __riscv_zfh + support_fp16_storage = true; #endif +#endif // __riscv_vector } #if __riscv_vector @@ -172,6 +175,14 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co { // x = (x - mean) / sqrt(var + eps) * gamma + beta #if __riscv_vector + int elembits = bottom_top_blob.elembits(); +#if __riscv_zfh + if (opt.use_fp16_storage && elembits == 16) + { + return forward_inplace_fp16s(bottom_top_blob, opt); + } +#endif // __riscv_zfh + int elempack = bottom_top_blob.elempack; const int packn = csrr_vlenb() / 4; int dims = bottom_top_blob.dims; @@ -283,4 +294,256 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co return 0; } +#if __riscv_vector && __riscv_zfh + +static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +{ + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); + _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / size; + + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = size; + __fp16* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); + + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; +} + +static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +{ + // mean and var + // f16m1 => f32m2 + vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _sum = vfadd_vv_f32m2(_p, _sum, vl); + } + vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _p = vfsub_vv_f32m2(_p, _mean, vl); + _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl); + } + vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); + // how about vfrsqrt7.v? + vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl); + if (affine) + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); + _p = vfmadd_vv_f32m2(_p, _a, _b, vl); + _p = vfmul_vf_f32m2(_p, gamma_data[i], vl); + _p = vfadd_vf_f32m2(_p, beta_data[i], vl); + vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + else + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); + _p = vfmadd_vv_f32m2(_p, _a, _b, vl); + vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + + return 0; +} + +int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / sqrt(var + eps) * gamma + beta + int elempack = bottom_top_blob.elempack; + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + + if (dims == 1) + { + __fp16* ptr = bottom_top_blob; + + return layernorm_rvv_pack1_fp16s_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine); + } + if (elempack == 1) + { + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + layernorm_rvv_pack1_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine); + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + + layernorm_rvv_pack1_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine); + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + layernorm_rvv_pack1_fp16s_procedure(size, ptr, gamma_data, beta_data, eps, affine); + } + } + } + + return 0; + } + + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + const word_type vl = vsetvl_e16m1(packn); + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + layernorm_rvv_packn_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + + layernorm_rvv_packn_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + layernorm_rvv_packn_fp16s_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + } + } + + return 0; +} + + +#endif + } // namespace ncnn \ No newline at end of file diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h index e23cbe8ff..5a883683e 100644 --- a/src/layer/riscv/layernorm_riscv.h +++ b/src/layer/riscv/layernorm_riscv.h @@ -24,6 +24,11 @@ public: LayerNorm_riscv(); virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if __riscv_vector && __riscv_zfh + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn From fabe56c0550697f56c81e6ad4be5fc50931332c6 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 23 Jul 2022 20:01:21 +0800 Subject: [PATCH 09/15] RVV: LayerNorm, fp16, s/sa, with elempack support --- src/layer/riscv/layernorm_riscv.cpp | 251 +++++++++++++++++++++++++++- src/layer/riscv/layernorm_riscv.h | 1 + 2 files changed, 251 insertions(+), 1 deletion(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index aaaed4328..8e7350241 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -179,7 +179,10 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co #if __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { - return forward_inplace_fp16s(bottom_top_blob, opt); + if (opt.use_fp16_arithmetic) + return forward_inplace_fp16sa(bottom_top_blob, opt); + else + return forward_inplace_fp16s(bottom_top_blob, opt); } #endif // __riscv_zfh @@ -543,6 +546,252 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o return 0; } +static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +{ + float sum = 0.f; + float sqsum = 0.f; + vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); + _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f16m1_f16(_sum); + float mean = sum / size; + + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); + _p = vfsub_vf_f16m8(_p, mean, vl); + _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f16m1_f16(_sqsum); + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = size; + __fp16* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl); + _p = vfmul_vf_f16m4(_p, a, vl); + vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl); + _p = vfadd_vf_f16m4(_p, b, vl); + vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl); + _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl); + vse16_v_f16m4(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); + _p = vfmul_vf_f16m8(_p, a, vl); + _p = vfadd_vf_f16m8(_p, b, vl); + vse16_v_f16m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; +} + +static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +{ + // mean and var + vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _sum = vfadd_vv_f16m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl); + } + vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _p = vfsub_vv_f16m1(_p, _mean, vl); + _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl); + } + vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); + // how about vfrsqrt7.v? + vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl); + if (affine) + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); + _p = vfmadd_vv_f16m1(_p, _a, _b, vl); + _p = vfmul_vf_f16m1(_p, gamma_data[i], vl); + _p = vfadd_vf_f16m1(_p, beta_data[i], vl); + vse16_v_f16m1(ptr + offset, _p, vl); + } + } + else + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); + _p = vfmadd_vv_f16m1(_p, _a, _b, vl); + vse16_v_f16m1(ptr + offset, _p, vl); + } + } + + return 0; +} + +int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / sqrt(var + eps) * gamma + beta + int elempack = bottom_top_blob.elempack; + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + + if (dims == 1) + { + __fp16* ptr = bottom_top_blob; + + return layernorm_rvv_pack1_fp16sa_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine); + } + if (elempack == 1) + { + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + layernorm_rvv_pack1_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine); + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + + layernorm_rvv_pack1_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine); + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + layernorm_rvv_pack1_fp16sa_procedure(size, ptr, gamma_data, beta_data, eps, affine); + } + } + } + + return 0; + } + + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + const word_type vl = vsetvl_e16m1(packn); + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + layernorm_rvv_packn_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + + layernorm_rvv_packn_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + layernorm_rvv_packn_fp16sa_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl); + } + } + } + } + + return 0; +} #endif diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h index 5a883683e..6d2a1748f 100644 --- a/src/layer/riscv/layernorm_riscv.h +++ b/src/layer/riscv/layernorm_riscv.h @@ -28,6 +28,7 @@ public: protected: #if __riscv_vector && __riscv_zfh int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; + int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif }; From 9b2da3649292a6550cce47113eb9a58baa60dc78 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 23 Jul 2022 21:27:28 +0800 Subject: [PATCH 10/15] Refactor: RVV LayerNorm, fp16 * move fp16 procedure into 'layernorm_rvv_fp16.h' --- src/layer/riscv/layernorm_riscv.cpp | 275 +------------------------- src/layer/riscv/layernorm_rvv_fp16.h | 284 +++++++++++++++++++++++++++ 2 files changed, 289 insertions(+), 270 deletions(-) create mode 100644 src/layer/riscv/layernorm_rvv_fp16.h diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index 8e7350241..dd56e0a9f 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -25,6 +25,11 @@ #include "riscv_usability.h" namespace ncnn { + +#if __riscv_vector && __riscv_zfh +#include "layernorm_rvv_fp16.h" +#endif + LayerNorm_riscv::LayerNorm_riscv() { #if __riscv_vector @@ -298,141 +303,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } #if __riscv_vector && __riscv_zfh - -static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) -{ - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - { - int n = size; - __fp16* ptr_sum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); - _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / size; - - { - int n = size; - __fp16* ptr_sqsum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; - - { - int n = size; - __fp16* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); - _p = vfmul_vf_f32m8(_p, a, vl); - vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); - _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); - vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); - - n -= vl; - ptr_store += vl; - ptr_gamma += vl; - ptr_beta += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); - n -= vl; - ptr_store += vl; - } - } - } - return 0; -} - -static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) -{ - // mean and var - // f16m1 => f32m2 - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl); - for (int i = 0; i < size; i++) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); - _sum = vfadd_vv_f32m2(_p, _sum, vl); - } - vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl); - for (int i = 0; i < size; i++) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); - _p = vfsub_vv_f32m2(_p, _mean, vl); - _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl); - } - vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl); - - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); - // how about vfrsqrt7.v? - vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl); - if (affine) - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); - _p = vfmadd_vv_f32m2(_p, _a, _b, vl); - _p = vfmul_vf_f32m2(_p, gamma_data[i], vl); - _p = vfadd_vf_f32m2(_p, beta_data[i], vl); - vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); - } - } - else - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); - _p = vfmadd_vv_f32m2(_p, _a, _b, vl); - vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); - } - } - - return 0; -} - int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { // x = (x - mean) / sqrt(var + eps) * gamma + beta @@ -546,140 +416,6 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o return 0; } -static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) -{ - float sum = 0.f; - float sqsum = 0.f; - vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); - vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); - { - int n = size; - __fp16* ptr_sum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); - _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f16m1_f16(_sum); - float mean = sum / size; - - { - int n = size; - __fp16* ptr_sqsum = ptr; - while (n > 0) - { - word_type vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); - _p = vfsub_vf_f16m8(_p, mean, vl); - _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f16m1_f16(_sqsum); - float var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; - - { - int n = size; - __fp16* ptr_store = ptr; - const float* ptr_gamma = gamma_data; - const float* ptr_beta = beta_data; - if (affine) - { - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl); - _p = vfmul_vf_f16m4(_p, a, vl); - vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl); - _p = vfadd_vf_f16m4(_p, b, vl); - vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl); - _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl); - vse16_v_f16m4(ptr_store, _p, vl); - - n -= vl; - ptr_store += vl; - ptr_gamma += vl; - ptr_beta += vl; - } - } - else - { - while (n > 0) - { - word_type vl = vsetvl_e16m4(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); - _p = vfmul_vf_f16m8(_p, a, vl); - _p = vfadd_vf_f16m8(_p, b, vl); - vse16_v_f16m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } - return 0; -} - -static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) -{ - // mean and var - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl); - for (int i = 0; i < size; i++) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); - _sum = vfadd_vv_f16m1(_p, _sum, vl); - // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl); - } - vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl); - for (int i = 0; i < size; i++) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); - _p = vfsub_vv_f16m1(_p, _mean, vl); - _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl); - } - vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl); - - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); - // how about vfrsqrt7.v? - vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl); - if (affine) - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); - _p = vfmadd_vv_f16m1(_p, _a, _b, vl); - _p = vfmul_vf_f16m1(_p, gamma_data[i], vl); - _p = vfadd_vf_f16m1(_p, beta_data[i], vl); - vse16_v_f16m1(ptr + offset, _p, vl); - } - } - else - { - for (int i = 0; i < size; i++) - { - const int offset = vl * i; - vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); - _p = vfmadd_vv_f16m1(_p, _a, _b, vl); - vse16_v_f16m1(ptr + offset, _p, vl); - } - } - - return 0; -} - int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const { // x = (x - mean) / sqrt(var + eps) * gamma + beta @@ -792,7 +528,6 @@ int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& return 0; } - #endif } // namespace ncnn \ No newline at end of file diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h new file mode 100644 index 000000000..313b51cfe --- /dev/null +++ b/src/layer/riscv/layernorm_rvv_fp16.h @@ -0,0 +1,284 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// fp16s +static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +{ + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); + _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / size; + + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = size; + __fp16* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); + _p = vfmul_vf_f32m8(_p, a, vl); + vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl); + _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl); + vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); + + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; +} + +static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +{ + // mean and var + // f16m1 => f32m2 + vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _sum = vfadd_vv_f32m2(_p, _sum, vl); + } + vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _p = vfsub_vv_f32m2(_p, _mean, vl); + _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl); + } + vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); + // how about vfrsqrt7.v? + vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl); + if (affine) + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); + _p = vfmadd_vv_f32m2(_p, _a, _b, vl); + _p = vfmul_vf_f32m2(_p, gamma_data[i], vl); + _p = vfadd_vf_f32m2(_p, beta_data[i], vl); + vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + else + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl); + _p = vfmadd_vv_f32m2(_p, _a, _b, vl); + vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + + return 0; +} + +// fp16sa + +static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +{ + float sum = 0.f; + float sqsum = 0.f; + vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); + _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f16m1_f16(_sum); + float mean = sum / size; + + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + word_type vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); + _p = vfsub_vf_f16m8(_p, mean, vl); + _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f16m1_f16(_sqsum); + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + { + int n = size; + __fp16* ptr_store = ptr; + const float* ptr_gamma = gamma_data; + const float* ptr_beta = beta_data; + if (affine) + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl); + _p = vfmul_vf_f16m4(_p, a, vl); + vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl); + _p = vfadd_vf_f16m4(_p, b, vl); + vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl); + _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl); + vse16_v_f16m4(ptr_store, _p, vl); + + n -= vl; + ptr_store += vl; + ptr_gamma += vl; + ptr_beta += vl; + } + } + else + { + while (n > 0) + { + word_type vl = vsetvl_e16m4(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); + _p = vfmul_vf_f16m8(_p, a, vl); + _p = vfadd_vf_f16m8(_p, b, vl); + vse16_v_f16m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; +} + +static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +{ + // mean and var + vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _sum = vfadd_vv_f16m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl); + } + vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _p = vfsub_vv_f16m1(_p, _mean, vl); + _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl); + } + vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl); + + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); + // how about vfrsqrt7.v? + vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl); + if (affine) + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); + _p = vfmadd_vv_f16m1(_p, _a, _b, vl); + _p = vfmul_vf_f16m1(_p, gamma_data[i], vl); + _p = vfadd_vf_f16m1(_p, beta_data[i], vl); + vse16_v_f16m1(ptr + offset, _p, vl); + } + } + else + { + for (int i = 0; i < size; i++) + { + const int offset = vl * i; + vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl); + _p = vfmadd_vv_f16m1(_p, _a, _b, vl); + vse16_v_f16m1(ptr + offset, _p, vl); + } + } + + return 0; +} From 3da3bfcb9ebe2b8c70f0c0198c0abfa9616a77bb Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 23 Jul 2022 23:16:15 +0800 Subject: [PATCH 11/15] Fix: RVV LayerNorm, fp16sa, pack1 --- src/layer/riscv/layernorm_rvv_fp16.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h index 313b51cfe..f17c7308a 100644 --- a/src/layer/riscv/layernorm_rvv_fp16.h +++ b/src/layer/riscv/layernorm_rvv_fp16.h @@ -151,8 +151,8 @@ static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, con static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) { - float sum = 0.f; - float sqsum = 0.f; + __fp16 sum = 0.f; + __fp16 sqsum = 0.f; vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); { @@ -169,7 +169,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co } } sum = vfmv_f_s_f16m1_f16(_sum); - float mean = sum / size; + __fp16 mean = sum / size; { int n = size; @@ -185,11 +185,11 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co } } sqsum = vfmv_f_s_f16m1_f16(_sqsum); - float var = sqsum / size; + __fp16 var = sqsum / size; // the var maybe minus due to accuracy //float var = sqsum / size - mean * mean; - float a = static_cast(1.f / (sqrt(var + eps))); - float b = -mean * a; + __fp16 a = static_cast<__fp16>(1.f / (sqrt(var + eps))); + __fp16 b = static_cast<__fp16>(-mean * a); { int n = size; @@ -219,7 +219,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co { while (n > 0) { - word_type vl = vsetvl_e16m4(n); + word_type vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); _p = vfmul_vf_f16m8(_p, a, vl); _p = vfadd_vf_f16m8(_p, b, vl); From e3cb69ee8d58023bd3e114dcd8e114fa62296698 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Mon, 25 Jul 2022 00:15:56 +0800 Subject: [PATCH 12/15] Apply requested changes * Embed the simd-free scalar code implementation --- src/layer/riscv/layernorm_riscv.cpp | 66 ++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index dd56e0a9f..38db7263c 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -174,6 +174,47 @@ static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const floa return 0; } +#else +static inline int layernorm_scalar_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine) +{ + // mean and var + float sum = 0.f; + float sqsum = 0.f; + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } + float mean = sum / size; + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + float a = static_cast(1.f / (sqrt(var + eps))); + float b = -mean * a; + + if (affine) + { + for (int i = 0; i < size; i++) + { + ptr[i] = (ptr[i] * a + b) * gamma_data[i] + beta_data[i]; + } + } + else + { + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } + } + return 0; +} #endif // __riscv_vector int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -193,16 +234,22 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int elempack = bottom_top_blob.elempack; const int packn = csrr_vlenb() / 4; +#endif // __riscv_vector int dims = bottom_top_blob.dims; int w = bottom_top_blob.w; if (dims == 1) { float* ptr = bottom_top_blob; - +#if __riscv_vector return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine); +#else + return layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine); +#endif // __riscv_vector } +#if __riscv_vector if (elempack == 1) +#endif { if (dims == 2) { @@ -213,7 +260,11 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); +#if __riscv_vector layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine); +#else + layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine); +#endif // __riscv_vector } } @@ -231,8 +282,11 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.channel(q).row(i); - +#if __riscv_vector layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine); +#else + layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine); +#endif // __riscv_vector } } } @@ -242,12 +296,17 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co for (int q = 0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); +#if __riscv_vector layernorm_rvv_pack1_procedure(size, ptr, gamma_data, beta_data, eps, affine); +#else + layernorm_scalar_procedure(size, ptr, gamma_data, beta_data, eps, affine); +#endif // __riscv_vector } } } } +#if __riscv_vector if (elempack == packn) { const word_type vl = vsetvl_e32m1(packn); @@ -295,9 +354,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } } } - -#else // __riscv_vector - return LayerNorm::forward_inplace(bottom_top_blob, opt); #endif // __riscv_vector return 0; } From 5e50c8eff8f4007e7b2be4382e89c6e9ccef6f17 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Fri, 29 Jul 2022 12:14:28 +0800 Subject: [PATCH 13/15] RVV: drop rvv-071 support (#4094) --- src/layer/riscv/layernorm_riscv.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index 38db7263c..b1f3a2f80 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -15,11 +15,7 @@ #include #if __riscv_vector -#ifdef RVV_SPEC_0_7 -#include "riscv_v_071_fix.h" -#else #include -#endif #endif // __riscv_vector #include "riscv_usability.h" From 6948e834b6a01dbad492cd33d0a9db76ea758a18 Mon Sep 17 00:00:00 2001 From: Xavier Hsinyuan Date: Sat, 1 Oct 2022 23:11:39 +0800 Subject: [PATCH 15/15] RVV: replace `word_type` to `size_t` (#4100, #4118) --- src/layer/riscv/layernorm_riscv.cpp | 16 ++++++++-------- src/layer/riscv/layernorm_rvv_fp16.h | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp index b1f3a2f80..de942b19e 100644 --- a/src/layer/riscv/layernorm_riscv.cpp +++ b/src/layer/riscv/layernorm_riscv.cpp @@ -48,7 +48,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa float* ptr_sum = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -64,7 +64,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa float* ptr_sqsum = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); _p = vfsub_vf_f32m8(_p, mean, vl); _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -88,7 +88,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa { while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); _p = vfmul_vf_f32m8(_p, a, vl); vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); @@ -107,7 +107,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa { while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); _p = vfmul_vf_f32m8(_p, a, vl); _p = vfadd_vf_f32m8(_p, b, vl); @@ -120,7 +120,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa return 0; } -static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl) { // mean and var vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); @@ -305,7 +305,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co #if __riscv_vector if (elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); if (dims == 2) { int w = bottom_top_blob.w; @@ -419,7 +419,7 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o const int packn = csrr_vlenb() / 2; // fp16 if (elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); if (dims == 2) { int w = bottom_top_blob.w; @@ -532,7 +532,7 @@ int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& const int packn = csrr_vlenb() / 2; // fp16 if (elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); if (dims == 2) { int w = bottom_top_blob.w; diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h index f17c7308a..cf22025ac 100644 --- a/src/layer/riscv/layernorm_rvv_fp16.h +++ b/src/layer/riscv/layernorm_rvv_fp16.h @@ -24,7 +24,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con __fp16* ptr_sum = ptr; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -40,7 +40,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con __fp16* ptr_sqsum = ptr; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); _p = vfsub_vf_f32m8(_p, mean, vl); _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -64,7 +64,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con { while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); _p = vfmul_vf_f32m8(_p, a, vl); vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl); @@ -83,7 +83,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con { while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); _p = vfmul_vf_f32m8(_p, a, vl); _p = vfadd_vf_f32m8(_p, b, vl); @@ -96,7 +96,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con return 0; } -static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl) { // mean and var // f16m1 => f32m2 @@ -160,7 +160,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co __fp16* ptr_sum = ptr; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -176,7 +176,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co __fp16* ptr_sqsum = ptr; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); _p = vfsub_vf_f16m8(_p, mean, vl); _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); @@ -200,7 +200,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co { while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl); _p = vfmul_vf_f16m4(_p, a, vl); vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl); @@ -219,7 +219,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co { while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); _p = vfmul_vf_f16m8(_p, a, vl); _p = vfadd_vf_f16m8(_p, b, vl); @@ -232,7 +232,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co return 0; } -static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl) +static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl) { // mean and var vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);