From 58bbdec7eb0580f03e0e50b675a2b8b72bee0e0e Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Thu, 21 Jul 2022 00:17:56 +0800
Subject: [PATCH 01/15] RVV: layernorm fp32 w/o elempack

---
 src/layer/riscv/layernorm_riscv.cpp | 410 ++++++++++++++++++++++++++++
 src/layer/riscv/layernorm_riscv.h   |  31 +++
 2 files changed, 441 insertions(+)
 create mode 100644 src/layer/riscv/layernorm_riscv.cpp
 create mode 100644 src/layer/riscv/layernorm_riscv.h
diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
new file mode 100644
index 000000000..8a4fc452d
--- /dev/null
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -0,0 +1,410 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "layernorm_riscv.h"
+#include <math.h>
+
+#if __riscv_vector
+#ifdef RVV_SPEC_0_7
+#include "riscv_v_071_fix.h"
+#else
+#include <riscv_vector.h>
+#endif
+#endif // __riscv_vector
+
+namespace ncnn {
+LayerNorm_riscv::LayerNorm_riscv()
+{
+#if __riscv_vector
+#warning "TODO: packing. Don't merge at this moment."
+    // support_packing = true;
+#endif
+}
+
+int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+// x = (x - mean) / sqrt(var + eps) * gamma + beta
+#if __riscv_vector
+    int dims = bottom_top_blob.dims;
+    int w = bottom_top_blob.w;
+    if (dims == 1)
+    {
+        float* ptr = bottom_top_blob;
+
+        // mean and var
+        float sum = 0.f;
+        float sqsum = 0.f;
+        vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+        vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+
+        {
+            int n = w;
+            float* ptr_sum = ptr;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                ptr_sum += vl;
+                n -= vl;
+            }
+        }
+        sum = vfmv_f_s_f32m1_f32(_sum);
+        float mean = sum / w;
+
+        {
+            int n = w;
+            float* ptr_sqsum = ptr;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                _p = vfsub_vf_f32m8(_p, mean, vl);
+                _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                n -= vl;
+                ptr_sqsum += vl;
+            }
+        }
+        sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+        float var = sqsum / w;
+        // the var maybe minus due to accuracy
+        //float var = sqsum / w - mean * mean;
+        float a = static_cast<float>(1.f / (sqrt(var + eps)));
+        float b = -mean * a;
+
+        {
+            int n = w;
+            float* ptr_store = ptr;
+            const float* ptr_gamma = gamma_data;
+            const float* ptr_beta = beta_data;
+            if (affine)
+            {
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                    _p = vfmul_vf_f32m8(_p, a, vl);
+                    vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                    _p = vfadd_vf_f32m8(_p, b, vl);
+                    vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                    _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                    vse32_v_f32m8(ptr_store, _p, vl);
+
+                    n -= vl;
+                    ptr_store += vl;
+                }
+            }
+            else
+            {
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                    _p = vfmul_vf_f32m8(_p, a, vl);
+                    _p = vfadd_vf_f32m8(_p, b, vl);
+                    vse32_v_f32m8(ptr_store, _p, vl);
+                    n -= vl;
+                    ptr_store += vl;
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+
+            // mean and var
+            float sum = 0.f;
+            float sqsum = 0.f;
+            vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+            vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+
+            {
+                int n = w;
+                float* ptr_sum = ptr;
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                    _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                    // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    ptr_sum += vl;
+                    n -= vl;
+                }
+            }
+            sum = vfmv_f_s_f32m1_f32(_sum);
+            float mean = sum / w;
+            float tmp = 0.f;
+
+            {
+                int n = w;
+                float* ptr_sqsum = ptr;
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                    _p = vfsub_vf_f32m8(_p, mean, vl);
+                    _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    n -= vl;
+                    ptr_sqsum += vl;
+                }
+            }
+            sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+            float var = sqsum / w;
+            // the var maybe minus due to accuracy
+            //float var = sqsum / w - mean * mean;
+
+            float a = static_cast<float>(1.f / (sqrt(var + eps)));
+            float b = -mean * a;
+
+            {
+                int n = w;
+                float* ptr_store = ptr;
+                const float* ptr_gamma = gamma_data;
+                const float* ptr_beta = beta_data;
+                if (affine)
+                {
+                    while (n > 0)
+                    {
+                        word_type vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                        _p = vfmul_vf_f32m8(_p, a, vl);
+                        vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                        _p = vfadd_vf_f32m8(_p, b, vl);
+                        vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                        _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                        vse32_v_f32m8(ptr_store, _p, vl);
+
+                        n -= vl;
+                        ptr_store += vl;
+                    }
+                }
+                else
+                {
+                    while (n > 0)
+                    {
+                        word_type vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                        _p = vfmul_vf_f32m8(_p, a, vl);
+                        _p = vfadd_vf_f32m8(_p, b, vl);
+                        vse32_v_f32m8(ptr_store, _p, vl);
+                        n -= vl;
+                        ptr_store += vl;
+                    }
+                }
+            }
+        }
+    }
+    if (dims == 3)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+
+                    // mean and var
+                    float sum = 0.f;
+                    float sqsum = 0.f;
+                    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                    {
+                        int n = w;
+                        float* ptr_sum = ptr;
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                            _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                            ptr_sum += vl;
+                            n -= vl;
+                        }
+                    }
+                    sum = vfmv_f_s_f32m1_f32(_sum);
+                    float mean = sum / w;
+
+                    {
+                        int n = w;
+                        float* ptr_sqsum = ptr;
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                            _p = vfsub_vf_f32m8(_p, mean, vl);
+                            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                            n -= vl;
+                            ptr_sqsum += vl;
+                        }
+                    }
+                    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+                    float var = sqsum / w;
+                    // the var maybe minus due to accuracy
+                    //float var = sqsum / w - mean * mean;
+
+                    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+                    float b = -mean * a;
+
+                    {
+                        int n = w;
+                        float* ptr_store = ptr;
+                        const float* ptr_gamma = gamma_data;
+                        const float* ptr_beta = beta_data;
+                        if (affine)
+                        {
+                            while (n > 0)
+                            {
+                                word_type vl = vsetvl_e32m8(n);
+                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                                _p = vfmul_vf_f32m8(_p, a, vl);
+                                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                                _p = vfadd_vf_f32m8(_p, b, vl);
+                                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                                vse32_v_f32m8(ptr_store, _p, vl);
+
+                                n -= vl;
+                                ptr_store += vl;
+                            }
+                        }
+                        else
+                        {
+                            while (n > 0)
+                            {
+                                word_type vl = vsetvl_e32m8(n);
+                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                                _p = vfmul_vf_f32m8(_p, a, vl);
+                                _p = vfadd_vf_f32m8(_p, b, vl);
+                                vse32_v_f32m8(ptr_store, _p, vl);
+                                n -= vl;
+                                ptr_store += vl;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else // if (affine_size == size)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+
+                // mean and var
+                float sum = 0.f;
+                float sqsum = 0.f;
+                vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                {
+                    int n = w;
+                    float* ptr_sum = ptr;
+                    while (n > 0)
+                    {
+                        word_type vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                        _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                        // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                        ptr_sum += vl;
+                        n -= vl;
+                    }
+                }
+                sum = vfmv_f_s_f32m1_f32(_sum);
+
+                float mean = sum / size;
+                {
+                    int n = w;
+                    float* ptr_sqsum = ptr;
+                    while (n > 0)
+                    {
+                        word_type vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                        _p = vfsub_vf_f32m8(_p, mean, vl);
+                        _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                        n -= vl;
+                        ptr_sqsum += vl;
+                    }
+                }
+                sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+
+                float var = sqsum / size;
+                // the var maybe minus due to accuracy
+                //float var = sqsum / size - mean * mean;
+
+                float a = static_cast<float>(1.f / (sqrt(var + eps)));
+                float b = -mean * a;
+
+                {
+                    int n = w;
+                    float* ptr_store = ptr;
+                    const float* ptr_gamma = gamma_data;
+                    const float* ptr_beta = beta_data;
+                    if (affine)
+                    {
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                            _p = vfmul_vf_f32m8(_p, a, vl);
+                            vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                            _p = vfadd_vf_f32m8(_p, b, vl);
+                            vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                            _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                            vse32_v_f32m8(ptr_store, _p, vl);
+
+                            n -= vl;
+                            ptr_store += vl;
+                        }
+                    }
+                    else
+                    {
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                            _p = vfmul_vf_f32m8(_p, a, vl);
+                            _p = vfadd_vf_f32m8(_p, b, vl);
+                            vse32_v_f32m8(ptr_store, _p, vl);
+                            n -= vl;
+                            ptr_store += vl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+#else  // __riscv_vector
+    return LayerNorm::forward_inplace(bottom_top_blob, opt);
+#endif // __riscv_vector
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h
new file mode 100644
index 000000000..e23cbe8ff
--- /dev/null
+++ b/src/layer/riscv/layernorm_riscv.h
@@ -0,0 +1,31 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LAYERNORM_RISCV_H
+#define LAYER_LAYERNORM_RISCV_H
+
+#include "layernorm.h"
+
+namespace ncnn {
+class LayerNorm_riscv : virtual public LayerNorm
+{
+public:
+    LayerNorm_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LAYERNORM_RISCV_H

From 062efce8b76f3733445a5541adce486e370462b5 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Thu, 21 Jul 2022 01:11:28 +0800
Subject: [PATCH 02/15] Refactor: RVV:layernorm fp32 w/o elempack

---
 src/layer/riscv/layernorm_riscv.cpp | 305 +++++++---------------------
 1 file changed, 78 insertions(+), 227 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index 8a4fc452d..aa597c198 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -31,94 +31,102 @@ LayerNorm_riscv::LayerNorm_riscv()
 #endif
 }
 
-int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
-{
-// x = (x - mean) / sqrt(var + eps) * gamma + beta
 #if __riscv_vector
-    int dims = bottom_top_blob.dims;
-    int w = bottom_top_blob.w;
-    if (dims == 1)
+static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine)
+{
+    float sum = 0.f;
+    float sqsum = 0.f;
+    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
     {
-        float* ptr = bottom_top_blob;
+        int n = w;
+        float* ptr_sum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+            vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+            _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            ptr_sum += vl;
+            n -= vl;
+        }
+    }
+    sum = vfmv_f_s_f32m1_f32(_sum);
+    float mean = sum / w;
 
-        // mean and var
-        float sum = 0.f;
-        float sqsum = 0.f;
-        vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-        vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    {
+        int n = w;
+        float* ptr_sqsum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+            vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+            _p = vfsub_vf_f32m8(_p, mean, vl);
+            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            n -= vl;
+            ptr_sqsum += vl;
+        }
+    }
+    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+    float var = sqsum / w;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / w - mean * mean;
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
 
+    {
+        int n = w;
+        float* ptr_store = ptr;
+        const float* ptr_gamma = gamma_data;
+        const float* ptr_beta = beta_data;
+        if (affine)
         {
-            int n = w;
-            float* ptr_sum = ptr;
             while (n > 0)
             {
                 word_type vl = vsetvl_e32m8(n);
-                vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
-                _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-                // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                ptr_sum += vl;
+                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                vse32_v_f32m8(ptr_store, _p, vl);
+
                 n -= vl;
+                ptr_store += vl;
+                ptr_gamma += vl;
+                ptr_beta += vl;
             }
         }
-        sum = vfmv_f_s_f32m1_f32(_sum);
-        float mean = sum / w;
-
+        else
         {
-            int n = w;
-            float* ptr_sqsum = ptr;
             while (n > 0)
             {
                 word_type vl = vsetvl_e32m8(n);
-                vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
-                _p = vfsub_vf_f32m8(_p, mean, vl);
-                _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vse32_v_f32m8(ptr_store, _p, vl);
                 n -= vl;
-                ptr_sqsum += vl;
+                ptr_store += vl;
             }
         }
-        sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-        float var = sqsum / w;
-        // the var maybe minus due to accuracy
-        //float var = sqsum / w - mean * mean;
-        float a = static_cast<float>(1.f / (sqrt(var + eps)));
-        float b = -mean * a;
+    }
+    return 0;
+}
+#endif // __riscv_vector
 
-        {
-            int n = w;
-            float* ptr_store = ptr;
-            const float* ptr_gamma = gamma_data;
-            const float* ptr_beta = beta_data;
-            if (affine)
-            {
-                while (n > 0)
-                {
-                    word_type vl = vsetvl_e32m8(n);
-                    vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                    _p = vfmul_vf_f32m8(_p, a, vl);
-                    vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                    _p = vfadd_vf_f32m8(_p, b, vl);
-                    vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                    _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                    vse32_v_f32m8(ptr_store, _p, vl);
+int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+// x = (x - mean) / sqrt(var + eps) * gamma + beta
+#if __riscv_vector
+    int dims = bottom_top_blob.dims;
+    int w = bottom_top_blob.w;
+    if (dims == 1)
+    {
+        float* ptr = bottom_top_blob;
 
-                    n -= vl;
-                    ptr_store += vl;
-                }
-            }
-            else
-            {
-                while (n > 0)
-                {
-                    word_type vl = vsetvl_e32m8(n);
-                    vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                    _p = vfmul_vf_f32m8(_p, a, vl);
-                    _p = vfadd_vf_f32m8(_p, b, vl);
-                    vse32_v_f32m8(ptr_store, _p, vl);
-                    n -= vl;
-                    ptr_store += vl;
-                }
-            }
-        }
+        return layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
     }
 
     if (dims == 2)
@@ -131,87 +139,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
         for (int i = 0; i < h; i++)
         {
             float* ptr = bottom_top_blob.row(i);
-
-            // mean and var
-            float sum = 0.f;
-            float sqsum = 0.f;
-            vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-            vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-
-            {
-                int n = w;
-                float* ptr_sum = ptr;
-                while (n > 0)
-                {
-                    word_type vl = vsetvl_e32m8(n);
-                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
-                    _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-                    // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                    ptr_sum += vl;
-                    n -= vl;
-                }
-            }
-            sum = vfmv_f_s_f32m1_f32(_sum);
-            float mean = sum / w;
-            float tmp = 0.f;
-
-            {
-                int n = w;
-                float* ptr_sqsum = ptr;
-                while (n > 0)
-                {
-                    word_type vl = vsetvl_e32m8(n);
-                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
-                    _p = vfsub_vf_f32m8(_p, mean, vl);
-                    _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                    n -= vl;
-                    ptr_sqsum += vl;
-                }
-            }
-            sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-            float var = sqsum / w;
-            // the var maybe minus due to accuracy
-            //float var = sqsum / w - mean * mean;
-
-            float a = static_cast<float>(1.f / (sqrt(var + eps)));
-            float b = -mean * a;
-
-            {
-                int n = w;
-                float* ptr_store = ptr;
-                const float* ptr_gamma = gamma_data;
-                const float* ptr_beta = beta_data;
-                if (affine)
-                {
-                    while (n > 0)
-                    {
-                        word_type vl = vsetvl_e32m8(n);
-                        vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                        _p = vfmul_vf_f32m8(_p, a, vl);
-                        vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                        _p = vfadd_vf_f32m8(_p, b, vl);
-                        vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                        _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                        vse32_v_f32m8(ptr_store, _p, vl);
-
-                        n -= vl;
-                        ptr_store += vl;
-                    }
-                }
-                else
-                {
-                    while (n > 0)
-                    {
-                        word_type vl = vsetvl_e32m8(n);
-                        vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                        _p = vfmul_vf_f32m8(_p, a, vl);
-                        _p = vfadd_vf_f32m8(_p, b, vl);
-                        vse32_v_f32m8(ptr_store, _p, vl);
-                        n -= vl;
-                        ptr_store += vl;
-                    }
-                }
-            }
+            layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
         }
     }
     if (dims == 3)
@@ -230,84 +158,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                 {
                     float* ptr = bottom_top_blob.channel(q).row(i);
 
-                    // mean and var
-                    float sum = 0.f;
-                    float sqsum = 0.f;
-                    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-                    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-                    {
-                        int n = w;
-                        float* ptr_sum = ptr;
-                        while (n > 0)
-                        {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
-                            _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-                            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                            ptr_sum += vl;
-                            n -= vl;
-                        }
-                    }
-                    sum = vfmv_f_s_f32m1_f32(_sum);
-                    float mean = sum / w;
-
-                    {
-                        int n = w;
-                        float* ptr_sqsum = ptr;
-                        while (n > 0)
-                        {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
-                            _p = vfsub_vf_f32m8(_p, mean, vl);
-                            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                            n -= vl;
-                            ptr_sqsum += vl;
-                        }
-                    }
-                    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-                    float var = sqsum / w;
-                    // the var maybe minus due to accuracy
-                    //float var = sqsum / w - mean * mean;
-
-                    float a = static_cast<float>(1.f / (sqrt(var + eps)));
-                    float b = -mean * a;
-
-                    {
-                        int n = w;
-                        float* ptr_store = ptr;
-                        const float* ptr_gamma = gamma_data;
-                        const float* ptr_beta = beta_data;
-                        if (affine)
-                        {
-                            while (n > 0)
-                            {
-                                word_type vl = vsetvl_e32m8(n);
-                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                                _p = vfmul_vf_f32m8(_p, a, vl);
-                                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                                _p = vfadd_vf_f32m8(_p, b, vl);
-                                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                                vse32_v_f32m8(ptr_store, _p, vl);
-
-                                n -= vl;
-                                ptr_store += vl;
-                            }
-                        }
-                        else
-                        {
-                            while (n > 0)
-                            {
-                                word_type vl = vsetvl_e32m8(n);
-                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                                _p = vfmul_vf_f32m8(_p, a, vl);
-                                _p = vfadd_vf_f32m8(_p, b, vl);
-                                vse32_v_f32m8(ptr_store, _p, vl);
-                                n -= vl;
-                                ptr_store += vl;
-                            }
-                        }
-                    }
+                    layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
                 }
             }
         }

From 569b24c14fa17f9c9e85e6ddb7d4e72a214fc497 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Thu, 21 Jul 2022 11:22:39 +0800
Subject: [PATCH 03/15] RVV: layernorm fp32 elempack

---
 src/layer/riscv/layernorm_riscv.cpp | 291 +++++++++++++++++++++-------
 1 file changed, 216 insertions(+), 75 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index aa597c198..ddd22cb1e 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -22,12 +22,13 @@
 #endif
 #endif // __riscv_vector
 
+#include "riscv_usability.h"
+
 namespace ncnn {
 LayerNorm_riscv::LayerNorm_riscv()
 {
 #if __riscv_vector
-#warning "TODO: packing. Don't merge at this moment."
-    // support_packing = true;
+    support_packing = true;
 #endif
 }
 
@@ -114,19 +115,72 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
     }
     return 0;
 }
+
+static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine, const word_type vl)
+{
+    // mean and var
+    vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
+    vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl);
+    for (int i = 0; i < w; i++)
+    {
+        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+        _sum = vfadd_vv_f32m1(_p, _sum, vl);
+        // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl);
+    }
+    vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, w, vl);
+    for (int i = 0; i < w; i++)
+    {
+        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+        _p = vfsub_vv_f32m1(_p, _mean, vl);
+        _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl);
+    }
+    vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl);
+
+    // the var maybe minus due to accuracy
+    //float var = sqsum / w - mean * mean;
+    vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
+    vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl);
+    if (affine)
+    {
+        for (int i = 0; i < w; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
+            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
+            _p = vfmul_vf_f32m1(_p, gamma_data[i], vl);
+            _p = vfadd_vf_f32m1(_p, beta_data[i], vl);
+            vse32_v_f32m1(ptr + offset, _p, vl);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < w; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
+            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
+            vse32_v_f32m1(ptr + offset, _p, vl);
+        }
+    }
+
+    return 0;
+}
 #endif // __riscv_vector
 
 int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
 // x = (x - mean) / sqrt(var + eps) * gamma + beta
 #if __riscv_vector
+    int elempack = bottom_top_blob.elempack;
+    const int packn = csrr_vlenb() / 4;
     int dims = bottom_top_blob.dims;
     int w = bottom_top_blob.w;
+
     if (dims == 1)
     {
         float* ptr = bottom_top_blob;
 
-        return layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+        return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine);
     }
 
     if (dims == 2)
@@ -134,12 +188,24 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
         int w = bottom_top_blob.w;
         int h = bottom_top_blob.h;
         // assert affine_size == w
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < h; i++)
+        if (elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                float* ptr = bottom_top_blob.row(i);
+                layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+            }
+        }
+        if (elempack == packn)
         {
-            float* ptr = bottom_top_blob.row(i);
-            layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+            const word_type vl = vsetvl_e32m1(packn);
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                float* ptr = bottom_top_blob.row(i);
+                layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+            }
         }
     }
     if (dims == 3)
@@ -151,100 +217,175 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
         if (affine_size == w)
         {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            if (elempack == 1)
             {
-                for (int i = 0; i < h; i++)
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
                 {
-                    float* ptr = bottom_top_blob.channel(q).row(i);
+                    for (int i = 0; i < h; i++)
+                    {
+                        float* ptr = bottom_top_blob.channel(q).row(i);
 
-                    layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+                        layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+                    }
+                }
+            }
+            if (elempack == packn)
+            {
+                const word_type vl = vsetvl_e32m1(packn);
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        float* ptr = bottom_top_blob.channel(q).row(i);
+
+                        layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+                    }
                 }
             }
         }
         else // if (affine_size == size)
         {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            if (elempack == 1)
             {
-                float* ptr = bottom_top_blob.channel(q);
-
-                // mean and var
-                float sum = 0.f;
-                float sqsum = 0.f;
-                vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-                vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
                 {
-                    int n = w;
-                    float* ptr_sum = ptr;
-                    while (n > 0)
+                    float* ptr = bottom_top_blob.channel(q);
+
+                    // mean and var
+                    float sum = 0.f;
+                    float sqsum = 0.f;
+                    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+                    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
                     {
-                        word_type vl = vsetvl_e32m8(n);
-                        vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
-                        _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-                        // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                        ptr_sum += vl;
-                        n -= vl;
+                        int n = size;
+                        float* ptr_sum = ptr;
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                            _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                            ptr_sum += vl;
+                            n -= vl;
+                        }
                     }
-                }
-                sum = vfmv_f_s_f32m1_f32(_sum);
+                    sum = vfmv_f_s_f32m1_f32(_sum);
 
-                float mean = sum / size;
-                {
-                    int n = w;
-                    float* ptr_sqsum = ptr;
-                    while (n > 0)
+                    float mean = sum / size;
                     {
-                        word_type vl = vsetvl_e32m8(n);
-                        vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
-                        _p = vfsub_vf_f32m8(_p, mean, vl);
-                        _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                        n -= vl;
-                        ptr_sqsum += vl;
+                        int n = size;
+                        float* ptr_sqsum = ptr;
+                        while (n > 0)
+                        {
+                            word_type vl = vsetvl_e32m8(n);
+                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                            _p = vfsub_vf_f32m8(_p, mean, vl);
+                            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                            n -= vl;
+                            ptr_sqsum += vl;
+                        }
                     }
-                }
-                sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+                    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+
+                    float var = sqsum / size;
+                    // the var maybe minus due to accuracy
+                    //float var = sqsum / size - mean * mean;
 
-                float var = sqsum / size;
-                // the var maybe minus due to accuracy
-                //float var = sqsum / size - mean * mean;
+                    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+                    float b = -mean * a;
 
-                float a = static_cast<float>(1.f / (sqrt(var + eps)));
-                float b = -mean * a;
+                    {
+                        int n = size;
+                        float* ptr_store = ptr;
+                        const float* ptr_gamma = gamma_data;
+                        const float* ptr_beta = beta_data;
+                        if (affine)
+                        {
+                            while (n > 0)
+                            {
+                                word_type vl = vsetvl_e32m8(n);
+                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                                _p = vfmul_vf_f32m8(_p, a, vl);
+                                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                                _p = vfadd_vf_f32m8(_p, b, vl);
+                                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                                vse32_v_f32m8(ptr_store, _p, vl);
 
+                                n -= vl;
+                                ptr_store += vl;
+                                ptr_gamma += vl;
+                                ptr_beta += vl;
+                            }
+                        }
+                        else
+                        {
+                            while (n > 0)
+                            {
+                                word_type vl = vsetvl_e32m8(n);
+                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                                _p = vfmul_vf_f32m8(_p, a, vl);
+                                _p = vfadd_vf_f32m8(_p, b, vl);
+                                vse32_v_f32m8(ptr_store, _p, vl);
+                                n -= vl;
+                                ptr_store += vl;
+                            }
+                        }
+                    }
+                }
+            }
+            if (elempack == packn)
+            {
+                const word_type vl = vsetvl_e32m1(packn);
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
                 {
-                    int n = w;
-                    float* ptr_store = ptr;
-                    const float* ptr_gamma = gamma_data;
-                    const float* ptr_beta = beta_data;
+                    float* ptr = bottom_top_blob.channel(q);
+                    // mean and var
+                    vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1());
+                    vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1());
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+                        _sum = vfadd_vv_f32m1(_p, _sum, vl);
+                        // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl);
+                    }
+                    vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl);
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+                        _p = vfsub_vv_f32m1(_p, _mean, vl);
+                        _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl);
+                    }
+                    vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl);
+
+                    // the var maybe minus due to accuracy
+                    //float var = sqsum / w - mean * mean;
+                    vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
+                    vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl);
                     if (affine)
                     {
-                        while (n > 0)
+                        for (int i = 0; i < size; i++)
                         {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                            _p = vfmul_vf_f32m8(_p, a, vl);
-                            vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                            _p = vfadd_vf_f32m8(_p, b, vl);
-                            vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                            _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                            vse32_v_f32m8(ptr_store, _p, vl);
-
-                            n -= vl;
-                            ptr_store += vl;
+                            const int offset = vl * i;
+                            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
+                            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
+                            _p = vfmul_vf_f32m1(_p, gamma_data[i], vl);
+                            _p = vfadd_vf_f32m1(_p, beta_data[i], vl);
+                            vse32_v_f32m1(ptr + offset, _p, vl);
                         }
                     }
                     else
                     {
-                        while (n > 0)
+                        for (int i = 0; i < size; i++)
                         {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                            _p = vfmul_vf_f32m8(_p, a, vl);
-                            _p = vfadd_vf_f32m8(_p, b, vl);
-                            vse32_v_f32m8(ptr_store, _p, vl);
-                            n -= vl;
-                            ptr_store += vl;
+                            const int offset = vl * i;
+                            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
+                            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
+                            vse32_v_f32m1(ptr + offset, _p, vl);
                         }
                     }
                 }

From 706ebd14ac22451e3c753006f4f2aba1c63699c8 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Thu, 21 Jul 2022 18:13:11 +0800
Subject: [PATCH 04/15] Refactor: layernorm

refactor condition check order when dim >= 2.
---
 src/layer/riscv/layernorm_riscv.cpp | 97 ++++++++++++++++-------------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index ddd22cb1e..a3f1f44c8 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -182,14 +182,13 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
         return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine);
     }
-
-    if (dims == 2)
+    if (elempack == 1)
     {
-        int w = bottom_top_blob.w;
-        int h = bottom_top_blob.h;
-        // assert affine_size == w
-        if (elempack == 1)
+        if (dims == 2)
         {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int i = 0; i < h; i++)
             {
@@ -197,27 +196,14 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                 layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
             }
         }
-        if (elempack == packn)
-        {
-            const word_type vl = vsetvl_e32m1(packn);
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
-            {
-                float* ptr = bottom_top_blob.row(i);
-                layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
-            }
-        }
-    }
-    if (dims == 3)
-    {
-        int w = bottom_top_blob.w;
-        int h = bottom_top_blob.h;
-        int channels = bottom_top_blob.c;
-        int size = w * h;
 
-        if (affine_size == w)
+        if (dims == 3)
         {
-            if (elempack == 1)
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+            if (affine_size == w)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q = 0; q < channels; q++)
@@ -230,24 +216,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                     }
                 }
             }
-            if (elempack == packn)
-            {
-                const word_type vl = vsetvl_e32m1(packn);
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    for (int i = 0; i < h; i++)
-                    {
-                        float* ptr = bottom_top_blob.channel(q).row(i);
-
-                        layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
-                    }
-                }
-            }
-        }
-        else // if (affine_size == size)
-        {
-            if (elempack == 1)
+            else // if (affine_size == size)
             {
                 #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q = 0; q < channels; q++)
@@ -337,7 +306,47 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                     }
                 }
             }
-            if (elempack == packn)
+        }
+    }
+
+    if (elempack == packn)
+    {
+        if (dims == 2)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
+
+            const word_type vl = vsetvl_e32m1(packn);
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                float* ptr = bottom_top_blob.row(i);
+                layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+            }
+        }
+        if (dims == 3)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+
+            if (affine_size == w)
+            {
+                const word_type vl = vsetvl_e32m1(packn);
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        float* ptr = bottom_top_blob.channel(q).row(i);
+
+                        layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+                    }
+                }
+            }
+            else // if (affine_size == size)
             {
                 const word_type vl = vsetvl_e32m1(packn);
                 #pragma omp parallel for num_threads(opt.num_threads)

From c42186e6e4ba43b99289b193cc56cb0cfd21db1b Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Thu, 21 Jul 2022 18:53:35 +0800
Subject: [PATCH 05/15] Refactor: layernorm

* remove some unused variable;
* refactor when "affine_size == size && dim == 3"
---
 src/layer/riscv/layernorm_riscv.cpp | 150 +++-------------------------
 1 file changed, 12 insertions(+), 138 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index a3f1f44c8..606502fb5 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -33,7 +33,7 @@ LayerNorm_riscv::LayerNorm_riscv()
 }
 
 #if __riscv_vector
-static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine)
+static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
 {
     float sum = 0.f;
     float sqsum = 0.f;
@@ -47,7 +47,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
             word_type vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
             _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
             ptr_sum += vl;
             n -= vl;
         }
@@ -63,7 +63,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
             word_type vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
             _p = vfsub_vf_f32m8(_p, mean, vl);
-            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
             n -= vl;
             ptr_sqsum += vl;
         }
@@ -116,7 +116,7 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
     return 0;
 }
 
-static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine_size, int affine, const word_type vl)
+static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
 {
     // mean and var
     vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
@@ -180,7 +180,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
     {
         float* ptr = bottom_top_blob;
 
-        return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine_size, affine);
+        return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine);
     }
     if (elempack == 1)
     {
@@ -193,7 +193,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < h; i++)
             {
                 float* ptr = bottom_top_blob.row(i);
-                layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+                layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine);
             }
         }
 
@@ -212,7 +212,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                     {
                         float* ptr = bottom_top_blob.channel(q).row(i);
 
-                        layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine);
+                        layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine);
                     }
                 }
             }
@@ -222,88 +222,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                 for (int q = 0; q < channels; q++)
                 {
                     float* ptr = bottom_top_blob.channel(q);
-
-                    // mean and var
-                    float sum = 0.f;
-                    float sqsum = 0.f;
-                    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-                    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-                    {
-                        int n = size;
-                        float* ptr_sum = ptr;
-                        while (n > 0)
-                        {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
-                            _sum = vfredosum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-                            // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                            ptr_sum += vl;
-                            n -= vl;
-                        }
-                    }
-                    sum = vfmv_f_s_f32m1_f32(_sum);
-
-                    float mean = sum / size;
-                    {
-                        int n = size;
-                        float* ptr_sqsum = ptr;
-                        while (n > 0)
-                        {
-                            word_type vl = vsetvl_e32m8(n);
-                            vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
-                            _p = vfsub_vf_f32m8(_p, mean, vl);
-                            _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-                            n -= vl;
-                            ptr_sqsum += vl;
-                        }
-                    }
-                    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-
-                    float var = sqsum / size;
-                    // the var maybe minus due to accuracy
-                    //float var = sqsum / size - mean * mean;
-
-                    float a = static_cast<float>(1.f / (sqrt(var + eps)));
-                    float b = -mean * a;
-
-                    {
-                        int n = size;
-                        float* ptr_store = ptr;
-                        const float* ptr_gamma = gamma_data;
-                        const float* ptr_beta = beta_data;
-                        if (affine)
-                        {
-                            while (n > 0)
-                            {
-                                word_type vl = vsetvl_e32m8(n);
-                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                                _p = vfmul_vf_f32m8(_p, a, vl);
-                                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                                _p = vfadd_vf_f32m8(_p, b, vl);
-                                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                                vse32_v_f32m8(ptr_store, _p, vl);
-
-                                n -= vl;
-                                ptr_store += vl;
-                                ptr_gamma += vl;
-                                ptr_beta += vl;
-                            }
-                        }
-                        else
-                        {
-                            while (n > 0)
-                            {
-                                word_type vl = vsetvl_e32m8(n);
-                                vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
-                                _p = vfmul_vf_f32m8(_p, a, vl);
-                                _p = vfadd_vf_f32m8(_p, b, vl);
-                                vse32_v_f32m8(ptr_store, _p, vl);
-                                n -= vl;
-                                ptr_store += vl;
-                            }
-                        }
-                    }
+                    layernorm_rvv_pack1_procedure(size, ptr, gamma_data, beta_data, eps, affine);
                 }
             }
         }
@@ -311,18 +230,18 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
     if (elempack == packn)
     {
+        const word_type vl = vsetvl_e32m1(packn);
         if (dims == 2)
         {
             int w = bottom_top_blob.w;
             int h = bottom_top_blob.h;
             // assert affine_size == w
 
-            const word_type vl = vsetvl_e32m1(packn);
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int i = 0; i < h; i++)
             {
                 float* ptr = bottom_top_blob.row(i);
-                layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+                layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
             }
         }
         if (dims == 3)
@@ -334,7 +253,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
             if (affine_size == w)
             {
-                const word_type vl = vsetvl_e32m1(packn);
                 #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q = 0; q < channels; q++)
                 {
@@ -342,61 +260,17 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                     {
                         float* ptr = bottom_top_blob.channel(q).row(i);
 
-                        layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine_size, affine, vl);
+                        layernorm_rvv_packn_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
                     }
                 }
             }
             else // if (affine_size == size)
             {
-                const word_type vl = vsetvl_e32m1(packn);
                 #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q = 0; q < channels; q++)
                 {
                     float* ptr = bottom_top_blob.channel(q);
-                    // mean and var
-                    vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1());
-                    vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vsetvlmax_e32m1());
-                    for (int i = 0; i < size; i++)
-                    {
-                        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
-                        _sum = vfadd_vv_f32m1(_p, _sum, vl);
-                        // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl);
-                    }
-                    vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl);
-                    for (int i = 0; i < size; i++)
-                    {
-                        vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
-                        _p = vfsub_vv_f32m1(_p, _mean, vl);
-                        _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl);
-                    }
-                    vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl);
-
-                    // the var maybe minus due to accuracy
-                    //float var = sqsum / w - mean * mean;
-                    vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
-                    vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl);
-                    if (affine)
-                    {
-                        for (int i = 0; i < size; i++)
-                        {
-                            const int offset = vl * i;
-                            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
-                            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
-                            _p = vfmul_vf_f32m1(_p, gamma_data[i], vl);
-                            _p = vfadd_vf_f32m1(_p, beta_data[i], vl);
-                            vse32_v_f32m1(ptr + offset, _p, vl);
-                        }
-                    }
-                    else
-                    {
-                        for (int i = 0; i < size; i++)
-                        {
-                            const int offset = vl * i;
-                            vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
-                            _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
-                            vse32_v_f32m1(ptr + offset, _p, vl);
-                        }
-                    }
+                    layernorm_rvv_packn_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl);
                 }
             }
         }

From 03ca1578a256fd6fbd0567fb285d5141d0ea6341 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Fri, 22 Jul 2022 23:55:22 +0800
Subject: [PATCH 06/15] Refactor: RVV LayerNorm fp32 w/ elempack

* Use `vfmacc.vv` to get sqsum;
---
 src/layer/riscv/layernorm_riscv.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index 606502fb5..53f3d19f1 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -132,13 +132,14 @@ static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float*
     {
         vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
         _p = vfsub_vv_f32m1(_p, _mean, vl);
-        _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl);
+        _sqsum = vfmacc_vv_f32m1(_sqsum, _p, _p, vl);
     }
     vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl);
 
     // the var maybe minus due to accuracy
     //float var = sqsum / w - mean * mean;
     vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
+    // how about vfrsqrt7.v?
     vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl);
     if (affine)
     {

From b495fa5413694083ae720a361069635383341da9 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 23 Jul 2022 18:00:30 +0800
Subject: [PATCH 07/15] Refactor: RVV LayerNorm fp32

---
 src/layer/riscv/layernorm_riscv.cpp | 30 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index 53f3d19f1..c613f8679 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -33,14 +33,14 @@ LayerNorm_riscv::LayerNorm_riscv()
 }
 
 #if __riscv_vector
-static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
 {
     float sum = 0.f;
     float sqsum = 0.f;
     vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
     vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
     {
-        int n = w;
+        int n = size;
         float* ptr_sum = ptr;
         while (n > 0)
         {
@@ -53,10 +53,10 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
         }
     }
     sum = vfmv_f_s_f32m1_f32(_sum);
-    float mean = sum / w;
+    float mean = sum / size;
 
     {
-        int n = w;
+        int n = size;
         float* ptr_sqsum = ptr;
         while (n > 0)
         {
@@ -69,14 +69,14 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
         }
     }
     sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-    float var = sqsum / w;
+    float var = sqsum / size;
     // the var maybe minus due to accuracy
-    //float var = sqsum / w - mean * mean;
+    //float var = sqsum / size - mean * mean;
     float a = static_cast<float>(1.f / (sqrt(var + eps)));
     float b = -mean * a;
 
     {
-        int n = w;
+        int n = size;
         float* ptr_store = ptr;
         const float* ptr_gamma = gamma_data;
         const float* ptr_beta = beta_data;
@@ -116,34 +116,34 @@ static inline int layernorm_rvv_pack1_procedure(int w, float* ptr, const float*
     return 0;
 }
 
-static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
 {
     // mean and var
     vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
     vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl);
-    for (int i = 0; i < w; i++)
+    for (int i = 0; i < size; i++)
     {
         vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
         _sum = vfadd_vv_f32m1(_p, _sum, vl);
         // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl);
     }
-    vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, w, vl);
-    for (int i = 0; i < w; i++)
+    vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl);
+    for (int i = 0; i < size; i++)
     {
         vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
         _p = vfsub_vv_f32m1(_p, _mean, vl);
         _sqsum = vfmacc_vv_f32m1(_sqsum, _p, _p, vl);
     }
-    vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, w, vl);
+    vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl);
 
     // the var maybe minus due to accuracy
-    //float var = sqsum / w - mean * mean;
+    //float var = sqsum / size - mean * mean;
     vfloat32m1_t _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
     // how about vfrsqrt7.v?
     vfloat32m1_t _b = vfmul_vv_f32m1(vfsgnjn_vv_f32m1(_mean, _mean, vl), _a, vl);
     if (affine)
     {
-        for (int i = 0; i < w; i++)
+        for (int i = 0; i < size; i++)
         {
             const int offset = vl * i;
             vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);
@@ -155,7 +155,7 @@ static inline int layernorm_rvv_packn_procedure(int w, float* ptr, const float*
     }
     else
     {
-        for (int i = 0; i < w; i++)
+        for (int i = 0; i < size; i++)
         {
             const int offset = vl * i;
             vfloat32m1_t _p = vle32_v_f32m1(ptr + offset, vl);

From 5edb46b6db4e36ac42e5b3d88515be21a3ebb80a Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 23 Jul 2022 18:48:22 +0800
Subject: [PATCH 08/15] RVV: layernorm fp16 storage w/ elempack support

---
 src/layer/riscv/layernorm_riscv.cpp | 263 ++++++++++++++++++++++++++++
 src/layer/riscv/layernorm_riscv.h   |   5 +
 2 files changed, 268 insertions(+)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index c613f8679..aaaed4328 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -29,7 +29,10 @@ LayerNorm_riscv::LayerNorm_riscv()
 {
 #if __riscv_vector
     support_packing = true;
+#if __riscv_zfh
+    support_fp16_storage = true;
 #endif
+#endif // __riscv_vector
 }
 
 #if __riscv_vector
@@ -172,6 +175,14 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 {
 // x = (x - mean) / sqrt(var + eps) * gamma + beta
 #if __riscv_vector
+    int elembits = bottom_top_blob.elembits();
+#if __riscv_zfh
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        return forward_inplace_fp16s(bottom_top_blob, opt);
+    }
+#endif // __riscv_zfh
+
     int elempack = bottom_top_blob.elempack;
     const int packn = csrr_vlenb() / 4;
     int dims = bottom_top_blob.dims;
@@ -283,4 +294,256 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
     return 0;
 }
 
+#if __riscv_vector && __riscv_zfh
+
+static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+{
+    float sum = 0.f;
+    float sqsum = 0.f;
+    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    {
+        int n = size;
+        __fp16* ptr_sum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m4(n);
+            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl);
+            _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            ptr_sum += vl;
+            n -= vl;
+        }
+    }
+    sum = vfmv_f_s_f32m1_f32(_sum);
+    float mean = sum / size;
+
+    {
+        int n = size;
+        __fp16* ptr_sqsum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m4(n);
+            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl);
+            _p = vfsub_vf_f32m8(_p, mean, vl);
+            _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            n -= vl;
+            ptr_sqsum += vl;
+        }
+    }
+    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+    float var = sqsum / size;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
+
+    {
+        int n = size;
+        __fp16* ptr_store = ptr;
+        const float* ptr_gamma = gamma_data;
+        const float* ptr_beta = beta_data;
+        if (affine)
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+                n -= vl;
+                ptr_store += vl;
+                ptr_gamma += vl;
+                ptr_beta += vl;
+            }
+        }
+        else
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
+                n -= vl;
+                ptr_store += vl;
+            }
+        }
+    }
+    return 0;
+}
+
+static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+{
+    // mean and var
+    // f16m1 => f32m2
+    vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
+    vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+        _sum = vfadd_vv_f32m2(_p, _sum, vl);
+    }
+    vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+        _p = vfsub_vv_f32m2(_p, _mean, vl);
+        _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl);
+    }
+    vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl);
+
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl);
+    // how about vfrsqrt7.v?
+    vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl);
+    if (affine)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
+            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
+            _p = vfmul_vf_f32m2(_p, gamma_data[i], vl);
+            _p = vfadd_vf_f32m2(_p, beta_data[i], vl);
+            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
+            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
+            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
+        }
+    }
+
+    return 0;
+}
+
+int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = (x - mean) / sqrt(var + eps) * gamma + beta
+    int elempack = bottom_top_blob.elempack;
+    int dims = bottom_top_blob.dims;
+    int w = bottom_top_blob.w;
+
+    if (dims == 1)
+    {
+        __fp16* ptr = bottom_top_blob;
+
+        return layernorm_rvv_pack1_fp16s_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine);
+    }
+    if (elempack == 1)
+    {
+        if (dims == 2)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                layernorm_rvv_pack1_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+            if (affine_size == w)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+
+                        layernorm_rvv_pack1_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+                    }
+                }
+            }
+            else // if (affine_size == size)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q);
+                    layernorm_rvv_pack1_fp16s_procedure(size, ptr, gamma_data, beta_data, eps, affine);
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2; // fp16
+    if (elempack == packn)
+    {
+        const word_type vl = vsetvl_e16m1(packn);
+        if (dims == 2)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                layernorm_rvv_packn_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
+            }
+        }
+        if (dims == 3)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+
+            if (affine_size == w)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+
+                        layernorm_rvv_packn_fp16s_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
+                    }
+                }
+            }
+            else // if (affine_size == size)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q);
+                    layernorm_rvv_packn_fp16s_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+#endif
+
 } // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h
index e23cbe8ff..5a883683e 100644
--- a/src/layer/riscv/layernorm_riscv.h
+++ b/src/layer/riscv/layernorm_riscv.h
@@ -24,6 +24,11 @@ public:
     LayerNorm_riscv();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if __riscv_vector && __riscv_zfh
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
 };
 
 } // namespace ncnn

From fabe56c0550697f56c81e6ad4be5fc50931332c6 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 23 Jul 2022 20:01:21 +0800
Subject: [PATCH 09/15] RVV: LayerNorm, fp16, s/sa, with elempack support

---
 src/layer/riscv/layernorm_riscv.cpp | 251 +++++++++++++++++++++++++++-
 src/layer/riscv/layernorm_riscv.h   |   1 +
 2 files changed, 251 insertions(+), 1 deletion(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index aaaed4328..8e7350241 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -179,7 +179,10 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 #if __riscv_zfh
     if (opt.use_fp16_storage && elembits == 16)
     {
-        return forward_inplace_fp16s(bottom_top_blob, opt);
+        if (opt.use_fp16_arithmetic)
+            return forward_inplace_fp16sa(bottom_top_blob, opt);
+        else
+            return forward_inplace_fp16s(bottom_top_blob, opt);
     }
 #endif // __riscv_zfh
 
@@ -543,6 +546,252 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
     return 0;
 }
 
+static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+{
+    float sum = 0.f;
+    float sqsum = 0.f;
+    vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+    vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+    {
+        int n = size;
+        __fp16* ptr_sum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m8(n);
+            vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl);
+            _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl);
+            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            ptr_sum += vl;
+            n -= vl;
+        }
+    }
+    sum = vfmv_f_s_f16m1_f16(_sum);
+    float mean = sum / size;
+
+    {
+        int n = size;
+        __fp16* ptr_sqsum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m8(n);
+            vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl);
+            _p = vfsub_vf_f16m8(_p, mean, vl);
+            _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            n -= vl;
+            ptr_sqsum += vl;
+        }
+    }
+    sqsum = vfmv_f_s_f16m1_f16(_sqsum);
+    float var = sqsum / size;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
+
+    {
+        int n = size;
+        __fp16* ptr_store = ptr;
+        const float* ptr_gamma = gamma_data;
+        const float* ptr_beta = beta_data;
+        if (affine)
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl);
+                _p = vfmul_vf_f16m4(_p, a, vl);
+                vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl);
+                _p = vfadd_vf_f16m4(_p, b, vl);
+                vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl);
+                _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl);
+                vse16_v_f16m4(ptr_store, _p, vl);
+
+                n -= vl;
+                ptr_store += vl;
+                ptr_gamma += vl;
+                ptr_beta += vl;
+            }
+        }
+        else
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
+                _p = vfmul_vf_f16m8(_p, a, vl);
+                _p = vfadd_vf_f16m8(_p, b, vl);
+                vse16_v_f16m8(ptr_store, _p, vl);
+                n -= vl;
+                ptr_store += vl;
+            }
+        }
+    }
+    return 0;
+}
+
+static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+{
+    // mean and var
+    vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
+    vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+        _sum = vfadd_vv_f16m1(_p, _sum, vl);
+        // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl);
+    }
+    vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+        _p = vfsub_vv_f16m1(_p, _mean, vl);
+        _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl);
+    }
+    vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl);
+
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl);
+    // how about vfrsqrt7.v?
+    vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl);
+    if (affine)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
+            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
+            _p = vfmul_vf_f16m1(_p, gamma_data[i], vl);
+            _p = vfadd_vf_f16m1(_p, beta_data[i], vl);
+            vse16_v_f16m1(ptr + offset, _p, vl);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
+            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
+            vse16_v_f16m1(ptr + offset, _p, vl);
+        }
+    }
+
+    return 0;
+}
+
+int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = (x - mean) / sqrt(var + eps) * gamma + beta
+    int elempack = bottom_top_blob.elempack;
+    int dims = bottom_top_blob.dims;
+    int w = bottom_top_blob.w;
+
+    if (dims == 1)
+    {
+        __fp16* ptr = bottom_top_blob;
+
+        return layernorm_rvv_pack1_fp16sa_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine);
+    }
+    if (elempack == 1)
+    {
+        if (dims == 2)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                layernorm_rvv_pack1_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+            if (affine_size == w)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+
+                        layernorm_rvv_pack1_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+                    }
+                }
+            }
+            else // if (affine_size == size)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q);
+                    layernorm_rvv_pack1_fp16sa_procedure(size, ptr, gamma_data, beta_data, eps, affine);
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2; // fp16
+    if (elempack == packn)
+    {
+        const word_type vl = vsetvl_e16m1(packn);
+        if (dims == 2)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            // assert affine_size == w
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                layernorm_rvv_packn_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
+            }
+        }
+        if (dims == 3)
+        {
+            int w = bottom_top_blob.w;
+            int h = bottom_top_blob.h;
+            int channels = bottom_top_blob.c;
+            int size = w * h;
+
+            if (affine_size == w)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+
+                        layernorm_rvv_packn_fp16sa_procedure(w, ptr, gamma_data, beta_data, eps, affine, vl);
+                    }
+                }
+            }
+            else // if (affine_size == size)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q);
+                    layernorm_rvv_packn_fp16sa_procedure(size, ptr, gamma_data, beta_data, eps, affine, vl);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
 
 #endif
 
diff --git a/src/layer/riscv/layernorm_riscv.h b/src/layer/riscv/layernorm_riscv.h
index 5a883683e..6d2a1748f 100644
--- a/src/layer/riscv/layernorm_riscv.h
+++ b/src/layer/riscv/layernorm_riscv.h
@@ -28,6 +28,7 @@ public:
 protected:
 #if __riscv_vector && __riscv_zfh
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
 

From 9b2da3649292a6550cce47113eb9a58baa60dc78 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 23 Jul 2022 21:27:28 +0800
Subject: [PATCH 10/15] Refactor: RVV LayerNorm, fp16

* move fp16 procedure into 'layernorm_rvv_fp16.h'
---
 src/layer/riscv/layernorm_riscv.cpp  | 275 +-------------------------
 src/layer/riscv/layernorm_rvv_fp16.h | 284 +++++++++++++++++++++++++++
 2 files changed, 289 insertions(+), 270 deletions(-)
 create mode 100644 src/layer/riscv/layernorm_rvv_fp16.h

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index 8e7350241..dd56e0a9f 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -25,6 +25,11 @@
 #include "riscv_usability.h"
 
 namespace ncnn {
+
+#if __riscv_vector && __riscv_zfh
+#include "layernorm_rvv_fp16.h"
+#endif
+
 LayerNorm_riscv::LayerNorm_riscv()
 {
 #if __riscv_vector
@@ -298,141 +303,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 }
 
 #if __riscv_vector && __riscv_zfh
-
-static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
-{
-    float sum = 0.f;
-    float sqsum = 0.f;
-    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
-    {
-        int n = size;
-        __fp16* ptr_sum = ptr;
-        while (n > 0)
-        {
-            word_type vl = vsetvl_e16m4(n);
-            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl);
-            _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
-            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-            ptr_sum += vl;
-            n -= vl;
-        }
-    }
-    sum = vfmv_f_s_f32m1_f32(_sum);
-    float mean = sum / size;
-
-    {
-        int n = size;
-        __fp16* ptr_sqsum = ptr;
-        while (n > 0)
-        {
-            word_type vl = vsetvl_e16m4(n);
-            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl);
-            _p = vfsub_vf_f32m8(_p, mean, vl);
-            _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-            n -= vl;
-            ptr_sqsum += vl;
-        }
-    }
-    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
-    float var = sqsum / size;
-    // the var maybe minus due to accuracy
-    //float var = sqsum / size - mean * mean;
-    float a = static_cast<float>(1.f / (sqrt(var + eps)));
-    float b = -mean * a;
-
-    {
-        int n = size;
-        __fp16* ptr_store = ptr;
-        const float* ptr_gamma = gamma_data;
-        const float* ptr_beta = beta_data;
-        if (affine)
-        {
-            while (n > 0)
-            {
-                word_type vl = vsetvl_e16m4(n);
-                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
-                _p = vfmul_vf_f32m8(_p, a, vl);
-                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
-                _p = vfadd_vf_f32m8(_p, b, vl);
-                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
-                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
-                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
-
-                n -= vl;
-                ptr_store += vl;
-                ptr_gamma += vl;
-                ptr_beta += vl;
-            }
-        }
-        else
-        {
-            while (n > 0)
-            {
-                word_type vl = vsetvl_e16m4(n);
-                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
-                _p = vfmul_vf_f32m8(_p, a, vl);
-                _p = vfadd_vf_f32m8(_p, b, vl);
-                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
-                n -= vl;
-                ptr_store += vl;
-            }
-        }
-    }
-    return 0;
-}
-
-static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
-{
-    // mean and var
-    // f16m1 => f32m2
-    vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
-    vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl);
-    for (int i = 0; i < size; i++)
-    {
-        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
-        _sum = vfadd_vv_f32m2(_p, _sum, vl);
-    }
-    vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl);
-    for (int i = 0; i < size; i++)
-    {
-        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
-        _p = vfsub_vv_f32m2(_p, _mean, vl);
-        _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl);
-    }
-    vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl);
-
-    // the var maybe minus due to accuracy
-    //float var = sqsum / size - mean * mean;
-    vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl);
-    // how about vfrsqrt7.v?
-    vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl);
-    if (affine)
-    {
-        for (int i = 0; i < size; i++)
-        {
-            const int offset = vl * i;
-            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
-            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
-            _p = vfmul_vf_f32m2(_p, gamma_data[i], vl);
-            _p = vfadd_vf_f32m2(_p, beta_data[i], vl);
-            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < size; i++)
-        {
-            const int offset = vl * i;
-            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
-            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
-            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
-        }
-    }
-
-    return 0;
-}
-
 int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
 {
     // x = (x - mean) / sqrt(var + eps) * gamma + beta
@@ -546,140 +416,6 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
     return 0;
 }
 
-static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
-{
-    float sum = 0.f;
-    float sqsum = 0.f;
-    vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
-    vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
-    {
-        int n = size;
-        __fp16* ptr_sum = ptr;
-        while (n > 0)
-        {
-            word_type vl = vsetvl_e16m8(n);
-            vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl);
-            _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl);
-            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-            ptr_sum += vl;
-            n -= vl;
-        }
-    }
-    sum = vfmv_f_s_f16m1_f16(_sum);
-    float mean = sum / size;
-
-    {
-        int n = size;
-        __fp16* ptr_sqsum = ptr;
-        while (n > 0)
-        {
-            word_type vl = vsetvl_e16m8(n);
-            vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl);
-            _p = vfsub_vf_f16m8(_p, mean, vl);
-            _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
-            n -= vl;
-            ptr_sqsum += vl;
-        }
-    }
-    sqsum = vfmv_f_s_f16m1_f16(_sqsum);
-    float var = sqsum / size;
-    // the var maybe minus due to accuracy
-    //float var = sqsum / size - mean * mean;
-    float a = static_cast<float>(1.f / (sqrt(var + eps)));
-    float b = -mean * a;
-
-    {
-        int n = size;
-        __fp16* ptr_store = ptr;
-        const float* ptr_gamma = gamma_data;
-        const float* ptr_beta = beta_data;
-        if (affine)
-        {
-            while (n > 0)
-            {
-                word_type vl = vsetvl_e16m4(n);
-                vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl);
-                _p = vfmul_vf_f16m4(_p, a, vl);
-                vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl);
-                _p = vfadd_vf_f16m4(_p, b, vl);
-                vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl);
-                _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl);
-                vse16_v_f16m4(ptr_store, _p, vl);
-
-                n -= vl;
-                ptr_store += vl;
-                ptr_gamma += vl;
-                ptr_beta += vl;
-            }
-        }
-        else
-        {
-            while (n > 0)
-            {
-                word_type vl = vsetvl_e16m4(n);
-                vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
-                _p = vfmul_vf_f16m8(_p, a, vl);
-                _p = vfadd_vf_f16m8(_p, b, vl);
-                vse16_v_f16m8(ptr_store, _p, vl);
-                n -= vl;
-                ptr_store += vl;
-            }
-        }
-    }
-    return 0;
-}
-
-static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
-{
-    // mean and var
-    vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
-    vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl);
-    for (int i = 0; i < size; i++)
-    {
-        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
-        _sum = vfadd_vv_f16m1(_p, _sum, vl);
-        // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl);
-    }
-    vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl);
-    for (int i = 0; i < size; i++)
-    {
-        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
-        _p = vfsub_vv_f16m1(_p, _mean, vl);
-        _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl);
-    }
-    vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl);
-
-    // the var maybe minus due to accuracy
-    //float var = sqsum / size - mean * mean;
-    vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl);
-    // how about vfrsqrt7.v?
-    vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl);
-    if (affine)
-    {
-        for (int i = 0; i < size; i++)
-        {
-            const int offset = vl * i;
-            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
-            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
-            _p = vfmul_vf_f16m1(_p, gamma_data[i], vl);
-            _p = vfadd_vf_f16m1(_p, beta_data[i], vl);
-            vse16_v_f16m1(ptr + offset, _p, vl);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < size; i++)
-        {
-            const int offset = vl * i;
-            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
-            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
-            vse16_v_f16m1(ptr + offset, _p, vl);
-        }
-    }
-
-    return 0;
-}
-
 int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
 {
     // x = (x - mean) / sqrt(var + eps) * gamma + beta
@@ -792,7 +528,6 @@ int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
 
     return 0;
 }
-
 #endif
 
 } // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h
new file mode 100644
index 000000000..313b51cfe
--- /dev/null
+++ b/src/layer/riscv/layernorm_rvv_fp16.h
@@ -0,0 +1,284 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// fp16s
+static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+{
+    float sum = 0.f;
+    float sqsum = 0.f;
+    vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+    {
+        int n = size;
+        __fp16* ptr_sum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m4(n);
+            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl);
+            _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            ptr_sum += vl;
+            n -= vl;
+        }
+    }
+    sum = vfmv_f_s_f32m1_f32(_sum);
+    float mean = sum / size;
+
+    {
+        int n = size;
+        __fp16* ptr_sqsum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m4(n);
+            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl);
+            _p = vfsub_vf_f32m8(_p, mean, vl);
+            _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            n -= vl;
+            ptr_sqsum += vl;
+        }
+    }
+    sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+    float var = sqsum / size;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
+
+    {
+        int n = size;
+        __fp16* ptr_store = ptr;
+        const float* ptr_gamma = gamma_data;
+        const float* ptr_beta = beta_data;
+        if (affine)
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vfloat32m8_t _beta = vle32_v_f32m8(ptr_beta, vl);
+                _p = vfmadd_vv_f32m8(_p, _gamma, _beta, vl);
+                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+                n -= vl;
+                ptr_store += vl;
+                ptr_gamma += vl;
+                ptr_beta += vl;
+            }
+        }
+        else
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
+                _p = vfmul_vf_f32m8(_p, a, vl);
+                _p = vfadd_vf_f32m8(_p, b, vl);
+                vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
+                n -= vl;
+                ptr_store += vl;
+            }
+        }
+    }
+    return 0;
+}
+
+static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+{
+    // mean and var
+    // f16m1 => f32m2
+    vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
+    vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+        _sum = vfadd_vv_f32m2(_p, _sum, vl);
+    }
+    vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, (float)size, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+        _p = vfsub_vv_f32m2(_p, _mean, vl);
+        _sqsum = vfmacc_vv_f32m2(_sqsum, _p, _p, vl);
+    }
+    vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, (float)size, vl);
+
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    vfloat32m2_t _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl);
+    // how about vfrsqrt7.v?
+    vfloat32m2_t _b = vfmul_vv_f32m2(vfsgnjn_vv_f32m2(_mean, _mean, vl), _a, vl);
+    if (affine)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
+            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
+            _p = vfmul_vf_f32m2(_p, gamma_data[i], vl);
+            _p = vfadd_vf_f32m2(_p, beta_data[i], vl);
+            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + offset, vl), vl);
+            _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
+            vse16_v_f16m1(ptr + offset, vfncvt_f_f_w_f16m1(_p, vl), vl);
+        }
+    }
+
+    return 0;
+}
+
+// fp16sa
+
+static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+{
+    float sum = 0.f;
+    float sqsum = 0.f;
+    vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+    vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+    {
+        int n = size;
+        __fp16* ptr_sum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m8(n);
+            vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl);
+            _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl);
+            // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            ptr_sum += vl;
+            n -= vl;
+        }
+    }
+    sum = vfmv_f_s_f16m1_f16(_sum);
+    float mean = sum / size;
+
+    {
+        int n = size;
+        __fp16* ptr_sqsum = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e16m8(n);
+            vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl);
+            _p = vfsub_vf_f16m8(_p, mean, vl);
+            _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+            n -= vl;
+            ptr_sqsum += vl;
+        }
+    }
+    sqsum = vfmv_f_s_f16m1_f16(_sqsum);
+    float var = sqsum / size;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
+
+    {
+        int n = size;
+        __fp16* ptr_store = ptr;
+        const float* ptr_gamma = gamma_data;
+        const float* ptr_beta = beta_data;
+        if (affine)
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl);
+                _p = vfmul_vf_f16m4(_p, a, vl);
+                vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl);
+                _p = vfadd_vf_f16m4(_p, b, vl);
+                vfloat16m4_t _beta = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_beta, vl), vl);
+                _p = vfmadd_vv_f16m4(_p, _gamma, _beta, vl);
+                vse16_v_f16m4(ptr_store, _p, vl);
+
+                n -= vl;
+                ptr_store += vl;
+                ptr_gamma += vl;
+                ptr_beta += vl;
+            }
+        }
+        else
+        {
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
+                _p = vfmul_vf_f16m8(_p, a, vl);
+                _p = vfadd_vf_f16m8(_p, b, vl);
+                vse16_v_f16m8(ptr_store, _p, vl);
+                n -= vl;
+                ptr_store += vl;
+            }
+        }
+    }
+    return 0;
+}
+
+static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+{
+    // mean and var
+    vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
+    vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+        _sum = vfadd_vv_f16m1(_p, _sum, vl);
+        // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl);
+    }
+    vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl);
+    for (int i = 0; i < size; i++)
+    {
+        vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+        _p = vfsub_vv_f16m1(_p, _mean, vl);
+        _sqsum = vfmacc_vv_f16m1(_sqsum, _p, _p, vl);
+    }
+    vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl);
+
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+    vfloat16m1_t _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl);
+    // how about vfrsqrt7.v?
+    vfloat16m1_t _b = vfmul_vv_f16m1(vfsgnjn_vv_f16m1(_mean, _mean, vl), _a, vl);
+    if (affine)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
+            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
+            _p = vfmul_vf_f16m1(_p, gamma_data[i], vl);
+            _p = vfadd_vf_f16m1(_p, beta_data[i], vl);
+            vse16_v_f16m1(ptr + offset, _p, vl);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; i++)
+        {
+            const int offset = vl * i;
+            vfloat16m1_t _p = vle16_v_f16m1(ptr + offset, vl);
+            _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
+            vse16_v_f16m1(ptr + offset, _p, vl);
+        }
+    }
+
+    return 0;
+}

From 3da3bfcb9ebe2b8c70f0c0198c0abfa9616a77bb Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 23 Jul 2022 23:16:15 +0800
Subject: [PATCH 11/15] Fix: RVV LayerNorm, fp16sa, pack1

---
 src/layer/riscv/layernorm_rvv_fp16.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h
index 313b51cfe..f17c7308a 100644
--- a/src/layer/riscv/layernorm_rvv_fp16.h
+++ b/src/layer/riscv/layernorm_rvv_fp16.h
@@ -151,8 +151,8 @@ static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, con
 
 static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
 {
-    float sum = 0.f;
-    float sqsum = 0.f;
+    __fp16 sum = 0.f;
+    __fp16 sqsum = 0.f;
     vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
     vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
     {
@@ -169,7 +169,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         }
     }
     sum = vfmv_f_s_f16m1_f16(_sum);
-    float mean = sum / size;
+    __fp16 mean = sum / size;
 
     {
         int n = size;
@@ -185,11 +185,11 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         }
     }
     sqsum = vfmv_f_s_f16m1_f16(_sqsum);
-    float var = sqsum / size;
+    __fp16 var = sqsum / size;
     // the var maybe minus due to accuracy
     //float var = sqsum / size - mean * mean;
-    float a = static_cast<float>(1.f / (sqrt(var + eps)));
-    float b = -mean * a;
+    __fp16 a = static_cast<__fp16>(1.f / (sqrt(var + eps)));
+    __fp16 b = static_cast<__fp16>(-mean * a);
 
     {
         int n = size;
@@ -219,7 +219,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                word_type vl = vsetvl_e16m8(n);
                 vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
                 _p = vfmul_vf_f16m8(_p, a, vl);
                 _p = vfadd_vf_f16m8(_p, b, vl);

From e3cb69ee8d58023bd3e114dcd8e114fa62296698 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Mon, 25 Jul 2022 00:15:56 +0800
Subject: [PATCH 12/15] Apply requested changes

 * Embed the simd-free scalar code implementation
---
 src/layer/riscv/layernorm_riscv.cpp | 66 ++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index dd56e0a9f..38db7263c 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -174,6 +174,47 @@ static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const floa
 
     return 0;
 }
+#else
+static inline int layernorm_scalar_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine)
+{
+    // mean and var
+    float sum = 0.f;
+    float sqsum = 0.f;
+    for (int i = 0; i < size; i++)
+    {
+        sum += ptr[i];
+        //sqsum += ptr[i] * ptr[i];
+    }
+    float mean = sum / size;
+    float tmp = 0.f;
+    for (int i = 0; i < size; i++)
+    {
+        tmp = ptr[i] - mean;
+        sqsum += tmp * tmp;
+    }
+    float var = sqsum / size;
+    // the var maybe minus due to accuracy
+    //float var = sqsum / size - mean * mean;
+
+    float a = static_cast<float>(1.f / (sqrt(var + eps)));
+    float b = -mean * a;
+
+    if (affine)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = (ptr[i] * a + b) * gamma_data[i] + beta_data[i];
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; i++)
+        {
+            ptr[i] = ptr[i] * a + b;
+        }
+    }
+    return 0;
+}
 #endif // __riscv_vector
 
 int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -193,16 +234,22 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
     int elempack = bottom_top_blob.elempack;
     const int packn = csrr_vlenb() / 4;
+#endif // __riscv_vector
     int dims = bottom_top_blob.dims;
     int w = bottom_top_blob.w;
 
     if (dims == 1)
     {
         float* ptr = bottom_top_blob;
-
+#if __riscv_vector
         return layernorm_rvv_pack1_procedure(w * elempack, ptr, gamma_data, beta_data, eps, affine);
+#else
+        return layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+#endif // __riscv_vector
     }
+#if __riscv_vector
     if (elempack == 1)
+#endif
     {
         if (dims == 2)
         {
@@ -213,7 +260,11 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < h; i++)
             {
                 float* ptr = bottom_top_blob.row(i);
+#if __riscv_vector
                 layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+#else
+                layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+#endif // __riscv_vector
             }
         }
 
@@ -231,8 +282,11 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                     for (int i = 0; i < h; i++)
                     {
                         float* ptr = bottom_top_blob.channel(q).row(i);
-
+#if __riscv_vector
                         layernorm_rvv_pack1_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+#else
+                        layernorm_scalar_procedure(w, ptr, gamma_data, beta_data, eps, affine);
+#endif // __riscv_vector
                     }
                 }
             }
@@ -242,12 +296,17 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
                 for (int q = 0; q < channels; q++)
                 {
                     float* ptr = bottom_top_blob.channel(q);
+#if __riscv_vector
                     layernorm_rvv_pack1_procedure(size, ptr, gamma_data, beta_data, eps, affine);
+#else
+                    layernorm_scalar_procedure(size, ptr, gamma_data, beta_data, eps, affine);
+#endif // __riscv_vector
                 }
             }
         }
     }
 
+#if __riscv_vector
     if (elempack == packn)
     {
         const word_type vl = vsetvl_e32m1(packn);
@@ -295,9 +354,6 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
             }
         }
     }
-
-#else  // __riscv_vector
-    return LayerNorm::forward_inplace(bottom_top_blob, opt);
 #endif // __riscv_vector
     return 0;
 }

From 5e50c8eff8f4007e7b2be4382e89c6e9ccef6f17 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Fri, 29 Jul 2022 12:14:28 +0800
Subject: [PATCH 13/15] RVV: drop rvv-071 support (#4094)

---
 src/layer/riscv/layernorm_riscv.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index 38db7263c..b1f3a2f80 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -15,11 +15,7 @@
 #include <math.h>
 
 #if __riscv_vector
-#ifdef RVV_SPEC_0_7
-#include "riscv_v_071_fix.h"
-#else
 #include <riscv_vector.h>
-#endif
 #endif // __riscv_vector
 
 #include "riscv_usability.h"

From 6948e834b6a01dbad492cd33d0a9db76ea758a18 Mon Sep 17 00:00:00 2001
From: Xavier Hsinyuan <me@lstlx.com>
Date: Sat, 1 Oct 2022 23:11:39 +0800
Subject: [PATCH 15/15] RVV: replace `word_type` to `size_t` (#4100, #4118)

---
 src/layer/riscv/layernorm_riscv.cpp  | 16 ++++++++--------
 src/layer/riscv/layernorm_rvv_fp16.h | 20 ++++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/layer/riscv/layernorm_riscv.cpp b/src/layer/riscv/layernorm_riscv.cpp
index b1f3a2f80..de942b19e 100644
--- a/src/layer/riscv/layernorm_riscv.cpp
+++ b/src/layer/riscv/layernorm_riscv.cpp
@@ -48,7 +48,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa
         float* ptr_sum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
             _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
             // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -64,7 +64,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa
         float* ptr_sqsum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
             _p = vfsub_vf_f32m8(_p, mean, vl);
             _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -88,7 +88,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
                 _p = vfmul_vf_f32m8(_p, a, vl);
                 vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
@@ -107,7 +107,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
                 _p = vfmul_vf_f32m8(_p, a, vl);
                 _p = vfadd_vf_f32m8(_p, b, vl);
@@ -120,7 +120,7 @@ static inline int layernorm_rvv_pack1_procedure(int size, float* ptr, const floa
     return 0;
 }
 
-static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+static inline int layernorm_rvv_packn_procedure(int size, float* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl)
 {
     // mean and var
     vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
@@ -305,7 +305,7 @@ int LayerNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 #if __riscv_vector
     if (elempack == packn)
     {
-        const word_type vl = vsetvl_e32m1(packn);
+        const size_t vl = vsetvl_e32m1(packn);
         if (dims == 2)
         {
             int w = bottom_top_blob.w;
@@ -419,7 +419,7 @@ int LayerNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
     const int packn = csrr_vlenb() / 2; // fp16
     if (elempack == packn)
     {
-        const word_type vl = vsetvl_e16m1(packn);
+        const size_t vl = vsetvl_e16m1(packn);
         if (dims == 2)
         {
             int w = bottom_top_blob.w;
@@ -532,7 +532,7 @@ int LayerNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
     const int packn = csrr_vlenb() / 2; // fp16
     if (elempack == packn)
     {
-        const word_type vl = vsetvl_e16m1(packn);
+        const size_t vl = vsetvl_e16m1(packn);
         if (dims == 2)
         {
             int w = bottom_top_blob.w;
diff --git a/src/layer/riscv/layernorm_rvv_fp16.h b/src/layer/riscv/layernorm_rvv_fp16.h
index f17c7308a..cf22025ac 100644
--- a/src/layer/riscv/layernorm_rvv_fp16.h
+++ b/src/layer/riscv/layernorm_rvv_fp16.h
@@ -24,7 +24,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con
         __fp16* ptr_sum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl);
             _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
             // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -40,7 +40,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con
         __fp16* ptr_sqsum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl);
             _p = vfsub_vf_f32m8(_p, mean, vl);
             _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -64,7 +64,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
                 _p = vfmul_vf_f32m8(_p, a, vl);
                 vfloat32m8_t _gamma = vle32_v_f32m8(ptr_gamma, vl);
@@ -83,7 +83,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
                 _p = vfmul_vf_f32m8(_p, a, vl);
                 _p = vfadd_vf_f32m8(_p, b, vl);
@@ -96,7 +96,7 @@ static inline int layernorm_rvv_pack1_fp16s_procedure(int size, __fp16* ptr, con
     return 0;
 }
 
-static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+static inline int layernorm_rvv_packn_fp16s_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl)
 {
     // mean and var
     // f16m1 => f32m2
@@ -160,7 +160,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         __fp16* ptr_sum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl);
             _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl);
             // _sqsum = vfredusum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -176,7 +176,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         __fp16* ptr_sqsum = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl);
             _p = vfsub_vf_f16m8(_p, mean, vl);
             _sqsum = vfredusum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
@@ -200,7 +200,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat16m4_t _p = vle16_v_f16m4(ptr_store, vl);
                 _p = vfmul_vf_f16m4(_p, a, vl);
                 vfloat16m4_t _gamma = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_gamma, vl), vl);
@@ -219,7 +219,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
         {
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
                 vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
                 _p = vfmul_vf_f16m8(_p, a, vl);
                 _p = vfadd_vf_f16m8(_p, b, vl);
@@ -232,7 +232,7 @@ static inline int layernorm_rvv_pack1_fp16sa_procedure(int size, __fp16* ptr, co
     return 0;
 }
 
-static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const word_type vl)
+static inline int layernorm_rvv_packn_fp16sa_procedure(int size, __fp16* ptr, const float* gamma_data, const float* beta_data, float eps, int affine, const size_t vl)
 {
     // mean and var
     vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);