diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp
new file mode 100644
index 000000000..c74f317be
--- /dev/null
+++ b/src/layer/riscv/dropout_riscv.cpp
@@ -0,0 +1,110 @@
+// Xavier Hsinyuan is pleased to support the open source community by making
+// ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "dropout_riscv.h"
+
+#if __riscv_vector
+#ifdef RVV_SPEC_0_7
+#include "riscv_v_071_fix.h"
+#else
+#include <riscv_vector.h>
+#endif
+#endif // __riscv_vector
+
+namespace ncnn {
+Dropout_riscv::Dropout_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif
+}
+
+int Dropout_riscv::forward_inplace(Mat& bottom_top_blob,
+                                   const Option& opt) const
+{
+    if (scale == 1.f)
+    {
+        return 0;
+    }
+
+#if __riscv_vector
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int n = w * elempack;
+        float* ptr = bottom_top_blob;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+            vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+            _p = vfmul_vf_f32m8(_p, scale, vl);
+
+            vse32_v_f32m8(ptr, _p, vl);
+            ptr += vl;
+            n -= vl;
+        }
+    }
+    if (dims == 2)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            int n = w * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                _p = vfmul_vf_f32m8(_p, scale, vl);
+
+                vse32_v_f32m8(ptr, _p, vl);
+                ptr += vl;
+                n -= vl;
+            }
+        }
+    }
+    if (dims == 3)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            int n = size * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                _p = vfmul_vf_f32m8(_p, scale, vl);
+
+                vse32_v_f32m8(ptr, _p, vl);
+                ptr += vl;
+                n -= vl;
+            }
+        }
+    }
+    return 0;
+#endif // __riscv_vector
+
+    return Dropout::forward_inplace(bottom_top_blob, opt);
+}
+} // namespace ncnn
diff --git a/src/layer/riscv/dropout_riscv.h b/src/layer/riscv/dropout_riscv.h
new file mode 100644
index 000000000..d685c0ee3
--- /dev/null
+++ b/src/layer/riscv/dropout_riscv.h
@@ -0,0 +1,35 @@
+// Xavier Hsinyuan is pleased to support the open source community by making
+// ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#ifndef LAYER_DROPOUT_RISCV_H
+#define LAYER_DROPOUT_RISCV_H
+
+#include "dropout.h"
+
+namespace ncnn {
+
+class Dropout_riscv : virtual public Dropout
+{
+public:
+    Dropout_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DROPOUT_RISCV_H
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
new file mode 100644
index 000000000..0ebb2879b
--- /dev/null
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -0,0 +1,880 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gru_riscv.h"
+
+#if __riscv_vector
+#ifdef RVV_SPEC_0_7
+#include "riscv_v_071_fix.h"
+#else
+#include <riscv_vector.h>
+#endif
+#endif // __riscv_vector
+
+namespace ncnn {
+
+//core rvv-optimized gru impl.
+#if __riscv_vector
+static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
+{
+    int size = bottom_blob.w;
+    int T = bottom_blob.h;
+
+    int num_output = top_blob.w;
+
+    // 2 x num_output
+    Mat gates(2, num_output, 4u, opt.workspace_allocator);
+    if (gates.empty())
+        return -100;
+
+    // unroll
+    for (int t = 0; t < T; t++)
+    {
+        int ti = reverse ? T - 1 - t : t;
+
+        const float* x = bottom_blob.row(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            float* gates_data = gates.row(q);
+
+            // gate reset update
+            const float* bias_c_R = bias_c.row(0);
+            const float* bias_c_U = bias_c.row(1);
+
+            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
+            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
+            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
+            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
+
+            float R = bias_c_R[q];
+            float U = bias_c_U[q];
+
+            int n = size;
+            const float* ptr_x = x;
+            const float* ptr_xcr = weight_xc_R;
+            const float* ptr_xcu = weight_xc_U;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl);
+                vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
+                vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
+                vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl);
+                vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl);
+
+                _xcr = vfmul_vv_f32m8(_xcr, _x, vl);
+                _xcu = vfmul_vv_f32m8(_xcu, _x, vl);
+                _scalar_r = vfredsum_vs_f32m8_f32m1(_scalar_r, _xcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f32m8_f32m1(_scalar_u, _xcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f32m1_f32(_scalar_r);
+                U = vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_x += vl;
+                ptr_xcr += vl;
+                ptr_xcu += vl;
+                n -= vl;
+            }
+            ptr_x = NULL;
+            ptr_xcr = NULL;
+            ptr_xcu = NULL;
+
+            int n_out = num_output;
+            const float* ptr_hc = hidden_state;
+            const float* ptr_hcr = weight_hc_R;
+            const float* ptr_hcu = weight_hc_U;
+            while (n_out > 0)
+            {
+                word_type vl = vsetvl_e32m8(n_out);
+                vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
+                vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
+                vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
+                vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl);
+                vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl);
+
+                _hcr = vfmul_vv_f32m8(_hcr, _h_cont, vl);
+                _hcu = vfmul_vv_f32m8(_hcu, _h_cont, vl);
+                _scalar_r = vfredsum_vs_f32m8_f32m1(_scalar_r, _hcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f32m8_f32m1(_scalar_u, _hcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f32m1_f32(_scalar_r);
+                U = vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_hc += vl;
+                ptr_hcr += vl;
+                ptr_hcu += vl;
+                n_out -= vl;
+            }
+            ptr_hc = NULL;
+            ptr_hcr = NULL;
+            ptr_hcu = NULL;
+
+            // sigmoid(R)
+            // sigmoid(U)
+            R = 1.f / (1.f + exp(-R));
+            U = 1.f / (1.f + exp(-U));
+
+            // gate new
+            const float* bias_c_WN = bias_c.row(2);
+            const float* bias_c_BN = bias_c.row(3);
+
+            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);
+            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);
+
+            float N = bias_c_BN[q];
+
+            int n_out2 = num_output;
+            const float* ptr_hc2 = hidden_state;
+            const float* ptr_whc_n = weight_hc_N;
+            while (n_out2 > 0)
+            {
+                word_type vl = vsetvl_e32m8(n_out2);
+
+                vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
+                vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
+                vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl);
+
+                _h_cont = vfmul_vv_f32m8(_whc_n, _h_cont, vl);
+                _scalar_n = vfredsum_vs_f32m8_f32m1(_scalar_n, _h_cont, _scalar_n, vl);
+
+                N = vfmv_f_s_f32m1_f32(_scalar_n);
+                n_out2 -= vl;
+                ptr_hc2 += vl;
+                ptr_whc_n += vl;
+            }
+            ptr_hc2 = NULL;
+            ptr_whc_n = NULL;
+
+            N = bias_c_WN[q] + R * N;
+
+            int n2 = size;
+            const float* ptr_x2 = x;
+            const float* ptr_xcn = weight_xc_N;
+            while (n2 > 0)
+            {
+                word_type vl = vsetvl_e32m8(n2);
+
+                vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl);
+                vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
+                vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl);
+
+                _xcn = vfmul_vv_f32m8(_x, _xcn, vl);
+                _scalar_n = vfredsum_vs_f32m8_f32m1(_scalar_n, _xcn, _scalar_n, vl);
+                N = vfmv_f_s_f32m1_f32(_scalar_n);
+
+                n2 -= vl;
+                ptr_x2 += vl;
+                ptr_xcn += vl;
+            }
+            ptr_x2 = NULL;
+            ptr_xcn = NULL;
+
+            // tanh(N)
+            N = tanh(N);
+
+            gates_data[0] = U;
+            gates_data[1] = N;
+        }
+
+        // h_t := (1 - update) .* new + update .* h_{t-1}
+        float* output_data = top_blob.row(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            const float* gates_data = gates.row(q);
+
+            float U = gates_data[0];
+            float N = gates_data[1];
+
+            float H = (1 - U) * N + U * hidden_state[q];
+
+            hidden_state[q] = H;
+            output_data[q] = H;
+        }
+    }
+
+    return 0;
+}
+
+#endif
+
+GRU_riscv::GRU_riscv()
+{
+#if __riscv_vector && __riscv_zfh
+    support_fp16_storage = true;
+#endif
+}
+
+int GRU_riscv::create_pipeline(const Option& opt)
+{
+#if __riscv_vector && __riscv_zfh
+    if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+        return create_pipeline_fp16sa(opt);
+#endif
+
+    return GRU::create_pipeline(opt);
+}
+
+int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+#if __riscv_vector
+
+#if __riscv_zfh
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_fp16sa(bottom_blob, top_blob, opt);
+        else
+            return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    int T = bottom_blob.h;
+
+    int num_directions = direction == 2 ? 2 : 1;
+
+    // initial hidden state
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        hidden.fill(0.0f);
+
+        int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const float* pf = top_blob_forward.row(i);
+            const float* pr = top_blob_reverse.row(i);
+            float* ptr = top_blob.row(i);
+
+            memcpy(ptr, pf, num_output * sizeof(float));
+            memcpy(ptr + num_output, pr, num_output * sizeof(float));
+        }
+    }
+
+    return 0;
+#endif
+    return GRU::forward(bottom_blob, top_blob, opt);
+}
+
+int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
+    {
+        return forward(bottom_blobs[0], top_blobs[0], opt);
+    }
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    int elembits = bottom_blob.elembits();
+
+#if __riscv_vector
+#if __riscv_zfh
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_fp16sa(bottom_blobs, top_blobs, opt);
+        else
+            return forward_fp16s(bottom_blobs, top_blobs, opt);
+    }
+#endif
+
+    int T = bottom_blob.h;
+    Mat& top_blob = top_blobs[0];
+    Mat& hidden_state = top_blobs[1];
+
+    //Copy previous states
+    hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
+
+    top_blob.create(num_output, T, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+#endif
+    return GRU::forward(bottom_blobs, top_blobs, opt);
+}
+
+#if __riscv_vector && __riscv_zfh
+static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
+{
+    int size = bottom_blob.w;
+    int T = bottom_blob.h;
+
+    int num_output = top_blob.w;
+
+    // 2 x num_output
+    Mat gates(2, num_output, 4u, opt.workspace_allocator);
+    if (gates.empty())
+        return -100;
+
+    // unroll
+    for (int t = 0; t < T; t++)
+    {
+        int ti = reverse ? T - 1 - t : t;
+
+        const __fp16* x = bottom_blob.row<const __fp16>(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            float* gates_data = gates.row(q);
+
+            // gate reset update
+            const float* bias_c_R = bias_c.row(0);
+            const float* bias_c_U = bias_c.row(1);
+
+            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
+            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
+            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
+            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
+
+            float R = bias_c_R[q];
+            float U = bias_c_U[q];
+
+            int n = size;
+            const __fp16* ptr_x = x;
+            const float* ptr_xcr = weight_xc_R;
+            const float* ptr_xcu = weight_xc_U;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m4(n);
+                vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl);
+                vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
+                vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
+                vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl);
+                vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl);
+
+                _xcr = vfmul_vv_f32m8(_xcr, _x, vl);
+                _xcu = vfmul_vv_f32m8(_xcu, _x, vl);
+                _scalar_r = vfredsum_vs_f32m8_f32m1(_scalar_r, _xcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f32m8_f32m1(_scalar_u, _xcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f32m1_f32(_scalar_r);
+                U = vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_x += vl;
+                ptr_xcr += vl;
+                ptr_xcu += vl;
+                n -= vl;
+            }
+            ptr_x = NULL;
+            ptr_xcr = NULL;
+            ptr_xcu = NULL;
+
+            int n_out = num_output;
+            const float* ptr_hc = hidden_state;
+            const float* ptr_hcr = weight_hc_R;
+            const float* ptr_hcu = weight_hc_U;
+            while (n_out > 0)
+            {
+                word_type vl = vsetvl_e16m4(n_out);
+                vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
+                vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
+                vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
+                vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl);
+                vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl);
+
+                _hcr = vfmul_vv_f32m8(_hcr, _h_cont, vl);
+                _hcu = vfmul_vv_f32m8(_hcu, _h_cont, vl);
+                _scalar_r = vfredsum_vs_f32m8_f32m1(_scalar_r, _hcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f32m8_f32m1(_scalar_u, _hcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f32m1_f32(_scalar_r);
+                U = vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_hc += vl;
+                ptr_hcr += vl;
+                ptr_hcu += vl;
+                n_out -= vl;
+            }
+            ptr_hc = NULL;
+            ptr_hcr = NULL;
+            ptr_hcu = NULL;
+
+            // sigmoid(R)
+            // sigmoid(U)
+            R = 1.f / (1.f + exp(-R));
+            U = 1.f / (1.f + exp(-U));
+
+            // gate new
+            const float* bias_c_WN = bias_c.row(2);
+            const float* bias_c_BN = bias_c.row(3);
+
+            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);
+            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);
+
+            float N = bias_c_BN[q];
+
+            int n_out2 = num_output;
+            const float* ptr_hc2 = hidden_state;
+            const float* ptr_whc_n = weight_hc_N;
+            while (n_out2 > 0)
+            {
+                word_type vl = vsetvl_e16m4(n_out2);
+
+                vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
+                vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
+                vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl);
+
+                _h_cont = vfmul_vv_f32m8(_whc_n, _h_cont, vl);
+                _scalar_n = vfredsum_vs_f32m8_f32m1(_scalar_n, _h_cont, _scalar_n, vl);
+
+                N = vfmv_f_s_f32m1_f32(_scalar_n);
+                n_out2 -= vl;
+                ptr_hc2 += vl;
+                ptr_whc_n += vl;
+            }
+            ptr_hc2 = NULL;
+            ptr_whc_n = NULL;
+
+            N = bias_c_WN[q] + R * N;
+
+            int n2 = size;
+            const __fp16* ptr_x2 = x;
+            const float* ptr_xcn = weight_xc_N;
+            while (n2 > 0)
+            {
+                word_type vl = vsetvl_e16m4(n2);
+
+                vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl);
+                vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
+                vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl);
+
+                _xcn = vfmul_vv_f32m8(_x, _xcn, vl);
+                _scalar_n = vfredsum_vs_f32m8_f32m1(_scalar_n, _xcn, _scalar_n, vl);
+                N = vfmv_f_s_f32m1_f32(_scalar_n);
+
+                n2 -= vl;
+                ptr_x2 += vl;
+                ptr_xcn += vl;
+            }
+            ptr_x2 = NULL;
+            ptr_xcn = NULL;
+
+            // tanh(N)
+            N = tanh(N);
+
+            gates_data[0] = U;
+            gates_data[1] = N;
+        }
+
+        // h_t := (1 - update) .* new + update .* h_{t-1}
+        __fp16* output_data = top_blob.row<__fp16>(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            const float* gates_data = gates.row(q);
+
+            float U = gates_data[0];
+            float N = gates_data[1];
+
+            float H = (1 - U) * N + U * hidden_state[q];
+
+            hidden_state[q] = H;
+            output_data[q] = (__fp16)H;
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int T = bottom_blob.h;
+
+    int num_directions = direction == 2 ? 2 : 1;
+    // initial hidden state
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        hidden.fill(0.0f);
+
+        int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int T = bottom_blob.h;
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    //Copy previous states
+    Mat hidden;
+    cast_float16_to_float32(bottom_blobs[1], hidden, opt);
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    cast_float32_to_float16(hidden, top_blobs[1], opt);
+    return 0;
+}
+
+#endif
+
+//fp16sa start at here
+#if __riscv_vector && __riscv_zfh
+
+int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
+{
+    cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt);
+    cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
+    cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
+
+    return 0;
+}
+
+static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
+{
+    int size = bottom_blob.w;
+    int T = bottom_blob.h;
+
+    int num_output = top_blob.w;
+
+    // 2 x num_output
+    Mat gates(2, num_output, 4u, opt.workspace_allocator);
+    if (gates.empty())
+        return -100;
+
+    // unroll
+    for (int t = 0; t < T; t++)
+    {
+        int ti = reverse ? T - 1 - t : t;
+
+        const __fp16* x = bottom_blob.row<const __fp16>(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            float* gates_data = gates.row(q);
+
+            // gate reset update
+            const __fp16* bias_c_R = bias_c.row<const __fp16>(0);
+            const __fp16* bias_c_U = bias_c.row<const __fp16>(1);
+
+            const __fp16* weight_xc_R = weight_xc.row<const __fp16>(num_output * 0 + q);
+            const __fp16* weight_xc_U = weight_xc.row<const __fp16>(num_output * 1 + q);
+            const __fp16* weight_hc_R = weight_hc.row<const __fp16>(num_output * 0 + q);
+            const __fp16* weight_hc_U = weight_hc.row<const __fp16>(num_output * 1 + q);
+
+            __fp16 R = bias_c_R[q];
+            __fp16 U = bias_c_U[q];
+
+            int n = size;
+            const __fp16* ptr_x = x;
+            const __fp16* ptr_xcr = weight_xc_R;
+            const __fp16* ptr_xcu = weight_xc_U;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e16m8(n);
+                vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl);
+                vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl);
+                vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl);
+                vfloat16m1_t _scalar_r = vfmv_s_f_f16m1(vundefined_f16m1(), R, vl);
+                vfloat16m1_t _scalar_u = vfmv_s_f_f16m1(vundefined_f16m1(), U, vl);
+
+                _xcr = vfmul_vv_f16m8(_xcr, _x, vl);
+                _xcu = vfmul_vv_f16m8(_xcu, _x, vl);
+                _scalar_r = vfredsum_vs_f16m8_f16m1(_scalar_r, _xcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f16m8_f16m1(_scalar_u, _xcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f16m1_f16(_scalar_r);
+                U = vfmv_f_s_f16m1_f16(_scalar_u);
+
+                ptr_x += vl;
+                ptr_xcr += vl;
+                ptr_xcu += vl;
+                n -= vl;
+            }
+
+            int n_out = num_output;
+            const float* ptr_hc = hidden_state;
+            const __fp16* ptr_hcr = weight_hc_R;
+            const __fp16* ptr_hcu = weight_hc_U;
+            while (n_out > 0)
+            {
+                word_type vl = vsetvl_e16m4(n_out);
+                vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl);
+                vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl);
+                vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl);
+                vfloat16m1_t _scalar_r = vfmv_s_f_f16m1(vundefined_f16m1(), R, vl);
+                vfloat16m1_t _scalar_u = vfmv_s_f_f16m1(vundefined_f16m1(), U, vl);
+
+                _hcr = vfmul_vv_f16m4(_hcr, _h_cont, vl);
+                _hcu = vfmul_vv_f16m4(_hcu, _h_cont, vl);
+                _scalar_r = vfredsum_vs_f16m4_f16m1(_scalar_r, _hcr, _scalar_r, vl);
+                _scalar_u = vfredsum_vs_f16m4_f16m1(_scalar_u, _hcu, _scalar_u, vl);
+
+                R = vfmv_f_s_f16m1_f16(_scalar_r);
+                U = vfmv_f_s_f16m1_f16(_scalar_u);
+
+                ptr_hc += vl;
+                ptr_hcr += vl;
+                ptr_hcu += vl;
+                n_out -= vl;
+            }
+
+            // sigmoid(R)
+            // sigmoid(U)
+            R = 1.f / (1.f + (__fp16)exp((float)(-R)));
+            U = 1.f / (1.f + (__fp16)exp((float)(-U)));
+
+            // gate new
+            const __fp16* bias_c_WN = bias_c.row<const __fp16>(2);
+            const __fp16* bias_c_BN = bias_c.row<const __fp16>(3);
+
+            const __fp16* weight_xc_N = weight_xc.row<const __fp16>(num_output * 2 + q);
+            const __fp16* weight_hc_N = weight_hc.row<const __fp16>(num_output * 2 + q);
+
+            __fp16 N = bias_c_BN[q];
+
+            int n_out2 = num_output;
+            const float* ptr_hc2 = hidden_state;
+            const __fp16* ptr_whc_n = weight_hc_N;
+            while (n_out2 > 0)
+            {
+                word_type vl = vsetvl_e16m4(n_out2);
+
+                vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl);
+                vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl);
+                vfloat16m1_t _scalar_n = vfmv_s_f_f16m1(vundefined_f16m1(), N, vl);
+
+                _h_cont = vfmul_vv_f16m4(_whc_n, _h_cont, vl);
+                _scalar_n = vfredsum_vs_f16m4_f16m1(_scalar_n, _h_cont, _scalar_n, vl);
+
+                N = vfmv_f_s_f16m1_f16(_scalar_n);
+                n_out2 -= vl;
+                ptr_hc2 += vl;
+                ptr_whc_n += vl;
+            }
+            N = bias_c_WN[q] + R * N;
+
+            int n2 = size;
+            const __fp16* ptr_x2 = x;
+            const __fp16* ptr_xcn = weight_xc_N;
+            while (n2 > 0)
+            {
+                word_type vl = vsetvl_e16m8(n2);
+
+                vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl);
+                vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl);
+                vfloat16m1_t _scalar_n = vfmv_s_f_f16m1(vundefined_f16m1(), N, vl);
+
+                _xcn = vfmul_vv_f16m8(_x, _xcn, vl);
+                _scalar_n = vfredsum_vs_f16m8_f16m1(_scalar_n, _xcn, _scalar_n, vl);
+                N = vfmv_f_s_f16m1_f16(_scalar_n);
+
+                n2 -= vl;
+                ptr_x2 += vl;
+                ptr_xcn += vl;
+            }
+
+            // tanh(N)
+            N = (__fp16)tanh((float)N);
+
+            gates_data[0] = U;
+            gates_data[1] = N;
+        }
+
+        // h_t := (1 - update) .* new + update .* h_{t-1}
+        __fp16* output_data = top_blob.row<__fp16>(ti);
+        for (int q = 0; q < num_output; q++)
+        {
+            const float* gates_data = gates.row(q);
+
+            float U = gates_data[0];
+            float N = gates_data[1];
+
+            float H = (1 - U) * N + U * hidden_state[q];
+
+            hidden_state[q] = H;
+            output_data[q] = H;
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int T = bottom_blob.h;
+
+    int num_directions = direction == 2 ? 2 : 1;
+    // initial hidden state
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        hidden.fill(0.0f);
+
+        int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int T = bottom_blob.h;
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    //Copy previous states
+    Mat hidden;
+    cast_float16_to_float32(bottom_blobs[1], hidden, opt);
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    cast_float32_to_float16(hidden, top_blobs[1], opt);
+
+    return 0;
+}
+
+#endif
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
new file mode 100644
index 000000000..18c69ab59
--- /dev/null
+++ b/src/layer/riscv/gru_riscv.h
@@ -0,0 +1,48 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GRU_RISCV_H
+#define LAYER_GRU_RISCV_H
+
+#include "gru.h"
+
+namespace ncnn {
+
+class GRU_riscv : virtual public GRU
+{
+public:
+    GRU_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int create_pipeline(const Option& opt);
+
+protected:
+#if __riscv_vector && __riscv_zfh
+    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    int create_pipeline_fp16sa(const Option& opt);
+#endif
+
+public:
+    Mat weight_xc_data_fp16sa;
+    Mat bias_c_data_fp16sa;
+    Mat weight_hc_data_fp16sa;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GRU_RISCV_H
diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp
new file mode 100644
index 000000000..fe06bfbde
--- /dev/null
+++ b/src/layer/riscv/softmax_riscv.cpp
@@ -0,0 +1,520 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax_riscv.h"
+#include <float.h>
+
+#if __riscv_vector
+#ifdef RVV_SPEC_0_7
+#include "riscv_v_071_fix.h"
+#else
+#include <riscv_vector.h>
+#endif
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+namespace ncnn {
+
+Softmax_riscv::Softmax_riscv()
+{
+}
+
+int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    size_t elemsize = bottom_top_blob.elemsize;
+    int elempack = bottom_top_blob.elempack;
+
+    int positive_axis = axis < 0 ? dims + axis : axis;
+#ifdef __riscv_vector
+    if (dims == 1) // positive_axis == 0
+    {
+        int w = bottom_top_blob.w;
+        float* ptr = bottom_top_blob;
+        float max = -FLT_MAX;
+
+        int n = w * elempack;
+        float* ptr_vol = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
+            vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
+            _max = vfredmax_vs_f32m8_f32m1(_max, _p, /* scalar*/ _max, vl);
+
+            max = vfmv_f_s_f32m1_f32(_max);
+            ptr_vol += vl;
+            n -= vl;
+        }
+        ptr_vol = NULL;
+
+        float sum = 0.f;
+        n = w * elempack;
+        ptr_vol = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+            vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
+            vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
+
+            _p = vfsub_vf_f32m8(_p, max, vl);
+            _p = exp_ps(_p, vl);
+            _sum = vfredsum_vs_f32m8_f32m1(_sum, _p, /*scalar*/ _sum, vl);
+
+            vse32_v_f32m8(ptr_vol, _p, vl);
+            sum = vfmv_f_s_f32m1_f32(_sum);
+            ptr_vol += vl;
+            n -= vl;
+        }
+        ptr_vol = NULL;
+
+        n = w * elempack;
+        ptr_vol = ptr;
+        while (n > 0)
+        {
+            word_type vl = vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
+            _p = vfdiv_vf_f32m8(_p, sum, vl);
+            vse32_v_f32m8(ptr_vol, _p, vl);
+
+            n -= vl;
+            ptr_vol += vl;
+        }
+        ptr_vol = NULL;
+
+        return 0;
+    }
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+
+        Mat max;
+        max.create(w, elemsize, opt.workspace_allocator);
+        if (max.empty())
+            return -100;
+        max.fill(-FLT_MAX);
+
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr = bottom_top_blob.row(i);
+            float* ptr_max = max;
+            int n = w * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+
+                vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+
+                _max = vfmax_vv_f32m8(_max, _p, vl);
+
+                vse32_v_f32m8(ptr_max, _max, vl);
+                ptr += vl;
+                ptr_max += vl;
+                n -= vl;
+            }
+        }
+
+        Mat sum;
+        sum.create(w, elemsize, opt.workspace_allocator);
+        if (sum.empty())
+            return -100;
+        sum.fill(0.f);
+
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float* ptr_max = max;
+            float* ptr_sum = sum;
+            int n = w * elempack;
+
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
+                vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
+
+                _p = vfsub_vv_f32m8(_p, _max, vl);
+                _p = exp_ps(_p, vl);
+                _sum = vfadd_vv_f32m8(_sum, _p, vl);
+
+                vse32_v_f32m8(ptr, _p, vl);
+                vse32_v_f32m8(ptr_sum, _sum, vl);
+                n -= vl;
+                ptr_max += vl;
+                ptr_sum += vl;
+                ptr += vl;
+            }
+        }
+
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float* ptr_sum = sum;
+
+            int n = w * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
+
+                _p = vfdiv_vv_f32m8(_p, _sum, vl);
+
+                vse32_v_f32m8(ptr, _p, vl);
+                n -= vl;
+                ptr += vl;
+                ptr_sum += vl;
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float m = -FLT_MAX;
+
+            int n1 = w * elempack;
+            float* ptr1 = ptr;
+            while (n1 > 0)
+            {
+                word_type vl = vsetvl_e32m8(n1);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl);
+                vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl);
+
+                _m = vfredmax_vs_f32m8_f32m1(_m, _p, _m, vl);
+
+                m = vfmv_f_s_f32m1_f32(_m);
+                ptr1 += vl;
+                n1 -= vl;
+            }
+            ptr1 = NULL;
+
+            float s = 0.f;
+            int n2 = w * elempack;
+            float* ptr2 = ptr;
+            while (n2 > 0)
+            {
+                word_type vl = vsetvl_e32m8(n2);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl);
+                vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl);
+
+                _p = exp_ps(vfsub_vf_f32m8(_p, m, vl), vl);
+                _s = vfredosum_vs_f32m8_f32m1(_s, _p, _s, vl);
+
+                vse32_v_f32m8(ptr2, _p, vl);
+                s = vfmv_f_s_f32m1_f32(_s);
+                ptr2 += vl;
+                n2 -= vl;
+            }
+            ptr2 = NULL;
+
+            int n3 = w * elempack;
+            float* ptr3 = ptr;
+            while (n3 > 0)
+            {
+                word_type vl = vsetvl_e32m8(n3);
+
+                vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl);
+
+                _p = vfdiv_vf_f32m8(_p, s, vl);
+
+                vse32_v_f32m8(ptr3, _p, vl);
+                n3 -= vl;
+                ptr3 += vl;
+            }
+            ptr3 = NULL;
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        Mat max;
+        max.create(w, h, elemsize, opt.workspace_allocator);
+        if (max.empty())
+            return -100;
+        max.fill(-FLT_MAX);
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_top_blob.channel(q);
+
+            float* ptr_max = max;
+            int n = size * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+
+                vfloat32m8_t _max = vle32_v_f32m8(max, vl);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                _max = vfmax_vv_f32m8(_max, _p, vl);
+                vse32_v_f32m8(ptr_max, _max, vl);
+
+                ptr += vl;
+                ptr_max += vl;
+                n -= vl;
+            }
+        }
+
+        Mat sum;
+        sum.create(w, h, elemsize, opt.workspace_allocator);
+        if (sum.empty())
+            return -100;
+        sum.fill(0.f);
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float* ptr_sum = sum;
+            float* ptr_max = max;
+            int n = size * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
+                vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
+                _p = exp_ps(vfsub_vv_f32m8(_p, _max, vl), vl);
+                _sum = vfadd_vv_f32m8(_sum, _p, vl);
+                vse32_v_f32m8(ptr, _p, vl);
+                vse32_v_f32m8(ptr_sum, _sum, vl);
+
+                n -= vl;
+                ptr += vl;
+                ptr_sum += vl;
+                ptr_max += vl;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float* ptr_sum = sum;
+            int n = size * elempack;
+            while (n > 0)
+            {
+                word_type vl = vsetvl_e32m8(n);
+                vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
+
+                _p = vfdiv_vv_f32m8(_p, _sum, vl);
+                vse32_v_f32m8(ptr, _p, vl);
+
+                ptr_sum += vl;
+                ptr += vl;
+                n -= vl;
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+
+        Mat max;
+        max.create(w, channels, elemsize, opt.workspace_allocator);
+        if (max.empty())
+            return -100;
+        max.fill(-FLT_MAX);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_top_blob.channel(q);
+            float* maxptr = max.row(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                float* maxptr_vol = maxptr;
+                int n = w * elempack;
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+
+                    _maxptr = vfmax_vv_f32m8(_maxptr, _p, vl);
+                    vse32_v_f32m8(maxptr_vol, _maxptr, vl);
+
+                    ptr += vl;
+                    maxptr_vol += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        Mat sum;
+        sum.create(w, channels, elemsize, opt.workspace_allocator);
+        if (sum.empty())
+            return -100;
+        sum.fill(0.f);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float* maxptr = max.row(q);
+            float* sumptr = sum.row(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                float* sumptr_vol = sumptr;
+                float* maxptr_vol = maxptr;
+                int n = w * elempack;
+
+                while (n)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
+                    vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
+
+                    _p = exp_ps(vfsub_vv_f32m8(_p, _maxptr, vl), vl);
+                    _sumptr = vfadd_vv_f32m8(_sumptr, _p, vl);
+
+                    vse32_v_f32m8(ptr, _p, vl);
+                    vse32_v_f32m8(sumptr_vol, _sumptr, vl);
+                    n -= vl;
+                    sumptr_vol += vl;
+                    maxptr_vol += vl;
+                    ptr += vl;
+                }
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float* sumptr = sum.row(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                float* sumptr_vol = sumptr;
+                int n = w * elempack;
+                while (n > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
+
+                    _p = vfdiv_vv_f32m8(_p, _sumptr, vl);
+
+                    vse32_v_f32m8(ptr, _p, vl);
+                    n -= vl;
+                    sumptr_vol += vl;
+                    ptr += vl;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                float max = -FLT_MAX;
+                int n1 = w * elempack;
+                float* ptr_1 = ptr;
+                while (n1 > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n1);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl);
+                    vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
+                    _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl);
+
+                    max = vfmv_f_s_f32m1_f32(_scalar_max);
+                    n1 -= vl;
+                    ptr_1 += vl;
+                }
+                ptr_1 = NULL;
+
+                float sum = 0.f;
+                int n2 = w * elempack;
+                float* ptr_2 = ptr;
+                while (n2 > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n2);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl);
+                    vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
+
+                    _p = exp_ps(vfsub_vf_f32m8(_p, max, vl), vl);
+                    _scalar_sum = vfredsum_vs_f32m8_f32m1(_scalar_sum, _p, _scalar_sum, vl);
+
+                    vse32_v_f32m8(ptr_2, _p, vl);
+                    sum = vfmv_f_s_f32m1_f32(_scalar_sum);
+                    n2 -= vl;
+                    ptr_2 += vl;
+                }
+                ptr_2 = NULL;
+
+                int n3 = w * elempack;
+                float* ptr_3 = ptr;
+                while (n3 > 0)
+                {
+                    word_type vl = vsetvl_e32m8(n3);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl);
+
+                    _p = vfdiv_vf_f32m8(_p, sum, vl);
+
+                    vse32_v_f32m8(ptr_3, _p, vl);
+                    n3 -= vl;
+                    ptr_3 += vl;
+                }
+                ptr_3 = NULL;
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+
+    return 0;
+#endif
+    return Softmax::forward_inplace(bottom_top_blob, opt);
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/softmax_riscv.h b/src/layer/riscv/softmax_riscv.h
new file mode 100644
index 000000000..bb39b5e3b
--- /dev/null
+++ b/src/layer/riscv/softmax_riscv.h
@@ -0,0 +1,32 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_RISCV_H
+#define LAYER_SOFTMAX_RISCV_H
+
+#include "softmax.h"
+
+namespace ncnn {
+
+class Softmax_riscv : virtual public Softmax
+{
+public:
+    Softmax_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_RISCV_H