|
- // Tencent is pleased to support the open source community by making ncnn available.
- //
- // Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #include "deconvolutiondepthwise_riscv.h"
-
- #if __riscv_vector
- #include <riscv_vector.h>
- #endif // __riscv_vector
-
- #include "riscv_activation.h"
- #include "riscv_usability.h"
-
- namespace ncnn {
-
- #if NCNN_ZFH
- int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
- {
- #if __riscv_zvfh
- const int packn = csrr_vlenb() / 2;
- #endif // __riscv_zvfh
-
- const int maxk = kernel_w * kernel_h;
- int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
-
- // depth-wise
- if (channels == group && group == num_output)
- {
- int elempack = 1;
- #if __riscv_zvfh
- if (opt.use_packing_layout)
- {
- elempack = channels % packn == 0 ? packn : 1;
- }
- #endif // __riscv_zvfh
-
- Mat weight_data_transposed(weight_data.w);
- {
- float* pt = weight_data_transposed;
- const float* p = weight_data;
-
- for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
- {
- for (int k = 0; k < maxk; k++)
- {
- pt[maxk - 1 - k] = p[k];
- }
-
- p += maxk;
- pt += maxk;
- }
- }
-
- #if __riscv_zvfh
- // packn
- if (elempack == packn)
- {
- Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
- Mat weight_data_r2_packed;
- convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt);
-
- ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
- }
- #endif // __riscv_zvfh
-
- if (elempack == 1)
- {
- ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt);
- }
-
- ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
-
- if (opt.lightmode)
- weight_data.release();
-
- return 0;
- }
-
- // group convolution
- create_group_ops(opt);
-
- if (opt.lightmode)
- weight_data.release();
-
- return 0;
- }
-
- int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
- {
- #if __riscv_zvfh
- const int packn = csrr_vlenb() / 2;
- const size_t vl = __riscv_vsetvl_e16m1(packn);
- #endif // __riscv_zvfh
-
- int w = bottom_blob.w;
- int h = bottom_blob.h;
- int channels = bottom_blob.c;
- size_t elemsize = bottom_blob.elemsize;
- int elempack = bottom_blob.elempack;
-
- const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
- const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
-
- int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
- int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
- int out_elempack = 1;
- #if __riscv_zvfh
- if (opt.use_packing_layout)
- {
- out_elempack = num_output % packn == 0 ? packn : 1;
- }
- #endif // __riscv_zvfh
- size_t out_elemsize = elemsize / elempack * out_elempack;
-
- Mat top_blob_bordered;
- if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
- {
- top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
- }
- else
- {
- top_blob_bordered = top_blob;
- top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
- }
- if (top_blob_bordered.empty())
- return -100;
-
- const int maxk = kernel_w * kernel_h;
-
- // depth-wise
- if (channels * elempack == group && group == num_output)
- {
- #if __riscv_zvfh
- if (elempack == packn)
- {
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int g = 0; g < channels; g++)
- {
- __fp16* outptr = top_blob_bordered.channel(g);
- const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
- const Mat m = bottom_blob.channel(g);
-
- for (int i = 0; i < outh; i++)
- {
- for (int j = 0; j < outw; j++)
- {
- vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
- if (bias_term)
- {
- _sum = __riscv_vle32_v_f32m2((const float*)bias_data + g * packn, vl);
- }
-
- for (int y = 0; y < kernel_h; y++)
- {
- int sys = (i + y * dilation_h - (kernel_extent_h - 1));
- if (sys < 0 || sys % stride_h != 0)
- continue;
-
- int sy = sys / stride_h;
- if (sy >= h)
- continue;
-
- for (int x = 0; x < kernel_w; x++)
- {
- int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
- if (sxs < 0 || sxs % stride_w != 0)
- continue;
-
- int sx = sxs / stride_w;
- if (sx >= w)
- continue;
-
- const __fp16* sptr = m.row<const __fp16>(sy) + sx * packn;
-
- int k = y * kernel_w + x;
-
- vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl);
- vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl);
- _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl);
- }
- }
-
- _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
- __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
- }
-
- outptr += outw * packn;
- }
- }
- }
- }
- #endif // __riscv_zvfh
-
- if (elempack == 1)
- {
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int g = 0; g < channels; g++)
- {
- __fp16* outptr = top_blob_bordered.channel(g);
- const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
- const Mat m = bottom_blob.channel(g);
-
- for (int i = 0; i < outh; i++)
- {
- for (int j = 0; j < outw; j++)
- {
- float sum = 0.f;
-
- if (bias_term)
- {
- sum = bias_data[g];
- }
-
- for (int y = 0; y < kernel_h; y++)
- {
- int sys = (i + y * dilation_h - (kernel_extent_h - 1));
- if (sys < 0 || sys % stride_h != 0)
- continue;
-
- int sy = sys / stride_h;
- if (sy >= h)
- continue;
-
- const __fp16* sptr = m.row<const __fp16>(sy);
-
- for (int x = 0; x < kernel_w; x++)
- {
- int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
- if (sxs < 0 || sxs % stride_w != 0)
- continue;
-
- int sx = sxs / stride_w;
- if (sx >= w)
- continue;
-
- float val = (float)sptr[sx];
-
- int k = y * kernel_w + x;
-
- float w = (float)kptr[k];
-
- sum += val * w;
- }
- }
-
- sum = activation_ss(sum, activation_type, activation_params);
-
- outptr[j] = (__fp16)sum;
- }
-
- outptr += outw;
- }
- }
- }
- }
- }
- else
- {
- // group deconvolution
- const int channels_g = channels * elempack / group;
- const int num_output_g = num_output / group;
-
- int g_elempack = 1;
- int out_g_elempack = 1;
- #if __riscv_zvfh
- if (opt.use_packing_layout)
- {
- g_elempack = channels_g % packn == 0 ? packn : 1;
- out_g_elempack = num_output_g % packn == 0 ? packn : 1;
- }
- #endif // __riscv_zvfh
-
- // unpacking
- Mat bottom_blob_unpacked = bottom_blob;
- if (elempack > g_elempack)
- {
- Option opt_p = opt;
- opt_p.blob_allocator = opt.workspace_allocator;
- convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
- }
-
- Mat top_blob_bordered_unpacked = top_blob_bordered;
- if (out_g_elempack < out_elempack)
- {
- top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
- if (top_blob_bordered_unpacked.empty())
- return -100;
- }
-
- for (int g = 0; g < group; g++)
- {
- const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
- Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
-
- const ncnn::Layer* op = group_ops[g];
-
- Option opt_g = opt;
- opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
-
- // forward
- op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
- }
-
- // packing
- if (out_g_elempack < out_elempack)
- {
- convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
- }
- else
- {
- top_blob_bordered = top_blob_bordered_unpacked;
- }
- }
-
- cut_padding(top_blob_bordered, top_blob, opt);
- if (top_blob.empty())
- return -100;
-
- return 0;
- }
-
- int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
- {
- #if __riscv_zvfh
- const int packn = csrr_vlenb() / 2;
- const size_t vl = __riscv_vsetvl_e16m1(packn);
- #endif // __riscv_zvfh
-
- int w = bottom_blob.w;
- int h = bottom_blob.h;
- int channels = bottom_blob.c;
- size_t elemsize = bottom_blob.elemsize;
- int elempack = bottom_blob.elempack;
-
- const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
- const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
-
- int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
- int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
- int out_elempack = 1;
- #if __riscv_zvfh
- if (opt.use_packing_layout)
- {
- out_elempack = num_output % packn == 0 ? packn : 1;
- }
- #endif // __riscv_zvfh
- size_t out_elemsize = elemsize / elempack * out_elempack;
-
- Mat top_blob_bordered;
- if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
- {
- top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
- }
- else
- {
- top_blob_bordered = top_blob;
- top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
- }
- if (top_blob_bordered.empty())
- return -100;
-
- const int maxk = kernel_w * kernel_h;
-
- // depth-wise
- if (channels * elempack == group && group == num_output)
- {
- #if __riscv_zvfh
- if (elempack == packn)
- {
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int g = 0; g < channels; g++)
- {
- __fp16* outptr = top_blob_bordered.channel(g);
- const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
- const Mat m = bottom_blob.channel(g);
-
- for (int i = 0; i < outh; i++)
- {
- for (int j = 0; j < outw; j++)
- {
- vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
-
- if (bias_term)
- {
- _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl);
- }
-
- for (int y = 0; y < kernel_h; y++)
- {
- int sys = (i + y * dilation_h - (kernel_extent_h - 1));
- if (sys < 0 || sys % stride_h != 0)
- continue;
-
- int sy = sys / stride_h;
- if (sy >= h)
- continue;
-
- for (int x = 0; x < kernel_w; x++)
- {
- int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
- if (sxs < 0 || sxs % stride_w != 0)
- continue;
-
- int sx = sxs / stride_w;
- if (sx >= w)
- continue;
-
- const __fp16* sptr = m.row<const __fp16>(sy) + sx * packn;
-
- int k = y * kernel_w + x;
-
- vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl);
- vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl);
- _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl);
- }
- }
-
- _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
- __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl);
- }
-
- outptr += outw * packn;
- }
- }
- }
- }
- #endif // __riscv_zvfh
-
- if (elempack == 1)
- {
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int g = 0; g < channels; g++)
- {
- __fp16* outptr = top_blob_bordered.channel(g);
- const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
- const Mat m = bottom_blob.channel(g);
-
- for (int i = 0; i < outh; i++)
- {
- for (int j = 0; j < outw; j++)
- {
- float sum = 0.f;
-
- if (bias_term)
- {
- sum = bias_data[g];
- }
-
- for (int y = 0; y < kernel_h; y++)
- {
- int sys = (i + y * dilation_h - (kernel_extent_h - 1));
- if (sys < 0 || sys % stride_h != 0)
- continue;
-
- int sy = sys / stride_h;
- if (sy >= h)
- continue;
-
- const __fp16* sptr = m.row<const __fp16>(sy);
-
- for (int x = 0; x < kernel_w; x++)
- {
- int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
- if (sxs < 0 || sxs % stride_w != 0)
- continue;
-
- int sx = sxs / stride_w;
- if (sx >= w)
- continue;
-
- __fp16 val = sptr[sx];
-
- int k = y * kernel_w + x;
-
- __fp16 w = kptr[k];
-
- sum += val * w;
- }
- }
-
- sum = activation_ss(sum, activation_type, activation_params);
-
- outptr[j] = (__fp16)sum;
- }
-
- outptr += outw;
- }
- }
- }
- }
- }
- else
- {
- // group deconvolution
- const int channels_g = channels * elempack / group;
- const int num_output_g = num_output / group;
-
- int g_elempack = 1;
- int out_g_elempack = 1;
- #if __riscv_zvfh
- if (opt.use_packing_layout)
- {
- g_elempack = channels_g % packn == 0 ? packn : 1;
- out_g_elempack = num_output_g % packn == 0 ? packn : 1;
- }
- #endif // __riscv_zvfh
-
- // unpacking
- Mat bottom_blob_unpacked = bottom_blob;
- if (elempack > g_elempack)
- {
- Option opt_p = opt;
- opt_p.blob_allocator = opt.workspace_allocator;
- convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
- }
-
- Mat top_blob_bordered_unpacked = top_blob_bordered;
- if (out_g_elempack < out_elempack)
- {
- top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
- if (top_blob_bordered_unpacked.empty())
- return -100;
- }
-
- for (int g = 0; g < group; g++)
- {
- const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
- Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
-
- const ncnn::Layer* op = group_ops[g];
-
- Option opt_g = opt;
- opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
-
- // forward
- op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
- }
-
- // packing
- if (out_g_elempack < out_elempack)
- {
- convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
- }
- else
- {
- top_blob_bordered = top_blob_bordered_unpacked;
- }
- }
-
- cut_padding(top_blob_bordered, top_blob, opt);
- if (top_blob.empty())
- return -100;
-
- return 0;
- }
- #endif // NCNN_ZFH
-
- } // namespace ncnn
|