Co-authored-by: MegEngine <megengine@megvii.com> Co-authored-by: tpoisonooo <tpoisonooo@users.noreply.github.com>tags/20220420
| @@ -255,8 +255,11 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| "w"(_k46474849) // %31 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v9"); | |||
| } | |||
| #else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined \ | |||
| // When compiled with gcc, gcc does not accept over 30 operands | |||
| #else | |||
| /** | |||
| * __ARM_NEON && __aarch64__ defined, but __clang__ not defined | |||
| * When compiled with gcc, gcc does not accept over 30 operands | |||
| */ | |||
| for (; nn > 0; nn--) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| @@ -948,8 +951,11 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| "w"(_k46474849) // %31 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v9"); | |||
| } | |||
| #else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined \ | |||
| // When compiled with gcc, gcc does not accept over 30 operands | |||
| #else | |||
| /** | |||
| * __ARM_NEON && __aarch64__ defined, but __clang__ not defined | |||
| * When compiled with gcc, gcc does not accept over 30 operands | |||
| */ | |||
| for (; nn > 0; nn--) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| @@ -140,8 +140,6 @@ int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int elembits = bottom_blob.elembits(); | |||
| #if __ARM_NEON | |||
| if (elempack == 8) | |||
| { | |||
| @@ -461,8 +459,6 @@ int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int elembits = bottom_blob.elembits(); | |||
| int ref_elempack = reference_blob.elempack; | |||
| Mat& top_blob = top_blobs[0]; | |||
| @@ -66,100 +66,6 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| { | |||
| float32x4_t _v = vld1q_f32(r0); | |||
| #if 0 // bad compiler generate slow instructions :( \ | |||
| // 0 | |||
| float32x4_t _out00 = vld1q_f32(outptr0 + 0); | |||
| _out00 = vmlaq_lane_f32(_out00, _v, vget_low_f32(_k0), 0); | |||
| float32x4_t _out01 = vmulq_lane_f32(_v, vget_low_f32(_k0), 1); | |||
| // ext | |||
| float32x4_t _zero_out01 = vdupq_n_f32(0.f); | |||
| _zero_out01 = vextq_f32(_zero_out01, _out01, 3); | |||
| _out00 = vaddq_f32(_out00, _zero_out01); | |||
| // | |||
| float32x2_t _out00low = vget_low_f32(_out00); | |||
| float32x2_t _out00high = vget_high_f32(_out00); | |||
| _out00high = vmla_lane_f32(_out00high, vget_low_f32(_v), vget_high_f32(_k0), 0); | |||
| _out00 = vcombine_f32(_out00low, _out00high); | |||
| vst1q_f32(outptr0 + 0, _out00); | |||
| // | |||
| float32x2_t _out02high = vld1_f32(outptr0 + 4); | |||
| float32x2_t _out01_zero = vext_f32(vget_high_f32(_out01), vget_low_f32(_zero_out01), 1); | |||
| _out02high = vadd_f32(_out02high, _out01_zero); | |||
| _out02high = vmla_lane_f32(_out02high, vget_high_f32(_v), vget_high_f32(_k0), 0); | |||
| vst1_f32(outptr0 + 4, _out02high); | |||
| // 1 | |||
| float32x4_t _out10 = vld1q_f32(outptr1 + 0); | |||
| _out10 = vmlaq_lane_f32(_out10, _v, vget_low_f32(_k1), 0); | |||
| float32x4_t _out11 = vmulq_lane_f32(_v, vget_low_f32(_k1), 1); | |||
| // ext | |||
| float32x4_t _zero_out11 = vdupq_n_f32(0.f); | |||
| _zero_out11 = vextq_f32(_zero_out11, _out11, 3); | |||
| _out10 = vaddq_f32(_out10, _zero_out11); | |||
| // | |||
| float32x2_t _out10low = vget_low_f32(_out10); | |||
| float32x2_t _out10high = vget_high_f32(_out10); | |||
| _out10high = vmla_lane_f32(_out10high, vget_low_f32(_v), vget_high_f32(_k1), 0); | |||
| _out10 = vcombine_f32(_out10low, _out10high); | |||
| vst1q_f32(outptr1 + 0, _out10); | |||
| // | |||
| float32x2_t _out12high = vld1_f32(outptr1 + 4); | |||
| float32x2_t _out11_zero = vext_f32(vget_high_f32(_out11), vget_low_f32(_zero_out11), 1); | |||
| _out12high = vadd_f32(_out12high, _out11_zero); | |||
| _out12high = vmla_lane_f32(_out12high, vget_high_f32(_v), vget_high_f32(_k1), 0); | |||
| vst1_f32(outptr1 + 4, _out12high); | |||
| // 2 | |||
| float32x4_t _out20 = vld1q_f32(outptr2 + 0); | |||
| _out20 = vmlaq_lane_f32(_out20, _v, vget_low_f32(_k2), 0); | |||
| float32x4_t _out21 = vmulq_lane_f32(_v, vget_low_f32(_k2), 1); | |||
| // ext | |||
| float32x4_t _zero_out21 = vdupq_n_f32(0.f); | |||
| _zero_out21 = vextq_f32(_zero_out21, _out21, 3); | |||
| _out20 = vaddq_f32(_out20, _zero_out21); | |||
| // | |||
| float32x2_t _out20low = vget_low_f32(_out20); | |||
| float32x2_t _out20high = vget_high_f32(_out20); | |||
| _out20high = vmla_lane_f32(_out20high, vget_low_f32(_v), vget_high_f32(_k2), 0); | |||
| _out20 = vcombine_f32(_out20low, _out20high); | |||
| vst1q_f32(outptr2 + 0, _out20); | |||
| // | |||
| float32x2_t _out22high = vld1_f32(outptr2 + 4); | |||
| float32x2_t _out21_zero = vext_f32(vget_high_f32(_out21), vget_low_f32(_zero_out21), 1); | |||
| _out22high = vadd_f32(_out22high, _out21_zero); | |||
| _out22high = vmla_lane_f32(_out22high, vget_high_f32(_v), vget_high_f32(_k2), 0); | |||
| vst1_f32(outptr2 + 4, _out22high); | |||
| #else | |||
| // | |||
| float32x4_t _out00 = vld1q_f32(outptr0 + 0); | |||
| _out00 = vmlaq_lane_f32(_out00, _v, vget_low_f32(_k0), 0); | |||
| @@ -198,7 +104,6 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| float32x4_t _out22 = vld1q_f32(outptr2 + 2); | |||
| _out22 = vmlaq_lane_f32(_out22, _v, vget_high_f32(_k2), 0); | |||
| vst1q_f32(outptr2 + 2, _out22); | |||
| #endif | |||
| r0 += 4; | |||
| outptr0 += 4; | |||
| @@ -67,7 +67,7 @@ int Convolution1D::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Convolution1D::create_pipeline(const Option& opt) | |||
| int Convolution1D::create_pipeline(const Option&) | |||
| { | |||
| if (dynamic_weight) | |||
| return 0; | |||
| @@ -73,7 +73,7 @@ int ConvolutionDepthWise1D::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise1D::create_pipeline(const Option& opt) | |||
| int ConvolutionDepthWise1D::create_pipeline(const Option&) | |||
| { | |||
| return 0; | |||
| } | |||
| @@ -67,12 +67,9 @@ int Deconvolution::load_model(const ModelBin& mb) | |||
| static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int inch = bottom_blob.c; | |||
| const int outw = top_blob.w; | |||
| const int outh = top_blob.h; | |||
| const int outch = top_blob.c; | |||
| const int bias_term = bias_data.empty() ? 0 : 1; | |||
| @@ -74,14 +74,10 @@ int Deconvolution3D::load_model(const ModelBin& mb) | |||
| static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int d = bottom_blob.d; | |||
| const int inch = bottom_blob.c; | |||
| const int outw = top_blob.w; | |||
| const int outh = top_blob.h; | |||
| const int outd = top_blob.d; | |||
| const int outch = top_blob.c; | |||
| const int bias_term = bias_data.empty() ? 0 : 1; | |||
| @@ -68,12 +68,9 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb) | |||
| static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int inch = bottom_blob.c; | |||
| const int outw = top_blob.w; | |||
| const int outh = top_blob.h; | |||
| const int outch = top_blob.c; | |||
| const int bias_term = bias_data.empty() ? 0 : 1; | |||
| @@ -75,14 +75,10 @@ int DeconvolutionDepthWise3D::load_model(const ModelBin& mb) | |||
| static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int d = bottom_blob.d; | |||
| const int inch = bottom_blob.c; | |||
| const int outw = top_blob.w; | |||
| const int outh = top_blob.h; | |||
| const int outd = top_blob.d; | |||
| const int outch = top_blob.c; | |||
| const int bias_term = bias_data.empty() ? 0 : 1; | |||