| @@ -12,21 +12,18 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 | |||
| void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); | |||
| void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 | |||
| void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); | |||
| void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_i8mm()) | |||
| { | |||
| @@ -35,13 +32,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_asimddp()) | |||
| { | |||
| convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -531,7 +527,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker | |||
| static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_i8mm()) | |||
| { | |||
| @@ -540,13 +535,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8 | |||
| if (ncnn::cpu_support_arm_asimddp()) | |||
| { | |||
| convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int w = bottom_blob.w; | |||
| @@ -12,23 +12,20 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdfhm()) | |||
| { | |||
| @@ -37,13 +34,12 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdhp()) | |||
| { | |||
| innerproduct_pack4_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int num_input = bottom_blob.w * bottom_blob.elempack; | |||
| @@ -294,7 +290,6 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdfhm()) | |||
| { | |||
| @@ -303,13 +298,12 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdhp()) | |||
| { | |||
| innerproduct_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int num_input = bottom_blob.w * bottom_blob.elempack; | |||
| @@ -516,7 +510,6 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt) | |||
| { | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdfhm()) | |||
| { | |||
| @@ -525,13 +518,12 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdhp()) | |||
| { | |||
| innerproduct_transform_kernel_fp16s_neon_asimdhp(weight_data, weight_data_tm, num_input, num_output, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| int out_elempack = 1; | |||
| @@ -12,19 +12,16 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| void innerproduct_gemm_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| void innerproduct_gemm_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) | |||
| { | |||
| #if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdfhm()) | |||
| { | |||
| @@ -33,13 +30,12 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML | |||
| if (ncnn::cpu_support_arm_asimdhp()) | |||
| { | |||
| innerproduct_gemm_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int num_input = bottom_blob.w; | |||
| @@ -17,7 +17,7 @@ void cast_fp32_to_bf16_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, con | |||
| void cast_bf16_to_fp32_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__ | |||
| void cast_fp32_to_bf16_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt); | |||
| void cast_bf16_to_fp32_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt); | |||
| #endif | |||
| @@ -32,7 +32,7 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| cast_fp32_to_bf16_sse_avx2(bottom_blob, top_blob, opt); | |||
| @@ -104,7 +104,7 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| cast_bf16_to_fp32_sse_avx2(bottom_blob, top_blob, opt); | |||
| @@ -12,29 +12,27 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt); | |||
| void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt); | |||
| void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk) | |||
| { | |||
| @@ -3430,14 +3428,12 @@ static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& ke | |||
| static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| conv3x3s1_winograd23_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int M = outch; | |||
| @@ -4430,7 +4426,6 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to | |||
| static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx512_vnni()) | |||
| { | |||
| @@ -4439,7 +4434,7 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx_vnni()) | |||
| { | |||
| conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt); | |||
| @@ -4447,7 +4442,7 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt); | |||
| @@ -4455,13 +4450,12 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_xop()) | |||
| { | |||
| conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| int outw = top_blob.w; | |||
| @@ -4642,14 +4636,12 @@ static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& ke | |||
| static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| conv3x3s1_winograd43_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int M = outch; | |||
| @@ -6260,7 +6252,6 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to | |||
| static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx512_vnni()) | |||
| { | |||
| @@ -6269,7 +6260,7 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx_vnni()) | |||
| { | |||
| conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt); | |||
| @@ -6277,7 +6268,7 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt); | |||
| @@ -6285,13 +6276,12 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_xop()) | |||
| { | |||
| conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| int outw = top_blob.w; | |||
| @@ -12,24 +12,22 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| void convolution_im2col_gemm_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_im2col_gemm_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_im2col_gemm_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt); | |||
| void convolution_im2col_gemm_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_im2col_gemm_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) | |||
| { | |||
| @@ -7476,14 +7474,12 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i | |||
| static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| convolution_im2col_gemm_transform_kernel_int8_avx2(kernel, AT, inch, outch, kernel_w, kernel_h, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| // NCNN_LOGE("convolution_im2col_gemm_transform_kernel"); | |||
| @@ -7558,24 +7554,23 @@ static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat | |||
| static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx512vnni()) | |||
| if (ncnn::cpu_support_x86_avx512_vnni()) | |||
| { | |||
| convolution_im2col_gemm_int8_avx512vnni(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| if (ncnn::cpu_support_x86_avxvnni()) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx_vnni()) | |||
| { | |||
| convolution_im2col_gemm_int8_avxvnni(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| convolution_im2col_gemm_int8_avx2(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| @@ -7583,13 +7578,12 @@ static void convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_xop()) | |||
| { | |||
| convolution_im2col_gemm_int8_xop(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -12,31 +12,26 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| void convolution_transform_kernel_packed_int8_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); | |||
| #endif | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| void convolution_packed_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_packed_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_transform_kernel_packed_int8_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); | |||
| void convolution_packed_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| void convolution_packed_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); | |||
| #endif | |||
| #endif | |||
| static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| convolution_transform_kernel_packed_int8_avx2(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| @@ -880,7 +875,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker | |||
| static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| #if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__) | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx512_vnni()) | |||
| { | |||
| @@ -889,7 +883,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx_vnni()) | |||
| { | |||
| convolution_packed_int8_avxvnni(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| @@ -897,7 +891,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_avx2()) | |||
| { | |||
| convolution_packed_int8_avx2(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| @@ -905,13 +899,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ | |||
| #if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__ | |||
| if (ncnn::cpu_support_x86_xop()) | |||
| { | |||
| convolution_packed_int8_xop(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| const int w = bottom_blob.w; | |||