GitOrigin-RevId: 1185594301
tags/v1.11.0
| @@ -97,6 +97,7 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> { | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const ctype* val) { res = vaddq_f16(vld1q_f16(val), res); } | |||
| void feed_vector(const float16x8_t& vval) { res = vaddq_f16(vval, res); } | |||
| void feed_remain(const ctype* val) { remain += *val; } | |||
| void post(ctype* dst) { | |||
| res = vmulq_n_f16(res, coef); | |||
| @@ -127,8 +128,8 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, false> { | |||
| vcoef = vdupq_n_f32(coef); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const uint8_t* val) { | |||
| const uint8x16_t vval = vld1q_u8(val); | |||
| void feed(const uint8_t* val) { feed_vector(vld1q_u8(val)); } | |||
| void feed_vector(const uint8x16_t& vval) { | |||
| const uint16x8_t vval_low = vmovl_u8(vget_low_u8(vval)); | |||
| const uint16x8_t vval_high = vmovl_u8(vget_high_u8(vval)); | |||
| @@ -219,8 +220,8 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255); | |||
| remain = vdupq_n_##_stype(_init); \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const ctype* val) { \ | |||
| __stype##8x16_t vval = vld1q_##_stype(val); \ | |||
| void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \ | |||
| void inline feed_vector(const __stype##8x16_t & vval) { \ | |||
| res = v##_mode##q_##_stype(vval, res); \ | |||
| } \ | |||
| void feed_remain(const ctype* val) { \ | |||
| @@ -286,8 +287,8 @@ REDUCER_MAX_MIN_C1( | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const ctype* val) { \ | |||
| __stype vval = vld1q_##_stype(val); \ | |||
| void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \ | |||
| void inline feed_vector(const __stype& vval) { \ | |||
| res = v##_mode##q_##_stype(vval, res); \ | |||
| } \ | |||
| void feed_remain(const ctype* val) { \ | |||
| @@ -326,8 +327,8 @@ struct ProductReducer; | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const ctype* val) { \ | |||
| __stype vval = vld1q_##_stype(val); \ | |||
| void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \ | |||
| void feed_vector(const __stype& vval) { \ | |||
| res = v##_act##q_##_stype(vval, res); \ | |||
| } \ | |||
| void feed_remain(const ctype* val) { \ | |||
| @@ -435,8 +436,8 @@ struct SumSqrReducer<__fp16, __fp16, __fp16, false> { | |||
| fp16_fix_t remain; | |||
| SumSqrReducer(DType, size_t cnt) : remain(0.0f) { res = vdupq_n_f16(0.0f); } | |||
| SumSqrReducer() = default; | |||
| void feed(const __fp16* val) { | |||
| float16x8_t vval = vld1q_f16(val); | |||
| void feed(const __fp16* val) { feed_vector(vld1q_f16(val)); } | |||
| void inline feed_vector(const float16x8_t& vval) { | |||
| res = vaddq_f16(vmulq_f16(vval, vval), res); | |||
| } | |||
| void feed_remain(const __fp16* val) { remain += (*val) * (*val); } | |||
| @@ -504,6 +505,60 @@ struct Exec<Reducer, false> { | |||
| } | |||
| }; | |||
| template <typename Reducer, typename dtype, size_t B> | |||
| struct ExecC1SmallB { | |||
| static void do_reduce( | |||
| const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C); | |||
| }; | |||
| #define ImplementC1SmallB(_ctype, _simd_prefix, _simd_suffix) \ | |||
| template <typename Reducer, size_t B> \ | |||
| struct ExecC1SmallB<Reducer, _ctype, B> { \ | |||
| static void do_reduce( \ | |||
| const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \ | |||
| size_t) { \ | |||
| size_t a = 0; \ | |||
| for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \ | |||
| Reducer reducer(src_dtype, B); \ | |||
| auto src_ptr = src + a * B; \ | |||
| if (B == 4) { \ | |||
| _simd_prefix##x4_t data_v4 = vld4q_##_simd_suffix(src_ptr); \ | |||
| reducer.feed_vector(data_v4.val[0]); \ | |||
| reducer.feed_vector(data_v4.val[1]); \ | |||
| reducer.feed_vector(data_v4.val[2]); \ | |||
| reducer.feed_vector(data_v4.val[3]); \ | |||
| } \ | |||
| if (B == 3) { \ | |||
| _simd_prefix##x3_t data_v3 = vld3q_##_simd_suffix(src_ptr); \ | |||
| reducer.feed_vector(data_v3.val[0]); \ | |||
| reducer.feed_vector(data_v3.val[1]); \ | |||
| reducer.feed_vector(data_v3.val[2]); \ | |||
| } \ | |||
| if (B == 2) { \ | |||
| _simd_prefix##x2_t data_v2 = vld2q_##_simd_suffix(src_ptr); \ | |||
| reducer.feed_vector(data_v2.val[0]); \ | |||
| reducer.feed_vector(data_v2.val[1]); \ | |||
| } \ | |||
| reducer.post(dst); \ | |||
| dst += Reducer::SIMD_WIDTH; \ | |||
| } \ | |||
| for (; a < A; a++) { \ | |||
| Reducer reducer(src_dtype, B); \ | |||
| auto src_ptr = src + a * B; \ | |||
| for (size_t i = 0; i < B; i++) \ | |||
| reducer.feed_remain(src_ptr + i); \ | |||
| reducer.post_remain(dst); \ | |||
| dst++; \ | |||
| } \ | |||
| } \ | |||
| } | |||
| ImplementC1SmallB(uint8_t, uint8x16, u8); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| ImplementC1SmallB(__fp16, float16x8, f16); | |||
| #endif | |||
| } // anonymous namespace | |||
| void ReduceImpl::exec( | |||
| @@ -513,31 +568,40 @@ void ReduceImpl::exec( | |||
| reduce::get_ABC(src.layout, A, B, C, param().axis); | |||
| bool execed = false; | |||
| using Mode = param::Reduce::Mode; | |||
| #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ | |||
| if (C == 1) { \ | |||
| using _Reducer = Reducer<dtype, ctype, comp_type, true>; \ | |||
| std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||
| do_reduce = Exec<_Reducer, true>::do_reduce; \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||
| reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||
| reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||
| execed = true; \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| } else { \ | |||
| using _Reducer = Reducer<dtype, ctype, comp_type, false>; \ | |||
| std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||
| do_reduce = Exec<_Reducer, false>::do_reduce; \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||
| reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||
| reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||
| execed = true; \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| #define DISPATCH_FUNC(Reducer, _dtype, ctype, comp_type) \ | |||
| if (C == 1) { \ | |||
| using _Reducer = Reducer<_dtype, ctype, comp_type, true>; \ | |||
| using _ReducerC1SmallB = Reducer<_dtype, ctype, comp_type, false>; \ | |||
| std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||
| do_reduce = Exec<_Reducer, true>::do_reduce; \ | |||
| if (src.layout.dtype.category() != DTypeCategory::FLOAT) { \ | |||
| if (B == 2) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \ | |||
| if (B == 3) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \ | |||
| if (B == 4) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \ | |||
| } \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||
| reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||
| reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||
| execed = true; \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| } else { \ | |||
| using _Reducer = Reducer<_dtype, ctype, comp_type, false>; \ | |||
| std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||
| do_reduce = Exec<_Reducer, false>::do_reduce; \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \ | |||
| reinterpret_cast<ctype*>(src.raw_ptr()), \ | |||
| reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \ | |||
| execed = true; \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| } | |||
| #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \ | |||
| @@ -36,7 +36,7 @@ void conv_stride2::do_conv_2x2_stride2( | |||
| rep(i, nn) { | |||
| GI_FLOAT32_t _outp = GiLoadFloat32(outptr); | |||
| GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0); | |||
| GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0); | |||
| GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 | |||
| GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 | |||
| @@ -44,7 +44,7 @@ void conv_stride2::do_conv_2x2_stride2( | |||
| _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); | |||
| _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||
| GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1); | |||
| GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | |||
| GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); | |||
| GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); | |||
| @@ -94,8 +94,8 @@ void conv_stride2::do_conv_3x3_stride2( | |||
| rep(i, nn) { | |||
| GI_FLOAT32_t _outp = GiLoadFloat32(outptr); | |||
| GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0); | |||
| GI_FLOAT32_V2_t _r0n = GiLd2qFloat32(r0 + 8); | |||
| GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0); | |||
| GI_FLOAT32_V2_t _r0n = GiLoadUzipFloat32V2(r0 + 8); | |||
| GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 | |||
| GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 | |||
| @@ -106,8 +106,8 @@ void conv_stride2::do_conv_3x3_stride2( | |||
| _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); | |||
| _outp = GiSimdFmaLane(_outp, _r02, _k0123, 2); | |||
| GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1); | |||
| GI_FLOAT32_V2_t _r1n = GiLd2qFloat32(r1 + 8); | |||
| GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1); | |||
| GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8); | |||
| GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); | |||
| GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); | |||
| @@ -118,8 +118,8 @@ void conv_stride2::do_conv_3x3_stride2( | |||
| _outp = GiSimdFmaLane(_outp, _r11, _k3456, 1); | |||
| _outp = GiSimdFmaLane(_outp, _r12, _k3456, 2); | |||
| GI_FLOAT32_V2_t _r2 = GiLd2qFloat32(r2); | |||
| GI_FLOAT32_V2_t _r2n = GiLd2qFloat32(r2 + 8); | |||
| GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2); | |||
| GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8); | |||
| GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r2, 0); | |||
| GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r2, 1); | |||
| @@ -176,8 +176,8 @@ void conv_stride2::do_conv_5x5_stride2( | |||
| rep(i, nn) { | |||
| GI_FLOAT32_t _sum = GiLoadFloat32(outptr); | |||
| GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0); | |||
| GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8); | |||
| GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); | |||
| GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); | |||
| GI_FLOAT32_t _r0_8101214 = | |||
| GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14 | |||
| GI_FLOAT32_t _r0_9111315 = | |||
| @@ -190,8 +190,8 @@ void conv_stride2::do_conv_5x5_stride2( | |||
| GI_FLOAT32_t _r03 = GiExtqFloat32(_r01, _r0_9111315, 1); // 3 5 7 9 | |||
| GI_FLOAT32_t _r04 = GiExtqFloat32(_r00, _r0_8101214, 2); // 4 6 8 10 | |||
| GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1); | |||
| GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8); | |||
| GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); | |||
| GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); | |||
| GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0); | |||
| GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1); | |||
| GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0); | |||
| @@ -200,8 +200,8 @@ void conv_stride2::do_conv_5x5_stride2( | |||
| GI_FLOAT32_t _r13 = GiExtqFloat32(_r11, _r1_9111315, 1); | |||
| GI_FLOAT32_t _r14 = GiExtqFloat32(_r10, _r1_8101214, 2); | |||
| GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2); | |||
| GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8); | |||
| GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); | |||
| GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); | |||
| GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0); | |||
| GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1); | |||
| GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0); | |||
| @@ -210,8 +210,8 @@ void conv_stride2::do_conv_5x5_stride2( | |||
| GI_FLOAT32_t _r23 = GiExtqFloat32(_r21, _r2_9111315, 1); | |||
| GI_FLOAT32_t _r24 = GiExtqFloat32(_r20, _r2_8101214, 2); | |||
| GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3); | |||
| GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8); | |||
| GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); | |||
| GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); | |||
| GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0); | |||
| GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1); | |||
| GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0); | |||
| @@ -220,8 +220,8 @@ void conv_stride2::do_conv_5x5_stride2( | |||
| GI_FLOAT32_t _r33 = GiExtqFloat32(_r31, _r3_9111315, 1); | |||
| GI_FLOAT32_t _r34 = GiExtqFloat32(_r30, _r3_8101214, 2); | |||
| GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4); | |||
| GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8); | |||
| GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); | |||
| GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); | |||
| GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0); | |||
| GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1); | |||
| GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0); | |||
| @@ -315,8 +315,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k0123 = GiLoadFloat32(k0); | |||
| GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4); | |||
| GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0); | |||
| GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8); | |||
| GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0); | |||
| GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8); | |||
| GI_FLOAT32_t _r0_8101214 = | |||
| GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14 | |||
| GI_FLOAT32_t _r0_9111315 = | |||
| @@ -342,8 +342,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k78910 = GiLoadFloat32(k1); | |||
| GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4); | |||
| GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1); | |||
| GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8); | |||
| GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1); | |||
| GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8); | |||
| GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0); | |||
| GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1); | |||
| GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0); | |||
| @@ -365,8 +365,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2); | |||
| GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4); | |||
| GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2); | |||
| GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8); | |||
| GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2); | |||
| GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8); | |||
| GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0); | |||
| GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1); | |||
| GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0); | |||
| @@ -388,8 +388,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3); | |||
| GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4); | |||
| GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3); | |||
| GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8); | |||
| GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3); | |||
| GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8); | |||
| GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0); | |||
| GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1); | |||
| GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0); | |||
| @@ -411,8 +411,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4); | |||
| GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4); | |||
| GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4); | |||
| GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8); | |||
| GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4); | |||
| GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8); | |||
| GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0); | |||
| GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1); | |||
| GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0); | |||
| @@ -434,8 +434,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5); | |||
| GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4); | |||
| GI_FLOAT32_V2_t _r50_02461357 = GiLd2qFloat32(r5); | |||
| GI_FLOAT32_V2_t _r50nx2 = GiLd2qFloat32(r5 + 8); | |||
| GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5); | |||
| GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8); | |||
| GI_FLOAT32_t _r5_8101214 = GiGetSubVectorFloat32V2(_r50nx2, 0); | |||
| GI_FLOAT32_t _r5_9111315 = GiGetSubVectorFloat32V2(_r50nx2, 1); | |||
| GI_FLOAT32_t _r50 = GiGetSubVectorFloat32V2(_r50_02461357, 0); | |||
| @@ -457,8 +457,8 @@ void conv_stride2::do_conv_7x7_stride2( | |||
| GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6); | |||
| GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3); | |||
| GI_FLOAT32_V2_t _r60_02461357 = GiLd2qFloat32(r6); | |||
| GI_FLOAT32_V2_t _r60nx2 = GiLd2qFloat32(r6 + 8); | |||
| GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6); | |||
| GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8); | |||
| GI_FLOAT32_t _r6_8101214 = GiGetSubVectorFloat32V2(_r60nx2, 0); | |||
| GI_FLOAT32_t _r6_9111315 = GiGetSubVectorFloat32V2(_r60nx2, 1); | |||
| GI_FLOAT32_t _r60 = GiGetSubVectorFloat32V2(_r60_02461357, 0); | |||
| @@ -151,6 +151,8 @@ typedef int32x4x2_t GI_INT32_V2_t; | |||
| typedef int32x4x4_t GI_INT32_V4_t; | |||
| typedef int16x8x2_t GI_INT16_V2_t; | |||
| typedef int8x16x2_t GI_INT8_V2_t; | |||
| typedef int8x16x3_t GI_INT8_V3_t; | |||
| typedef int8x16x4_t GI_INT8_V4_t; | |||
| typedef int64x2_t GI_INT64_t; | |||
| #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | |||
| @@ -302,6 +304,8 @@ typedef vint32m1x2_t GI_INT32_V2_t; | |||
| typedef vint32m1x4_t GI_INT32_V4_t; | |||
| typedef vint16m1x2_t GI_INT16_V2_t; | |||
| typedef vint8m1x2_t GI_INT8_V2_t; | |||
| typedef vint8m1x3_t GI_INT8_V3_t; | |||
| typedef vint8m1x4_t GI_INT8_V4_t; | |||
| //! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as | |||
| //! a workaround, we use vfloat32m1_t instead | |||
| typedef vfloat32m1_t float32x2_t; | |||
| @@ -390,6 +394,14 @@ typedef struct { | |||
| GI_INT8_t val[2]; | |||
| } GI_INT8_V2_t; | |||
| typedef struct { | |||
| GI_INT8_t val[3]; | |||
| } GI_INT8_V3_t; | |||
| typedef struct { | |||
| GI_INT8_t val[4]; | |||
| } GI_INT8_V4_t; | |||
| #endif | |||
| //! variable length type intrinsic can not be a member of c++ class | |||
| //! caused by can not do sizeof at build stage, for example RVV and SVE | |||
| @@ -417,6 +429,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t; | |||
| #define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index) | |||
| #define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index) | |||
| #define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index) | |||
| #define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index) | |||
| //! insert subvector | |||
| #define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s) | |||
| @@ -570,6 +584,8 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t; | |||
| #define GiGetSubVectorInt16V2(s, index) s.val[index] | |||
| #define GiGetSubVectorInt8V2(s, index) s.val[index] | |||
| #define GiGetSubVectorInt8V3(s, index) s.val[index] | |||
| #define GiGetSubVectorInt8V4(s, index) s.val[index] | |||
| //! insert subvector | |||
| #define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s | |||
| @@ -381,31 +381,6 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld2q_f32(Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32_V2_t v; | |||
| v.val[0] = GiLoadFloat32(Buffer); | |||
| v.val[1] = GiLoadFloat32((Buffer + 4)); | |||
| v = GiUzpqFloat32(v.val[0], v.val[1]); | |||
| return v; | |||
| #elif defined(GI_RVV_INTRINSICS) | |||
| return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); | |||
| #else | |||
| GI_FLOAT32_V2_t ret; | |||
| ret.val[0][0] = Buffer[0]; | |||
| ret.val[0][1] = Buffer[2]; | |||
| ret.val[0][2] = Buffer[4]; | |||
| ret.val[0][3] = Buffer[6]; | |||
| ret.val[1][0] = Buffer[1]; | |||
| ret.val[1][1] = Buffer[3]; | |||
| ret.val[1][2] = Buffer[5]; | |||
| ret.val[1][3] = Buffer[7]; | |||
| return ret; | |||
| #endif | |||
| } | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GiExtqFloat32(a, b, n) vextq_f32(a, b, n) | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -1709,6 +1684,31 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE GI_FLOAT32_V2_t GiLoadUzipFloat32V2(const float* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld2q_f32(Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32_V2_t v; | |||
| v.val[0] = GiLoadFloat32(Buffer); | |||
| v.val[1] = GiLoadFloat32((Buffer + 4)); | |||
| v = GiUzpqFloat32(v.val[0], v.val[1]); | |||
| return v; | |||
| #elif defined(GI_RVV_INTRINSICS) | |||
| return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); | |||
| #else | |||
| GI_FLOAT32_V2_t ret; | |||
| ret.val[0][0] = Buffer[0]; | |||
| ret.val[0][1] = Buffer[2]; | |||
| ret.val[0][2] = Buffer[4]; | |||
| ret.val[0][3] = Buffer[6]; | |||
| ret.val[1][0] = Buffer[1]; | |||
| ret.val[1][1] = Buffer[3]; | |||
| ret.val[1][2] = Buffer[5]; | |||
| ret.val[1][3] = Buffer[7]; | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| @@ -86,6 +86,152 @@ GI_INT8_t GiLoadInt8(const void* Buffer) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_V2_t GiLoadUzipInt8V2(const void* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld2q_s8((int8_t*)Buffer); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| GI_INT8_t v0, v1; | |||
| v0 = _mm_loadu_si128((const __m128i*)Buffer); | |||
| v1 = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16)); | |||
| GI_INT8_V2_t ret; | |||
| v0 = _mm_shuffle_epi8( | |||
| v0, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)); | |||
| v1 = _mm_shuffle_epi8( | |||
| v1, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)); | |||
| ret.val[0] = _mm_unpacklo_epi64(v0, v1); | |||
| ret.val[1] = _mm_unpackhi_epi64(v0, v1); | |||
| return ret; | |||
| #elif defined(GI_RVV_INTRINSICS) | |||
| return vlseg2e8_v_i8m1x2((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||
| #else | |||
| int8_t data[2 * GI_SIMD_LEN_BYTE]; | |||
| const int8_t* ptr = (int8_t*)Buffer; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| data[i] = ptr[2 * i]; | |||
| data[GI_SIMD_LEN_BYTE + i] = ptr[2 * i + 1]; | |||
| } | |||
| GI_INT8_V2_t ret; | |||
| ret.val[0] = GiLoadInt8(data); | |||
| ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE); | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_V3_t GiLoadUzipInt8V3(const void* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld3q_s8((int8_t*)Buffer); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| GI_INT8_V3_t v; | |||
| __m128i tmp0, tmp1, tmp2, tmp3; | |||
| static const int8_t mask8_0[16] = {0, 3, 6, 9, 12, 15, 1, 4, | |||
| 7, 10, 13, 2, 5, 8, 11, 14}; | |||
| static const int8_t mask8_1[16] = {2, 5, 8, 11, 14, 0, 3, 6, | |||
| 9, 12, 15, 1, 4, 7, 10, 13}; | |||
| static const int8_t mask8_2[16] = {1, 4, 7, 10, 13, 2, 5, 8, | |||
| 11, 14, 0, 3, 6, 9, 12, 15}; | |||
| v.val[0] = _mm_loadu_si128((const __m128i*)Buffer); | |||
| v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16)); | |||
| v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32)); | |||
| tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); | |||
| tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); | |||
| tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); | |||
| tmp3 = _mm_slli_si128(tmp0, 10); | |||
| tmp3 = _mm_alignr_epi8(tmp1, tmp3, 10); // a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x | |||
| tmp3 = _mm_slli_si128(tmp3, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, | |||
| tmp3 = _mm_srli_si128(tmp3, 5); // a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 | |||
| v.val[0] = _mm_slli_si128(tmp2, 11); // 0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, | |||
| v.val[0] = _mm_or_si128(v.val[0], tmp3); | |||
| tmp3 = _mm_slli_si128(tmp0, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, | |||
| tmp3 = _mm_srli_si128(tmp3, 11); // a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 | |||
| v.val[1] = _mm_srli_si128(tmp1, 5); // b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 | |||
| v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, | |||
| v.val[1] = _mm_or_si128(v.val[1], tmp3); | |||
| v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, | |||
| v.val[1] = _mm_srli_si128(v.val[1], 5); // a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 | |||
| tmp3 = _mm_srli_si128(tmp2, 5); // c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 | |||
| tmp3 = _mm_slli_si128(tmp3, 11); // 0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, | |||
| v.val[1] = _mm_or_si128(v.val[1], tmp3); | |||
| tmp3 = _mm_srli_si128(tmp2, 10); // c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, | |||
| tmp3 = _mm_slli_si128(tmp3, 10); // 0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, | |||
| v.val[2] = _mm_srli_si128(tmp1, 11); // b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 | |||
| v.val[2] = _mm_slli_si128(v.val[2], 5); // 0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 | |||
| v.val[2] = | |||
| _mm_or_si128(v.val[2], tmp3); // 0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, | |||
| tmp0 = _mm_srli_si128(tmp0, 11); // a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, | |||
| v.val[2] = _mm_or_si128(v.val[2], tmp0); | |||
| return v; | |||
| #elif defined(GI_RVV_INTRINSICS) | |||
| return vlseg3e8_v_i8m1x3((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||
| #else | |||
| int8_t data[3 * GI_SIMD_LEN_BYTE]; | |||
| const int8_t* ptr = (int8_t*)Buffer; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| data[i] = ptr[3 * i]; | |||
| data[GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 1]; | |||
| data[2 * GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 2]; | |||
| } | |||
| GI_INT8_V3_t ret; | |||
| ret.val[0] = GiLoadInt8(data); | |||
| ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE); | |||
| ret.val[2] = GiLoadInt8(data + 2 * GI_SIMD_LEN_BYTE); | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_V4_t GiLoadUzipInt8V4(const void* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld4q_s8((int8_t*)Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_INT8_V4_t v; | |||
| __m128i tmp3, tmp2, tmp1, tmp0; | |||
| v.val[0] = _mm_loadu_si128((const __m128i*)Buffer); | |||
| v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16)); | |||
| v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32)); | |||
| v.val[3] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 48)); | |||
| tmp0 = _mm_unpacklo_epi8(v.val[0], v.val[1]); | |||
| tmp1 = _mm_unpacklo_epi8(v.val[2], v.val[3]); | |||
| tmp2 = _mm_unpackhi_epi8(v.val[0], v.val[1]); | |||
| tmp3 = _mm_unpackhi_epi8(v.val[2], v.val[3]); | |||
| v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); | |||
| v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); | |||
| v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); | |||
| v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); | |||
| tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2]); | |||
| tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2]); | |||
| tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3]); | |||
| tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3]); | |||
| v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); | |||
| v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); | |||
| v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); | |||
| v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); | |||
| return v; | |||
| #elif defined(GI_RVV_INTRINSICS) | |||
| return vlseg4e8_v_i8m1x4((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); | |||
| #else | |||
| GI_INT8_V4_t ret; | |||
| const int8_t* ptr = (int8_t*)Buffer; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| ret.val[0][i] = ptr[4 * i]; | |||
| ret.val[1][i] = ptr[4 * i + 1]; | |||
| ret.val[2][i] = ptr[4 * i + 2]; | |||
| ret.val[3][i] = ptr[4 * i + 3]; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreInt32(void* Buffer, GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| @@ -471,7 +471,7 @@ void do_average_pooling_3x3_s2x2_gi( | |||
| int odd_offset = 0, even_offset = 0; | |||
| for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLd2qFloat32(sptr + iw); | |||
| auto s0 = GiLoadUzipFloat32V2(sptr + iw); | |||
| GiStoreFloat32(even + even_offset, GiGetSubVectorFloat32V2(s0, 0)); | |||
| GiStoreFloat32(odd + odd_offset, GiGetSubVectorFloat32V2(s0, 1)); | |||
| even_offset += MEGDNN_SIMD_WIDTH; | |||
| @@ -186,8 +186,15 @@ bool ReduceImpl::exec_optimized( | |||
| #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \ | |||
| if (C == 1) { \ | |||
| using _Reducer = Reducer<dtype, ctype, comp_type, true>; \ | |||
| using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \ | |||
| std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \ | |||
| do_reduce = Exec<_Reducer, true>::do_reduce; \ | |||
| if (B == 2) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \ | |||
| if (B == 3) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \ | |||
| if (B == 4) \ | |||
| do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \ | |||
| midout_iv(0)) { \ | |||
| @@ -46,8 +46,8 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
| vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef)); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const int8_t* val) { | |||
| const GI_INT8_t vval = GiLoadInt8(val); | |||
| void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); } | |||
| void feed_vector(const GI_INT8_t vval) { | |||
| const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); | |||
| const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); | |||
| @@ -121,9 +121,10 @@ struct MeanReducer<dt_float32, float, float, false> { | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const float* val) { | |||
| void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } | |||
| void inline feed_vector(const GI_FLOAT32_t& val) { | |||
| res = GiFloat32Type2FixLenType( | |||
| GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res))); | |||
| GiAddFloat32(val, GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { remain += *val; } | |||
| void post(float* dst) { | |||
| @@ -172,31 +173,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
| #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| remain = _Mode##_NAN(*val, remain); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } \ | |||
| void inline feed_vector(const GI_FLOAT32_t& val) { \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), val)); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| remain = _Mode##_NAN(*val, remain); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| } | |||
| REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
| @@ -246,10 +247,10 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); } \ | |||
| void inline feed_vector(GI_INT8_t val) { \ | |||
| res = GiInt8Type2FixLenType( \ | |||
| Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \ | |||
| Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), val)); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| using namespace std; \ | |||
| @@ -304,32 +305,32 @@ REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f); | |||
| REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
| #undef REDUCER_SUM_PRODUCT_C1 | |||
| #define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, (*val)); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| #define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } \ | |||
| void inline feed_vector(GI_FLOAT32_t val) { \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##Float32(val, GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, (*val)); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| } | |||
| REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f); | |||
| @@ -378,15 +379,16 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| SumSqrReducer() = default; | |||
| void feed(const float* val) { | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
| void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } | |||
| void inline feed_vector(GI_FLOAT32_t src) { | |||
| res = GiFloat32Type2FixLenType(GiAddFloat32( | |||
| GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res))); | |||
| GiMultiplyFloat32(src, src), GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { remain += (*val) * (*val); } | |||
| void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); } | |||
| void post_remain(float* dst) { *dst = remain; } | |||
| }; | |||
| /**************************************do reduce*************************/ | |||
| template <typename Reducer, bool C1> | |||
| @@ -446,6 +448,57 @@ struct Exec<Reducer, false> { | |||
| } | |||
| }; | |||
| template <typename Reducer, typename dtype, size_t B> | |||
| struct ExecC1SmallB { | |||
| static void do_reduce( | |||
| const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C); | |||
| }; | |||
| #define ImplementC1SmallB(_ctype, _gi_type, _gi_ins) \ | |||
| template <typename Reducer, size_t B> \ | |||
| struct ExecC1SmallB<Reducer, _ctype, B> { \ | |||
| static void do_reduce( \ | |||
| const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \ | |||
| size_t) { \ | |||
| size_t a = 0; \ | |||
| for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \ | |||
| Reducer reducer(src_dtype, B); \ | |||
| auto src_ptr = src + a * B; \ | |||
| if (B == 4) { \ | |||
| GI_##_gi_type##_V4_t data_v4 = GiLoadUzip##_gi_ins##V4(src_ptr); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 0)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 1)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 2)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 3)); \ | |||
| } \ | |||
| if (B == 3) { \ | |||
| GI_##_gi_type##_V3_t data_v3 = GiLoadUzip##_gi_ins##V3(src_ptr); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 0)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 1)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 2)); \ | |||
| } \ | |||
| if (B == 2) { \ | |||
| GI_##_gi_type##_V2_t data_v2 = GiLoadUzip##_gi_ins##V2(src_ptr); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 0)); \ | |||
| reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 1)); \ | |||
| } \ | |||
| reducer.post(dst); \ | |||
| dst += Reducer::SIMD_WIDTH; \ | |||
| } \ | |||
| for (; a < A; a++) { \ | |||
| Reducer reducer(src_dtype, B); \ | |||
| auto src_ptr = src + a * B; \ | |||
| for (size_t i = 0; i < B; i++) \ | |||
| reducer.feed_remain(src_ptr + i); \ | |||
| reducer.post_remain(dst); \ | |||
| dst++; \ | |||
| } \ | |||
| } \ | |||
| } | |||
| ImplementC1SmallB(float, FLOAT32, Float32); | |||
| ImplementC1SmallB(int8_t, INT8, Int8); | |||
| } // namespace | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -565,7 +565,8 @@ struct ResizeAreaFastVec_SIMD_32f { | |||
| if (cn == 1) { | |||
| for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { | |||
| GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1); | |||
| GI_FLOAT32_V2_t v_row0 = GiLoadUzipFloat32V2(S0), | |||
| v_row1 = GiLoadUzipFloat32V2(S1); | |||
| GI_FLOAT32_t v_dst0 = GiAddFloat32( | |||
| GiGetSubVectorFloat32V2(v_row0, 0), | |||
| @@ -29,7 +29,7 @@ TEST_F(ARM_COMMON, REDUCE) { | |||
| for (int32_t axis : {0, 1, 2}) { | |||
| for (size_t A : {1, 3, 5}) { | |||
| for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) { | |||
| TensorShape shape{A, B, C}; | |||
| Param param(mode, axis); | |||
| Config config(param, dtype, shape); | |||
| @@ -51,7 +51,7 @@ TEST_F(ARM_COMMON, REDUCE) { | |||
| for (int32_t axis : {0, 1, 2}) { | |||
| for (size_t A : {1, 3, 5}) { | |||
| for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) { | |||
| TensorShape shape{A, B, C}; | |||
| Param param(mode, axis); | |||
| Config config(param, dtype, shape); | |||
| @@ -1356,12 +1356,12 @@ TEST_F(FALLBACK, GiSt1Float32) { | |||
| ASSERT_EQ(ret[1], s0[1]); | |||
| } | |||
| TEST_F(FALLBACK, GiLd2qFloat32) { | |||
| TEST_F(FALLBACK, GiLoadUzipFloat32) { | |||
| GI_FLOAT32_V2_t ret; | |||
| std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f}; | |||
| force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); | |||
| ret = GiLd2qFloat32(s0.data()); | |||
| ret = GiLoadUzipFloat32V2(s0.data()); | |||
| std::vector<float> naive0; | |||
| std::vector<float> naive1; | |||
| @@ -2590,6 +2590,61 @@ TEST_F(FALLBACK, GiLoadInt8) { | |||
| } | |||
| } | |||
| TEST_F(FALLBACK, GiLoadUzipInt8V2) { | |||
| std::vector<int8_t> s0{9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128, | |||
| 127, 2, 55, 3, -1, 7, 8, -18, 17, 12, 35, | |||
| 7, 8, 10, 22, -108, 27, 21, 45, 13, -11}; | |||
| GI_INT8_V2_t ret; | |||
| force_memset_ret((void*)&ret, 2 * GI_SIMD_LEN_BYTE); | |||
| ret = GiLoadUzipInt8V2(s0.data()); | |||
| auto p = (int8_t*)&ret; | |||
| for (size_t i = 0; i < SIMD_LEN_8; i++) { | |||
| ASSERT_EQ(p[i], s0[2 * i]); | |||
| ASSERT_EQ(p[SIMD_LEN_8 + i], s0[2 * i + 1]); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK, GiLoadUzipInt8V3) { | |||
| std::vector<int8_t> s0{9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128, 127, | |||
| 2, 55, 3, -1, 7, 8, -18, 17, 12, 35, 7, 8, | |||
| 10, 22, -108, 27, 21, 45, 13, -11, 11, 14, -11, 12, | |||
| 111, 32, 6, 9, 16, 29, -118, 67, 28, 15, 19, -10}; | |||
| GI_INT8_V3_t ret; | |||
| force_memset_ret((void*)&ret, 3 * GI_SIMD_LEN_BYTE); | |||
| ret = GiLoadUzipInt8V3(s0.data()); | |||
| auto p = (int8_t*)&ret; | |||
| for (size_t i = 0; i < SIMD_LEN_8; i++) { | |||
| ASSERT_EQ(p[i], s0[3 * i]); | |||
| ASSERT_EQ(p[SIMD_LEN_8 + i], s0[3 * i + 1]); | |||
| ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[3 * i + 2]); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK, GiLoadUzipInt8V4) { | |||
| std::vector<int8_t> s0{ | |||
| 9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128, 127, 2, 55, 3, -1, | |||
| 7, 8, -18, 17, 12, 35, 7, 8, 10, 22, -108, 27, 21, 45, 13, -11, | |||
| 11, 14, -11, 12, 111, 32, 6, 9, 16, 29, -118, 67, 28, 15, 19, -10, | |||
| 9, 4, -108, 27, 22, 43, 13, 10, 31, 12, -108, 117, 22, 25, 31, -10, | |||
| }; | |||
| GI_INT8_V4_t ret; | |||
| force_memset_ret((void*)&ret, 4 * GI_SIMD_LEN_BYTE); | |||
| ret = GiLoadUzipInt8V4(s0.data()); | |||
| auto p = (int8_t*)&ret; | |||
| for (size_t i = 0; i < SIMD_LEN_8; i++) { | |||
| ASSERT_EQ(p[i], s0[4 * i]); | |||
| ASSERT_EQ(p[SIMD_LEN_8 + i], s0[4 * i + 1]); | |||
| ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[4 * i + 2]); | |||
| ASSERT_EQ(p[3 * SIMD_LEN_8 + i], s0[4 * i + 3]); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK, GiStoreInt32) { | |||
| GI_INT32_t src0; | |||
| std::vector<int32_t> s0{1, 2, -200, 999}; | |||
| @@ -1,6 +1,7 @@ | |||
| #include "test/fallback/fixture.h" | |||
| #include "megdnn/oprs.h" | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/task_record_check.h" | |||
| #include "test/common/tensor.h" | |||
| @@ -27,9 +28,9 @@ TEST_F(FALLBACK, REDUCE_FULL) { | |||
| dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f), | |||
| dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))}) | |||
| for (int32_t axis : {0, 1, 2}) { | |||
| for (size_t A : {1, 3, 5}) { | |||
| for (size_t A : {1, 3, 5, 20}) { | |||
| for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) { | |||
| TensorShape shape{A, B, C}; | |||
| Param param(mode, axis); | |||
| Config config(param, dtype, shape); | |||
| @@ -49,9 +50,9 @@ TEST_F(FALLBACK, REDUCE_FULL) { | |||
| for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR}) | |||
| for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()}) | |||
| for (int32_t axis : {0, 1, 2}) { | |||
| for (size_t A : {1, 3, 5}) { | |||
| for (size_t A : {1, 3, 5, 20}) { | |||
| for (size_t B : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {4, 6, 9, 16, 33, 45}) { | |||
| for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) { | |||
| TensorShape shape{A, B, C}; | |||
| Param param(mode, axis); | |||
| Config config(param, dtype, shape); | |||
| @@ -301,4 +302,56 @@ TEST_F(FALLBACK, REDUCE_RECORD) { | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) { | |||
| auto run = [&]() { | |||
| Benchmarker<Reduce> benchmarker_reduce(handle()); | |||
| Benchmarker<Convolution> benchmarker_conv(handle()); | |||
| benchmarker_reduce.set_display(false); | |||
| benchmarker_conv.set_display(false); | |||
| constexpr size_t RUNS = 50; | |||
| benchmarker_reduce.set_times(RUNS); | |||
| benchmarker_conv.set_times(RUNS); | |||
| param::Reduce param; | |||
| param.axis = 3; | |||
| param.mode = param::Reduce::Mode::SUM; | |||
| benchmarker_reduce.set_param(param); | |||
| param::Convolution param_conv; | |||
| benchmarker_conv.set_param(param_conv); | |||
| { | |||
| TensorLayout src({24, 240, 128, 2}, dtype::Float32()); | |||
| auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; | |||
| TensorLayout conv_src({24, 2, 240, 128}, dtype::Float32()); | |||
| TensorLayout conv_weight({1, 2, 1, 1}, dtype::Float32()); | |||
| auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS; | |||
| printf("case 1: reduce use time %fms, convolution use time %fms\n", reduce, | |||
| conv); | |||
| } | |||
| { | |||
| TensorLayout src({24, 240, 128, 3}, dtype::Float32()); | |||
| auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; | |||
| TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32()); | |||
| TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32()); | |||
| auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS; | |||
| printf("case 2: reduce use time %fms, convolution use time %fms\n", reduce, | |||
| conv); | |||
| } | |||
| { | |||
| TensorLayout src({24, 240, 128, 4}, dtype::Float32()); | |||
| auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; | |||
| TensorLayout conv_src({24, 4, 240, 128}, dtype::Float32()); | |||
| TensorLayout conv_weight({1, 4, 1, 1}, dtype::Float32()); | |||
| auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS; | |||
| printf("case 3: reduce use time %fms, convolution use time %fms\n", reduce, | |||
| conv); | |||
| } | |||
| }; | |||
| run(); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -467,6 +467,33 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) { | |||
| } | |||
| #endif | |||
| TEST_F(X86, BENCHMARK_REDUCE_VS_CONV) { | |||
| auto run = [&]() { | |||
| Benchmarker<Reduce> benchmarker_reduce(handle()); | |||
| Benchmarker<Convolution> benchmarker_conv(handle()); | |||
| benchmarker_reduce.set_display(false); | |||
| benchmarker_conv.set_display(false); | |||
| constexpr size_t RUNS = 50; | |||
| benchmarker_reduce.set_times(RUNS); | |||
| benchmarker_conv.set_times(RUNS); | |||
| param::Reduce param; | |||
| param.axis = 3; | |||
| param.mode = param::Reduce::Mode::SUM; | |||
| benchmarker_reduce.set_param(param); | |||
| param::Convolution param_conv; | |||
| benchmarker_conv.set_param(param_conv); | |||
| TensorLayout src({24, 240, 128, 3}, dtype::Float32()); | |||
| auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS; | |||
| TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32()); | |||
| TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32()); | |||
| auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS; | |||
| printf("reduce use time %fms, convolution use time %fms\n", reduce, conv); | |||
| }; | |||
| run(); | |||
| } | |||
| #endif | |||
| } // namespace test | |||