GitOrigin-RevId: 148ae44ba0
tags/v1.7.0
| @@ -10,6 +10,7 @@ project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING}) | |||||
| set(CMAKE_CXX_STANDARD 14) | set(CMAKE_CXX_STANDARD 14) | ||||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||||
| set(CMAKE_CXX_EXTENSIONS OFF) | set(CMAKE_CXX_EXTENSIONS OFF) | ||||
| set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | |||||
| set(CMAKE_POSITION_INDEPENDENT_CODE ON) | set(CMAKE_POSITION_INDEPENDENT_CODE ON) | ||||
| set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) | ||||
| set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) | set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) | ||||
| @@ -8,11 +8,11 @@ | |||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| */ | */ | ||||
| #include "src/fallback/conv_bias/im2col/algos.h" | |||||
| #include "megdnn/opr_param_defs.h" | #include "megdnn/opr_param_defs.h" | ||||
| #include "src/common/opr_delegate.h" | #include "src/common/opr_delegate.h" | ||||
| #include "src/fallback/conv_bias/common.h" | #include "src/fallback/conv_bias/common.h" | ||||
| #include "src/fallback/conv_bias/im2col/algos.h" | |||||
| #include "src/fallback/conv_bias/im2col/factory.h" | #include "src/fallback/conv_bias/im2col/factory.h" | ||||
| #include "src/fallback/conv_bias/im2col/im2col_kerns.h" | #include "src/fallback/conv_bias/im2col/im2col_kerns.h" | ||||
| #include "src/fallback/conv_bias/opr_impl.h" | #include "src/fallback/conv_bias/opr_impl.h" | ||||
| @@ -68,16 +68,16 @@ static void choice_ohw_oc_block( | |||||
| fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) { | fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) { | ||||
| //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, | //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, | ||||
| //! when ohw_tile_size < this value ohw_tile_size = ohw | //! when ohw_tile_size < this value ohw_tile_size = ohw | ||||
| static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32; | |||||
| size_t DEFAULT_OHW_MIN_TILE_SIZE = round_up(static_cast<size_t>(32), block_n); | |||||
| //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, | //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, | ||||
| //! oc_tile_size = DEFAULT_OC_TILE_SIZE | //! oc_tile_size = DEFAULT_OC_TILE_SIZE | ||||
| static constexpr size_t DEFAULT_OC_TILE_SIZE = 512; | |||||
| size_t DEFAULT_OC_TILE_SIZE = round_up(static_cast<size_t>(512), block_m); | |||||
| //! when oc_tile_size > this value m_oc_tile_size = | //! when oc_tile_size > this value m_oc_tile_size = | ||||
| //! DEFAULT_OC_MAX_TILE_SIZE | //! DEFAULT_OC_MAX_TILE_SIZE | ||||
| static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024; | |||||
| size_t DEFAULT_OC_MAX_TILE_SIZE = round_up(static_cast<size_t>(1024), block_m); | |||||
| //! when oc_tile_size < this value oc_tile_size = | //! when oc_tile_size < this value oc_tile_size = | ||||
| //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation | //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation | ||||
| static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128; | |||||
| size_t DEFAULT_OC_MIN_TILE_SIZE = round_up(static_cast<size_t>(128), block_m); | |||||
| size_t nr_threads = param.nr_threads; | size_t nr_threads = param.nr_threads; | ||||
| size_t OC = param.filter_meta.ocpg; | size_t OC = param.filter_meta.ocpg; | ||||
| size_t ohw = param.osz[0] * param.osz[1]; | size_t ohw = param.osz[0] * param.osz[1]; | ||||
| @@ -123,6 +123,7 @@ public: | |||||
| X86_INT8X8X16_SSE, | X86_INT8X8X16_SSE, | ||||
| X86_INT8X8X32_SSE_4X8X2, | X86_INT8X8X32_SSE_4X8X2, | ||||
| X86_F32_MK8_8X8, | X86_F32_MK8_8X8, | ||||
| X86_F32_6x16, | |||||
| X86_INT8X8X32_VNNI, | X86_INT8X8X32_VNNI, | ||||
| X86_INT8X8X32_MKLDNN, | X86_INT8X8X32_MKLDNN, | ||||
| #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 | ||||
| @@ -33,6 +33,15 @@ static inline __m256 _mm256_loadu2_m128_emulate( | |||||
| _mm256_castps128_ps256(_mm_loadu_ps(loaddr)), _mm_loadu_ps(hiaddr), 1); | _mm256_castps128_ps256(_mm_loadu_ps(loaddr)), _mm_loadu_ps(hiaddr), 1); | ||||
| } | } | ||||
| MEGDNN_ATTRIBUTE_TARGET("avx") | |||||
| static inline void _mm256_storeu2_m128_emulate( | |||||
| float* hiaddr, float* loaddr, __m256 reg) { | |||||
| auto xmm0 = _mm256_extractf128_ps(reg, 0); | |||||
| auto xmm1 = _mm256_extractf128_ps(reg, 1); | |||||
| _mm_storeu_ps(loaddr, xmm0); | |||||
| _mm_storeu_ps(hiaddr, xmm1); | |||||
| } | |||||
| template <typename ctype, size_t len> | template <typename ctype, size_t len> | ||||
| struct Vector; | struct Vector; | ||||
| @@ -309,6 +309,33 @@ void gemm_s8s8s32_sse_4x8x2(const MatrixMulImpl::KernParam& kern_param) { | |||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } | } | ||||
| void gemm_f32_avx2_6x16(const MatrixMulImpl::KernParam& kern_param) { | |||||
| MEGDNN_MARK_USED_VAR(kern_param); | |||||
| MIDOUT_BEGIN(megdnn_x86_matmul_kern_avx2_6x16x2, midout_iv(0)) { | |||||
| constexpr int cacheline = 64; | |||||
| const size_t m = kern_param.M; | |||||
| const size_t n = kern_param.N; | |||||
| const size_t k = kern_param.K; | |||||
| const bool trans_a = kern_param.trA; | |||||
| const bool trans_b = kern_param.trB; | |||||
| const size_t lda = kern_param.LDA; | |||||
| const size_t ldb = kern_param.LDB; | |||||
| const size_t ldc = kern_param.LDC; | |||||
| auto a_type = kern_param.A_type; | |||||
| auto b_type = kern_param.B_type; | |||||
| auto c_type = kern_param.C_type; | |||||
| const auto a_ptr = kern_param.A<float>(); | |||||
| const auto b_ptr = kern_param.B<float>(); | |||||
| auto c_ptr = kern_param.C<float>(); | |||||
| x86::matmul::sgemm_pack_6x16_avx2 strategy(m, n, k, a_type, b_type, c_type); | |||||
| megdnn::matmul::GemmInterleaved<x86::matmul::sgemm_pack_6x16_avx2>( | |||||
| m, n, k, trans_a, trans_b, strategy, cacheline) | |||||
| .execute(a_ptr, lda, b_ptr, ldb, c_ptr, ldc, kern_param.workspace_ptr); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| /*************************AlgoInt8x8x16AVX2********************/ | /*************************AlgoInt8x8x16AVX2********************/ | ||||
| @@ -625,4 +652,43 @@ size_t MatrixMulImpl::AlgoF32MK8_8x8::get_workspace( | |||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| } | } | ||||
| /*************************AlgoFloatAVX2M6N16********************/ | |||||
| MatrixMulImpl::kern_t MatrixMulImpl::AlgoFloatAVX2M6N16::get_kern( | |||||
| const KernSizeParam&) const { | |||||
| return gemm_f32_avx2_6x16; | |||||
| } | |||||
| bool MatrixMulImpl::AlgoFloatAVX2M6N16::usable( | |||||
| const KernSizeParam& kern_size_param) const { | |||||
| bool is_param_ok = | |||||
| kern_size_param.A_type.enumv() == kern_size_param.B_type.enumv() && | |||||
| ((kern_size_param.A_type.enumv() == DTypeEnum::Float32 && | |||||
| kern_size_param.C_type.enumv() == DTypeEnum::Float32)) && | |||||
| kern_size_param.compute_mode == Param::ComputeMode::DEFAULT && | |||||
| kern_size_param.format == Param::Format::DEFAULT && | |||||
| is_supported(SIMDType::AVX2); | |||||
| return is_param_ok; | |||||
| } | |||||
| size_t MatrixMulImpl::AlgoFloatAVX2M6N16::get_workspace( | |||||
| const KernSizeParam& kern_param) const { | |||||
| constexpr int cacheline = 64; | |||||
| const size_t m = kern_param.M; | |||||
| const size_t n = kern_param.N; | |||||
| const size_t k = kern_param.K; | |||||
| const bool trans_a = kern_param.trA; | |||||
| const bool trans_b = kern_param.trB; | |||||
| auto a_type = kern_param.A_type; | |||||
| auto b_type = kern_param.B_type; | |||||
| auto c_type = kern_param.C_type; | |||||
| x86::matmul::sgemm_pack_6x16_avx2 strategy(m, n, k, a_type, b_type, c_type); | |||||
| return megdnn::matmul::GemmInterleaved<x86::matmul::sgemm_pack_6x16_avx2>( | |||||
| m, n, k, trans_a, trans_b, strategy, cacheline) | |||||
| .get_workspace_size(); | |||||
| } | |||||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( | |||||
| AlgoFloatAVX2M6N16, megdnn_x86_matmul_kern, "AlgoFloatAVX2M6N16"_hash, | |||||
| x86::matmul::sgemm_pack_6x16_avx2, float, float, float, AlgoDataType::FLOAT32, | |||||
| DEFAULT); | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -134,6 +134,17 @@ public: | |||||
| MEGDNN_DECL_ALGO_TYPE(X86_F32_MK8_8X8) | MEGDNN_DECL_ALGO_TYPE(X86_F32_MK8_8X8) | ||||
| }; | }; | ||||
| class MatrixMulImpl::AlgoFloatAVX2M6N16 : public AlgoBase { | |||||
| public: | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| const char* name() const override { return "X86_F32_6x16"; } | |||||
| bool usable(const KernSizeParam&) const override; | |||||
| size_t get_workspace(const KernSizeParam&) const override; | |||||
| kern_t get_kern(const KernSizeParam&) const override; | |||||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL(); | |||||
| MEGDNN_DECL_ALGO_TYPE(X86_F32_6x16) | |||||
| }; | |||||
| #if MEGDNN_X86_WITH_VNNI | #if MEGDNN_X86_WITH_VNNI | ||||
| class MatrixMulImpl::AlgoInt8x8x32Vnni : public AlgoBase { | class MatrixMulImpl::AlgoInt8x8x32Vnni : public AlgoBase { | ||||
| public: | public: | ||||
| @@ -19,6 +19,9 @@ namespace matmul { | |||||
| MEGDNN_REG_GEMM_STRATEGY_NOPACK( | MEGDNN_REG_GEMM_STRATEGY_NOPACK( | ||||
| float, float, float, 8, 8, 8, false, true, sgemm_nopack_8x8_avx2); | float, float, float, 8, 8, 8, false, true, sgemm_nopack_8x8_avx2); | ||||
| MEGDNN_REG_GEMM_STRATEGY_WITH_PACK_A_TYPE( | |||||
| float, float, float, float, 6, 16, 1, false, false, sgemm_pack_6x16_avx2); | |||||
| } // namespace matmul | } // namespace matmul | ||||
| } // namespace x86 | } // namespace x86 | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -34,6 +34,7 @@ class MatrixMulImpl::AlgoPack : NonCopyableObj { | |||||
| AlgoInt8x8x16AVX2 algoint8x8x16avx2_m4n16k2; | AlgoInt8x8x16AVX2 algoint8x8x16avx2_m4n16k2; | ||||
| AlgoInt8x8x16SSE algoint8x8x16sse_m4n8k2; | AlgoInt8x8x16SSE algoint8x8x16sse_m4n8k2; | ||||
| AlgoF32MK8_8x8 algof32mk8_8x8; | AlgoF32MK8_8x8 algof32mk8_8x8; | ||||
| AlgoFloatAVX2M6N16 algof32_6x16; | |||||
| SmallVector<fallback::MatrixMulImpl::AlgoBase*> m_all_algos; | SmallVector<fallback::MatrixMulImpl::AlgoBase*> m_all_algos; | ||||
| fallback::MatrixMulImpl::AlgoBase::Mapper m_all_algos_map; | fallback::MatrixMulImpl::AlgoBase::Mapper m_all_algos_map; | ||||
| @@ -51,6 +52,7 @@ public: | |||||
| m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2); | m_all_algos.emplace_back(&algoint8x8x32sse_m4n8k2); | ||||
| m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2); | m_all_algos.emplace_back(&algoint8x8x16sse_m4n8k2); | ||||
| m_all_algos.emplace_back(&algof32mk8_8x8); | m_all_algos.emplace_back(&algof32mk8_8x8); | ||||
| m_all_algos.emplace_back(&algof32_6x16); | |||||
| #if MEGDNN_X86_WITH_MKL_DNN | #if MEGDNN_X86_WITH_MKL_DNN | ||||
| m_all_algos.emplace_back(&algoint8x8x32mkldnn); | m_all_algos.emplace_back(&algoint8x8x32mkldnn); | ||||
| #endif | #endif | ||||
| @@ -67,6 +67,7 @@ private: | |||||
| class AlgoInt8x8x16SSE; | class AlgoInt8x8x16SSE; | ||||
| class AlgoPack; | class AlgoPack; | ||||
| class AlgoF32MK8_8x8; | class AlgoF32MK8_8x8; | ||||
| class AlgoFloatAVX2M6N16; | |||||
| public: | public: | ||||
| static const AlgoPack& algo_pack(); | static const AlgoPack& algo_pack(); | ||||
| @@ -84,6 +84,15 @@ TEST_F(X86, SHAKE_MATRIX_MUL_FORWARD) { | |||||
| .exec({{20, 100}, {100, 60}, {}}); | .exec({{20, 100}, {100, 60}, {}}); | ||||
| } | } | ||||
| TEST_F(X86, SHAKE_MATRIX_MUL_6x16_FORWARD) { | |||||
| AccuracyShakeChecker<MatrixMul> checker(handle()); | |||||
| checker.set_before_exec_callback(AlgoGenerator<MatrixMul>("X86_F32_6x16")); | |||||
| checker.set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .exec({{20, 100}, {100, 60}, {}}); | |||||
| } | |||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -1150,6 +1150,56 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) { | |||||
| #endif | #endif | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_6x16) { | |||||
| using namespace conv_bias; | |||||
| std::vector<TestArg> args; | |||||
| auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p, | |||||
| NonlineMode nonline_mode) { | |||||
| if (w + 2 * p < kernel || h + 2 * p < kernel) | |||||
| return; | |||||
| param::ConvBias param; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.pad_h = p; | |||||
| param.pad_w = p; | |||||
| param.nonlineMode = nonline_mode; | |||||
| //! no bias | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{}); | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{1, oc, 1, 1}); | |||||
| args.emplace_back( | |||||
| param, TensorShape{1, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, | |||||
| TensorShape{ | |||||
| 1, oc, (h + 2 * p - kernel) / param.stride_h + 1, | |||||
| (w + 2 * p - kernel) / param.stride_w + 1}); | |||||
| }; | |||||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||||
| for (size_t ic : {1, 4, 8, 16}) | |||||
| for (size_t oc : {1, 4, 8, 16, 300}) | |||||
| for (size_t p : {0, 2}) | |||||
| for (size_t size : {8, 24}) | |||||
| for (NonlineMode nonline_mode : | |||||
| {NonlineMode::IDENTITY, NonlineMode::RELU}) { | |||||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||||
| } | |||||
| run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); | |||||
| Checker<ConvBias> checker(handle()); | |||||
| #define cb(algo_name) \ | |||||
| checker.set_before_exec_callback( \ | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \ | |||||
| for (auto&& arg : args) { \ | |||||
| checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \ | |||||
| } | |||||
| cb("IM2COLMATMUL:X86_F32_6x16:192"); | |||||
| } | |||||
| #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM | #if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { | TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| @@ -1435,6 +1485,12 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32_PREPROCESS) { | |||||
| #endif | #endif | ||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_6x16) { | |||||
| using namespace conv_bias; | |||||
| std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false); | |||||
| check_conv_bias(args, handle(), "CONV1x1:X86_F32_6x16:48"); | |||||
| } | |||||
| TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| @@ -2651,6 +2707,148 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread) | |||||
| shapes_and_computation.clear(); | shapes_and_computation.clear(); | ||||
| } | } | ||||
| TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6x16) { | |||||
| constexpr size_t RUNS = 50; | |||||
| param::ConvBias param; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| param.pad_h = 1; | |||||
| param.pad_w = 1; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| std::vector<DType> data_type = { | |||||
| dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()}; | |||||
| std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation; | |||||
| auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, | |||||
| size_t group) { | |||||
| SmallVector<TensorShape> shapes{ | |||||
| {N, IC, H, W}, | |||||
| {OC / group, IC / group, FS, FS}, | |||||
| {1, OC, 1, 1}, | |||||
| {}, | |||||
| {N, OC, H, W}}; | |||||
| TensorShape dst{N, OC, H, W}; | |||||
| float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
| dst.total_nr_elems()) * | |||||
| 1e-6; | |||||
| shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
| }; | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 32, 32, 80, 80, 3, 1); | |||||
| bench_case(1, 32, 32, 80, 80, 3, 1); | |||||
| bench_case(1, 64, 32, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 64, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 128, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 256, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 512, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 1024, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 32, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 64, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 1024, 14, 14, 3, 1); | |||||
| bench_case(1, 128, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 128, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 256, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 1024, 14, 14, 3, 1); | |||||
| bench_case(1, 1024, 1024, 14, 14, 3, 1); | |||||
| std::string algo_name = "IM2COLMATMUL:X86_F32_6x16:192"; | |||||
| printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n"); | |||||
| benchmark_impl( | |||||
| param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, | |||||
| data_type); | |||||
| benchmark_impl( | |||||
| param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, | |||||
| data_type); | |||||
| benchmark_impl( | |||||
| param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, | |||||
| data_type); | |||||
| shapes_and_computation.clear(); | |||||
| } | |||||
| TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_6X16_single_thread) { | |||||
| constexpr size_t RUNS = 50; | |||||
| param::ConvBias param; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| param.pad_h = 1; | |||||
| param.pad_w = 1; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| std::vector<DType> data_type = { | |||||
| dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()}; | |||||
| std::vector<std::pair<SmallVector<TensorShape>, float>> shapes_and_computation; | |||||
| auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, | |||||
| size_t group) { | |||||
| SmallVector<TensorShape> shapes{ | |||||
| {N, IC, H, W}, | |||||
| {OC / group, IC / group, FS, FS}, | |||||
| {1, OC, 1, 1}, | |||||
| {}, | |||||
| {N, OC, H, W}}; | |||||
| TensorShape dst{N, OC, H, W}; | |||||
| float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
| dst.total_nr_elems()) * | |||||
| 1e-6; | |||||
| shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
| }; | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 32, 32, 80, 80, 3, 1); | |||||
| bench_case(1, 32, 32, 80, 80, 3, 1); | |||||
| bench_case(1, 64, 32, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 64, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 128, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 256, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 512, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 1024, 7, 7, 3, 1); | |||||
| bench_case(1, 64, 32, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 64, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 64, 1024, 14, 14, 3, 1); | |||||
| bench_case(1, 128, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 128, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 256, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 1024, 14, 14, 3, 1); | |||||
| bench_case(1, 1024, 1024, 14, 14, 3, 1); | |||||
| std::string algo_name = "IM2COLMATMUL:X86_F32_MKL_PACKA:192"; | |||||
| std::string algo_name1 = "IM2COLMATMUL:X86_F32_6x16:192"; | |||||
| printf("Benchmark IM2COLMATMUL:X86_F32_6x16 algo\n"); | |||||
| benchmark_impl_comp( | |||||
| param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {4}}, | |||||
| {1, {4}}, data_type); | |||||
| benchmark_impl_comp( | |||||
| param, shapes_and_computation, algo_name, algo_name1, RUNS, {1, {7}}, | |||||
| {1, {7}}, data_type); | |||||
| shapes_and_computation.clear(); | |||||
| } | |||||
| TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) { | TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) { | ||||
| constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
| @@ -83,6 +83,12 @@ TEST_F(X86, MATRIX_MUL_AVX2_MK8_8X8) { | |||||
| "X86_F32MK8_8X8", param::MatrixMul::Format::MK8, 1, 1e-3, false); | "X86_F32MK8_8X8", param::MatrixMul::Format::MK8, 1, 1e-3, false); | ||||
| } | } | ||||
| TEST_F(X86, MATRIX_MUL_AVX2_6x16) { | |||||
| matrix_mul::check_matrix_mul( | |||||
| dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(), | |||||
| "X86_F32_6x16", param::MatrixMul::Format::DEFAULT, 1, 1e-3, false); | |||||
| } | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) { | TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) { | ||||
| @@ -93,6 +99,14 @@ TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) { | |||||
| dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS"); | dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS"); | ||||
| } | } | ||||
| TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_6x16) { | |||||
| auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8); | |||||
| matrix_mul::benchmark_with_contrast( | |||||
| handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||||
| "X86_F32_6x16", param::MatrixMul::Format::DEFAULT, dtype::Float32{}, | |||||
| dtype::Float32{}, dtype::Float32{}, "X86_F32_BLAS"); | |||||
| } | |||||
| TEST_F(X86, BENCHMARK_MATRIX_MUL_8X8X32) { | TEST_F(X86, BENCHMARK_MATRIX_MUL_8X8X32) { | ||||
| constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
| auto rng = std::make_unique<UniformIntRNG>(-127, 127); | auto rng = std::make_unique<UniformIntRNG>(-127, 127); | ||||