GitOrigin-RevId: 78c3cfceae
tags/v1.2.0
| @@ -435,16 +435,6 @@ public: | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| const TensorLayout& dst) = 0; | |||
| /** | |||
| * \brief deduce the origin filter layout and conv_bias param after winograd | |||
| * transform, this used in fast-run to construct the origin cache-key | |||
| */ | |||
| static void deduce_winograd_origin_layout_and_param( | |||
| const Param::Format format, const size_t output_block_size, | |||
| const TensorLayout& src_layout, | |||
| const TensorLayout& winograd_filter_layout, | |||
| TensorLayout& origin_layout, Param& origin_param); | |||
| enum class BiasMode : uint32_t { | |||
| NO_BIAS = 0, //!< no bias | |||
| BROADCAST_CHANNEL_BIAS, //!< broadcast channel bias, [1, c, 1, 1] | |||
| @@ -91,29 +91,6 @@ class MaxTensorDiff : public OperatorBase { | |||
| void check_exec(const TensorLayout& layout1, | |||
| const TensorLayout& layout2, size_t workspace_in_bytes); | |||
| }; | |||
| /*! | |||
| * \brief winograd preprocess opr. | |||
| * | |||
| * for the detail \see src/fallback/conv_bias/winograd/winograd.h | |||
| * | |||
| */ | |||
| class WinogradFilterPreprocess : public OperatorBase { | |||
| DEF_OPR_PARAM(Winograd); | |||
| DEF_OPR_IMPL(WinogradFilterPreprocess, OperatorBase, 1, 1); | |||
| public: | |||
| virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace) = 0; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&); | |||
| void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
| protected: | |||
| void check_exec(const TensorLayout& src, const TensorLayout& dst, | |||
| size_t workspace_in_bytes); | |||
| }; | |||
| } // namespace megdnn | |||
| #include "megdnn/internal/opr_header_epilogue.h" | |||
| @@ -39,7 +39,7 @@ pdef('Axis').add_fields('int32', 'axis', 0) | |||
| 'NCHW44','NCHW44_DOT', | |||
| Doc('NCHW_WINOGRAD', 'NCHW layout with weights tranformed by winograd'), | |||
| Doc('NCHW88_WINOGRAD', 'NCHW88 layout with weights tranformed by winograd'), | |||
| Doc('NCHW44_WINOGRAD', 'NCHW44 layout with weights tranformed by winograd'), | |||
| Doc('NCHW44_WINOGRAD', 'NCHW44 layout with weights tranformed by winograd'), | |||
| Doc('NCHW4_NCHW32', 'NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is nchw32 layout'), | |||
| Doc('NCHW32_NCHW4', 'NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is nchw4 layout'), | |||
| Doc('NCHW4_NCHW', 'NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw layout'), | |||
| @@ -456,15 +456,6 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0) | |||
| 'layout is (K/4, M/4, 4(m), 4(k)) x (K/4, N, 4(k))')) | |||
| ) | |||
| (pdef('Winograd', 'winograd param used in convbias'). | |||
| add_fields( | |||
| 'uint32', | |||
| Doc('output_block_size', 'output block size, detail meaning see winograd ' | |||
| 'in convbias, equals to the meaning of m in F(m, r)'), 0). | |||
| add_enum_alias('Format', 'MatrixMul'). | |||
| add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode') | |||
| ) | |||
| (pdef('SVD'). | |||
| add_fields('bool', | |||
| Doc('full_matrices', | |||
| @@ -27,7 +27,7 @@ using namespace arm_common; | |||
| /* ======================= AlgoFP16WinogradF23 ======================== */ | |||
| bool ConvBiasImpl::AlgoFP16WinogradF23::usable( | |||
| const NCBKernSizeParam& param, | |||
| const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 0) { | |||
| @@ -37,12 +37,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF23::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -78,12 +73,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF45::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 4 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 5) && | |||
| @@ -117,12 +107,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF63::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -162,12 +147,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF23_8x8::usable( | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -47,12 +47,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable( | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -86,12 +81,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -125,12 +115,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 5 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 4) && | |||
| @@ -164,12 +149,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 4 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 5) && | |||
| @@ -209,12 +189,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable( | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -257,12 +232,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -303,12 +273,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| param.output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -350,12 +315,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable( | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| param.output_block_size == 7 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -242,14 +242,8 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8::usable( | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == PackMode::NO_PACK && | |||
| ((param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS16)) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -293,13 +287,8 @@ bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( | |||
| .get_matmul_kern_param(param)); | |||
| return is_matmul_usable && | |||
| m_matmul_algo->packmode() == PackMode::NO_PACK && | |||
| ((param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) || | |||
| ((param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD) && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -341,14 +330,8 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( | |||
| .get_matmul_kern_param(param); | |||
| bool is_matmul_usable = m_matmul_algo->usable(matmul_param); | |||
| return is_matmul_usable && | |||
| ((param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS16)) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -240,7 +240,6 @@ void winograd_2x3_4x4_s8_f32_nchw44::filter(const int8_t* filter, | |||
| float* transform_mid_buf, size_t OC, size_t IC, | |||
| size_t oc_start, size_t oc_end) { | |||
| constexpr int alpha = 2 + 3 - 1; | |||
| /** | |||
| * origin: (4x3) * (3 x 3) * (3 x 4) | |||
| */ | |||
| @@ -290,8 +290,8 @@ ConvBiasImpl::get_all_packed_algo() { | |||
| bool ConvBiasImpl::is_matmul_quantized_prefer( | |||
| const ConvBiasImpl::NCBKernSizeParam& param) const { | |||
| fallback::ConvBiasImpl::NCBKernSizeParam conv_ncb_param( | |||
| param, 0, param::MatrixMul::Format::DEFAULT, {}, 0, | |||
| BiasMode::NO_BIAS, param::ConvBias::NonlineMode::IDENTITY); | |||
| param, {}, 0, BiasMode::NO_BIAS, | |||
| param::ConvBias::NonlineMode::IDENTITY); | |||
| conv_ncb_param.dst_type = param.bias_type; | |||
| conv_ncb_param.filter_meta.group = 1; | |||
| @@ -320,11 +320,6 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order( | |||
| auto FH = param.filter_meta.spatial[0]; | |||
| auto FW = param.filter_meta.spatial[1]; | |||
| //! TODO: now winograd only support fast-run | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) { | |||
| return {AlgoCategory::WINOGRAD}; | |||
| } | |||
| //! im2col | |||
| bool im2col_prefer = (IC >= 32 || OC >= 32); | |||
| //! quantized algo use matmul when direct algo is unusable | |||
| @@ -27,7 +27,7 @@ | |||
| #include "src/arm_common/type_cvt/opr_impl.h" | |||
| #include "src/arm_common/reduce/opr_impl.h" | |||
| #include "src/arm_common/conv_bias/opr_impl.h" | |||
| #include "src/arm_common/winograd_filter_preprocess/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| @@ -50,7 +50,6 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(WarpPerspective) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(TypeCvt) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(Reduce) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvBias) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(WinogradFilterPreprocess) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData) | |||
| #pragma GCC diagnostic push | |||
| @@ -1,179 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/arm_common/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/arm_common/handle.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/arm_common/conv_bias/fp32/strategy.h" | |||
| #include "src/arm_common/conv_bias/int8/strategy.h" | |||
| #include "src/arm_common/conv_bias/f16/strategy.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_arm_common_winograd_filter_preprocess) | |||
| using namespace megdnn; | |||
| using namespace arm_common; | |||
| void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) { | |||
| using namespace winograd; | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| //! NCHW44 group conv or NCHW group conv or both dense conv | |||
| size_t flt_start = 0; | |||
| size_t pack_c_size = 1; | |||
| size_t group = 1; | |||
| if (src.layout.ndim == 5) { //! {g, OC, IC, FH, FW} | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| } else if (src.layout.ndim == 6) { //! {OC/4, IC/4, FH, FW, 4, 4} | |||
| pack_c_size = src.layout[5]; | |||
| } else if (src.layout.ndim == 7) { //! {g, OC/4, IC/4, FH, FW, 4, 4} | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| pack_c_size = src.layout[6]; | |||
| } | |||
| size_t OC = src.layout[flt_start] * pack_c_size, | |||
| IC = src.layout[flt_start + 1] * pack_c_size, | |||
| FW = src.layout[flt_start + 3]; | |||
| size_t m = param().output_block_size; | |||
| bool execed = false; | |||
| #define DISPATCH(_strategy, _format, ...) \ | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_filter_preprocess, \ | |||
| ##__VA_ARGS__) { \ | |||
| if (param().format == _format) { \ | |||
| for (size_t g = 0; g < group; g++) { \ | |||
| auto run = [=]() { \ | |||
| _strategy strategy(src.layout.dtype, src.layout.dtype, \ | |||
| src.layout.dtype); \ | |||
| megdnn::winograd::ConvBias<_strategy, _format>(strategy, \ | |||
| 1_z) \ | |||
| .filter_process(src_ptr, dst_ptr, workspace_ptr, \ | |||
| OC, IC); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(run()); \ | |||
| src_ptr += src.layout.stride[0]; \ | |||
| dst_ptr += dst.layout.stride[0]; \ | |||
| } \ | |||
| execed = true; \ | |||
| } \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { | |||
| const float* src_ptr = src.ptr<float>(); | |||
| float* dst_ptr = dst.ptr<float>(); | |||
| float* workspace_ptr = workspace.ptr<float>(); | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| if (pack_c_size == 1) { | |||
| DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, | |||
| 0, 0); | |||
| } else if (pack_c_size == 4) { | |||
| DISPATCH(winograd_F23_mk4_f_nchw44, | |||
| param::Winograd::Format::MK4, 0, 5); | |||
| } | |||
| } else if (m == 6) { | |||
| DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT, | |||
| 0, 1); | |||
| if (pack_c_size == 1) { | |||
| DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, | |||
| 0, 2); | |||
| } else if (pack_c_size == 4) { | |||
| DISPATCH(winograd_F63_mk4_f_nchw44, | |||
| param::Winograd::Format::MK4, 0, 6); | |||
| } | |||
| } else if (m == 7) { | |||
| megdnn_assert(pack_c_size == 4, "WINOGRAD F(7,3) Only Supports NCHW44"); | |||
| DISPATCH(winograd_F73_mk4_f_nchw44, | |||
| param::Winograd::Format::MK4, 0, 7); | |||
| } | |||
| } else if (FW == 4) { | |||
| if (m == 5) { | |||
| DISPATCH(winograd_5x4_1x1_f, param::Winograd::Format::DEFAULT, | |||
| 0, 3); | |||
| } | |||
| } else if (FW == 5) { | |||
| if (m == 4) { | |||
| DISPATCH(winograd_4x5_1x1_f, param::Winograd::Format::DEFAULT, | |||
| 0, 4); | |||
| } | |||
| } | |||
| } | |||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||
| const dt_int8* src_ptr = src.compatible_ptr<dt_int8>(); | |||
| if (param().compute_mode == param::ConvBias::ComputeMode::DEFAULT) { | |||
| dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>(); | |||
| dt_int16* workspace_ptr = workspace.ptr<dt_int16>(); | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| if (pack_c_size == 1) { | |||
| DISPATCH(winograd_2x3_8x8_s8, | |||
| param::Winograd::Format::MK8, 1, 0); | |||
| } else if (pack_c_size == 4) { | |||
| DISPATCH(winograd_2x3_8x8_s8_nchw44, | |||
| param::Winograd::Format::MK8, 1, 0); | |||
| }else{ | |||
| megdnn_throw("only support pack_c_size = 1 or 4"); | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| dt_int32* dst_ptr_tmp = dst.compatible_ptr<dt_int32>(); | |||
| dt_int32* workspace_ptr_tmp = workspace.ptr<dt_int32>(); | |||
| float* dst_ptr = reinterpret_cast<float*>(dst_ptr_tmp); | |||
| float* workspace_ptr = reinterpret_cast<float*>(workspace_ptr_tmp); | |||
| if (pack_c_size == 4) { | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| DISPATCH(winograd_2x3_4x4_s8_f32_nchw44, | |||
| param::Winograd::Format::MK4, 1, 1); | |||
| } | |||
| } | |||
| } else { | |||
| megdnn_throw("only support pack_c_size == 4"); | |||
| } | |||
| } | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float16) { | |||
| const dt_float16* src_ptr = src.ptr<dt_float16>(); | |||
| dt_float16* dst_ptr = dst.ptr<dt_float16>(); | |||
| dt_float16* workspace_ptr = workspace.ptr<dt_float16>(); | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| DISPATCH(winograd_2x3_4x4_f16, param::Winograd::Format::DEFAULT, | |||
| 2, 0); | |||
| DISPATCH(winograd_2x3_8x8_f16, param::Winograd::Format::MK8, 2, | |||
| 1); | |||
| } else if (m == 6) { | |||
| DISPATCH(winograd_6x3_1x1_f16, param::Winograd::Format::DEFAULT, | |||
| 2, 2); | |||
| } | |||
| } else if (FW == 5) { | |||
| if (m == 4) { | |||
| DISPATCH(winograd_4x5_1x1_f16, param::Winograd::Format::DEFAULT, | |||
| 2, 3); | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| #undef DISPATCH | |||
| megdnn_assert(execed, | |||
| "Unsupport winograd filter preprocess. m: %zu src: %s", m, | |||
| src.layout.to_string().c_str()); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1,28 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/winograd_filter_preprocess/opr_impl.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess { | |||
| public: | |||
| using WinogradFilterPreprocess::WinogradFilterPreprocess; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| }; | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -35,37 +35,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| const TensorLayout& dst, size_t workspace_in_bytes, | |||
| const PreprocessedFilter* preprocessed_filter) { | |||
| if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | |||
| src.dtype.category() == DTypeCategory::QUANTIZED) { | |||
| megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16 || | |||
| //!int8 winogradf23_44 using float,QuantizedS32 take the scale | |||
| filter.dtype.enumv() == DTypeEnum::QuantizedS32); | |||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | |||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | |||
| } else { | |||
| megdnn_assert(src.dtype.enumv() == filter.dtype.enumv()); | |||
| } | |||
| megdnn_assert(src.dtype.enumv() == filter.dtype.enumv()); | |||
| if (src.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||
| if (bias.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||
| float scale_src = src.dtype.param<dtype::QuantizedS8>().scale; | |||
| float scale_filter = 0.f; | |||
| if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| if (filter.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||
| //! int8 winogradf23_44 using float,QuantizedS32 take the | |||
| //! scale | |||
| scale_filter = | |||
| filter.dtype.param<dtype::QuantizedS32>().scale; | |||
| } else { | |||
| scale_filter = | |||
| filter.dtype.param<dtype::QuantizedS16>().scale; | |||
| } | |||
| } else { | |||
| scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale; | |||
| } | |||
| float scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale; | |||
| float scale_bias = bias.dtype.param<dtype::QuantizedS32>().scale; | |||
| megdnn_assert( | |||
| std::abs(scale_src * scale_filter - scale_bias) < 1e-6, | |||
| @@ -77,15 +51,8 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| if (bias.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||
| float scale_src = src.dtype.param<dtype::Quantized8Asymm>().scale; | |||
| float scale_filter = 0.f; | |||
| if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale; | |||
| } else { | |||
| scale_filter = | |||
| filter.dtype.param<dtype::Quantized8Asymm>().scale; | |||
| } | |||
| float scale_filter = | |||
| filter.dtype.param<dtype::Quantized8Asymm>().scale; | |||
| float scale_bias = bias.dtype.param<dtype::QuantizedS32>().scale; | |||
| megdnn_assert( | |||
| std::abs(scale_src * scale_filter - scale_bias) < 1e-6, | |||
| @@ -115,7 +82,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| if (check_eq(bias, dst)) | |||
| return ret; | |||
| if (param().format == param::ConvBias::Format::NCHW || | |||
| param().format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW4_NCHW) { | |||
| megdnn_assert(bias.shape[0] == 1); | |||
| megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", | |||
| @@ -131,7 +97,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| } else if (param().format == param::ConvBias::Format::NCHW4 || | |||
| param().format == param::ConvBias::Format::NCHW44 || | |||
| param().format == param::ConvBias::Format::NCHW44_DOT || | |||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD || | |||
| param().format == param::ConvBias::Format::NCHW32_NCHW4) { | |||
| megdnn_assert(bias.shape[0] == 1); | |||
| megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", | |||
| @@ -140,8 +105,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| megdnn_assert(bias.shape[3] == 1); | |||
| megdnn_assert(bias.shape[4] == 4); | |||
| } else if (param().format == param::ConvBias::Format::NCHW8 || | |||
| param().format == param::ConvBias::Format::NCHW88 || | |||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD) { | |||
| param().format == param::ConvBias::Format::NCHW88 ) { | |||
| megdnn_assert(bias.shape[0] == 1); | |||
| megdnn_assert(bias.shape[1] == dst.shape[1], "bias:%s, dst:%s", | |||
| bias.to_string().c_str(), dst.to_string().c_str()); | |||
| @@ -175,11 +139,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| } | |||
| if (z.ndim != 0) { | |||
| megdnn_assert(param().format != param::ConvBias::Format::NCHW_WINOGRAD); | |||
| megdnn_assert(param().format != | |||
| param::ConvBias::Format::NCHW88_WINOGRAD); | |||
| megdnn_assert(param().format != | |||
| param::ConvBias::Format::NCHW44_WINOGRAD); | |||
| megdnn_assert(param().format != param::ConvBias::Format::NCHW4_NCHW32); | |||
| megdnn_assert(param().format != param::ConvBias::Format::NCHW32_NCHW4); | |||
| megdnn_assert(z.dtype.enumv() == dst.dtype.enumv()); | |||
| @@ -187,105 +146,6 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| } | |||
| return ret; | |||
| } | |||
| /*! | |||
| * \brief deduce the origin filter layout and param after winograd transformed | |||
| */ | |||
| void ConvBiasForward::deduce_winograd_origin_layout_and_param( | |||
| const Param::Format format, const size_t output_block_size, | |||
| const TensorLayout& src_layout, | |||
| const TensorLayout& winograd_filter_layout, TensorLayout& origin_layout, | |||
| Param& origin_param) { | |||
| if (format == megdnn::param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| format == megdnn::param::ConvBias::Format::NCHW44_WINOGRAD || | |||
| format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) { | |||
| //! change NCHWxx_WINOGRAD to NCHWxx | |||
| size_t OC = 0; | |||
| size_t IC = 0; | |||
| size_t GROUP = 1; | |||
| size_t FH = winograd_filter_layout[1] - output_block_size + 1; | |||
| //! {alpha, alpha, IC, OC} | |||
| if (winograd_filter_layout.ndim == 4) { | |||
| OC = winograd_filter_layout[3]; | |||
| IC = winograd_filter_layout[2]; | |||
| } | |||
| //! {group, alpha, alpha, IC, OC} | |||
| else if (winograd_filter_layout.ndim == 5) { | |||
| OC = winograd_filter_layout[4]; | |||
| IC = winograd_filter_layout[3]; | |||
| GROUP = winograd_filter_layout[0]; | |||
| } | |||
| //! {alpha, alpha, OC/f, IC/f, f, f} | |||
| else if (winograd_filter_layout.ndim == 6) { | |||
| OC = winograd_filter_layout[2] * winograd_filter_layout[5]; | |||
| IC = winograd_filter_layout[3] * winograd_filter_layout[4]; | |||
| } | |||
| //! {group, alpha, alpha, OC/f, IC/f, f, f} | |||
| else if (winograd_filter_layout.ndim == 7) { | |||
| OC = winograd_filter_layout[3] * winograd_filter_layout[6]; | |||
| IC = winograd_filter_layout[4] * winograd_filter_layout[5]; | |||
| GROUP = winograd_filter_layout[0]; | |||
| } | |||
| auto origin_data_type = winograd_filter_layout.dtype; | |||
| if (src_layout.dtype.enumv() == DTypeEnum::QuantizedS8) { | |||
| if (origin_data_type.enumv() == DTypeEnum::QuantizedS16) { | |||
| float scale = | |||
| origin_data_type.param<dtype::QuantizedS16>().scale; | |||
| origin_data_type = megdnn::dtype::QuantizedS8(scale); | |||
| } else { | |||
| //! In order to braing the sacle of filter, the transformed | |||
| //! qint8 winograd filter computing with float dtype is Qint32 | |||
| megdnn_assert(origin_data_type.enumv() == | |||
| DTypeEnum::QuantizedS32); | |||
| float scale = | |||
| origin_data_type.param<dtype::QuantizedS32>().scale; | |||
| origin_data_type = megdnn::dtype::QuantizedS8(scale); | |||
| } | |||
| } | |||
| if (GROUP == 1) { | |||
| if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) { | |||
| origin_layout = | |||
| TensorLayout({OC, IC, FH, FH}, origin_data_type); | |||
| } else if (format == | |||
| megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| origin_layout = TensorLayout({OC / 4, IC / 4, FH, FH, 4, 4}, | |||
| origin_data_type); | |||
| } else { | |||
| megdnn_assert(format == | |||
| megdnn::param::ConvBias::Format::NCHW88_WINOGRAD); | |||
| origin_layout = TensorLayout({OC / 8, IC / 8, FH, FH, 8, 8}, | |||
| origin_data_type); | |||
| } | |||
| } else { | |||
| if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) { | |||
| origin_layout = | |||
| TensorLayout({GROUP, OC, IC, FH, FH}, origin_data_type); | |||
| } else if (format == | |||
| megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| origin_layout = | |||
| TensorLayout({GROUP, OC / 4, IC / 4, FH, FH, 4, 4}, | |||
| origin_data_type); | |||
| } else { | |||
| megdnn_assert(format == | |||
| megdnn::param::ConvBias::Format::NCHW88_WINOGRAD); | |||
| origin_layout = | |||
| TensorLayout({GROUP, OC / 8, IC / 8, FH, FH, 8, 8}, | |||
| origin_data_type); | |||
| } | |||
| } | |||
| origin_param.output_block_size = 0; | |||
| if (format == megdnn::param::ConvBias::Format::NCHW_WINOGRAD) { | |||
| origin_param.format = megdnn::param::ConvBias::Format::NCHW; | |||
| } else if (format == megdnn::param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| origin_param.format = megdnn::param::ConvBias::Format::NCHW44; | |||
| } else { | |||
| megdnn_assert(format == | |||
| megdnn::param::ConvBias::Format::NCHW88_WINOGRAD); | |||
| origin_param.format = megdnn::param::ConvBias::Format::NCHW88; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| struct NCHWParamTrait; | |||
| @@ -41,36 +41,12 @@ uint32_t spatial_getter(uint32_t filter, const Param&) { | |||
| return filter; | |||
| } | |||
| template <> | |||
| uint32_t | |||
| spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW_WINOGRAD>( | |||
| uint32_t filter, const param::ConvBias& param) { | |||
| //! f = m + r - 1 -> r = f + 1 - m | |||
| return filter - param.output_block_size + 1; | |||
| } | |||
| template <> | |||
| uint32_t | |||
| spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW88_WINOGRAD>( | |||
| uint32_t filter, const param::ConvBias& param) { | |||
| //! f = m + r - 1 -> r = f + 1 - m | |||
| return filter - param.output_block_size + 1; | |||
| } | |||
| template <> | |||
| uint32_t | |||
| spatial_getter<param::ConvBias, param::ConvBias::Format::NCHW44_WINOGRAD>( | |||
| uint32_t filter, const param::ConvBias& param) { | |||
| //! f = m + r - 1 -> r = f + 1 - m | |||
| return filter - param.output_block_size + 1; | |||
| } | |||
| template <typename Parameter, typename Param> | |||
| void make_canonized_filter_meta_nchw_nhwc( | |||
| size_t src_ndim, const TensorLayout& filter, const Param& param, | |||
| typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | |||
| megdnn_assert(param.format == Param::Format::NCHW || | |||
| param.format == Param::Format::NHWC || | |||
| param.format == Param::Format::NCHW_WINOGRAD); | |||
| param.format == Param::Format::NHWC ); | |||
| auto img_ndim = src_ndim - 2; | |||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
| if (param.sparse == Param::Sparse::DENSE) { | |||
| @@ -101,20 +77,6 @@ void make_canonized_filter_meta_nchw_nhwc( | |||
| flt_spatial_start = 2; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 1; | |||
| } else if (param.format == Param::Format::NCHW_WINOGRAD) { | |||
| // filter should be (alphah, alphaw, ic, oc) or (alphah, alphaw, ocb, | |||
| // icb, ic_block_size, oc_block_size) | |||
| flt_spatial_start = 0; | |||
| if (filter.ndim == flt_start + 4) { | |||
| ocpg_pos = 3; | |||
| icpg_pos = 2; | |||
| } else { | |||
| megdnn_assert(filter.ndim == flt_start + 6); | |||
| ic_block_size = filter[flt_start + 4]; | |||
| oc_block_size = filter[flt_start + 5]; | |||
| ocpg_pos = 2; | |||
| icpg_pos = 3; | |||
| } | |||
| } else { | |||
| megdnn_assert(param.format == Param::Format::NHWC, | |||
| "invalid conv tensor format"); | |||
| @@ -136,14 +98,8 @@ void make_canonized_filter_meta_nchw_nhwc( | |||
| megdnn_assert(dilation[i] > 0, | |||
| "invalid dilation on spatial dim %zu: %u", i, | |||
| dilation[i]); | |||
| if (param.format == Param::Format::NCHW_WINOGRAD) { | |||
| ret.spatial[i] = | |||
| spatial_getter<Param, Param::Format::NCHW_WINOGRAD>( | |||
| filter[i + flt_start + flt_spatial_start], param); | |||
| } else { | |||
| ret.spatial[i] = spatial_getter<Param, Param::Format::NCHW>( | |||
| filter[i + flt_start + flt_spatial_start], param); | |||
| } | |||
| ret.spatial[i] = spatial_getter<Param, Param::Format::NCHW>( | |||
| filter[i + flt_start + flt_spatial_start], param); | |||
| ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; | |||
| } | |||
| } | |||
| @@ -295,20 +251,12 @@ void make_canonized_filter_meta_nchwxx( | |||
| * FH, FW, pack_size(IC), pack_size(OC)} [group] | |||
| * {GROUP/pack_size, 1, 1, FH, FW, pack_size} [chan] | |||
| * | |||
| ** NCHW88_WINOGRAD and NCHW44_WINOGRAD mode | |||
| * filter: | |||
| * {alpha, alpha, OC/pack_size, IC/pack_size, pack_size(IC), | |||
| *pack_size(OC)} [dense] | |||
| * {GROUP, alpha, alpha, OC_PER_GROUP/pack_size, | |||
| * IC_PER_GROUP/pack_size, pack_size(IC), pack_size(OC)} [group] | |||
| * | |||
| */ | |||
| megdnn_assert(param.format == Param::Format::NCHW88 || | |||
| param.format == Param::Format::NCHW44 || | |||
| param.format == Param::Format::NCHW44_WINOGRAD || | |||
| param.format == Param::Format::NCHW44_DOT || | |||
| param.format == Param::Format::NCHW88_WINOGRAD); | |||
| param.format == Param::Format::NCHW44_DOT); | |||
| size_t img_ndim = 2; | |||
| size_t flt_start = 0; | |||
| size_t flt_spatial_start = 2; | |||
| @@ -325,10 +273,6 @@ void make_canonized_filter_meta_nchwxx( | |||
| filter[filter.ndim - 1]); | |||
| ret.group = 1; | |||
| flt_start = 0; | |||
| if (param.format == Param::Format::NCHW88_WINOGRAD || | |||
| param.format == Param::Format::NCHW44_WINOGRAD) { | |||
| flt_start = 2; | |||
| } | |||
| if (filter[filter.ndim - 2] == 2 * pack_size && | |||
| filter[filter.ndim - 1] == 2 * pack_size) { | |||
| pack_c_size = 2 * pack_size; | |||
| @@ -339,10 +283,6 @@ void make_canonized_filter_meta_nchwxx( | |||
| ret.icpg = filter[flt_start + 1] * pack_c_size; | |||
| } else if (filter.ndim == img_ndim + 3) { | |||
| // ohwi8o | |||
| megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD, | |||
| "Hybrid nchw88 mode in not support winograd"); | |||
| megdnn_assert(param.format != Param::Format::NCHW44_WINOGRAD, | |||
| "Hybrid nchw44 mode in not support winograd"); | |||
| flt_start = 0; | |||
| flt_spatial_start = 1; | |||
| ret.group = 1; | |||
| @@ -357,15 +297,9 @@ void make_canonized_filter_meta_nchwxx( | |||
| megdnn_assert(param.sparse == Param::Sparse::GROUP, | |||
| "invalid convolution sparse type"); | |||
| flt_start = 1; | |||
| if (param.format == Param::Format::NCHW88_WINOGRAD || | |||
| param.format == Param::Format::NCHW44_WINOGRAD) { | |||
| flt_start = 3; | |||
| } | |||
| auto filter_oc = filter[flt_start]; | |||
| auto filter_ic = filter[flt_start + 1]; | |||
| if (filter_oc == 1 && filter_ic == 1 && filter.ndim == (img_ndim + 4) && | |||
| param.format != Param::Format::NCHW88_WINOGRAD && | |||
| param.format != Param::Format::NCHW44_WINOGRAD) { | |||
| if (filter_oc == 1 && filter_ic == 1 && filter.ndim == (img_ndim + 4)) { | |||
| // Depthwise case goihw8g | |||
| megdnn_assert(filter.ndim == img_ndim + 4, | |||
| "bad filter ndim for group convolution: " | |||
| @@ -416,17 +350,7 @@ void make_canonized_filter_meta_nchwxx( | |||
| "NCHWXX has invalid dilation on spatial dim %zu: %u, " | |||
| "require to be 1", | |||
| i, dilation[i]); | |||
| if (param.format == Param::Format::NCHW88_WINOGRAD) { | |||
| ret.spatial[i] = | |||
| spatial_getter<Param, Param::Format::NCHW88_WINOGRAD>( | |||
| filter[i + flt_start - 2], param); | |||
| } else if (param.format == Param::Format::NCHW44_WINOGRAD) { | |||
| ret.spatial[i] = | |||
| spatial_getter<Param, Param::Format::NCHW44_WINOGRAD>( | |||
| filter[i + flt_start - 2], param); | |||
| } else { | |||
| ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; | |||
| } | |||
| ret.spatial[i] = filter[i + flt_start + flt_spatial_start]; | |||
| ret.dilated_spatial[i] = (ret.spatial[i] - 1) * dilation[i] + 1; | |||
| } | |||
| } | |||
| @@ -579,13 +503,11 @@ ConvolutionBase<Parameter>::make_canonized_filter_meta( | |||
| } else if (param().format == Param::Format::NCHW8) { | |||
| make_canonized_filter_meta_nchwx<8, Parameter>(src_ndim, filter, | |||
| param(), ret); | |||
| } else if (param().format == Param::Format::NCHW88 || | |||
| param().format == Param::Format::NCHW88_WINOGRAD) { | |||
| } else if (param().format == Param::Format::NCHW88) { | |||
| make_canonized_filter_meta_nchwxx<8, Parameter>(src_ndim, filter, | |||
| param(), ret); | |||
| } else if (param().format == Param::Format::NCHW44 || | |||
| param().format == Param::Format::NCHW44_DOT || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) { | |||
| param().format == Param::Format::NCHW44_DOT) { | |||
| make_canonized_filter_meta_nchwxx<4, Parameter>(src_ndim, filter, | |||
| param(), ret); | |||
| } else if (param().format == Param::Format::NCHW32 || | |||
| @@ -597,8 +519,7 @@ ConvolutionBase<Parameter>::make_canonized_filter_meta( | |||
| param(), ret); | |||
| } else { | |||
| megdnn_assert(param().format == Param::Format::NHWC || | |||
| param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NCHW_WINOGRAD); | |||
| param().format == Param::Format::NCHW); | |||
| make_canonized_filter_meta_nchw_nhwc<Parameter>(src_ndim, filter, | |||
| param(), ret); | |||
| } | |||
| @@ -619,17 +540,8 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src, | |||
| } else if (src.enumv() == DTypeEnum::QuantizedS8 || | |||
| src.enumv() == DTypeEnum::Quantized8Asymm || | |||
| src.enumv() == DTypeEnum::Quantized4Asymm) { | |||
| //! Qint8 winograd compute with float, in order to bringing the filter | |||
| //! scale, here just use QuantizedS32 as filter type. | |||
| if (src.enumv() == DTypeEnum::QuantizedS8 && | |||
| filter.enumv() == DTypeEnum::QuantizedS32) { | |||
| supported_dst_dtype.push_back(dtype::QuantizedS32( | |||
| src.param<dtype::QuantizedS8>().scale * | |||
| filter.param<dtype::QuantizedS32>().scale)); | |||
| } else { | |||
| supported_dst_dtype.push_back( | |||
| dtype::QuantizedS32(mul_scale(src, filter))); | |||
| } | |||
| supported_dst_dtype.push_back( | |||
| dtype::QuantizedS32(mul_scale(src, filter))); | |||
| if (dst.valid() && dst.enumv() == src.enumv()) { | |||
| supported_dst_dtype.push_back(dst); | |||
| } | |||
| @@ -681,24 +593,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| megdnn_assert_contiguous(src); | |||
| megdnn_assert_contiguous(filter); | |||
| megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); | |||
| if ((param().format == Param::Format::NCHW_WINOGRAD || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) && | |||
| src.dtype.category() == DTypeCategory::QUANTIZED) { | |||
| megdnn_assert((filter.dtype.enumv() == DTypeEnum::QuantizedS16 || | |||
| filter.dtype.enumv() == DTypeEnum::QuantizedS32), | |||
| "%s", errmsg().c_str()); | |||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | |||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm, | |||
| "%s", errmsg().c_str()); | |||
| } else { | |||
| megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s", | |||
| errmsg().c_str()); | |||
| } | |||
| megdnn_assert(src.dtype.enumv() == filter.dtype.enumv(), "%s", | |||
| errmsg().c_str()); | |||
| check_or_deduce_dtype_fwd(src.dtype, filter.dtype, dst.dtype); | |||
| size_t img_dim; | |||
| if (param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NHWC || | |||
| param().format == Param::Format::NCHW_WINOGRAD) { | |||
| param().format == Param::Format::NHWC) { | |||
| img_dim = src.ndim - 2; | |||
| megdnn_assert(filter.ndim >= img_dim + 2 && filter.ndim <= img_dim + 6, | |||
| "%s", errmsg().c_str()); | |||
| @@ -714,8 +614,6 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| param().format == Param::Format::NCHW32 || | |||
| param().format == Param::Format::NCHW32_NCHW4 || | |||
| param().format == Param::Format::NCHW88 || | |||
| param().format == Param::Format::NCHW88_WINOGRAD || | |||
| param().format == Param::Format::NCHW44_WINOGRAD || | |||
| param().format == Param::Format::CHWN4); | |||
| img_dim = src.ndim - 3; | |||
| if ((param().format == Param::Format::NCHW88 || | |||
| @@ -770,8 +668,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| "but got src %s, filter %s", | |||
| src.to_string().c_str(), filter.to_string().c_str()); | |||
| } | |||
| if (param().format == Param::Format::NCHW88 || | |||
| param().format == Param::Format::NCHW88_WINOGRAD) { | |||
| if (param().format == Param::Format::NCHW88) { | |||
| megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | |||
| filter[filter.ndim - 1] == 8) || | |||
| (src.ndim == 5 && | |||
| @@ -786,8 +683,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| src.to_string().c_str(), filter.to_string().c_str()); | |||
| } | |||
| if (param().format == Param::Format::NCHW44 || | |||
| param().format == Param::Format::NCHW44_DOT || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) { | |||
| param().format == Param::Format::NCHW44_DOT) { | |||
| //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||
| megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | |||
| filter[filter.ndim - 1] == 4) || | |||
| @@ -820,12 +716,10 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| "currently only convolution on 2D image is supported"); | |||
| auto cflt = make_canonized_filter_meta(src.ndim, filter); | |||
| if (param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NHWC || | |||
| param().format == Param::Format::NCHW_WINOGRAD) { | |||
| param().format == Param::Format::NHWC ) { | |||
| size_t src_or_dst_c_pos = 0; | |||
| size_t src_or_dst_spatial_start = 0; | |||
| if (param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NCHW_WINOGRAD) { | |||
| if (param().format == Param::Format::NCHW) { | |||
| src_or_dst_c_pos = 1; | |||
| src_or_dst_spatial_start = 2; | |||
| } else { | |||
| @@ -836,10 +730,6 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| } | |||
| megdnn_assert(cflt.icpg * cflt.group == src[src_or_dst_c_pos], "%s", | |||
| errmsg().c_str()); | |||
| if (param().format == Param::Format::NCHW_WINOGRAD) { | |||
| megdnn_assert(cflt.spatial[0] == cflt.spatial[1], | |||
| "NCHW_WINOGRAD only support conv with fh == fw"); | |||
| } | |||
| dst.ndim = src.ndim; | |||
| dst[0] = src[0]; | |||
| dst[src_or_dst_c_pos] = cflt.ocpg * cflt.group; | |||
| @@ -900,8 +790,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | |||
| cflt.stride[1], cflt.padding[1]); | |||
| dst[4] = 32; | |||
| } else if (param().format == Param::Format::NCHW88 || | |||
| param().format == Param::Format::NCHW88_WINOGRAD) { | |||
| } else if (param().format == Param::Format::NCHW88 ) { | |||
| megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | |||
| "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | |||
| src.ndim); | |||
| @@ -923,8 +812,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| } | |||
| } else if (param().format == Param::Format::NCHW44 || | |||
| param().format == Param::Format::NCHW44_DOT || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) { | |||
| param().format == Param::Format::NCHW44_DOT) { | |||
| megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 4), | |||
| "invalid src ndim for NCHW44, expected=5 or 4, got=%zu", | |||
| src.ndim); | |||
| @@ -189,7 +189,6 @@ private: | |||
| cb(RelayoutFormat) \ | |||
| cb(TopK) \ | |||
| cb(PowC) \ | |||
| cb(WinogradFilterPreprocess) \ | |||
| cb(LocalShareForward) \ | |||
| cb(LocalShareBackwardData) \ | |||
| cb(LocalShareBackwardFilter) \ | |||
| @@ -1,157 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/common/winograd_filter_preprocess.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "megdnn/oprs.h" | |||
| #include <numeric> | |||
| #include "src/common/utils.h" | |||
| using namespace megdnn; | |||
| void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src, | |||
| TensorLayout& dst) { | |||
| auto errmsg = [&]() { | |||
| return "invalid filter layout:" + megdnn_layout_msg(src); | |||
| }; | |||
| MEGDNN_MARK_USED_VAR(errmsg); | |||
| //! NCHW88 weight layout include | |||
| //! dense{oc/8, ic/8, fh, fw, 8, 8}; group {g, oc/8, ic/8, fh, fw, 8, 8}; | |||
| //! channel wise{g/8, 1, 1, fh, fw, 8} | |||
| megdnn_assert( | |||
| src.ndim == 4 || src.ndim == 5 || src.ndim == 6 || src.ndim == 7, | |||
| "%s", errmsg().c_str()); | |||
| //! nchw88 channel wise conv | |||
| megdnn_assert(!(src.ndim == 6 && src[1] == 1 && src[2] == 1), | |||
| "chennel wise nchw88 can not use winograd "); | |||
| //! nchw88 group conv | |||
| size_t flt_start = 0; | |||
| size_t pack_c_size = 1; | |||
| size_t group = 1; | |||
| //! group conv | |||
| if (src.ndim == 5) { | |||
| flt_start = 1; | |||
| group = src[0]; | |||
| //! nchw88 dense conv | |||
| } else if (src.ndim == 6) { | |||
| pack_c_size = src[5]; | |||
| //! nchw88 group conv | |||
| } else if (src.ndim == 7) { | |||
| flt_start = 1; | |||
| group = src[0]; | |||
| pack_c_size = src[6]; | |||
| } | |||
| size_t OC = src[flt_start] * pack_c_size, | |||
| IC = src[flt_start + 1] * pack_c_size, FH = src[flt_start + 2], | |||
| FW = src[flt_start + 3]; | |||
| size_t m = param().output_block_size; | |||
| megdnn_assert(FH == FW, "%s", errmsg().c_str()); | |||
| size_t alpha = FH + m - 1; | |||
| DType dst_type = src.dtype; | |||
| if (src.dtype.category() == DTypeCategory::QUANTIZED) { | |||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); | |||
| if (param().compute_mode == | |||
| param::ConvBias::ComputeMode::DEFAULT) { | |||
| //! input int8 compute short | |||
| dst_type = dtype::QuantizedS16( | |||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||
| } else { | |||
| //! input int8 compute float32 | |||
| dst_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||
| } | |||
| } | |||
| if (src.ndim == 4 || src.ndim == 6) { | |||
| if (param().format == param::Winograd::Format::DEFAULT) { | |||
| dst = TensorLayout({alpha, alpha, IC, OC}, dst_type); | |||
| } else { | |||
| megdnn_assert(param().format == param::Winograd::Format::MK4 || | |||
| param().format == param::Winograd::Format::MK8); | |||
| size_t pack_size = MatrixMulForward::pack_size(param().format); | |||
| dst = TensorLayout({alpha, alpha, OC / pack_size, IC / pack_size, | |||
| pack_size, pack_size}, | |||
| dst_type); | |||
| } | |||
| } else { | |||
| megdnn_assert(src.ndim == 5 || src.ndim == 7); | |||
| if (param().format == param::Winograd::Format::DEFAULT) { | |||
| dst = TensorLayout({group, alpha, alpha, IC, OC}, dst_type); | |||
| } else { | |||
| megdnn_assert(param().format == param::Winograd::Format::MK4 || | |||
| param().format == param::Winograd::Format::MK8); | |||
| size_t pack_size = MatrixMulForward::pack_size(param().format); | |||
| dst = TensorLayout({group, alpha, alpha, OC / pack_size, | |||
| IC / pack_size, pack_size, pack_size}, | |||
| dst_type); | |||
| } | |||
| } | |||
| } | |||
| void WinogradFilterPreprocess::check_exec(const TensorLayout& src, | |||
| const TensorLayout& dst, | |||
| size_t workspace_in_bytes) { | |||
| auto errmsg = [&]() { | |||
| return megdnn_layout_msg(src) + ", " + megdnn_layout_msg(dst); | |||
| }; | |||
| MEGDNN_MARK_USED_VAR(errmsg); | |||
| megdnn_assert_contiguous(src); | |||
| megdnn_assert_contiguous(dst); | |||
| //! nchwxx now only support Format MKx | |||
| if (param().format == param::Winograd::Format::DEFAULT) { | |||
| megdnn_assert(src.ndim == dst.ndim && (src.ndim == 4 || src.ndim == 5), | |||
| "%s", errmsg().c_str()); | |||
| } else { | |||
| megdnn_assert( | |||
| (param().format == param::Winograd::Format::MK4 || | |||
| param().format == param::Winograd::Format::MK8) && | |||
| (src.ndim == dst.ndim - 2 || src.ndim == dst.ndim) && | |||
| (src.ndim == 4 || src.ndim == 5 || src.ndim == 6 || | |||
| src.ndim == 7), | |||
| "%s", errmsg().c_str()); | |||
| } | |||
| TensorLayout dst_expected; | |||
| deduce_layout(src, dst_expected); | |||
| megdnn_assert_eq_layout(dst_expected, dst); | |||
| auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); | |||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
| } | |||
| size_t WinogradFilterPreprocess::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) { | |||
| MEGDNN_MARK_USED_VAR(dst); | |||
| DType output_compute_dtype = src.dtype; | |||
| if (src.dtype.category() == DTypeCategory::QUANTIZED) { | |||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | |||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | |||
| if (param().compute_mode == | |||
| param::ConvBias::ComputeMode::DEFAULT) { | |||
| //! input int8 compute short | |||
| output_compute_dtype = dtype::QuantizedS16( | |||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||
| } else { | |||
| //! input int8 compute float32 | |||
| output_compute_dtype = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||
| } | |||
| } | |||
| size_t FW = src[3]; | |||
| if (src.ndim == 5 || src.ndim == 7) { | |||
| FW = src[4]; | |||
| } | |||
| size_t pack_size = MatrixMulForward::pack_size(param().format); | |||
| size_t alpha = param().output_block_size + FW - 1; | |||
| return 2 * alpha * alpha * output_compute_dtype.size() * pack_size * | |||
| pack_size; | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -72,7 +72,6 @@ | |||
| #include "src/cuda/type_cvt/opr_impl.h" | |||
| #include "src/cuda/warp_affine/opr_impl.h" | |||
| #include "src/cuda/warp_perspective/opr_impl.h" | |||
| #include "src/cuda/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/cuda/local_share/opr_impl.h" | |||
| #include "src/cuda/roi_align/opr_impl.h" | |||
| #include "src/cuda/batch_conv_bias/opr_impl.h" | |||
| @@ -1,22 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/cuda/winograd_filter_preprocess/opr_impl.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/cuda/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/common/utils.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in, _megdnn_tensor_in, | |||
| _megdnn_workspace) { | |||
| megdnn_throw("WinogradFilterPreprocess is not supported in CUDA"); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1,27 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/cuda/winograd_filter_preprocess/opr_impl.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess { | |||
| public: | |||
| using WinogradFilterPreprocess::WinogradFilterPreprocess; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| }; | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -259,12 +259,7 @@ bool ConvBiasImpl::AlgoWinogradF32::usable( | |||
| strategy, UNIT_TILE_SIZE, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -329,12 +324,7 @@ bool ConvBiasImpl::AlgoWinogradF32_4x4::usable( | |||
| strategy, UNIT_TILE_SIZE, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -397,12 +387,7 @@ bool ConvBiasImpl::AlgoWinogradQS8::usable( | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::DEFAULT)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -467,12 +452,7 @@ bool ConvBiasImpl::AlgoWinogradQS8_8x8::usable( | |||
| strategy, UNIT_TILE_SIZE, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW && | |||
| param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -342,10 +342,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
| param().format == Param::Format::NCHW4 || | |||
| param().format == Param::Format::NCHW44 || | |||
| param().format == Param::Format::NCHW44_DOT || | |||
| param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NCHW_WINOGRAD || | |||
| param().format == Param::Format::NCHW88_WINOGRAD || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) { | |||
| param().format == Param::Format::NCHW) { | |||
| spatial_pos = 2; | |||
| } else if (param().format == Param::Format::NHWC) { | |||
| spatial_pos = 1; | |||
| @@ -370,25 +367,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
| "should be equal"); | |||
| auto&& fm = check_layout_fwd(src, filter, dst); | |||
| auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm); | |||
| param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT; | |||
| if (param().format == Param::Format::NCHW_WINOGRAD || | |||
| param().format == Param::Format::NCHW88_WINOGRAD || | |||
| param().format == Param::Format::NCHW44_WINOGRAD) { | |||
| size_t flt_start = 0; | |||
| if (param().sparse == Param::Sparse::GROUP) { | |||
| flt_start = 1; | |||
| } | |||
| if (filter.ndim == 6 + flt_start) { | |||
| if (filter[5] == 4) { | |||
| format = param::MatrixMul::Format::MK4; | |||
| } else { | |||
| megdnn_assert(filter[5] == 8); | |||
| format = param::MatrixMul::Format::MK8; | |||
| } | |||
| } | |||
| } | |||
| size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) | |||
| ->megcore_dispatcher() | |||
| ->nr_threads(); | |||
| @@ -407,8 +386,6 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||
| nr_threads, | |||
| reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>( | |||
| preprocessed_filter)}, | |||
| param().output_block_size, | |||
| format, | |||
| bias.dtype, | |||
| bias.stride[0], | |||
| bias_mode, | |||
| @@ -537,11 +514,7 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order( | |||
| auto FH = param.filter_meta.spatial[0]; | |||
| auto FW = param.filter_meta.spatial[1]; | |||
| //! TODO: now winograd only support in fast-run | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) { | |||
| return {AlgoCategory::WINOGRAD}; | |||
| } | |||
| //! im2col + matmul | |||
| bool im2col_prefer = (IC >= 32 || OC >= 32); | |||
| //! quantized algo use matmul when direct algo is unusable | |||
| @@ -632,21 +605,6 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_pack_id, | |||
| break; | |||
| } | |||
| case ConvBiasImpl::Param::Format::NCHW_WINOGRAD: | |||
| case ConvBiasImpl::Param::Format::NCHW44_WINOGRAD: | |||
| case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: { | |||
| //! four format of weight layout | |||
| //! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8} | |||
| //! 2. {alpha, alpha, ocpg/8, icpg/8, 8, 8} | |||
| //! 3. {g, alpha, alpha, oc, ic, 8, 8} | |||
| //! 4. {alpha, alpha, oc, ic} | |||
| group_offset = pack_group_size * group_pack_id * filter_meta.icpg * | |||
| filter_meta.ocpg * | |||
| (filter_meta.spatial[0] + output_block_size - 1) * | |||
| (filter_meta.spatial[1] + output_block_size - 1) * | |||
| filter_type.size(); | |||
| break; | |||
| } | |||
| default: | |||
| megdnn_assert(0, "other filter format is not support yet"); | |||
| } | |||
| @@ -103,19 +103,13 @@ public: | |||
| struct NCBKernSizeParam : ConvolutionImpl::NCBKernSizeParam { | |||
| NCBKernSizeParam() = default; | |||
| NCBKernSizeParam(const ConvolutionImpl::NCBKernSizeParam& param, | |||
| size_t output_block_size, | |||
| param::MatrixMul::Format winograd_matmul_format, | |||
| DType bias_type, ptrdiff_t bias_bs, BiasMode bias_mode, | |||
| Param::NonlineMode nonlineMode) | |||
| : ConvolutionImpl::NCBKernSizeParam(param), | |||
| output_block_size{output_block_size}, | |||
| winograd_matmul_format{winograd_matmul_format}, | |||
| bias_type{bias_type}, | |||
| bias_bs{bias_bs}, | |||
| bias_mode{bias_mode}, | |||
| nonlineMode{nonlineMode} {} | |||
| size_t output_block_size; //!< used in winograd algo | |||
| param::MatrixMul::Format winograd_matmul_format; | |||
| DType bias_type; | |||
| //! stride for batch of bias | |||
| ptrdiff_t bias_bs; | |||
| @@ -88,13 +88,7 @@ class ConvBias { | |||
| size_t filter_transform_buf_size = 0; | |||
| //! filter : (alpha, alpha, IC, OC) or (OCB, ICB, IC_BLOCK_SIZE, | |||
| //! OC_BLOCK_SIZE) | |||
| if (param.preprocessed_filter == nullptr && | |||
| param.filter_meta.format != | |||
| param::ConvBias::Format::NCHW_WINOGRAD && | |||
| param.filter_meta.format != | |||
| param::ConvBias::Format::NCHW88_WINOGRAD && | |||
| param.filter_meta.format != | |||
| param::ConvBias::Format::NCHW44_WINOGRAD) { | |||
| if (param.preprocessed_filter == nullptr) { | |||
| filter_transform_buf_size = Strategy::ALPHA * Strategy::ALPHA * OC * | |||
| IC * sizeof(input_filter_compute_type); | |||
| } | |||
| @@ -108,12 +102,7 @@ class ConvBias { | |||
| nullptr, | |||
| {winograd_comput_size, filter_transform_buf_size * GROUP}); | |||
| } else { | |||
| megdnn_assert(param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD); | |||
| megdnn_assert(param.preprocessed_filter != nullptr); | |||
| return WorkspaceBundle(nullptr, {winograd_comput_size}); | |||
| } | |||
| } | |||
| @@ -499,7 +488,6 @@ public: | |||
| const TensorND& preprocessed_dst = | |||
| param.preprocessed_filter->tensors[0]; | |||
| WorkspaceBundle bundle = get_preprocess_wbundle(param); | |||
| Strategy strategy = m_strategy; | |||
| SmallVector<NCBKern> kerns; | |||
| auto filter_process_kern = | |||
| @@ -558,13 +546,7 @@ public: | |||
| param.filter_meta.stride[1] == 1 && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88 || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44 || | |||
| param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW88_WINOGRAD || | |||
| param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD)); | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44)); | |||
| SmallVector<NCBKern> kerns; | |||
| if (param.preprocessed_filter == nullptr && | |||
| @@ -316,8 +316,6 @@ ConvolutionImpl::AlgoDefault::init_conv_bias_param( | |||
| mul_scale(param.src_type, param.filter_type)); | |||
| } | |||
| return {param, | |||
| 0, | |||
| param::MatrixMul::Format::DEFAULT, | |||
| bias_type, | |||
| 0, | |||
| BiasMode::NO_BIAS, | |||
| @@ -225,8 +225,7 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | |||
| param().format == Param::Format::NCHW44_DOT || | |||
| param().format == Param::Format::NCHW44) { | |||
| spatial_pos = 2; | |||
| } else if (param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NCHW_WINOGRAD) { | |||
| } else if (param().format == Param::Format::NCHW) { | |||
| spatial_pos = 2; | |||
| } else if (param().format == Param::Format::NHWC) { | |||
| spatial_pos = 1; | |||
| @@ -78,7 +78,6 @@ | |||
| #include "src/naive/type_cvt/opr_impl.h" | |||
| #include "src/naive/warp_affine/opr_impl.h" | |||
| #include "src/naive/warp_perspective/opr_impl.h" | |||
| #include "src/naive/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/naive/remap/opr_impl.h" | |||
| #include "src/naive/fake_quant/opr_impl.h" | |||
| @@ -1,234 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/naive/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/common/winograd/winograd_helper.h" | |||
| #include "src/naive/handle.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_naive_winograd_filter_preprocess) | |||
| using namespace megdnn; | |||
| using namespace naive; | |||
| void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| //! nchw88 group conv | |||
| size_t flt_start = 0; | |||
| size_t pack_c_size = 1; | |||
| size_t group = 1; | |||
| //! group conv | |||
| if (src.layout.ndim == 5) { | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| //! nchw88 dense conv | |||
| } else if (src.layout.ndim == 6) { | |||
| pack_c_size = src.layout[5]; | |||
| //! nchw88 group conv | |||
| } else if (src.layout.ndim == 7) { | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| pack_c_size = src.layout[6]; | |||
| } | |||
| size_t OC = src.layout[flt_start] * pack_c_size, | |||
| IC = src.layout[flt_start + 1] * pack_c_size, | |||
| FW = src.layout[flt_start + 3]; | |||
| size_t m = param().output_block_size; | |||
| bool execed = false; | |||
| #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _format, rescale) \ | |||
| if (param().format == _format) { \ | |||
| return winograd::StrategyHelper< \ | |||
| _ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, param::ConvBias::Format::NCHW, \ | |||
| _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||
| OC, m, FW, interp_points, src.layout.dtype, \ | |||
| rescale); \ | |||
| } | |||
| #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale) \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::DEFAULT, _rescale); \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::MK4, _rescale); | |||
| #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale) \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::DEFAULT, _rescale); \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::MK8, _rescale); | |||
| #define DISPATCH_KERNEL(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _kern, _rescale, ...) \ | |||
| const _ctype* src_ptr = src.compatible_ptr<_ctype>(); \ | |||
| _input_filter_compute_type* dst_ptr = \ | |||
| dst.compatible_ptr<_input_filter_compute_type>(); \ | |||
| _input_filter_compute_type* workspace_ptr = \ | |||
| workspace.ptr<_input_filter_compute_type>(); \ | |||
| MIDOUT_BEGIN(megdnn_naive_winograd_filter_preprocess, ##__VA_ARGS__) { \ | |||
| for (size_t g = 0; g < group; g++) { \ | |||
| auto run = [=]() { \ | |||
| _kern(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR(run()); \ | |||
| src_ptr += src.layout.stride[0]; \ | |||
| dst_ptr += dst.layout.stride[0]; \ | |||
| } \ | |||
| execed = true; \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| #define DISPATCH_DTYPE(_midout_tag) \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||
| } \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { \ | |||
| DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32, \ | |||
| DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 1); \ | |||
| } \ | |||
| MEGDNN_INC_FLOAT16(if (src.layout.dtype.enumv() == DTypeEnum::Float16) { \ | |||
| DISPATCH_KERNEL(dt_float16, dt_float16, dt_float16, dt_float16, \ | |||
| DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 2); \ | |||
| }) | |||
| if (src.layout.ndim <= 5) { | |||
| //! dispatch_dtype with consider layout and format. | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(0); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(1); | |||
| } | |||
| } else if (FW == 4) { | |||
| if (m == 5) { | |||
| std::vector<float> interp_points = {0, 0.5, -0.5, 1, -1, 2, -2}; | |||
| DISPATCH_DTYPE(2); | |||
| } | |||
| } else if (FW == 5) { | |||
| if (m == 4) { | |||
| std::vector<float> interp_points = {0, 1, -1, 0.5, -0.5, 2, -2}; | |||
| DISPATCH_DTYPE(3); | |||
| } | |||
| } | |||
| #undef cb | |||
| #undef DISPATCH_FORMAT_MK4 | |||
| #undef DISPATCH_FORMAT_MK8 | |||
| #undef DISPATCH_DTYPE | |||
| } else { | |||
| megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||
| #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _format, rescale) \ | |||
| if (param().format == _format) { \ | |||
| return winograd::StrategyHelper< \ | |||
| _ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, param::ConvBias::Format::NCHW88, \ | |||
| _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||
| OC, m, FW, interp_points, src.layout.dtype, \ | |||
| rescale); \ | |||
| } | |||
| #define DISPATCH_FORMAT_MK8(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale) \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::MK8, _rescale); | |||
| #define DISPATCH_DTYPE(_midout_tag) \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0); \ | |||
| } | |||
| if (pack_c_size == 8) { //! NCHW88 | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(4); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, | |||
| -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(5); | |||
| } | |||
| } | |||
| #undef cb | |||
| #undef DISPATCH_DTYPE | |||
| } | |||
| else if (pack_c_size == 4) { //! NCHW44 | |||
| #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _format, rescale) \ | |||
| if (param().format == _format) { \ | |||
| return winograd::StrategyHelper< \ | |||
| _ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, param::ConvBias::Format::NCHW44, \ | |||
| _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||
| OC, m, FW, interp_points, src.layout.dtype, \ | |||
| rescale); \ | |||
| } | |||
| #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale) \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::MK4, _rescale); | |||
| #define DISPATCH_DTYPE(_midout_tag) \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||
| } \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { \ | |||
| if (param().format == param::Winograd::Format::MK4) { \ | |||
| DISPATCH_KERNEL(dt_int8, dt_int8, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||
| } else if (param().format == param::Winograd::Format::MK8) { \ | |||
| DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32, \ | |||
| DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 0); \ | |||
| } \ | |||
| } | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(6); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, | |||
| -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(7); | |||
| } else if (m == 7) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, | |||
| -2, 0.5, -0.5, 1.5}; | |||
| DISPATCH_DTYPE(8); | |||
| } | |||
| } | |||
| #undef cb | |||
| #undef DISPATCH_FORMAT_MK8 | |||
| #undef DISPATCH_FORMAT_MK4 | |||
| #undef DISPATCH_KERNEL | |||
| #undef DISPATCH_DTYPE | |||
| } | |||
| } | |||
| megdnn_assert(execed, | |||
| "Unsupport winograd filter preprocess. m: %zu src: %s", m, | |||
| src.layout.to_string().c_str()); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1,28 +0,0 @@ | |||
| /** | |||
| * \file dnn/src/naive/winograd_filter_preprocess/opr_impl.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| namespace naive { | |||
| class WinogradFilterPreprocessImpl : public WinogradFilterPreprocess { | |||
| public: | |||
| using WinogradFilterPreprocess::WinogradFilterPreprocess; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| }; | |||
| } // namespace naive | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -43,12 +43,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_8x8::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW88 || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW88_WINOGRAD && | |||
| param.output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88 && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -89,12 +84,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_8x8::usable( | |||
| strategy, m_tile_size, param) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| (param.filter_meta.format == param::ConvBias::Format::NCHW88 || | |||
| (param.filter_meta.format == | |||
| param::ConvBias::Format::NCHW88_WINOGRAD && | |||
| param.output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK8)) && | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88 && | |||
| !param.filter_meta.should_flip && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| @@ -173,11 +173,7 @@ SmallVector<AlgoCategory> ConvBiasImpl::suggest_algo_category_order( | |||
| auto FH = param.filter_meta.spatial[0]; | |||
| auto FW = param.filter_meta.spatial[1]; | |||
| //! TODO: now winograd only support fast-run | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW44_WINOGRAD || | |||
| param.filter_meta.format == param::ConvBias::Format::NCHW88_WINOGRAD) { | |||
| return {AlgoCategory::WINOGRAD}; | |||
| } | |||
| //! nchw88 use mkl-dnn which algo is direct | |||
| if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { | |||
| return {AlgoCategory::DIRECT, AlgoCategory::IM2COL}; | |||
| @@ -629,6 +629,35 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_INT8_DIRECT_DOT_NCHW44_S2_8x8x32) { | |||
| #endif | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_winograd_args(3); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| for (auto&& arg : args) { | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| run(args, dtype::Float32(), dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), 1e-3f); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); | |||
| checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng); | |||
| run(args, dtype::Float16(), dtype::Float16(), dtype::Float16(), | |||
| dtype::Float16(), 0.35f); | |||
| #endif | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(); | |||
| @@ -717,207 +746,97 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F45) { | |||
| check_winograd("1:4:32", checker, args); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_winograd_args(3); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle) { | |||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW); | |||
| auto winograd_preprocess_opr = | |||
| handle->create_operator<WinogradFilterPreprocess>(); | |||
| winograd_preprocess_opr->param().output_block_size = m; | |||
| TensorLayout filter_transform_layout; | |||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
| filter_transform_layout); | |||
| size_t winograd_preprocess_workspace_in_bytes = | |||
| winograd_preprocess_opr->get_workspace_in_bytes( | |||
| tensors[1].layout, filter_transform_layout); | |||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
| conv_bias_opr->param() = param; | |||
| conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||
| nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| winograd_preprocess_workspace_in_bytes}); | |||
| wb.set(malloc(wb.total_size_in_bytes())); | |||
| TensorND filter_transform_tensor(wb.get(0), | |||
| std::move(filter_transform_layout)); | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| auto run = [&checker, &extra_impl]( | |||
| Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind(extra_impl, | |||
| std::placeholders::_1, m, | |||
| arg.param, handle)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| } | |||
| }; | |||
| run(handle(), args, {6}, dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), dtype::Float32(), 1e-3f); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); | |||
| checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng); | |||
| run(handle(), args, {6}, dtype::Float16(), dtype::Float16(), | |||
| dtype::Float16(), dtype::Float16(), 0.35f); | |||
| #endif | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| std::vector<TestArg> args_first_half(args.begin(), | |||
| args.begin() + args.size() / 2); | |||
| run(handle(), args_first_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, | |||
| 1e-3f); | |||
| run(args_first_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, 1e-3f); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_2) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| std::vector<TestArg> args_second_half(args.begin() + args.size() / 2, | |||
| args.end()); | |||
| run(handle(), args_second_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, | |||
| 1e-3f); | |||
| run(args_second_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, 1e-3f); | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F16) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); | |||
| checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng); | |||
| run(handle(), args, {2}, dtype::Float16{}, dtype::Float16{}, | |||
| dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8, | |||
| 0.25); | |||
| run(args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, | |||
| dtype::Float16{}, 0.25); | |||
| } | |||
| #endif | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -933,24 +852,19 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { | |||
| get_quantized_winograd_mk_packed_args(8); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, | |||
| DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| @@ -958,7 +872,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) { | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| } | |||
| }; | |||
| @@ -973,118 +886,99 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) { | |||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| dtype::QuantizedS8(60.25f),1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) { | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| float epsilon = 0.001; | |||
| #if MEGDNN_AARCH64 | |||
| const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; | |||
| const char* matmul_name = "AARCH64_F32_MK4_4x16"; | |||
| #else | |||
| const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; | |||
| const char* matmul_name = "ARMV7_F32_MK4_4x8"; | |||
| #endif | |||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||
| ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); | |||
| ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); | |||
| std::vector<TestArg> quantized_args = | |||
| get_int8_nchw44_args(3, 4, false, true); | |||
| get_int8_nchw44_args(3, 4, true, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| run(quantized_args, dtype::QuantizedS8(0.41113496f), | |||
| dtype::QuantizedS8(0.01887994f), | |||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||
| dtype::QuantizedS8(0.49550694f), epsilon); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) { | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| float epsilon = 0.001; | |||
| #if MEGDNN_AARCH64 | |||
| const char* matmul_name = "AARCH64_F32_MK4_4x16"; | |||
| const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; | |||
| #else | |||
| const char* matmul_name = "ARMV7_F32_MK4_4x8"; | |||
| const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; | |||
| #endif | |||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||
| ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); | |||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true); | |||
| ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); | |||
| std::vector<TestArg> quantized_args = | |||
| get_int8_nchw44_args(3, 4, false, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||
| dtype::QuantizedS8(0.01887994f), | |||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, | |||
| epsilon); | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) { | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) { | |||
| using namespace conv_bias; | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -1096,23 +990,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| #endif | |||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||
| ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); | |||
| std::vector<TestArg> quantized_args = | |||
| get_int8_nchw44_args(3, 4, true, true); | |||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||
| run(quantized_args, dtype::QuantizedS8(0.41113496f), | |||
| dtype::QuantizedS8(0.01887994f), | |||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, | |||
| epsilon); | |||
| dtype::QuantizedS8(0.49550694f), epsilon); | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) { | |||
| using namespace conv_bias; | |||
| @@ -1170,7 +1056,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_8x8_2) { | |||
| check_winograd_fp16("8:2:32", checker, args_back_half, rng, 0.25, | |||
| param::MatrixMul::Format::MK8); | |||
| } | |||
| #endif | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) { | |||
| using namespace conv_bias; | |||
| @@ -1187,6 +1072,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) { | |||
| check_winograd("8:2:32", checker, args, param::MatrixMul::Format::MK8); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_INT8_8X8_WEIGHT_PREPROCESS) { | |||
| using namespace conv_bias; | |||
| @@ -83,56 +83,12 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle) { | |||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW44); | |||
| auto winograd_preprocess_opr = | |||
| handle->create_operator<WinogradFilterPreprocess>(); | |||
| winograd_preprocess_opr->param().output_block_size = m; | |||
| winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK4; | |||
| TensorLayout filter_transform_layout; | |||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
| filter_transform_layout); | |||
| size_t winograd_preprocess_workspace_in_bytes = | |||
| winograd_preprocess_opr->get_workspace_in_bytes( | |||
| tensors[1].layout, filter_transform_layout); | |||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
| conv_bias_opr->param() = param; | |||
| conv_bias_opr->param().format = | |||
| param::ConvBias::Format::NCHW44_WINOGRAD; | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||
| nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| winograd_preprocess_workspace_in_bytes}); | |||
| wb.set(malloc(wb.total_size_in_bytes())); | |||
| TensorND filter_transform_tensor(wb.get(0), | |||
| std::move(filter_transform_layout)); | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| auto run = [&checker, &extra_impl]( | |||
| Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker]( | |||
| const std::vector<TestArg>& args, | |||
| DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind(extra_impl, | |||
| std::placeholders::_1, m, | |||
| arg.param, handle)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| @@ -140,7 +96,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| } | |||
| }; | |||
| @@ -149,7 +104,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||
| // dtype::Float32(), dtype::Float32(), 1e-2f); | |||
| //! remove this when low precision mode is ok | |||
| run(handle(), nchw44_args, {2, 6}, dtype::Float32(), dtype::Float32(), | |||
| run(nchw44_args, dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), dtype::Float32(), 1e-3f); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| @@ -158,31 +113,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| std::vector<TestArg> args_first_half(args.begin(), | |||
| args.begin() + args.size() / 2); | |||
| run(handle(), args_first_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, | |||
| 1e-3f); | |||
| run(args_first_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, 1e-3f); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_MK_PACKED_F32_2_WEIGHT_PREPROCESS) { | |||
| @@ -190,31 +138,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| std::vector<TestArg> args_second_half(args.begin() + args.size() / 2, | |||
| args.end()); | |||
| run(handle(), args_second_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, | |||
| 1e-3f); | |||
| run(args_second_half, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, | |||
| dtype::Float32{}, 1e-3f); | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| @@ -223,32 +164,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||
| Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); | |||
| checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng); | |||
| run(handle(), args, {2}, dtype::Float16{}, dtype::Float16{}, | |||
| dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8, | |||
| 0.25); | |||
| run(args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, | |||
| dtype::Float16{}, 0.25); | |||
| } | |||
| #endif | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| @@ -257,23 +191,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -289,9 +217,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| get_quantized_winograd_mk_packed_args(8); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_WEIGHT_PREPROCESS) { | |||
| @@ -299,15 +226,11 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, | |||
| DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| @@ -315,7 +238,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| } | |||
| }; | |||
| @@ -330,9 +252,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE_WEIGHT_PREPROCESS) { | |||
| @@ -340,23 +261,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -372,9 +287,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| get_int8_nchw44_args(3, 4, false, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||
| run(quantized_args, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), 1e-3); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| @@ -383,23 +297,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -414,11 +322,10 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args(3, 4, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||
| run(quantized_args, dtype::QuantizedS8(0.41113496f), | |||
| dtype::QuantizedS8(0.01887994f), | |||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, | |||
| epsilon); | |||
| dtype::QuantizedS8(0.49550694f), epsilon); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| @@ -427,23 +334,17 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle()); | |||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| param::MatrixMul::Format format, float eps) { | |||
| float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind( | |||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||
| arg.param, handle, format)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| @@ -459,11 +360,10 @@ TEST_F(ARM_COMMON_MULTI_THREADS, | |||
| get_int8_nchw44_args(3, 4, true, true); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||
| run(quantized_args, dtype::QuantizedS8(0.41113496f), | |||
| dtype::QuantizedS8(0.01887994f), | |||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, | |||
| epsilon); | |||
| dtype::QuantizedS8(0.49550694f), epsilon); | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23_WEIGHT_PREPROCESS) { | |||
| @@ -1,91 +0,0 @@ | |||
| /** | |||
| * \file dnn/test/arm_common/winograd_filter_preprocess.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "test/common/checker.h" | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/winograd_filter_preprocess.h" | |||
| #include "test/arm_common/fixture.h" | |||
| using namespace megdnn; | |||
| using namespace test; | |||
| TEST_F(ARM_COMMON, WinogradFilterPreprocessF32) { | |||
| using namespace winograd_filter_preprocess; | |||
| Checker<WinogradFilterPreprocess> checker(handle()); | |||
| // default | |||
| std::vector<TestArg> args = get_args(6, 3); | |||
| std::vector<TestArg> args54 = get_args(5, 4); | |||
| std::vector<TestArg> args45 = get_args(4, 5); | |||
| // mk4 | |||
| std::vector<TestArg> args_mk4_out2 = | |||
| get_mk_packed_args(2, param::Winograd::Format::MK4, 4); | |||
| std::vector<TestArg> args_mk4_out6 = | |||
| get_mk_packed_args(6, param::Winograd::Format::MK4, 4); | |||
| args.insert(args.end(), args54.begin(), args54.end()); | |||
| args.insert(args.end(), args45.begin(), args45.end()); | |||
| args.insert(args.end(), args_mk4_out2.begin(), args_mk4_out2.end()); | |||
| args.insert(args.end(), args_mk4_out6.begin(), args_mk4_out6.end()); | |||
| for (auto&& arg : args) { | |||
| checker.set_param(arg.param) | |||
| .set_dtype(0, dtype::Float32{}) | |||
| .set_dtype(1, dtype::Float32{}) | |||
| .execs({arg.src, {}}); | |||
| } | |||
| } | |||
| TEST_F(ARM_COMMON, WinogradFilterPreprocessQs8) { | |||
| using namespace winograd_filter_preprocess; | |||
| std::vector<TestArg> args = | |||
| get_mk_packed_args(2, param::Winograd::Format::MK8, 8); | |||
| Checker<WinogradFilterPreprocess> checker(handle()); | |||
| UniformIntRNG rng{-50, 50}; | |||
| checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng); | |||
| for (auto&& arg : args) { | |||
| checker.set_param(arg.param) | |||
| .set_dtype(0, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(1, dtype::QuantizedS16(2.5f)) | |||
| .execs({arg.src, {}}); | |||
| } | |||
| } | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| TEST_F(ARM_COMMON, WinogradFilterPreprocessF16) { | |||
| using namespace winograd_filter_preprocess; | |||
| Checker<WinogradFilterPreprocess> checker(handle()); | |||
| // default | |||
| std::vector<TestArg> args = get_args(6, 3); | |||
| std::vector<TestArg> args_23 = | |||
| get_mk_packed_args(2, param::Winograd::Format::DEFAULT, 4); | |||
| std::vector<TestArg> args45 = get_args(4, 5); | |||
| // mk8 | |||
| std::vector<TestArg> args_mk8_out2 = | |||
| get_mk_packed_args(2, param::Winograd::Format::MK8, 8); | |||
| args.insert(args.end(), args_23.begin(), args_23.end()); | |||
| args.insert(args.end(), args45.begin(), args45.end()); | |||
| args.insert(args.end(), args_mk8_out2.begin(), args_mk8_out2.end()); | |||
| Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); | |||
| for (auto&& arg : args) { | |||
| checker.set_param(arg.param) | |||
| .set_rng(0, rng) | |||
| .set_dtype(0, dtype::Float16{}) | |||
| .set_dtype(1, dtype::Float16{}) | |||
| .execs({arg.src, {}}); | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1152,50 +1152,6 @@ void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args, | |||
| } | |||
| void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle, | |||
| param::MatrixMul::Format format) { | |||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW || | |||
| param.format == param::ConvBias::Format::NCHW44); | |||
| auto winograd_preprocess_opr = | |||
| handle->create_operator<WinogradFilterPreprocess>(); | |||
| winograd_preprocess_opr->param().output_block_size = m; | |||
| winograd_preprocess_opr->param().format = format; | |||
| winograd_preprocess_opr->param().compute_mode = param.compute_mode; | |||
| TensorLayout filter_transform_layout; | |||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
| filter_transform_layout); | |||
| size_t winograd_preprocess_workspace_in_bytes = | |||
| winograd_preprocess_opr->get_workspace_in_bytes( | |||
| tensors[1].layout, filter_transform_layout); | |||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
| conv_bias_opr->param() = param; | |||
| if (param.format == param::ConvBias::Format::NCHW) { | |||
| conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; | |||
| } else { | |||
| conv_bias_opr->param().format = | |||
| param::ConvBias::Format::NCHW44_WINOGRAD; | |||
| } | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, tensors[2].layout, | |||
| tensors[3].layout, tensors[4].layout, nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| winograd_preprocess_workspace_in_bytes}); | |||
| wb.set(malloc(wb.total_size_in_bytes())); | |||
| TensorND filter_transform_tensor(wb.get(0), | |||
| std::move(filter_transform_layout)); | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], nullptr, wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| void checker_conv_bias_common(std::vector<conv_bias::TestArg> args, Handle* handle, | |||
| RNG* rng, float epsilon, DType type0, DType type1, | |||
| DType type2, DType type3, const char* algo_name) { | |||
| @@ -1388,7 +1344,6 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args( | |||
| } | |||
| return args; | |||
| } | |||
| } // namespace conv_bias | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -94,9 +94,6 @@ void checker_conv_bias_int8x8x16( | |||
| std::vector<megdnn::test::conv_bias::TestArg> args, | |||
| megdnn::Handle* handle, const char* algo_name); | |||
| void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle, | |||
| param::MatrixMul::Format format); | |||
| void checker_conv_bias_common(std::vector<conv_bias::TestArg> args, | |||
| Handle* handle, RNG* rng, float epsilon, | |||
| DType type0, DType type1, DType type2, | |||
| @@ -95,7 +95,6 @@ DEF(MaskConvolution, 4, true, true); | |||
| DEF(MaskPropagate, 2, true, true); | |||
| DEF(RelayoutFormat, 2, true, true); | |||
| DEF(MaxTensorDiff, 2, true, false); | |||
| DEF(WinogradFilterPreprocess, 2, true, true); | |||
| DEF(LocalShareForward, 3, true, true); | |||
| DEF(LocalShareBackwardData, 3, true, false); | |||
| DEF(LocalShareBackwardFilter, 3, true, false); | |||
| @@ -1814,69 +1814,22 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_winograd_mk_nchw88_args(); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle) { | |||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW88); | |||
| auto winograd_preprocess_opr = | |||
| handle->create_operator<WinogradFilterPreprocess>(); | |||
| winograd_preprocess_opr->param().output_block_size = m; | |||
| winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK8; | |||
| TensorLayout filter_transform_layout; | |||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
| filter_transform_layout); | |||
| size_t winograd_preprocess_workspace_in_bytes = | |||
| winograd_preprocess_opr->get_workspace_in_bytes( | |||
| tensors[1].layout, filter_transform_layout); | |||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
| conv_bias_opr->param() = param; | |||
| conv_bias_opr->param().format = | |||
| param::ConvBias::Format::NCHW88_WINOGRAD; | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||
| nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| winograd_preprocess_workspace_in_bytes}); | |||
| wb.set(malloc(wb.total_size_in_bytes())); | |||
| TensorND filter_transform_tensor(wb.get(0), | |||
| std::move(filter_transform_layout)); | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| auto run = [&checker, &extra_impl]( | |||
| Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| auto run = [&checker](const std::vector<TestArg>& args, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind(extra_impl, | |||
| std::placeholders::_1, m, | |||
| arg.param, handle)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| }; | |||
| run(handle(), args, {2, 6}, dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), dtype::Float32(), 1e-3f); | |||
| run(args, dtype::Float32(), dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), 1e-3f); | |||
| } | |||
| /*********************************** End winograd ************************/ | |||
| @@ -32,7 +32,6 @@ | |||
| #include "megbrain/jit/fusion_pass.h" | |||
| #endif | |||
| #include "megbrain/gopt/weights_preprocess.h" | |||
| using namespace mgb; | |||
| using namespace cg; | |||
| @@ -14,7 +14,6 @@ | |||
| #include "megbrain/gopt/gtrans.h" | |||
| #include "megbrain/gopt/inference.h" | |||
| #include "megbrain/gopt/misc.h" | |||
| #include "megbrain/gopt/weights_preprocess.h" | |||
| #include "megbrain/graph/cg.h" | |||
| #include "megbrain/graph/event.h" | |||
| #include "megbrain/graph/exc_extra_info.h" | |||
| @@ -780,8 +779,6 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||
| add_pass<FuseConvBiasZPass>(); | |||
| }); | |||
| cb(weight_winograd_transform, | |||
| { add_pass<WinogradTransformReplacePass>(); }); | |||
| #undef cb | |||
| if (need_param_fuse) { | |||
| @@ -1,206 +0,0 @@ | |||
| /** | |||
| * \file src/gopt/impl/weights_preprocess.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "megbrain/gopt/weights_preprocess.h" | |||
| #include "megbrain/gopt/inference.h" | |||
| #include "megbrain/opr/dnn/convolution.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/utils/hash_ct.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megbrain_weight_preprocess) | |||
| #define MIDOUT_B(tag) \ | |||
| MIDOUT_BEGIN(megbrain_weight_preprocess, midout_iv(MGB_HASH_STR(tag))) { | |||
| #define MIDOUT_E \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| using namespace mgb; | |||
| using namespace gopt; | |||
| using namespace cg; | |||
| const char* WinogradTransformReplacePass::name() const { | |||
| return "winograd_transform"; | |||
| } | |||
| void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
| MIDOUT_B("WinogradTransformReplacePass::apply") | |||
| auto rewriter = opt.graph().make_rewriter(); | |||
| ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM}; | |||
| opt.graph().iter([&cvprop](OperatorNodeBase *opr) { | |||
| cvprop.add_opr(opr); | |||
| }); | |||
| auto get_algo = [](const opr::ConvBias& opr) -> std::string { | |||
| auto&& inputs = opr.input(); | |||
| SmallVector<TensorLayout> layouts; | |||
| mgb_assert(inputs.size() >= 2 && inputs.size() <= 4); | |||
| auto&& mo = opr.megdnn_opr(); | |||
| for (size_t i = 0; i < 4; i++) { | |||
| if (inputs.size() <= i) { | |||
| if (i == 2) { | |||
| //! bias | |||
| DType dtype; | |||
| mo->deduce_dtype(inputs[0]->dtype(), inputs[1]->dtype(), | |||
| DType{}, DType{}, dtype); | |||
| layouts.emplace_back(TensorShape{}, dtype); | |||
| } else { | |||
| layouts.emplace_back(TensorShape{}, opr.output(0)->dtype(), | |||
| opr.output(0)->format()); | |||
| } | |||
| } else { | |||
| layouts.emplace_back(inputs[i]->shape(), inputs[i]->dtype(), | |||
| inputs[i]->format()); | |||
| } | |||
| } | |||
| layouts.emplace_back(opr.output(0)->shape(), opr.output(0)->dtype(), | |||
| opr.output(0)->format()); | |||
| AlgoChooserProfileCache& cache = opr.profile_cache(); | |||
| auto param_blob = opr.param_blob(); | |||
| AlgoChooserProfileCache::Key cache_key{layouts.data(), layouts.size(), | |||
| param_blob.first, | |||
| param_blob.second}; | |||
| auto&& rst = cache.get(cache_key); | |||
| if (!rst.valid()) | |||
| return ""; | |||
| auto prof = rst.val(); | |||
| if (prof.empty()) | |||
| return ""; | |||
| return prof[0].algo; | |||
| }; | |||
| auto on_opr = [&](OperatorNodeBase* opr) { | |||
| auto type = opr->dyn_typeinfo(); | |||
| do { | |||
| if (type != opr::ConvBias::typeinfo()) | |||
| break; | |||
| auto&& conv_bias_opr = opr->cast_final_safe<opr::ConvBias>(); | |||
| auto&& inputs = conv_bias_opr.input(); | |||
| VarNodeArray new_inp; | |||
| new_inp.reserve(inputs.size()); | |||
| for (auto i : inputs) { | |||
| new_inp.push_back(rewriter.get_var(i)); | |||
| } | |||
| if (!(cvprop.is_midconst(inputs[1]) || | |||
| cvprop.is_const(inputs[1]))) { | |||
| break; | |||
| } | |||
| auto algo_name = get_algo(conv_bias_opr); | |||
| auto winograd_param = | |||
| megdnn::ConvBias::parse_winograd_name(algo_name); | |||
| if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM) | |||
| break; | |||
| mgb_assert( | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW || | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW88 || | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW44, | |||
| "currently winograd only suppport NCHW and NCHW44 and " | |||
| "NCHW88"); | |||
| opr::ConvBiasForward::check_winograd_param_valid( | |||
| winograd_param, conv_bias_opr.input(0)->dtype()); | |||
| megdnn::param::Winograd winograd_preprocess_param; | |||
| winograd_preprocess_param.format = | |||
| opr::ConvBiasForward::get_matmul_format(winograd_param); | |||
| winograd_preprocess_param.output_block_size = | |||
| winograd_param.output_block_size; | |||
| auto conv_bias_param = conv_bias_opr.param(); | |||
| //! If input dtype is Qint8 and matmul format is MK4, The winograd | |||
| //! compute type is float. | |||
| if (conv_bias_opr.input(0)->dtype().enumv() == | |||
| DTypeEnum::QuantizedS8 && | |||
| winograd_preprocess_param.format == | |||
| megdnn::param::MatrixMul::Format::MK4) { | |||
| winograd_preprocess_param.compute_mode = | |||
| megdnn::param::ConvBias::ComputeMode::FLOAT32; | |||
| conv_bias_param.compute_mode = | |||
| megdnn::param::ConvBias::ComputeMode::FLOAT32; | |||
| } | |||
| auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make( | |||
| new_inp[1], winograd_preprocess_param); | |||
| mgb_assert(inputs.size() == 2 || inputs.size() == 3, | |||
| "input size need to be 2/3, but got: %zu", | |||
| inputs.size()); | |||
| SymbolVar new_conv_bias_opr; | |||
| if (new_inp[0]->shape().ndim == 4) { | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW_WINOGRAD; | |||
| } else { | |||
| mgb_assert(new_inp[0]->shape().ndim == 5); | |||
| size_t pack_size = new_inp[0]->shape()[4]; | |||
| if (pack_size == 8) { | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||
| } else if (pack_size == 4) { | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW44_WINOGRAD; | |||
| } else { | |||
| mgb_assert(0, "Invalid pack size %zu in algo %s", pack_size, | |||
| algo_name.c_str()); | |||
| } | |||
| } | |||
| conv_bias_param.output_block_size = | |||
| winograd_param.output_block_size; | |||
| if (inputs.size() == 2) { | |||
| new_conv_bias_opr = opr::ConvBias::make( | |||
| new_inp[0], winograd_preprocess_opr.node(), | |||
| conv_bias_param, conv_bias_opr.execution_policy(), | |||
| conv_bias_opr.config()); | |||
| } else { | |||
| new_conv_bias_opr = opr::ConvBias::make( | |||
| new_inp[0], winograd_preprocess_opr.node(), new_inp[2], | |||
| conv_bias_param, conv_bias_opr.execution_policy(), | |||
| conv_bias_opr.config()); | |||
| } | |||
| auto&& origin_out = conv_bias_opr.output(); | |||
| auto&& cur_out = new_conv_bias_opr.node()->owner_opr()->output(); | |||
| mgb_assert(origin_out.size() == cur_out.size()); | |||
| for (size_t i = 0; i < origin_out.size(); i++) { | |||
| if (!origin_out[i]->contain_flag( | |||
| VarNode::Flag::VOLATILE_CONTENT)) { | |||
| rewriter.replace_var(origin_out[i], cur_out[i], nullptr); | |||
| } | |||
| } | |||
| return; | |||
| } while (0); | |||
| rewriter.auto_replace_outputs(opr); | |||
| }; | |||
| opt.graph().iter(on_opr); | |||
| rewriter.apply_inplace(); | |||
| MIDOUT_E | |||
| } | |||
| /** | |||
| * \warning WinogradTransformReplacePass implies that we run ParamFuse pass | |||
| * before(currently run ParamFuse in optimize_for_inference when dump model), | |||
| * othwise it can not deal with \c ConvBias(x, W+1), as the node of W+1 has no | |||
| * flag PERSISTENT_DEVICE_VALUE, it's a mid-const node, we should use | |||
| * ConstVarPropogate strictly speaking. | |||
| */ | |||
| void gopt::transform_vars_inplace_with_winograd( | |||
| mgb::cg::VarNodeArray& dest_vars) { | |||
| gopt::GraphOptimizer optimizer; | |||
| optimizer.add_pass<WinogradTransformReplacePass>(); | |||
| optimizer.add_pass<ParamFusePass>(); | |||
| optimizer.apply_inplace(dest_vars); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -1,32 +0,0 @@ | |||
| /** | |||
| * \file src/gopt/include/megbrain/gopt/weights_preprocess.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/gopt/framework.h" | |||
| namespace mgb { | |||
| namespace gopt { | |||
| class WinogradTransformReplacePass final : public Pass { | |||
| class Impl; | |||
| public: | |||
| const char* name() const override; | |||
| void apply(OptState& opt) const override; | |||
| }; | |||
| void transform_vars_inplace_with_winograd(mgb::cg::VarNodeArray& dest_vars); | |||
| } // namespace gopt | |||
| } // namespace mgb | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -46,7 +46,6 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||
| ConvTensorLayouts origin_layouts = ctx.layouts(); | |||
| typename Opr::Param origin_param = ctx.mgb_opr()->param(); | |||
| get_origin_param_and_layouts(ctx, origin_layouts, origin_param); | |||
| AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), | |||
| origin_layouts.size(), &origin_param, | |||
| sizeof(origin_param)}; | |||
| @@ -104,18 +103,6 @@ AlgoChooserProfileCache::Result AlgoChooser<Opr>::get_profile_result( | |||
| return prof_rst; | |||
| } | |||
| template <> | |||
| void AlgoChooser<megdnn::ConvBias>::get_origin_param_and_layouts( | |||
| const ExeContext& ctx, ConvTensorLayouts& layouts, | |||
| megdnn::ConvBias::Param& param) { | |||
| auto format = static_cast<megdnn::param::ConvBias::Format>( | |||
| ctx.megdnn_opr()->param().format); | |||
| size_t output_block_size = ctx.megdnn_opr()->param().output_block_size; | |||
| megdnn::ConvBias::deduce_winograd_origin_layout_and_param( | |||
| format, output_block_size, ctx.layouts()[0], ctx.layouts()[1], | |||
| layouts[1], param); | |||
| } | |||
| template <typename Opr> | |||
| typename AlgoChooser<Opr>::ImplAlgo AlgoChooser<Opr>::choose_by_profile( | |||
| ExeContext& ctx, bool require_reproducible, bool enable_update) { | |||
| @@ -1607,15 +1607,5 @@ void RelayoutFormat::init_output_format() { | |||
| } | |||
| // f}}} | |||
| // | |||
| /* f{{{ ===================== WinogradFilterPreprocess ===================== */ | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(WinogradFilterPreprocess); | |||
| MEGDNN_OPR_INIT1(WinogradFilterPreprocess, "winograd_filter_preprocess") | |||
| void WinogradFilterPreprocess::init_output_dtype() { | |||
| TensorLayout dst; | |||
| TensorLayout src{input(0)->shape(), input(0)->dtype(), input(0)->format()}; | |||
| megdnn_opr()->deduce_layout(src, dst); | |||
| output(0)->dtype(dst.dtype); | |||
| } | |||
| // f}}} | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -184,7 +184,6 @@ namespace opr { | |||
| MGB_REG_OPR_SHALLOW_COPY(ParamPackConcat, opr_shallow_copy_param_pack_concat); | |||
| MGB_SEREG_OPR(RelayoutFormat, 1); | |||
| MGB_SEREG_OPR(WinogradFilterPreprocess, 1); | |||
| } // namespace opr | |||
| } // namespace mgb | |||
| @@ -113,10 +113,6 @@ class AlgoChooser { | |||
| //! entrance for getting algorithm according to execution strategy | |||
| static ImplAlgo get_algo(ExeContext& ctx); | |||
| static void get_origin_param_and_layouts(const ExeContext&, | |||
| ConvTensorLayouts&, | |||
| typename Opr::Param&) {} | |||
| //! get all profile result, either by retrieving cache or profiling | |||
| static AlgoChooserProfileCache::Result get_profile_result( | |||
| ExeContext& ctx, bool enable_update); | |||
| @@ -635,22 +635,6 @@ MGB_DEFINE_OPR_CLASS(RelayoutFormat, | |||
| const OperatorNodeConfig &config = {}); | |||
| void init_output_format() override final; | |||
| }; | |||
| /*! | |||
| * \brief change conv weights layout base on winograd transform. | |||
| * | |||
| * See docs of megdnn params for more details | |||
| */ | |||
| MGB_DEFINE_OPR_CLASS(WinogradFilterPreprocess, | |||
| intl::MegDNNOprWrapperFwd<megdnn::WinogradFilterPreprocess>) | |||
| public: | |||
| WinogradFilterPreprocess(VarNode* p0, const Param& param, | |||
| const OperatorNodeConfig& config); | |||
| static SymbolVar make(SymbolVar p0, const Param& param = {}, | |||
| const OperatorNodeConfig& config = {}); | |||
| void init_output_dtype() override final; | |||
| }; | |||
| } // opr | |||
| } // mgb | |||
| @@ -171,12 +171,6 @@ uint64_t eval_conv_computation(const TensorShape& src_shape, | |||
| cpos = 1; | |||
| spatial_start = 2; | |||
| break; | |||
| case Param::Format::NCHW_WINOGRAD: | |||
| case Param::Format::NCHW44_WINOGRAD: | |||
| case Param::Format::NCHW88_WINOGRAD: | |||
| cpos = 1; | |||
| spatial_start = 0; | |||
| break; | |||
| case Param::Format::NHWC: | |||
| cpos = 3; | |||
| spatial_start = 1; | |||
| @@ -203,29 +197,9 @@ uint64_t eval_conv_computation(const TensorShape& src_shape, | |||
| uint64_t fh = static_cast<uint64_t>(filter_shape[spatial_start]); | |||
| uint64_t fw = static_cast<uint64_t>(filter_shape[spatial_start + 1]); | |||
| if (param.format == Param::Format::NCHW_WINOGRAD || | |||
| param.format == Param::Format::NCHW44_WINOGRAD || | |||
| param.format == Param::Format::NCHW88_WINOGRAD) { | |||
| mgb_assert(opr->same_type<opr::ConvBias>(), | |||
| "Only conv bias support WINOGRAD"); | |||
| auto&& conv_bias_opr = opr->cast_final_safe<opr::ConvBias>(); | |||
| uint32_t output_block_size = conv_bias_opr.param().output_block_size; | |||
| mgb_assert(fh == fw, | |||
| "NCHW_WINOGRAD, NCHW88_WINOGRAD need fw==fh, got fw: %u fh " | |||
| "%u\n", | |||
| static_cast<uint32_t>(fh), static_cast<uint32_t>(fw)); | |||
| fh = fh + 1 - output_block_size; | |||
| fw = fw + 1 - output_block_size; | |||
| } | |||
| // mul and add are counted as 2 operations | |||
| if(param.format == Param::Format::NCHW88_WINOGRAD){ | |||
| return dst_shape.total_nr_elems() * fh * fw * | |||
| static_cast<uint64_t>(src_shape[cpos] * 8) / group * 2; | |||
| } | |||
| if (param.format == Param::Format::NCHW44_WINOGRAD) { | |||
| return dst_shape.total_nr_elems() * fh * fw * | |||
| static_cast<uint64_t>(src_shape[cpos] * 4) / group * 2; | |||
| } | |||
| return dst_shape.total_nr_elems() * fh * fw * | |||
| static_cast<uint64_t>(src_shape[cpos]) / group * 2; | |||
| } | |||
| @@ -28,6 +28,7 @@ table Blob { | |||
| } | |||
| table Reserved0 {} | |||
| table DeprecatedParam {} | |||
| union OperatorParam { | |||
| param.Empty = 1, | |||
| @@ -50,7 +51,8 @@ union OperatorParam { | |||
| param.ElemwiseMultiType = 18, | |||
| param.PowC = 19, | |||
| param.MatrixMul = 20, | |||
| param.Winograd = 21, | |||
| //Reserved for param.Winograd = 21, | |||
| DeprecatedParam = 21, | |||
| param.SVD = 22, | |||
| param.Reduce = 23, | |||
| param.Cumsum = 24, | |||