| @@ -387,7 +387,9 @@ public: | |||
| //! get algo name, the format is ParamTrait<T>::category:base:p.to_string() | |||
| //! \warning: base must not contain :. | |||
| template <typename T> | |||
| static std::string algo_name(const std::string& base, const T& p); | |||
| static std::string algo_name( | |||
| const std::string& base, const T& p, | |||
| param::ConvBias::Format format = param::ConvBias::Format::NCHW); | |||
| /*! | |||
| * \brief parse algo_name and get WinogradParam from algo name. | |||
| * | |||
| @@ -388,6 +388,166 @@ ConvBiasImpl::AlgoFP32WinogradF63_4x4::dispatch_kerns( | |||
| return {}; | |||
| } | |||
| /* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ | |||
| bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | |||
| fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||
| MEGDNN_MARK_USED_VAR(opr); | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
| if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||
| return false; | |||
| using Strategy = winograd::winograd_F23_mk4_f_nchw44; | |||
| Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||
| auto&& matmul_param = | |||
| megdnn::winograd::ConvBias<Strategy, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
| (opr->param().format == param::ConvBias::Format::NCHW44 || | |||
| (opr->param().format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| opr->param().output_block_size == 2 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| (param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||
| param.filter_meta.stride[0] == 1) && | |||
| (param.filter_meta.dilation[0] == | |||
| param.filter_meta.dilation[1] && | |||
| param.filter_meta.dilation[0] == 1) && | |||
| param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||
| param.src_type.enumv() == DTypeEnum::Float32; | |||
| } | |||
| MIDOUT_END(); | |||
| return false; | |||
| } | |||
| size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::get_workspace( | |||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
| winograd::winograd_F23_mk4_f_nchw44 strategy( | |||
| param.src_type, param.filter_type, param.dst_type); | |||
| return megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg) | |||
| .get_workspace_size(param, m_matmul_algo); | |||
| } | |||
| MIDOUT_END(); | |||
| return 0; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> | |||
| ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::dispatch_kerns( | |||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
| winograd::winograd_F23_mk4_f_nchw44 strategy( | |||
| param.src_type, param.filter_type, param.dst_type); | |||
| auto winograd_impl = | |||
| megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg); | |||
| return winograd_impl.get_kerns(param, m_matmul_algo); | |||
| } | |||
| MIDOUT_END(); | |||
| return {}; | |||
| } | |||
| /* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */ | |||
| bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( | |||
| fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MEGDNN_MARK_USED_VAR(opr); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
| if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||
| return false; | |||
| using Strategy = winograd::winograd_F63_mk4_f_nchw44; | |||
| Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||
| auto&& matmul_param = | |||
| megdnn::winograd::ConvBias<Strategy, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg) | |||
| .get_matmul_kern_param(param); | |||
| return m_matmul_algo->usable(matmul_param) && | |||
| m_matmul_algo->packmode() == | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
| (opr->param().format == param::ConvBias::Format::NCHW44 || | |||
| (opr->param().format == | |||
| param::ConvBias::Format::NCHW44_WINOGRAD && | |||
| opr->param().output_block_size == 6 && | |||
| param.winograd_matmul_format == | |||
| param::MatrixMul::Format::MK4)) && | |||
| opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
| param.filter_meta.spatial[0] == 3) && | |||
| (param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||
| param.filter_meta.stride[0] == 1) && | |||
| (param.filter_meta.dilation[0] == | |||
| param.filter_meta.dilation[1] && | |||
| param.filter_meta.dilation[0] == 1) && | |||
| param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||
| param.src_type.enumv() == DTypeEnum::Float32 && | |||
| param.filter_meta.icpg % 4 == 0 && | |||
| param.filter_meta.ocpg % 4 == 0; | |||
| } | |||
| MIDOUT_END(); | |||
| return false; | |||
| } | |||
| size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::get_workspace( | |||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
| winograd::winograd_F63_mk4_f_nchw44 strategy( | |||
| param.src_type, param.filter_type, param.dst_type); | |||
| return megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg) | |||
| .get_workspace_size(param, m_matmul_algo); | |||
| } | |||
| MIDOUT_END(); | |||
| return 0; | |||
| } | |||
| SmallVector<ConvBiasImpl::NCBKern> | |||
| ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::dispatch_kerns( | |||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
| midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
| winograd::winograd_F63_mk4_f_nchw44 strategy( | |||
| param.src_type, param.filter_type, param.dst_type); | |||
| auto winograd_impl = | |||
| megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||
| param::MatrixMul::Format::MK4>( | |||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
| param.osz[1], param.filter_meta.ocpg); | |||
| return winograd_impl.get_kerns(param, m_matmul_algo); | |||
| } | |||
| MIDOUT_END(); | |||
| return {}; | |||
| } | |||
| /* ===================== direct algo ===================== */ | |||
| MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl); | |||
| @@ -157,6 +157,64 @@ private: | |||
| uint32_t m_tile_size; | |||
| }; | |||
| //===================== NCHW44 Winograd Support =====================// | |||
| class ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44 final : public AlgoBase { | |||
| public: | |||
| AlgoFP32WinogradF23_4x4_NCHW44( | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||
| : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||
| bool is_reproducible() const override { return true; } | |||
| const char* name() const override { | |||
| if (m_name.empty()) { | |||
| m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||
| m_matmul_algo->name(), {4, 2, m_tile_size}, | |||
| param::ConvBias::Format::NCHW44); | |||
| } | |||
| return m_name.c_str(); | |||
| } | |||
| bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy algo_selection_strategy) const override; | |||
| size_t get_workspace(fallback::ConvBiasImpl*, | |||
| const NCBKernSizeParam& param) const override; | |||
| virtual SmallVector<NCBKern> dispatch_kerns( | |||
| fallback::ConvBiasImpl* opr, | |||
| const NCBKernSizeParam& param) const override; | |||
| private: | |||
| fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||
| mutable std::string m_name; | |||
| uint32_t m_tile_size; | |||
| }; | |||
| class ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44 final : public AlgoBase { | |||
| public: | |||
| AlgoFP32WinogradF63_4x4_NCHW44( | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||
| : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||
| bool is_reproducible() const override { return true; } | |||
| const char* name() const override { | |||
| if (m_name.empty()) { | |||
| m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||
| m_matmul_algo->name(), {4, 6, m_tile_size}, | |||
| param::ConvBias::Format::NCHW44); | |||
| } | |||
| return m_name.c_str(); | |||
| } | |||
| bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
| AlgoSelectionStrategy algo_selection_strategy) const override; | |||
| size_t get_workspace(fallback::ConvBiasImpl*, | |||
| const NCBKernSizeParam& param) const override; | |||
| virtual SmallVector<NCBKern> dispatch_kerns( | |||
| fallback::ConvBiasImpl* opr, | |||
| const NCBKernSizeParam& param) const override; | |||
| private: | |||
| fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||
| mutable std::string m_name; | |||
| uint32_t m_tile_size; | |||
| }; | |||
| // ================================================================= // | |||
| class ConvBiasImpl::AlgoF32Direct final : public AlgoBase { | |||
| SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||
| bool m_large_group; | |||
| @@ -32,6 +32,12 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, | |||
| MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, | |||
| winograd_4x5_1x1_f) | |||
| MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 2, 3, 4, 4, | |||
| winograd_F23_mk4_f_nchw44) | |||
| MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, | |||
| winograd_F63_mk4_f_nchw44) | |||
| } // namespace winograd | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,349 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/conv_bias/fp32/strategy_f23_mk4_nchw44.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/arm_common/conv_bias/fp32/strategy.h" | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| #include "src/arm_common/utils.h" | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/fallback/conv_bias/winograd/winograd.h" | |||
| #include "src/naive/matrix_mul/matrix_mul_helper.h" | |||
| #include "src/arm_common/elemwise_helper/op_unary.h" | |||
| #include "src/arm_common/conv_bias/fp32/helper.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_fp32_F23_mk4) | |||
| using namespace megdnn; | |||
| using namespace arm_common; | |||
| namespace { | |||
| constexpr size_t alpha = 2 + 3 - 1; | |||
| constexpr size_t pack_size = 4; | |||
| struct InputTransformF23_NCHW44 { | |||
| template <bool inner> | |||
| static void prepare(const float* input, float* patch, float* patchT, | |||
| int ih_start, int iw_start, size_t IH, size_t IW, | |||
| size_t ic, size_t IC) { | |||
| MEGDNN_MARK_USED_VAR(patch); | |||
| size_t IW4 = IW * pack_size; | |||
| size_t iw4_start = iw_start * pack_size; | |||
| size_t icb = ic / pack_size; | |||
| if (!(inner && ic + pack_size < IC)) { | |||
| memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||
| } | |||
| if (inner) { | |||
| const float* input_ptr = | |||
| input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||
| for (size_t ih = 0; ih < alpha; ih++) { | |||
| #define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||
| #undef cb | |||
| #define cb(i) vst1q_f32(patchT + ih * alpha * pack_size + i * pack_size, v##i); | |||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||
| #undef cb | |||
| input_ptr += IW4; | |||
| } | |||
| } else { | |||
| int ih0_act = std::max<int>(ih_start, 0), | |||
| ih1_act = std::min<int>(ih_start + alpha, IH), | |||
| iw0_act = std::max<int>(iw_start, 0), | |||
| iw1_act = std::min<int>(iw_start + alpha, IW); | |||
| const float* input_ptr = input + icb * IH * IW4; | |||
| // partial copy | |||
| for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||
| for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||
| size_t iho = ih - ih_start, iwo = iw - iw_start; | |||
| auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||
| vst1q_f32( | |||
| patchT + iho * alpha * pack_size + iwo * pack_size, | |||
| src); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void transform(const float* patchT, float* input_transform_buf, | |||
| size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||
| size_t IC) { | |||
| // BT * d * B | |||
| #define cb(m, n) \ | |||
| Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||
| patchT + m * alpha * pack_size + n * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||
| #undef cb | |||
| //! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 | |||
| //! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 | |||
| //! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 | |||
| //! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 | |||
| #define cb(m) \ | |||
| auto t0##m = d0##m - d2##m; \ | |||
| auto t1##m = d1##m + d2##m; \ | |||
| auto t2##m = d2##m - d1##m; \ | |||
| auto t3##m = d3##m - d1##m; | |||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||
| #undef cb | |||
| #define cb(m) \ | |||
| d##m##0 = t##m##0 - t##m##2; \ | |||
| d##m##1 = t##m##1 + t##m##2; \ | |||
| d##m##2 = t##m##2 - t##m##1; \ | |||
| d##m##3 = t##m##3 - t##m##1; | |||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||
| #undef cb | |||
| size_t ICB = IC / 4; | |||
| size_t icb = ic / 4; | |||
| #define cb(m, n) \ | |||
| d##m##n.save(input_transform_buf + \ | |||
| (m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||
| icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||
| #undef cb | |||
| } | |||
| }; | |||
| #define CONCAT(a, idx) a##idx | |||
| template <BiasMode bmode, typename Op> | |||
| struct OutputTransformF23_NCHW44 { | |||
| static void transform(const float* output_transform_buf, const float* bias, | |||
| float* output, float* transform_mid_buf, | |||
| size_t oh_start, size_t ow_start, size_t OH, | |||
| size_t OW, size_t oc_start, size_t oc_end, | |||
| size_t oc_index, size_t unit_idx, | |||
| size_t nr_units_in_tile, const DType& src_dtype, | |||
| const DType& dst_dtype) { | |||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
| Op op(src_dtype, dst_dtype); | |||
| //! AT * m * A | |||
| size_t OCB = (oc_end - oc_start) / pack_size; | |||
| size_t oc = oc_start + oc_index; | |||
| size_t ocb = oc_index / pack_size; | |||
| #define cb(m, n) \ | |||
| auto v##m##n = Vector<float, 4>::load( \ | |||
| output_transform_buf + \ | |||
| (m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||
| ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||
| #undef cb | |||
| //! 1 1 1 0 v00 v01 v02 v03 1 0 | |||
| //! 0 1 -1 1 v10 v11 v12 v13 1 1 | |||
| //! v20 v21 v22 v23 1 -1 | |||
| //! v30 v31 v32 v33 0 1 | |||
| #define cb(m) \ | |||
| auto t0##m = v0##m + v1##m + v2##m; \ | |||
| auto t1##m = v1##m - v2##m + v3##m; | |||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||
| #undef cb | |||
| #define cb(m) \ | |||
| v##m##0 = t##m##0 + t##m##1 + t##m##2; \ | |||
| v##m##1 = t##m##1 - t##m##2 + t##m##3; | |||
| UNROLL_CALL_NOWRAPPER(2, cb); | |||
| #undef cb | |||
| Vector<float, 4> vbias; | |||
| if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
| vbias = Vector<float, 4>::load(bias + oc); | |||
| #define cb(m, n) v##m##n += vbias; | |||
| UNROLL_CALL_RAW_D2(2, 2, cb); | |||
| #undef cb | |||
| } | |||
| if (bmode != BiasMode::BIAS) { | |||
| #define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||
| UNROLL_CALL_RAW_D2(2, 2, cb); | |||
| #undef cb | |||
| } | |||
| #define out_save(oho, owo) \ | |||
| do { \ | |||
| size_t oh = oh_start + oho; \ | |||
| size_t ow = ow_start + owo; \ | |||
| if (oh < OH && ow < OW) { \ | |||
| if (bmode == BiasMode::BIAS) { \ | |||
| v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||
| oh * OW * pack_size + \ | |||
| ow * pack_size); \ | |||
| v##oho##owo = op(v##oho##owo.value); \ | |||
| } \ | |||
| v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||
| ow * pack_size); \ | |||
| } \ | |||
| } while (0); | |||
| UNROLL_CALL_RAW_D2(2, 2, out_save); | |||
| #undef out_save | |||
| } | |||
| }; | |||
| #undef CONCAT | |||
| } // namespace | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| namespace winograd { | |||
| MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F23_mk4_f_nchw44) | |||
| void winograd_F23_mk4_f_nchw44::filter(const float* filter, | |||
| float* filter_transform_buf, | |||
| float* transform_mid_buf, size_t OC, | |||
| size_t IC, size_t oc_start, | |||
| size_t oc_end) { | |||
| //! 1 0 0 v00 v01 v02 1 0.5 0.5 0 | |||
| //! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 | |||
| //! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 | |||
| //! 0 0 1 | |||
| constexpr size_t pack_size = 4; | |||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
| megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||
| oc_start % pack_size == 0 && | |||
| oc_end % pack_size == 0 && IC % pack_size == 0 && | |||
| OC % pack_size == 0, | |||
| "NCHW44 Winograd filter transform requires both OC and IC " | |||
| "are times of 4"); | |||
| size_t OCB = OC / pack_size; | |||
| size_t ICB = IC / pack_size; | |||
| for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||
| for (size_t icb = 0; icb < ICB; icb++) { | |||
| for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||
| const float* fptr = filter + | |||
| (ocb * ICB + icb) * KERNEL_SIZE * | |||
| KERNEL_SIZE * pack_size * | |||
| pack_size + | |||
| ic_inner * pack_size; | |||
| #define cb(m, n) \ | |||
| Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||
| fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||
| #undef cb | |||
| #define FILTER_TRANSFORM(n, wd, g) \ | |||
| auto wd##n##0 = g##0##n; \ | |||
| tmp0 = (g##0##n + g##2##n) * 0.5; \ | |||
| tmp1 = g##1##n * 0.5; \ | |||
| auto wd##n##1 = tmp0 + tmp1; \ | |||
| auto wd##n##2 = tmp0 - tmp1; \ | |||
| auto wd##n##3 = g##2##n; | |||
| Vector<float, 4> tmp0, tmp1; | |||
| UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||
| UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); | |||
| #undef FILTER_TRANSFORM | |||
| #define cb_save(m, n) \ | |||
| ret##m##n.save(filter_transform_buf + \ | |||
| (m * ALPHA + n) * OCB * ICB * pack_size * pack_size + \ | |||
| ocb * ICB * pack_size * pack_size + \ | |||
| icb * pack_size * pack_size + ic_inner * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb_save) | |||
| #undef cb_save | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void winograd_F23_mk4_f_nchw44::input(const float* input, | |||
| float* input_transform_buf, | |||
| float* transform_mid_buf, size_t IH, | |||
| size_t IW, size_t IC, size_t PH, | |||
| size_t PW, size_t unit_start_idx, | |||
| size_t nr_units_in_tile) { | |||
| megdnn_assert(IC % 4 == 0); | |||
| constexpr int alpha = 3 + 2 - 1; | |||
| // OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||
| auto units_w = | |||
| div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||
| float* patch = transform_mid_buf; | |||
| float* patchT = transform_mid_buf + 4 * alpha * alpha; | |||
| for (size_t ic = 0; ic < IC; ic += 4) { | |||
| rep(unit_idx, nr_units_in_tile) { | |||
| size_t index = unit_start_idx + unit_idx; | |||
| size_t nh = index / units_w; | |||
| size_t nw = index % units_w; | |||
| int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||
| int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||
| if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||
| iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||
| InputTransformF23_NCHW44::prepare<true>(input, patch, patchT, | |||
| ih_start, iw_start, IH, | |||
| IW, ic, IC); | |||
| InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||
| unit_idx, nr_units_in_tile, | |||
| ic, IC); | |||
| } else { | |||
| InputTransformF23_NCHW44::prepare<false>(input, patch, patchT, | |||
| ih_start, iw_start, IH, | |||
| IW, ic, IC); | |||
| InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||
| unit_idx, nr_units_in_tile, | |||
| ic, IC); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void winograd_F23_mk4_f_nchw44::output(const float* output_transform_buf, | |||
| const float* bias, float* output, | |||
| float* transform_mid_buf, BiasMode bmode, | |||
| NonlineMode nonline_mode, size_t OH, | |||
| size_t OW, size_t oc_start, | |||
| size_t oc_end, size_t unit_start_idx, | |||
| size_t nr_units_in_tile) { | |||
| #define cb(_bmode, _nonline_op, ...) \ | |||
| OutputTransformF23_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||
| __VA_ARGS__); | |||
| auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||
| constexpr size_t pack_size = 4; | |||
| size_t OC = oc_end - oc_start; | |||
| megdnn_assert(OC % pack_size == 0 && oc_start % pack_size == 0 && | |||
| oc_end % pack_size == 0, | |||
| "NCHW44 Winograd filter transform requires OC is times of 4"); | |||
| for (size_t oc = oc_start; oc < oc_end; oc += 4) { | |||
| size_t oc_index = oc - oc_start; | |||
| rep(unit_idx, nr_units_in_tile) { | |||
| size_t index = unit_start_idx + unit_idx; | |||
| auto nh = index / units_w; | |||
| auto nw = index % units_w; | |||
| size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||
| size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||
| DISPATCH_CONV_WINOGRAD_BIAS( | |||
| megdnn_arm_common_winograd_nchw44_fp32_F23_mk4, cb, float, | |||
| float, bmode, nonline_mode, output_transform_buf, bias, | |||
| output, transform_mid_buf, oh_start, ow_start, OH, OW, | |||
| oc_start, oc_end, oc_index, unit_idx, nr_units_in_tile, | |||
| src_dtype, dst_dtype); | |||
| } | |||
| } | |||
| #undef cb | |||
| } | |||
| } // namespace winograd | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,410 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/conv_bias/fp32/strategy_f | |||
| * 63_mk4_nchw44.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/arm_common/conv_bias/fp32/filter_transform.h" | |||
| #include "src/arm_common/conv_bias/fp32/helper.h" | |||
| #include "src/arm_common/conv_bias/fp32/strategy.h" | |||
| #include "src/arm_common/elemwise_helper/op_unary.h" | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| #include "src/arm_common/utils.h" | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/common/winograd/winograd_helper.h" | |||
| #include "src/fallback/conv_bias/winograd/winograd.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_arm_common_winograd_fp32_F63_mk4) | |||
| using namespace megdnn; | |||
| using namespace arm_common; | |||
| namespace { | |||
| constexpr size_t alpha = 6 + 3 - 1; | |||
| constexpr size_t pack_size = 4; | |||
| struct InputTransformF63_NCHW44 { | |||
| template <bool inner> | |||
| static void prepare(const float* input, float* patch, float* patchT, | |||
| int ih_start, int iw_start, size_t IH, size_t IW, | |||
| size_t ic, size_t IC) { | |||
| MEGDNN_MARK_USED_VAR(patch); | |||
| size_t IW4 = IW * pack_size; | |||
| size_t iw4_start = iw_start * pack_size; | |||
| size_t icb = ic / pack_size; | |||
| if (!(inner && ic + pack_size < IC)) { | |||
| memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||
| } | |||
| if (inner) { | |||
| const float* input_ptr = | |||
| input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||
| for (size_t ih = 0; ih < alpha; ih++) { | |||
| #define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||
| UNROLL_CALL_NOWRAPPER(8, cb); | |||
| #undef cb | |||
| #define cb(i) vst1q_f32(patchT + ih * pack_size * alpha + i * pack_size, v##i); | |||
| UNROLL_CALL_NOWRAPPER(8, cb); | |||
| #undef cb | |||
| input_ptr += IW4; | |||
| } | |||
| } else { | |||
| int ih0_act = std::max<int>(ih_start, 0), | |||
| ih1_act = std::min<int>(ih_start + alpha, IH), | |||
| iw0_act = std::max<int>(iw_start, 0), | |||
| iw1_act = std::min<int>(iw_start + alpha, IW); | |||
| const float* input_ptr = input + icb * IH * IW4; | |||
| // partial copy | |||
| for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||
| for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||
| size_t iho = ih - ih_start, iwo = iw - iw_start; | |||
| auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||
| vst1q_f32( | |||
| patchT + iho * pack_size * alpha + iwo * pack_size, | |||
| src); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void transform(const float* patchT, float* input_transform_buf, | |||
| size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||
| size_t IC) { | |||
| // BT * d * B | |||
| #define cb(m, n) \ | |||
| Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||
| patchT + m * alpha * pack_size + n * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||
| #undef cb | |||
| //! B | |||
| //! 1 0 0 0 0 0 0 0 | |||
| //! 0 1 -1 0.5 -0.5 2 -2 -1 | |||
| //! -5.25 1 1 0.25 0.25 4 4 0 | |||
| //! 0 -4.25 4.25 -2.5 2.5 -2.5 2.5 5.25 | |||
| //! 5.25 -4.25 -4.25 -1.25 -1.25 -5 -5 0 | |||
| //! 0 1 -1 2 -2 0.5 -0.5 -5.25 | |||
| //! -1 1 1 1 1 1 1 0 | |||
| //! 0 0 0 0 0 0 0 1 | |||
| #define cb(m) \ | |||
| auto t0##m = d0##m + (d4##m - d2##m) * 5.25f - d6##m; \ | |||
| auto t1##m = d1##m + d2##m + d5##m + d6##m - (d3##m + d4##m) * 4.25f; \ | |||
| auto t2##m = d2##m + d6##m - (d1##m + d5##m) + (d3##m - d4##m) * 4.25f; \ | |||
| auto t3##m = d1##m * 0.5f + d2##m * 0.25f - d3##m * 2.5f - d4##m * 1.25f + \ | |||
| d5##m * 2.f + d6##m; \ | |||
| auto t4##m = d1##m * (-0.5f) + d2##m * 0.25f + d3##m * 2.5f - \ | |||
| d4##m * 1.25f - d5##m * 2.f + d6##m; \ | |||
| auto t5##m = d1##m * 2.f + d2##m * 4.f - d3##m * 2.5f - d4##m * 5.f + \ | |||
| d5##m * 0.5f + d6##m; \ | |||
| auto t6##m = d1##m * (-2.f) + d2##m * 4.f + d3##m * 2.5f - d4##m * 5.f - \ | |||
| d5##m * 0.5f + d6##m; \ | |||
| auto t7##m = (d7##m - d1##m) + (d3##m - d5##m) * 5.25f; | |||
| UNROLL_CALL_NOWRAPPER(8, cb); | |||
| #undef cb | |||
| #define cb(m) \ | |||
| d##m##0 = t##m##0 + (t##m##4 - t##m##2) * 5.25f - t##m##6; \ | |||
| d##m##1 = t##m##1 + t##m##2 + t##m##5 + t##m##6 - \ | |||
| (t##m##3 + t##m##4) * 4.25f; \ | |||
| d##m##2 = t##m##2 + t##m##6 - (t##m##1 + t##m##5) + \ | |||
| (t##m##3 - t##m##4) * 4.25f; \ | |||
| d##m##3 = t##m##1 * 0.5f + t##m##2 * 0.25f - t##m##3 * 2.5f - \ | |||
| t##m##4 * 1.25f + t##m##5 * 2.f + t##m##6; \ | |||
| d##m##4 = t##m##1 * (-0.5f) + t##m##2 * 0.25f + t##m##3 * 2.5f - \ | |||
| t##m##4 * 1.25f - t##m##5 * 2.f + t##m##6; \ | |||
| d##m##5 = t##m##1 * 2.f + t##m##2 * 4.f - t##m##3 * 2.5f - t##m##4 * 5.f + \ | |||
| t##m##5 * 0.5f + t##m##6; \ | |||
| d##m##6 = t##m##1 * (-2.f) + t##m##2 * 4.f + t##m##3 * 2.5f - \ | |||
| t##m##4 * 5.f - t##m##5 * 0.5f + t##m##6; \ | |||
| d##m##7 = (t##m##7 - t##m##1) + (t##m##3 - t##m##5) * 5.25f; | |||
| UNROLL_CALL_NOWRAPPER(8, cb); | |||
| #undef cb | |||
| size_t ICB = IC / pack_size; | |||
| size_t icb = ic / pack_size; | |||
| #define cb(m, n) \ | |||
| d##m##n.save(input_transform_buf + \ | |||
| (m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||
| icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(8, 8, cb) | |||
| #undef cb | |||
| } | |||
| }; | |||
| template <BiasMode bmode, typename Op> | |||
| struct OutputTransformF63_NCHW44 { | |||
| static void transform(const float* output_transform_buf, const float* bias, | |||
| float* output, float* transform_mid_buf, | |||
| size_t oh_start, size_t ow_start, size_t OH, | |||
| size_t OW, size_t oc_start, size_t oc_end, | |||
| size_t oc_index, size_t unit_idx, | |||
| size_t nr_units_in_tile, const DType& src_dtype, | |||
| const DType& dst_dtype) { | |||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
| Op op(src_dtype, dst_dtype); | |||
| //! AT * m * A | |||
| size_t oc = oc_start + oc_index; | |||
| size_t OCB = (oc_end - oc_start) / pack_size; | |||
| size_t ocb = oc_index / pack_size; | |||
| #define cb(m, n) \ | |||
| auto v##m##n = Vector<float, 4>::load( \ | |||
| output_transform_buf + \ | |||
| (m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||
| ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||
| #undef cb | |||
| /** | |||
| * A | |||
| * | |||
| * 1 0 0 0 0 0 | |||
| * 1 1 1 1 1 1 | |||
| * 1 -1 1 -1 1 -1 | |||
| * 1 2 4 8 16 32 | |||
| * 1 -2 4 -8 16 -32 | |||
| * 1 0.5 0.25 0.125 0.0625 0.03125 | |||
| * 1 -0.5 0.25 -0.125 0.0625 -0.03125 | |||
| * 0 0.0 0 0 0 1 | |||
| */ | |||
| Vector<float, 4> v1addv2, v1subv2, v3addv4, v3subv4, v5addv6, v5subv6; | |||
| #define cb(m) \ | |||
| v1addv2 = v1##m + v2##m; \ | |||
| v1subv2 = v1##m - v2##m; \ | |||
| v3addv4 = v3##m + v4##m; \ | |||
| v3subv4 = v3##m - v4##m; \ | |||
| v5addv6 = v5##m + v6##m; \ | |||
| v5subv6 = v5##m - v6##m; \ | |||
| auto t0##m = v0##m + v1addv2 + v3addv4 + v5addv6; \ | |||
| auto t1##m = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||
| auto t2##m = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||
| auto t3##m = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||
| auto t4##m = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||
| auto t5##m = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + v7##m; | |||
| UNROLL_CALL_NOWRAPPER(8, cb); | |||
| #undef cb | |||
| #define cb(m) \ | |||
| v1addv2 = t##m##1 + t##m##2; \ | |||
| v1subv2 = t##m##1 - t##m##2; \ | |||
| v3addv4 = t##m##3 + t##m##4; \ | |||
| v3subv4 = t##m##3 - t##m##4; \ | |||
| v5addv6 = t##m##5 + t##m##6; \ | |||
| v5subv6 = t##m##5 - t##m##6; \ | |||
| v##m##0 = t##m##0 + v1addv2 + v3addv4 + v5addv6; \ | |||
| v##m##1 = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||
| v##m##2 = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||
| v##m##3 = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||
| v##m##4 = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||
| v##m##5 = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + t##m##7; | |||
| UNROLL_CALL_NOWRAPPER(6, cb); | |||
| #undef cb | |||
| Vector<float, 4> vbias; | |||
| if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
| vbias = Vector<float, 4>::load(bias + oc); | |||
| #define cb(m, n) v##m##n += vbias; | |||
| UNROLL_CALL_RAW_D2(6, 6, cb); | |||
| #undef cb | |||
| } | |||
| if (bmode != BiasMode::BIAS) { | |||
| #define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||
| UNROLL_CALL_RAW_D2(6, 6, cb); | |||
| #undef cb | |||
| } | |||
| #define out_save(oho, owo) \ | |||
| do { \ | |||
| size_t oh = oh_start + oho; \ | |||
| size_t ow = ow_start + owo; \ | |||
| if (oh < OH && ow < OW) { \ | |||
| if (bmode == BiasMode::BIAS) { \ | |||
| v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||
| oh * OW * pack_size + \ | |||
| ow * pack_size); \ | |||
| v##oho##owo = op(v##oho##owo.value); \ | |||
| } \ | |||
| v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||
| ow * pack_size); \ | |||
| } \ | |||
| } while (0); | |||
| UNROLL_CALL_RAW_D2(6, 6, out_save); | |||
| } | |||
| #undef out_save | |||
| }; | |||
| } // namespace | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| namespace winograd { | |||
| MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F63_mk4_f_nchw44) | |||
| void winograd_F63_mk4_f_nchw44::filter(const float* filter, | |||
| float* filter_transform_buf, | |||
| float* transform_mid_buf, size_t OC, | |||
| size_t IC, size_t oc_start, | |||
| size_t oc_end) { | |||
| constexpr size_t pack_size = 4; | |||
| // Gg * GT | |||
| // G | |||
| // 1.0000000 0.0000000 0.0000000 | |||
| // -0.2222222 -0.2222222 -0.2222222 | |||
| // -0.2222222 0.2222222 -0.2222222 | |||
| // 0.0111111 0.0222222 0.0444444 | |||
| // 0.0111111 -0.0222222 0.0444444 | |||
| // 0.7111111 0.3555556 0.1777778 | |||
| // 0.7111111 -0.3555556 0.1777778 | |||
| // 0.0000000 0.0000000 1.0000000 | |||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
| megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||
| oc_start % pack_size == 0 && | |||
| oc_end % pack_size == 0 && IC % pack_size == 0 && | |||
| OC % pack_size == 0, | |||
| "NCHW44 Winograd filter transform requires both OC and IC " | |||
| "are times of 4"); | |||
| size_t ICB = IC / pack_size; | |||
| for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||
| for (size_t icb = 0; icb < ICB; icb++) { | |||
| for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||
| const float* fptr = filter + | |||
| (ocb * ICB + icb) * KERNEL_SIZE * | |||
| KERNEL_SIZE * pack_size * | |||
| pack_size + | |||
| ic_inner * pack_size; | |||
| #define cb(m, n) \ | |||
| Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||
| fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||
| #undef cb | |||
| #define FILTER_TRANSFORM(n, wd, g) \ | |||
| auto wd##n##0 = g##0##n; \ | |||
| tmp0 = (g##0##n + g##2##n) * -0.2222222f; \ | |||
| tmp1 = g##1##n * -0.2222222f; \ | |||
| auto wd##n##1 = tmp0 + tmp1; \ | |||
| auto wd##n##2 = tmp0 - tmp1; \ | |||
| tmp0 = g##0##n * 0.0111111f + g##2##n * 0.0444444f; \ | |||
| tmp1 = g##1##n * 0.0222222f; \ | |||
| auto wd##n##3 = tmp0 + tmp1; \ | |||
| auto wd##n##4 = tmp0 - tmp1; \ | |||
| tmp0 = g##0##n * 0.7111111f + g##2##n * 0.1777778f; \ | |||
| tmp1 = g##1##n * 0.3555556f; \ | |||
| auto wd##n##5 = tmp0 + tmp1; \ | |||
| auto wd##n##6 = tmp0 - tmp1; \ | |||
| auto wd##n##7 = g##2##n; | |||
| Vector<float, 4> tmp0, tmp1; | |||
| UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||
| UNROLL_CALL_RAW(8, FILTER_TRANSFORM, ret, wd); | |||
| #undef FILTER_TRANSFORM | |||
| #define cb_save(m, n) \ | |||
| ret##m##n.save(filter_transform_buf + (m * alpha + n) * OC * IC + \ | |||
| ocb * IC * pack_size + icb * pack_size * pack_size + \ | |||
| ic_inner * pack_size); | |||
| UNROLL_CALL_NOWRAPPER_D2(8, 8, cb_save) | |||
| #undef cb_save | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void winograd_F63_mk4_f_nchw44::input(const float* input, | |||
| float* input_transform_buf, | |||
| float* transform_mid_buf, size_t IH, | |||
| size_t IW, size_t IC, size_t PH, | |||
| size_t PW, size_t unit_start_idx, | |||
| size_t nr_units_in_tile) { | |||
| constexpr size_t pack_size = 4; | |||
| megdnn_assert(IC % pack_size == 0); | |||
| constexpr int alpha = 3 + 6 - 1; | |||
| // OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||
| auto units_w = | |||
| div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||
| float* patch = transform_mid_buf; | |||
| float* patchT = transform_mid_buf + pack_size * alpha * alpha; | |||
| for (size_t ic = 0; ic < IC; ic += pack_size) { | |||
| rep(unit_idx, nr_units_in_tile) { | |||
| size_t index = unit_start_idx + unit_idx; | |||
| size_t nh = index / units_w; | |||
| size_t nw = index % units_w; | |||
| int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||
| int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||
| if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||
| iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||
| InputTransformF63_NCHW44::prepare<true>(input, patch, patchT, | |||
| ih_start, iw_start, IH, | |||
| IW, ic, IC); | |||
| InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||
| unit_idx, nr_units_in_tile, | |||
| ic, IC); | |||
| } else { | |||
| InputTransformF63_NCHW44::prepare<false>(input, patch, patchT, | |||
| ih_start, iw_start, IH, | |||
| IW, ic, IC); | |||
| InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||
| unit_idx, nr_units_in_tile, | |||
| ic, IC); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void winograd_F63_mk4_f_nchw44::output(const float* output_transform_buf, | |||
| const float* bias, float* output, | |||
| float* transform_mid_buf, BiasMode bmode, | |||
| NonlineMode nonline_mode, size_t OH, | |||
| size_t OW, size_t oc_start, | |||
| size_t oc_end, size_t unit_start_idx, | |||
| size_t nr_units_in_tile) { | |||
| constexpr size_t pack_size = 4; | |||
| #define cb(_bmode, _nonline_op, ...) \ | |||
| OutputTransformF63_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||
| __VA_ARGS__); | |||
| auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||
| for (size_t oc = oc_start; oc < oc_end; oc += pack_size) { | |||
| size_t oc_index = oc - oc_start; | |||
| rep(unit_idx, nr_units_in_tile) { | |||
| size_t index = unit_start_idx + unit_idx; | |||
| auto nh = index / units_w; | |||
| auto nw = index % units_w; | |||
| size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||
| size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||
| DISPATCH_CONV_WINOGRAD_BIAS( | |||
| megdnn_arm_common_winograd_fp32_F63_mk4, cb, float, float, | |||
| bmode, nonline_mode, output_transform_buf, bias, output, | |||
| transform_mid_buf, oh_start, ow_start, OH, OW, oc_start, | |||
| oc_end, oc_index, unit_idx, nr_units_in_tile, src_dtype, | |||
| dst_dtype); | |||
| } | |||
| } | |||
| #undef cb | |||
| } | |||
| } // namespace winograd | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -169,6 +169,14 @@ public: | |||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
| tile_size)); | |||
| winograd_algos.emplace_back(refhold.back().get()); | |||
| refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( | |||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
| tile_size)); | |||
| winograd_algos.emplace_back(refhold.back().get()); | |||
| refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( | |||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
| tile_size)); | |||
| winograd_algos.emplace_back(refhold.back().get()); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| refhold.emplace_back(new AlgoFP16WinogradF23( | |||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
| @@ -49,6 +49,9 @@ private: | |||
| class AlgoFP32WinogradF54; | |||
| class AlgoFP32WinogradF45; | |||
| class AlgoFP32WinogradF23_4x4_NCHW44; | |||
| class AlgoFP32WinogradF63_4x4_NCHW44; | |||
| class AlgoS8ChanWiseStride1NCHW44; | |||
| class AlgoS8ChanWiseStride2NCHW44; | |||
| @@ -28,13 +28,22 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| using namespace winograd; | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| //! NCHW44 group conv or NCHW group conv or both dense conv | |||
| size_t flt_start = 0; | |||
| size_t pack_c_size = 1; | |||
| size_t group = 1; | |||
| if (src.layout.ndim == 5) { | |||
| if (src.layout.ndim == 5) { //! {g, OC, IC, FH, FW} | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| } else if (src.layout.ndim == 6) { //! {OC/4, IC/4, FH, FW, 4, 4} | |||
| pack_c_size = src.layout[5]; | |||
| } else if (src.layout.ndim == 7) { //! {g, OC/4, IC/4, FH, FW, 4, 4} | |||
| flt_start = 1; | |||
| group = src.layout[0]; | |||
| pack_c_size = src.layout[6]; | |||
| } | |||
| size_t OC = src.layout[flt_start], IC = src.layout[flt_start + 1], | |||
| size_t OC = src.layout[flt_start] * pack_c_size, | |||
| IC = src.layout[flt_start + 1] * pack_c_size, | |||
| FW = src.layout[flt_start + 3]; | |||
| size_t m = param().output_block_size; | |||
| @@ -68,13 +77,23 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| float* workspace_ptr = workspace.ptr<float>(); | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, 0, | |||
| 0); | |||
| if (pack_c_size == 1) { | |||
| DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, | |||
| 0, 0); | |||
| } else if (pack_c_size == 4) { | |||
| DISPATCH(winograd_F23_mk4_f_nchw44, | |||
| param::Winograd::Format::MK4, 0, 5); | |||
| } | |||
| } else if (m == 6) { | |||
| DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT, | |||
| 0, 1); | |||
| DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, 0, | |||
| 2); | |||
| if (pack_c_size == 1) { | |||
| DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, | |||
| 0, 2); | |||
| } else if (pack_c_size == 4) { | |||
| DISPATCH(winograd_F63_mk4_f_nchw44, | |||
| param::Winograd::Format::MK4, 0, 6); | |||
| } | |||
| } | |||
| } else if (FW == 4) { | |||
| if (m == 5) { | |||
| @@ -158,7 +158,10 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| } | |||
| template <typename T> | |||
| struct ParamTrait; | |||
| struct NCHWParamTrait; | |||
| template <typename T> | |||
| struct NCHW44ParamTrait; | |||
| std::string ConvBias::WinogradParam::to_string() const { | |||
| return ssprintf("%u:%u:%u", channel_block_size, output_block_size, | |||
| @@ -166,32 +169,51 @@ std::string ConvBias::WinogradParam::to_string() const { | |||
| } | |||
| template <typename T> | |||
| std::string ConvBias::algo_name(const std::string& base, const T& p) { | |||
| return ssprintf("%s:%s:%s", ParamTrait<T>::category.c_str(), base.c_str(), | |||
| p.to_string().c_str()); | |||
| std::string ConvBias::algo_name(const std::string& base, const T& p, | |||
| param::ConvBias::Format format) { | |||
| if (format == param::ConvBias::Format::NCHW) { | |||
| return ssprintf("%s:%s:%s", NCHWParamTrait<T>::category.c_str(), | |||
| base.c_str(), p.to_string().c_str()); | |||
| } else if (format == param::ConvBias::Format::NCHW44) { | |||
| return ssprintf("%s:%s:%s", NCHW44ParamTrait<T>::category.c_str(), | |||
| base.c_str(), p.to_string().c_str()); | |||
| } | |||
| megdnn_throw("Invalid format"); | |||
| return ""; | |||
| } | |||
| #define FOREACH_CONV_BIAS_PARAM(cb) \ | |||
| cb(WinogradParam) cb(DirectParam) cb(MatmulParam) cb(DefaultParam) | |||
| #define cb(pt) \ | |||
| template <> \ | |||
| struct ParamTrait<ConvBias::pt> { \ | |||
| static const std::string category; \ | |||
| #define cb(pt) \ | |||
| template <> \ | |||
| struct NCHWParamTrait<ConvBias::pt> { \ | |||
| static const std::string category; \ | |||
| }; \ | |||
| template <> \ | |||
| struct NCHW44ParamTrait<ConvBias::pt> { \ | |||
| static const std::string category; \ | |||
| }; | |||
| FOREACH_CONV_BIAS_PARAM(cb) | |||
| #undef cb | |||
| #define cb(pt, ct) const std::string ParamTrait<ConvBias::pt>::category = ct | |||
| cb(WinogradParam, "WINOGRAD"); | |||
| #define cb(pt, ct) \ | |||
| const std::string NCHWParamTrait<ConvBias::pt>::category = ct; \ | |||
| const std::string NCHW44ParamTrait<ConvBias::pt>::category = ct | |||
| cb(DirectParam, "DIRECT"); | |||
| cb(MatmulParam, "MATMUL"); | |||
| cb(DefaultParam, "DEFAULT"); | |||
| #undef cb | |||
| const std::string NCHWParamTrait<ConvBias::WinogradParam>::category = | |||
| "WINOGRAD"; | |||
| const std::string NCHW44ParamTrait<ConvBias::WinogradParam>::category = | |||
| "WINOGRAD_NCHW44"; | |||
| #define cb(t) \ | |||
| template std::string ConvBias::algo_name<ConvBias::t>( \ | |||
| const std::string& base, const ConvBias::t& p); | |||
| const std::string& base, const ConvBias::t& p, \ | |||
| param::ConvBias::Format format); | |||
| FOREACH_CONV_BIAS_PARAM(cb) | |||
| #undef cb | |||
| @@ -199,17 +221,37 @@ ConvBias::WinogradParam ConvBias::parse_winograd_name( | |||
| const std::string& algo_name) { | |||
| ConvBias::WinogradParam ret = INVALID_WINOGRAD_PARAM; | |||
| char base[128]; | |||
| sscanf(algo_name.c_str(), "WINOGRAD:%[^:]:%u:%u:%u", base, | |||
| &(ret.channel_block_size), &(ret.output_block_size), | |||
| &(ret.tile_size)); | |||
| if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||
| ret.channel_block_size == 0) { | |||
| megdnn_log_warn("the algo name %s is not suitable for winograd", | |||
| algo_name.c_str()); | |||
| return INVALID_WINOGRAD_PARAM; | |||
| char name[128]; | |||
| auto parse = [&](const std::string& algo_name, | |||
| const std::string& pre) -> auto { | |||
| memset(name, 0, 128); | |||
| sscanf(algo_name.c_str(), "%[^:]:%[^:]:%u:%u:%u", name, base, | |||
| &(ret.channel_block_size), &(ret.output_block_size), | |||
| &(ret.tile_size)); | |||
| if (strcmp(name, pre.c_str())) { | |||
| megdnn_log_warn("algo %s is not %s algo", name, pre.c_str()); | |||
| ret = INVALID_WINOGRAD_PARAM; | |||
| return false; | |||
| } | |||
| if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||
| ret.channel_block_size == 0) { | |||
| megdnn_log_warn("the algo name %s is not suitable for %s", | |||
| algo_name.c_str(), pre.c_str()); | |||
| ret = INVALID_WINOGRAD_PARAM; | |||
| return false; | |||
| } | |||
| return true; | |||
| }; | |||
| if (parse(algo_name, "WINOGRAD_NCHW44")) { | |||
| return ret; | |||
| } else { | |||
| parse(algo_name, "WINOGRAD"); | |||
| return ret; | |||
| } | |||
| return ret; | |||
| } | |||
| constexpr ConvBias::WinogradParam ConvBias::INVALID_WINOGRAD_PARAM; | |||
| void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
| @@ -299,6 +299,7 @@ void make_canonized_filter_meta_nchwxx( | |||
| megdnn_assert(param.format == Param::Format::NCHW88 || | |||
| param.format == Param::Format::NCHW44 || | |||
| param.format == Param::Format::NCHW44_WINOGRAD || | |||
| param.format == Param::Format::NCHW88_WINOGRAD); | |||
| size_t img_ndim = 2; | |||
| size_t flt_start = 0; | |||
| @@ -663,6 +664,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| param().format == Param::Format::NCHW32 || | |||
| param().format == Param::Format::NCHW88 || | |||
| param().format == Param::Format::NCHW88_WINOGRAD || | |||
| param().format == Param::Format::NCHW44_WINOGRAD || | |||
| param().format == Param::Format::CHWN4); | |||
| img_dim = src.ndim - 3; | |||
| if ((param().format == Param::Format::NCHW88 || | |||
| @@ -68,6 +68,7 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) { | |||
| case param::ConvBias::Format::NHWCD4: | |||
| return 4; | |||
| case param::ConvBias::Format::NCHW4: | |||
| case param::ConvBias::Format::NCHW44: | |||
| return 4; | |||
| case param::ConvBias::Format::NCHW32: | |||
| return 32; | |||
| @@ -363,6 +364,7 @@ INST(uint8_t, uint8_t, int16_t, int) | |||
| _ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, layout, param::MatrixMul::Format::MK4>; | |||
| INST(float, float, float, float, param::ConvBias::Format::NCHW) | |||
| INST(float, float, float, float, param::ConvBias::Format::NCHW44) | |||
| #undef INST | |||
| #define INST(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| @@ -425,6 +425,7 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_pack_id, | |||
| break; | |||
| } | |||
| case ConvBiasImpl::Param::Format::NCHW_WINOGRAD: | |||
| case ConvBiasImpl::Param::Format::NCHW44_WINOGRAD: | |||
| case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: { | |||
| //! four format of weight layout | |||
| //! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8} | |||
| @@ -198,7 +198,8 @@ void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||
| for (auto kernel : kerns) { | |||
| megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | |||
| param.filter_meta.format == Param::Format::NHWC || | |||
| param.filter_meta.format == Param::Format::NCHW88, | |||
| param.filter_meta.format == Param::Format::NCHW88 || | |||
| param.filter_meta.format == Param::Format::NCHW44, | |||
| "invalid conv format"); | |||
| auto run = [param, kernel](size_t index, size_t thread_id) { | |||
| CpuNDRange ndrange_id(kernel.global_size, index); | |||
| @@ -137,6 +137,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| #undef DISPATCH_FORMAT_MK8 | |||
| #undef DISPATCH_DTYPE | |||
| } else { | |||
| megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||
| #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _format, rescale) \ | |||
| if (param().format == _format) { \ | |||
| @@ -158,20 +159,58 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0); \ | |||
| } | |||
| megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(4); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(5); | |||
| if (pack_c_size == 8) { //! NCHW88 | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(4); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, | |||
| -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(5); | |||
| } | |||
| } | |||
| #undef cb | |||
| #undef DISPATCH_FORMAT_MK8 | |||
| #undef DISPATCH_DTYPE | |||
| } | |||
| else if (pack_c_size == 4) { //! NCHW44 | |||
| #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _format, rescale) \ | |||
| if (param().format == _format) { \ | |||
| return winograd::StrategyHelper< \ | |||
| _ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, param::ConvBias::Format::NCHW44, \ | |||
| _format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||
| OC, m, FW, interp_points, src.layout.dtype, \ | |||
| rescale); \ | |||
| } | |||
| #define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type, \ | |||
| _output_compute_type, _rescale) \ | |||
| cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
| param::Winograd::Format::MK4, _rescale); | |||
| #define DISPATCH_DTYPE(_midout_tag) \ | |||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||
| } | |||
| if (FW == 3) { | |||
| if (m == 2) { | |||
| std::vector<float> interp_points = {0, 1, -1}; | |||
| DISPATCH_DTYPE(6); | |||
| } else if (m == 6) { | |||
| std::vector<float> interp_points = {0, 1, -1, 2, | |||
| -2, 0.5, -0.5}; | |||
| DISPATCH_DTYPE(7); | |||
| } | |||
| } | |||
| #undef cb | |||
| #undef DISPATCH_FORMAT_MK8 | |||
| #undef DISPATCH_KERNEL | |||
| #undef DISPATCH_DTYPE | |||
| } | |||
| } | |||
| megdnn_assert(execed, | |||
| "Unsupport winograd filter preprocess. m: %zu src: %s", m, | |||
| @@ -699,6 +699,135 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) { | |||
| } | |||
| #endif | |||
| void benchmark_winograd_nchw_vs_nchw44(const char* algo_name, Handle* handle) { | |||
| using namespace conv_bias; | |||
| using NLMode = param::ConvBias::NonlineMode; | |||
| std::vector<conv_bias::TestArg> args_nchw44; | |||
| std::vector<conv_bias::TestArg> args_nchw; | |||
| auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, | |||
| size_t group, NLMode nlmode) { | |||
| param::ConvBias param; | |||
| param.format = param::ConvBias::Format::NCHW44; | |||
| param.stride_h = 1; | |||
| param.stride_w = 1; | |||
| param.pad_h = 1; | |||
| param.pad_w = 1; | |||
| param.nonlineMode = nlmode; | |||
| if (group == 1) { | |||
| param.sparse = param::ConvBias::Sparse::DENSE; | |||
| args_nchw44.emplace_back(param, TensorShape{n, ic / 4, h, w, 4}, | |||
| TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, | |||
| TensorShape{}); | |||
| param.format = param::ConvBias::Format::NCHW; | |||
| args_nchw.emplace_back(param, TensorShape{n, ic, h, w}, | |||
| TensorShape{oc, ic, 3, 3}, TensorShape{}); | |||
| } else { | |||
| auto oc_per_group = oc / group; | |||
| auto ic_per_group = ic / group; | |||
| param.sparse = param::ConvBias::Sparse::GROUP; | |||
| args_nchw44.emplace_back(param, | |||
| TensorShape{n, ic_per_group / 4, h, w, 4}, | |||
| TensorShape{group, oc_per_group / 4, | |||
| ic_per_group / 4, 3, 3, 4, 4}, | |||
| TensorShape{}); | |||
| param.format = param::ConvBias::Format::NCHW; | |||
| args_nchw.emplace_back( | |||
| param, TensorShape{n, ic, h, w}, | |||
| TensorShape{group, oc_per_group, ic_per_group, 3, 3}, | |||
| TensorShape{}); | |||
| } | |||
| }; | |||
| std::vector<NLMode> nonlinemode = {NLMode::IDENTITY}; | |||
| for (auto nlmode : nonlinemode) | |||
| for (size_t n : {1, 2}) | |||
| for (size_t group = 1; group <= 2; ++group) { | |||
| pack(n, 512, 512, 15, 15, group, nlmode); | |||
| pack(n, 512, 256, 15, 15, group, nlmode); | |||
| pack(n, 256, 256, 29, 29, group, nlmode); | |||
| pack(n, 256, 128, 29, 29, group, nlmode); | |||
| pack(n, 128, 128, 57, 57, group, nlmode); | |||
| pack(n, 128, 64, 57, 57, group, nlmode); | |||
| pack(n, 24, 24, 224, 224, group, nlmode); | |||
| pack(n, 64, 24, 123, 123, group, nlmode); | |||
| pack(n, 64, 64, 56, 56, group, nlmode); | |||
| pack(n, 128, 128, 28, 28, group, nlmode); | |||
| pack(n, 256, 256, 14, 14, group, nlmode); | |||
| pack(n, 512, 512, 7, 7, group, nlmode); | |||
| } | |||
| using namespace conv_bias; | |||
| constexpr size_t RUN = 10; | |||
| Benchmarker<ConvBias> benchmark_winograd_nchw(handle); | |||
| benchmark_winograd_nchw.set_display(false); | |||
| benchmark_winograd_nchw.set_times(RUN); | |||
| Benchmarker<ConvBias> benchmark_winograd_nchw44(handle); | |||
| benchmark_winograd_nchw44.set_display(false); | |||
| benchmark_winograd_nchw44.set_times(RUN); | |||
| std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name); | |||
| std::string winograd_nchw44_algo_name = | |||
| ssprintf("WINOGRAD_NCHW44:%s", algo_name); | |||
| for (size_t i = 0; i < args_nchw.size(); ++i) { | |||
| auto arg_nchw = args_nchw[i]; | |||
| auto arg_nchw44 = args_nchw44[i]; | |||
| TensorLayout dst_layout; | |||
| auto opr = handle->create_operator<ConvBias>(); | |||
| opr->param() = arg_nchw.param; | |||
| opr->deduce_layout({arg_nchw.src, dtype::Float32()}, | |||
| {arg_nchw.filter, dtype::Float32()}, | |||
| {arg_nchw.bias, dtype::Float32()}, {}, dst_layout); | |||
| //! dst.nr_elems * IC * FH * FW * 2 | |||
| float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] * | |||
| arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| benchmark_winograd_nchw.set_param(arg_nchw.param); | |||
| auto nchw_used = algo_benchmark<ConvBias>( | |||
| benchmark_winograd_nchw, | |||
| {arg_nchw.src, arg_nchw.filter, {}, {}, {}}, | |||
| winograd_nchw_algo_name.c_str()) / | |||
| RUN; | |||
| benchmark_winograd_nchw44.set_param(arg_nchw44.param); | |||
| auto nchw44_used = | |||
| algo_benchmark<ConvBias>( | |||
| benchmark_winograd_nchw44, | |||
| {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}}, | |||
| winograd_nchw44_algo_name.c_str()) / | |||
| RUN; | |||
| printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops " | |||
| "speedup: " | |||
| "%f\n", | |||
| arg_nchw.src.to_string().c_str(), | |||
| arg_nchw.filter.to_string().c_str(), nchw_used, | |||
| computations / nchw_used, nchw44_used, | |||
| computations / nchw44_used, nchw_used / nchw44_used); | |||
| } | |||
| } | |||
| TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) { | |||
| #if MEGDNN_AARCH64 | |||
| benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:2", handle()); | |||
| #else | |||
| benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:2", handle()); | |||
| #endif | |||
| } | |||
| TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) { | |||
| #if MEGDNN_AARCH64 | |||
| benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6", handle()); | |||
| #else | |||
| benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6", handle()); | |||
| #endif | |||
| } | |||
| TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) { | |||
| auto benchmark_winograd_quantized = [](const char* algo_name_fp32, | |||
| const char* algo_name_quantized, | |||
| @@ -654,6 +654,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) { | |||
| check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_NCHW44) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4, | |||
| param::ConvBias::Format::NCHW44); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_winograd_args(3); | |||
| @@ -667,7 +676,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4) { | |||
| std::vector<TestArg> args = get_winograd_mk_packed_args(); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| check_winograd("4:6:32", checker, args, param::MatrixMul::Format::MK4); | |||
| check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4, | |||
| param::ConvBias::Format::NCHW44); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { | |||
| @@ -761,6 +778,75 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
| #endif | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||
| using namespace conv_bias; | |||
| std::vector<TestArg> nchw44_args = get_nchw44_conv_bias_args({3}, 1); | |||
| Checker<ConvBiasForward> checker(handle()); | |||
| auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||
| param::ConvBias param, Handle* handle) { | |||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW44); | |||
| auto winograd_preprocess_opr = | |||
| handle->create_operator<WinogradFilterPreprocess>(); | |||
| winograd_preprocess_opr->param().output_block_size = m; | |||
| winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK4; | |||
| TensorLayout filter_transform_layout; | |||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
| filter_transform_layout); | |||
| size_t winograd_preprocess_workspace_in_bytes = | |||
| winograd_preprocess_opr->get_workspace_in_bytes( | |||
| tensors[1].layout, filter_transform_layout); | |||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
| conv_bias_opr->param() = param; | |||
| conv_bias_opr->param().format = param::ConvBias::Format::NCHW44_WINOGRAD; | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, | |||
| tensors[4].layout, nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| winograd_preprocess_workspace_in_bytes}); | |||
| wb.set(malloc(wb.total_size_in_bytes())); | |||
| TensorND filter_transform_tensor(wb.get(0), | |||
| std::move(filter_transform_layout)); | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| auto run = [&checker, &extra_impl]( | |||
| Handle* handle, const std::vector<TestArg>& args, | |||
| const std::vector<size_t>& out_size, DType A_dtype, | |||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||
| const float eps) { | |||
| for (auto&& arg : args) { | |||
| for (uint32_t m : out_size) { | |||
| checker.set_extra_opr_impl(std::bind(extra_impl, | |||
| std::placeholders::_1, m, | |||
| arg.param, handle)); | |||
| checker.set_dtype(0, A_dtype) | |||
| .set_dtype(1, B_dtype) | |||
| .set_dtype(2, C_dtype) | |||
| .set_dtype(4, D_dtype) | |||
| .set_epsilon(eps) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
| } | |||
| } | |||
| }; | |||
| run(handle(), nchw44_args, {2, 6}, dtype::Float32(), dtype::Float32(), | |||
| dtype::Float32(), dtype::Float32(), 1e-3f); | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { | |||
| using namespace conv_bias; | |||
| @@ -72,7 +72,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
| auto&& inputs = conv_bias_opr.input(); | |||
| VarNodeArray new_inp; | |||
| new_inp.reserve(inputs.size()); | |||
| for (auto i: inputs) { | |||
| for (auto i : inputs) { | |||
| new_inp.push_back(rewriter.get_var(i)); | |||
| } | |||
| @@ -86,11 +86,15 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
| megdnn::ConvBias::parse_winograd_name(algo_name); | |||
| if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM) | |||
| break; | |||
| mgb_assert(conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW || | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW88, | |||
| "currently winograd only suppport NCHW and nchw88"); | |||
| mgb_assert( | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW || | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW88 || | |||
| conv_bias_opr.param().format == | |||
| megdnn::ConvBias::Param::Format::NCHW44, | |||
| "currently winograd only suppport NCHW and NCHW44 and " | |||
| "NCHW88"); | |||
| opr::ConvBiasForward::check_winograd_param_valid( | |||
| winograd_param, conv_bias_opr.input(0)->dtype()); | |||
| megdnn::param::Winograd winograd_preprocess_param; | |||
| @@ -110,8 +114,17 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
| megdnn::ConvBias::Param::Format::NCHW_WINOGRAD; | |||
| } else { | |||
| mgb_assert(new_inp[0]->shape().ndim == 5); | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||
| size_t pack_size = new_inp[0]->shape()[4]; | |||
| if (pack_size == 8) { | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||
| } else if (pack_size == 4) { | |||
| conv_bias_param.format = | |||
| megdnn::ConvBias::Param::Format::NCHW44_WINOGRAD; | |||
| } else { | |||
| mgb_assert(0, "Invalid pack size %zu in algo %s", pack_size, | |||
| algo_name.c_str()); | |||
| } | |||
| } | |||
| conv_bias_param.output_block_size = | |||
| winograd_param.output_block_size; | |||