| @@ -281,6 +281,13 @@ struct TensorLayout : public TensorShape { | |||||
| add_axis_inplace(axis, 1, stride[axis] * shape[axis]); | add_axis_inplace(axis, 1, stride[axis] * shape[axis]); | ||||
| } | } | ||||
| /*! | |||||
| * \brief modify data type of the layout inplace | |||||
| * | |||||
| * By the way this API will modify the format according to the data type | |||||
| */ | |||||
| void modify_dtype_inplace(DType dtype); | |||||
| /* =================== generate new layout =================== */ | /* =================== generate new layout =================== */ | ||||
| /** | /** | ||||
| @@ -513,6 +513,15 @@ class DType { | |||||
| bool is_low_bit() const { return low_bit() != 0; } | bool is_low_bit() const { return low_bit() != 0; } | ||||
| bool is_quantized_lowbit() const { | |||||
| return low_bit() != 0 && | |||||
| #if MEGDNN_CC_HOST | |||||
| category() == DTypeCategory::QUANTIZED; | |||||
| #else | |||||
| category().ev == DTypeCategory::Ev::QUANTIZED; | |||||
| #endif | |||||
| } | |||||
| /*! | /*! | ||||
| * \brief size of this data type, in bytes | * \brief size of this data type, in bytes | ||||
| */ | */ | ||||
| @@ -226,7 +226,7 @@ public: | |||||
| std::string to_string() const override; | std::string to_string() const override; | ||||
| //! raise exception if given layout is illegal | //! raise exception if given layout is illegal | ||||
| void assert_valid(const TensorLayout& layout) const; | |||||
| void assert_valid(const TensorLayout& layout) const override; | |||||
| void serialize_append(std::string& result) const override; | void serialize_append(std::string& result) const override; | ||||
| @@ -282,6 +282,11 @@ void TensorLayout::add_axis_inplace(size_t axis, size_t shape, | |||||
| this->stride[axis] = stride; | this->stride[axis] = stride; | ||||
| } | } | ||||
| void TensorLayout::modify_dtype_inplace(DType dtype_) { | |||||
| dtype = dtype_; | |||||
| format = Format(dtype); | |||||
| } | |||||
| bool TensorLayout::is_contiguous() const { | bool TensorLayout::is_contiguous() const { | ||||
| return format.impl()->is_contiguous_spec(*this); | return format.impl()->is_contiguous_spec(*this); | ||||
| } | } | ||||
| @@ -952,7 +952,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| megdnn_assert(src[4] == 4); | megdnn_assert(src[4] == 4); | ||||
| dst[4] = 4; | dst[4] = 4; | ||||
| } | } | ||||
| dst.format = src.format; | |||||
| if (!src.format.is_default() && | |||||
| !src.format.is_lowbit_aligned()) { // propagate | |||||
| dst.format = src.format; | |||||
| } else { // determined by dtype | |||||
| dst.format = TensorFormat(dst.dtype); | |||||
| } | |||||
| dst.init_contiguous_stride(); | dst.init_contiguous_stride(); | ||||
| return cflt; | return cflt; | ||||
| } | } | ||||
| @@ -46,14 +46,15 @@ TensorFormat TensorFormat::deserialize(const std::string& bin, | |||||
| TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} | TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {} | ||||
| TensorFormat::Format(DType dtype) { | TensorFormat::Format(DType dtype) { | ||||
| megdnn_assert(dtype.valid()); | |||||
| if (dtype.is_low_bit()) { | |||||
| if (dtype.valid() && | |||||
| dtype.is_quantized_lowbit()) { // quantized lowbit, by default | |||||
| // aligned to bytes | |||||
| size_t size_nbits = dtype.low_bit(); | size_t size_nbits = dtype.low_bit(); | ||||
| megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, | megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4, | ||||
| "unsupported lowbits data type(%s, size in bits: %zu)", | "unsupported lowbits data type(%s, size in bits: %zu)", | ||||
| dtype.name(), size_nbits); | dtype.name(), size_nbits); | ||||
| m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; | m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl; | ||||
| } else { | |||||
| } else { // non parameterized lowbit, default format | |||||
| m_impl = DefaultTensorFormat::make().m_impl; | m_impl = DefaultTensorFormat::make().m_impl; | ||||
| } | } | ||||
| } | } | ||||
| @@ -89,8 +90,8 @@ bool TensorFormat::is_lowbit_aligned() const { | |||||
| /* ===================== DefaultFormat ===================== */ | /* ===================== DefaultFormat ===================== */ | ||||
| void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { | void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const { | ||||
| megdnn_assert( | megdnn_assert( | ||||
| !layout.dtype.valid() || !layout.dtype.is_low_bit(), | |||||
| "DefaultTensorFormat does not support low-bits tensor(dtype:%s)", | |||||
| !layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(), | |||||
| "DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)", | |||||
| layout.dtype.name()); | layout.dtype.name()); | ||||
| } | } | ||||
| @@ -271,7 +272,8 @@ void Image2DPackedTensorFormatBase<PIXEL_SIZE>::assert_valid( | |||||
| auto m_align_axis = align_axis(); | auto m_align_axis = align_axis(); | ||||
| megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), | megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE), | ||||
| "bad shape: %zu", layout.shape[layout.ndim - 1]); | "bad shape: %zu", layout.shape[layout.ndim - 1]); | ||||
| megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis); | |||||
| megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() && | |||||
| layout.ndim > m_align_axis); | |||||
| ptrdiff_t first_non_zero_stride = 0; | ptrdiff_t first_non_zero_stride = 0; | ||||
| for (int i = layout.ndim - 1; i >= 0; --i) { | for (int i = layout.ndim - 1; i >= 0; --i) { | ||||
| megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); | megdnn_assert(layout.shape[i] && layout.stride[i] >= 0); | ||||
| @@ -478,6 +480,7 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||||
| megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && | megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() && | ||||
| layout.dtype.low_bit() == m_size_nbits); | layout.dtype.low_bit() == m_size_nbits); | ||||
| bool has_dim_unity_stride = false; | bool has_dim_unity_stride = false; | ||||
| bool has_dim_aligned_stride = false; | |||||
| for (int i = layout.ndim - 1; i >= 0; --i) { | for (int i = layout.ndim - 1; i >= 0; --i) { | ||||
| if (!has_dim_unity_stride && layout.stride[i] == 1) | if (!has_dim_unity_stride && layout.stride[i] == 1) | ||||
| has_dim_unity_stride = true; | has_dim_unity_stride = true; | ||||
| @@ -485,15 +488,16 @@ void LowbitsAlignedTensorFormatBase::assert_valid( | |||||
| layout.stride[i] >= 0 && | layout.stride[i] >= 0 && | ||||
| (layout.stride[i] % m_align_size_in_elements == 0 || | (layout.stride[i] % m_align_size_in_elements == 0 || | ||||
| layout.stride[i] == 1), | layout.stride[i] == 1), | ||||
| "bad stride:%s, %zu", layout.to_string().c_str(), | |||||
| layout.stride[i]); | |||||
| "bad stride:%s, %ld", layout.to_string().c_str(), | |||||
| static_cast<long>(layout.stride[i])); | |||||
| if (!has_dim_aligned_stride && | |||||
| static_cast<size_t>(layout.stride[i]) == m_align_size_in_elements) | |||||
| has_dim_aligned_stride = true; | |||||
| } | } | ||||
| if (!has_dim_unity_stride && | |||||
| (int)layout.stride[layout.ndim - 1] == | |||||
| round_up(1, (int)m_align_size_in_elements)) | |||||
| has_dim_unity_stride = true; | |||||
| megdnn_assert(layout.ndim == 0 || has_dim_unity_stride, | |||||
| "innermost dim not contiguous"); | |||||
| megdnn_assert( | |||||
| layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride, | |||||
| "innermost dim not contiguous"); | |||||
| } | } | ||||
| void LowbitsAlignedTensorFormatBase::serialize_append( | void LowbitsAlignedTensorFormatBase::serialize_append( | ||||
| @@ -542,6 +546,7 @@ size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride( | |||||
| multiplier = round_up(multiplier, m_align_size_in_elements); | multiplier = round_up(multiplier, m_align_size_in_elements); | ||||
| accum = mul(accum, multiplier); | accum = mul(accum, multiplier); | ||||
| } | } | ||||
| assert_valid(layout); | |||||
| return accum; | return accum; | ||||
| } | } | ||||
| @@ -12,6 +12,7 @@ | |||||
| #include "./algo.h" | #include "./algo.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #include "src/common/conv_bias.h" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -27,8 +28,7 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS4::is_available( | |||||
| bool available = true; | bool available = true; | ||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||||
| param.format)) | |||||
| if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) | |||||
| return false; | return false; | ||||
| if (param.format != Format::NCHW) | if (param.format != Format::NCHW) | ||||
| return false; | return false; | ||||
| @@ -128,7 +128,7 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS4::exec( | |||||
| conv_op->param() = args.opr->param(); | conv_op->param() = args.opr->param(); | ||||
| using Format = param::ConvBias::Format; | using Format = param::ConvBias::Format; | ||||
| conv_op->param().format = Format::NCHW64; | conv_op->param().format = Format::NCHW64; | ||||
| ExecArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
| ExecArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
| src_, | src_, | ||||
| filter_, | filter_, | ||||
| bias_, | bias_, | ||||
| @@ -190,7 +190,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle( | |||||
| conv_op->param() = args.opr->param(); | conv_op->param() = args.opr->param(); | ||||
| using Format = param::ConvBias::Format; | using Format = param::ConvBias::Format; | ||||
| conv_op->param().format = Format::NCHW64; | conv_op->param().format = Format::NCHW64; | ||||
| SizeArgs args_{dynamic_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
| SizeArgs args_{reinterpret_cast<ConvBiasForwardImpl*>(conv_op.get()), | |||||
| layouts[0], | layouts[0], | ||||
| layouts[1], | layouts[1], | ||||
| layouts[2], | layouts[2], | ||||
| @@ -64,7 +64,6 @@ public: | |||||
| class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | ||||
| class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | ||||
| class AlgoInt8NCHW32IMMAImplicitGemm; | class AlgoInt8NCHW32IMMAImplicitGemm; | ||||
| class AlgoFallbackNCHWQS4; | |||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -151,9 +151,9 @@ void exec_matrix_mul_quint4x4x32_helper( | |||||
| MEGDNN_MARK_USED_VAR(format); | MEGDNN_MARK_USED_VAR(format); | ||||
| MEGDNN_MARK_USED_VAR(compute_mode); | MEGDNN_MARK_USED_VAR(compute_mode); | ||||
| auto convert_layout = [](const TensorLayout& layout) { | auto convert_layout = [](const TensorLayout& layout) { | ||||
| auto ret = layout; | |||||
| auto param = layout.dtype.param<dtype::Quantized4Asymm>(); | auto param = layout.dtype.param<dtype::Quantized4Asymm>(); | ||||
| ret.dtype = dtype::Quantized8Asymm(param.scale, param.zero_point); | |||||
| TensorLayout ret(layout, | |||||
| dtype::Quantized8Asymm(param.scale, param.zero_point)); | |||||
| return ret; | return ret; | ||||
| }; | }; | ||||
| TensorLayout A_layout, B_layout; | TensorLayout A_layout, B_layout; | ||||
| @@ -205,9 +205,8 @@ void exec_matrix_mul_qint4x4x16_helper( | |||||
| MEGDNN_MARK_USED_VAR(format); | MEGDNN_MARK_USED_VAR(format); | ||||
| MEGDNN_MARK_USED_VAR(compute_mode); | MEGDNN_MARK_USED_VAR(compute_mode); | ||||
| auto convert_layout = [](const TensorLayout& layout) { | auto convert_layout = [](const TensorLayout& layout) { | ||||
| auto ret = layout; | |||||
| auto param = layout.dtype.param<dtype::QuantizedS4>(); | auto param = layout.dtype.param<dtype::QuantizedS4>(); | ||||
| ret.dtype = dtype::QuantizedS8(param.scale); | |||||
| TensorLayout ret(layout, dtype::QuantizedS8(param.scale)); | |||||
| return ret; | return ret; | ||||
| }; | }; | ||||
| TensorLayout A_layout, B_layout; | TensorLayout A_layout, B_layout; | ||||
| @@ -406,8 +406,7 @@ size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
| } | } | ||||
| namespace { | namespace { | ||||
| void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle, | |||||
| WorkspaceBundle& workspace_bundle) { | |||||
| void post_process(const TensorND& dst, TensorND& comp_dst) { | |||||
| if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | ||||
| int8_to_int4(comp_dst, dst); | int8_to_int4(comp_dst, dst); | ||||
| } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | ||||
| @@ -427,8 +426,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { | ||||
| float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; | float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale; | ||||
| comp_src.layout.dtype = dtype::QuantizedS8(scale); | comp_src.layout.dtype = dtype::QuantizedS8(scale); | ||||
| comp_src.layout.init_contiguous_stride(); | |||||
| comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); | comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); | ||||
| comp_src.layout.init_contiguous_stride(); | |||||
| comp_src.raw_ptr = wsb.get(0); | comp_src.raw_ptr = wsb.get(0); | ||||
| comp_dst.layout.dtype = dtype::QuantizedS8(scale); | comp_dst.layout.dtype = dtype::QuantizedS8(scale); | ||||
| comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); | comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); | ||||
| @@ -571,7 +570,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
| default: \ | default: \ | ||||
| megdnn_assert(0, "not support mode"); \ | megdnn_assert(0, "not support mode"); \ | ||||
| } \ | } \ | ||||
| post_process(dst, comp_dst, handle(), wsb); \ | |||||
| post_process(dst, comp_dst); \ | |||||
| return; \ | return; \ | ||||
| } | } | ||||
| @@ -132,7 +132,6 @@ public: | |||||
| : dtype::Float32()); | : dtype::Float32()); | ||||
| if (m_fmt.find(i) == m_fmt.end()) { | if (m_fmt.find(i) == m_fmt.end()) { | ||||
| layouts[i] = TensorLayout(shapes[i], dt); | layouts[i] = TensorLayout(shapes[i], dt); | ||||
| layouts[i].init_contiguous_stride(); | |||||
| } else | } else | ||||
| layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); | layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); | ||||
| } | } | ||||
| @@ -325,13 +325,8 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS) { | |||||
| layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | ||||
| layout = layout.broadcast({16, 32, 7, 7}); | layout = layout.broadcast({16, 32, 7, 7}); | ||||
| EXPECT_EQ(make_layout({16, 32, 49}, {0, 1, 0}, dtype::QuantizedS4{1.2}), | |||||
| EXPECT_EQ(make_layout({16, 32, 49}, {0, 2, 0}, dtype::QuantizedS4{1.2}), | |||||
| layout.collapse_contiguous()); | layout.collapse_contiguous()); | ||||
| layout = TensorLayout{{1, 32, 1, 1}, dtype::QuantizedS4{1.2f}}; | |||||
| layout.init_contiguous_stride(); | |||||
| layout = layout.broadcast({16, 32, 7, 7}); | |||||
| ASSERT_THROW(layout.span(), MegDNNError); | |||||
| } | } | ||||
| TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | ||||
| @@ -342,7 +337,7 @@ TEST(BASIC_TYPES, TENSOR_LAYOUT_FMT_LOW_BITS_VALID) { | |||||
| LowbitsAlignedToBytesTensorFormat::make(4_z)), | LowbitsAlignedToBytesTensorFormat::make(4_z)), | ||||
| MegDNNError); | MegDNNError); | ||||
| ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, | ASSERT_THROW(TensorLayout({16, 32, 7, 7}, dtype::IntB2{}, | ||||
| LowbitsAlignedToBytesTensorFormat::make(2_z)), | |||||
| LowbitsAlignedToBytesTensorFormat::make(4_z)), | |||||
| MegDNNError); | MegDNNError); | ||||
| } | } | ||||
| @@ -343,6 +343,14 @@ static inline bool good_float(dt_qint32) { | |||||
| return true; | return true; | ||||
| } | } | ||||
| static inline bool good_float(dt_qint4) { | |||||
| return true; | |||||
| } | |||||
| static inline bool good_float(dt_quint4) { | |||||
| return true; | |||||
| } | |||||
| // A hack for the (x+0) promote to int trick on dt_quint8. | // A hack for the (x+0) promote to int trick on dt_quint8. | ||||
| static inline int operator+(dt_quint8 lhs, int rhs) { | static inline int operator+(dt_quint8 lhs, int rhs) { | ||||
| megdnn_assert(rhs == 0, "unexpected rhs"); | megdnn_assert(rhs == 0, "unexpected rhs"); | ||||
| @@ -545,12 +545,12 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
| using Param = WarpPerspective::Param; | using Param = WarpPerspective::Param; | ||||
| auto convert_true_format = [](const TensorLayout& layout) { | auto convert_true_format = [](const TensorLayout& layout) { | ||||
| if (layout.ndim == 4) | |||||
| return layout | |||||
| .reshape({layout[0], layout[1] / 64, layout[2], layout[3], | |||||
| 64}) | |||||
| .dimshuffle({0, 1, 4, 2, 3}); | |||||
| else | |||||
| if (layout.ndim == 4) { | |||||
| TensorLayout ret{ | |||||
| {layout[0], layout[1] / 64, layout[2], layout[3], 64}, | |||||
| layout.dtype}; | |||||
| return ret.dimshuffle({0, 1, 4, 2, 3}); | |||||
| } else | |||||
| return layout; | return layout; | ||||
| }; | }; | ||||
| @@ -563,15 +563,16 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
| TensorNDArray nchw_tensors; | TensorNDArray nchw_tensors; | ||||
| for (size_t i = 0; i < tensors.size(); ++i) { | for (size_t i = 0; i < tensors.size(); ++i) { | ||||
| TensorLayout ly; | |||||
| auto layout = tensors[i].layout; | auto layout = tensors[i].layout; | ||||
| if (layout.dtype.enumv() == DTypeEnum::QuantizedS4) | |||||
| layout.dtype = dtype::QuantizedS4(); | |||||
| if (layout.ndim == 5) { | |||||
| layout = layout.reshape({layout[0], layout[1] * layout[4], | |||||
| layout[2], layout[3]}); | |||||
| if (tensors[i].layout.ndim == 5) { | |||||
| ly = TensorLayout{{layout[0], layout[1] * layout[4], layout[2], | |||||
| layout[3]}, | |||||
| layout.dtype}; | |||||
| } else { | |||||
| ly = layout; | |||||
| } | } | ||||
| nchw_tensors.emplace_back(malloc(layout.span().dist_byte()), | |||||
| layout); | |||||
| nchw_tensors.emplace_back(malloc(ly.span().dist_byte()), ly); | |||||
| } | } | ||||
| TensorNDArray nchw64_tensors; | TensorNDArray nchw64_tensors; | ||||
| for (size_t i = 0; i < tensors.size(); ++i) { | for (size_t i = 0; i < tensors.size(); ++i) { | ||||
| @@ -617,13 +618,11 @@ TEST_F(NAIVE, WARP_PERSPECTIVE_NCHW64) { | |||||
| checker.set_param(param); | checker.set_param(param); | ||||
| checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); | checker.execs({{2, 1, 10, 10, 64}, {2, 3, 3}, {2, 1, 10, 12, 64}}); | ||||
| checker.execs( | checker.execs( | ||||
| {{20, 30, 10, 12, 64}, {20, 3, 3}, {20, 30, 11, 12, 64}}); | |||||
| checker.execs( | |||||
| {{220, 3, 10, 10, 64}, {220, 3, 3}, {220, 3, 10, 12, 64}}); | |||||
| checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 25, 510, 64}}); | |||||
| checker.execs({{1, 25, 25, 510, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||||
| checker.execs({{1, 25, 25, 24, 64}, {1, 3, 3}, {1, 25, 51, 50, 64}}); | |||||
| checker.execs({{1, 25, 51, 50, 64}, {1, 3, 3}, {1, 25, 25, 24, 64}}); | |||||
| {{20, 3, 10, 12, 64}, {20, 3, 3}, {20, 3, 11, 12, 64}}); | |||||
| checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 25, 51, 64}}); | |||||
| checker.execs({{1, 3, 25, 51, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||||
| checker.execs({{1, 3, 25, 24, 64}, {1, 3, 3}, {1, 3, 51, 50, 64}}); | |||||
| checker.execs({{1, 3, 51, 50, 64}, {1, 3, 3}, {1, 3, 25, 24, 64}}); | |||||
| } | } | ||||
| } | } | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include "megbrain/graph/cg.h" | #include "megbrain/graph/cg.h" | ||||
| #include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
| #include "megbrain/utils/mempool.h" | #include "megbrain/utils/mempool.h" | ||||
| #include "./numpy_dtypes.h" | #include "./numpy_dtypes.h" | ||||
| namespace py = pybind11; | namespace py = pybind11; | ||||
| @@ -390,16 +391,24 @@ HostTensorND lowbit_ndarray_to_host_tensor( | |||||
| } else { | } else { | ||||
| mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, | mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM, | ||||
| "unsupported ndim %zu", layout.ndim); | "unsupported ndim %zu", layout.ndim); | ||||
| for (size_t i = 0; i < layout.ndim; ++ i) { | |||||
| layout.shape[i] = PyArray_SHAPE(input)[i]; | |||||
| layout.stride[i] = PyArray_STRIDE(input, i); | |||||
| TensorLayout ly; | |||||
| ly.ndim = layout.ndim; | |||||
| for (size_t i = 0; i < layout.ndim; ++i) { | |||||
| ly.shape[i] = layout.shape[i] = PyArray_SHAPE(input)[i]; | |||||
| ly.stride[i] = PyArray_STRIDE(input, i); | |||||
| mgb_assert(layout.shape[i], "zero shape not supported"); | mgb_assert(layout.shape[i], "zero shape not supported"); | ||||
| } | } | ||||
| mgb_assert(layout.is_contiguous()); | |||||
| mgb_assert(ly.is_physical_contiguous()); | |||||
| layout.init_contiguous_stride(); | |||||
| } | } | ||||
| HostTensorND ret{comp_node, layout}; | HostTensorND ret{comp_node, layout}; | ||||
| lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||||
| layout.total_nr_elems()); | |||||
| if (layout.format.is_lowbit_aligned()) { | |||||
| mgb_assert(layout.is_contiguous()); | |||||
| lowbit_memcpy_byte2aligned(ret.raw_ptr(), src_ptr, layout); | |||||
| } else { | |||||
| lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr, | |||||
| layout.total_nr_elems()); | |||||
| } | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -423,10 +432,8 @@ std::pair<HostTensorND, bool> np2tensor_try_borrow( | |||||
| } | } | ||||
| // make result from PyArrayObject; its reference may be stolen | // make result from PyArrayObject; its reference may be stolen | ||||
| auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) { | |||||
| TensorLayout layout; | |||||
| layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input)); | |||||
| auto make_from_arr = [&](PyArrayObject* input, bool allow_borrow) { | |||||
| TensorLayout layout{{}, dtype_np2mgb_descr(PyArray_DESCR(input))}; | |||||
| if (dtype.valid()) | if (dtype.valid()) | ||||
| mgb_assert(dtype == layout.dtype); | mgb_assert(dtype == layout.dtype); | ||||
| layout.ndim = PyArray_NDIM(input); | layout.ndim = PyArray_NDIM(input); | ||||
| @@ -605,8 +612,15 @@ PyObject* ndarray_from_tensor( | |||||
| if (val.dtype().is_low_bit()) { | if (val.dtype().is_low_bit()) { | ||||
| mgb_assert(share_type != ShareType::MUST_SHARE, | mgb_assert(share_type != ShareType::MUST_SHARE, | ||||
| "can not share memory for lowbit dtype"); | "can not share memory for lowbit dtype"); | ||||
| lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(), | |||||
| val.layout().total_nr_elems()); | |||||
| const auto& layout = val.layout(); | |||||
| if (layout.format.is_lowbit_aligned()) { | |||||
| lowbit_memcpy_aligned2byte(alloc_new_ret(), val.raw_ptr(), | |||||
| val.layout()); | |||||
| } else { | |||||
| lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), | |||||
| val.raw_ptr(), | |||||
| val.layout().total_nr_elems()); | |||||
| } | |||||
| } else if (share_type == ShareType::MUST_UNSHARE) { | } else if (share_type == ShareType::MUST_UNSHARE) { | ||||
| memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); | memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte()); | ||||
| } else { | } else { | ||||
| @@ -290,7 +290,7 @@ Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) { | |||||
| } | } | ||||
| Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) | Tensor::Tensor(const TensorLayout& layout, const CompNode& cn) | ||||
| : m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))}, | |||||
| : m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())}, | |||||
| m_offset{0} {} | m_offset{0} {} | ||||
| Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) | Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout) | ||||
| @@ -359,19 +359,6 @@ struct LowbitMemcpy<bits, true> { | |||||
| } | } | ||||
| }; | }; | ||||
| template<typename DT> | |||||
| struct QuantizedLowbitTrait; | |||||
| template<> | |||||
| struct QuantizedLowbitTrait<dtype::Quantized4Asymm> { | |||||
| static constexpr int8_t SHIFT = 0; | |||||
| }; | |||||
| template<> | |||||
| struct QuantizedLowbitTrait<dtype::QuantizedS4> { | |||||
| static constexpr int8_t SHIFT = 8; | |||||
| }; | |||||
| template <typename DT, bool div_byte = (DTypeTrait<DT>::category == | template <typename DT, bool div_byte = (DTypeTrait<DT>::category == | ||||
| DTypeCategory::QUANTIZED) && | DTypeCategory::QUANTIZED) && | ||||
| (8 % DTypeTrait<DT>::low_bit == 0)> | (8 % DTypeTrait<DT>::low_bit == 0)> | ||||
| @@ -452,4 +439,44 @@ void mgb::lowbit_memcpy_compact2byte( | |||||
| mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); | mgb_throw(MegBrainError, "bad dtype for lowbit: %s", dtype.name()); | ||||
| } | } | ||||
| void mgb::lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||||
| const ::megdnn::TensorLayout& layout) { | |||||
| size_t low_bit = layout.dtype.low_bit(); | |||||
| size_t dim = layout.shape[layout.ndim - 1]; | |||||
| if ((dim * low_bit) % 8) { // padding | |||||
| size_t n = layout.total_nr_elems(); | |||||
| size_t stride = divup<size_t>(dim * low_bit, 8); | |||||
| dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||||
| const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||||
| for (size_t i = 0; i < n / dim; ++i) { | |||||
| lowbit_memcpy_byte2compact(layout.dtype, dest_ptr, src_ptr, dim); | |||||
| dest_ptr += stride; | |||||
| src_ptr += dim; | |||||
| } | |||||
| } else { | |||||
| lowbit_memcpy_byte2compact(layout.dtype, dest, src, | |||||
| layout.total_nr_elems()); | |||||
| } | |||||
| } | |||||
| void mgb::lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||||
| const ::megdnn::TensorLayout& layout) { | |||||
| size_t low_bit = layout.dtype.low_bit(); | |||||
| size_t dim = layout.shape[layout.ndim - 1]; | |||||
| if ((dim * low_bit) % 8) { // padding | |||||
| size_t n = layout.total_nr_elems(); | |||||
| size_t stride = divup<size_t>(dim * low_bit, 8); | |||||
| dt_byte* dest_ptr = reinterpret_cast<dt_byte*>(dest); | |||||
| const dt_byte* src_ptr = reinterpret_cast<const dt_byte*>(src); | |||||
| for (size_t i = 0; i < n / dim; ++i) { | |||||
| lowbit_memcpy_compact2byte(layout.dtype, dest_ptr, src_ptr, dim); | |||||
| dest_ptr += dim; | |||||
| src_ptr += stride; | |||||
| } | |||||
| } else { | |||||
| lowbit_memcpy_compact2byte(layout.dtype, dest, src, | |||||
| layout.total_nr_elems()); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -1340,15 +1340,19 @@ void VarNodeMemManager::make_dev_tensor_from_mem_plan_single( | |||||
| void VarNodeMemManager::var_alloc_with_shape(VarNode* var, | void VarNodeMemManager::var_alloc_with_shape(VarNode* var, | ||||
| const TensorShape& shape, | const TensorShape& shape, | ||||
| size_t size_req) { | size_t size_req) { | ||||
| mgb_assert(var->format().is_default(), | |||||
| bool cond_default = var->format().is_default(); | |||||
| bool cond_lowbit = var->dtype().is_quantized_lowbit() && | |||||
| var->format().is_lowbit_aligned(); | |||||
| mgb_assert(cond_default || cond_lowbit, | |||||
| "dynamic shape is currently only supported for var with " | "dynamic shape is currently only supported for var with " | ||||
| "default format; got %s", | "default format; got %s", | ||||
| var->format().to_string().c_str()); | var->format().to_string().c_str()); | ||||
| var->shape(shape); | var->shape(shape); | ||||
| TensorLayout ly{shape, var->dtype()}; | |||||
| if (size_req != 0) { | if (size_req != 0) { | ||||
| mgb_assert(var->dtype().size(shape.total_nr_elems()) <= size_req); | |||||
| mgb_assert(ly.span().dist_byte() <= size_req); | |||||
| } else { | } else { | ||||
| size_req = var->dtype().size(shape.total_nr_elems()); | |||||
| size_req = ly.span().dist_byte(); | |||||
| } | } | ||||
| auto&& mplan = var->m_mem_plan; | auto&& mplan = var->m_mem_plan; | ||||
| @@ -202,6 +202,17 @@ void lowbit_memcpy_byte2compact( | |||||
| void lowbit_memcpy_compact2byte( | void lowbit_memcpy_compact2byte( | ||||
| DType dtype, void *dest, const void *src, size_t n); | DType dtype, void *dest, const void *src, size_t n); | ||||
| /*! | |||||
| * \brief copy from byte representation to an aligend tensor for lowbit types | |||||
| */ | |||||
| void lowbit_memcpy_byte2aligned(void* dest, const void* src, | |||||
| const ::megdnn::TensorLayout& ly); | |||||
| /*! | |||||
| * \brief copy from an aligend tensor to byte representation for lowbit types | |||||
| */ | |||||
| void lowbit_memcpy_aligned2byte(void* dest, const void* src, | |||||
| const ::megdnn::TensorLayout& ly); | |||||
| } // namespace mgb | } // namespace mgb | ||||
| @@ -4454,314 +4454,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | MGB_ASSERT_TENSOR_EQ(t1, t2); | ||||
| } | } | ||||
| TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
| w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto w2 = mkcvar("w2", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||||
| auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
| b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
| auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); | |||||
| auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
| auto y5 = opr::ElemwiseMultiType::make( | |||||
| {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); | |||||
| y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||||
| SymbolVar y5_pad; | |||||
| unpack_vector( | |||||
| gopt::GraphOptimizer{} | |||||
| .add_pass(gopt::EnableNCHW64Pass::make_nchw64_converter()) | |||||
| .apply({{y5}}) | |||||
| .endpoint_vars(), | |||||
| y5_pad); | |||||
| EXPECT_TRUE(y5.node()->shape().eq_shape(y5_pad.node()->shape())); | |||||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<opr::ConvBias>()) { | |||||
| oprs.push_back(opr); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(y5_pad.node()->owner_opr()); | |||||
| ASSERT_EQ(oprs.size(), 5); | |||||
| using Format = opr::ConvBiasForward::Param::Format; | |||||
| #define CHECK(_i, _fmt) \ | |||||
| { \ | |||||
| const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||||
| ASSERT_EQ(o.param().format, Format::_fmt); \ | |||||
| } | |||||
| CHECK(0, NCHW4); | |||||
| CHECK(1, NCHW4); | |||||
| CHECK(2, NCHW32); | |||||
| CHECK(3, NCHW64); | |||||
| CHECK(4, NCHW4); | |||||
| #undef CHECK | |||||
| HostTensorND t1, t2; | |||||
| auto func1 = graph->compile({make_callback_copy(y5, t1)}); | |||||
| func1->execute(); | |||||
| auto func2 = graph->compile({make_callback_copy(y5_pad, t2)}); | |||||
| func2->execute(); | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
| } | |||||
| TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
| w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| opr::Pooling::Param pool; | |||||
| pool.format = opr::Pooling::Param::Format::NCHW; | |||||
| y = opr::Pooling::make(y, pool); | |||||
| auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y2 = opr::TypeCvt::make(y2, dtype::QuantizedS4{40.f}); | |||||
| auto w3 = mkcvar("w3", {64, 20, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
| b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
| auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| auto w4 = mkcvar("w4", {20, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
| b4 = mkcvar("b4", {1, 20, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
| auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| y4 = opr::TypeCvt::make(y4, dtype::QuantizedS8{2.5f}); | |||||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
| auto y5 = opr::ElemwiseMultiType::make( | |||||
| {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); | |||||
| opr::ConvolutionBackwardData::Param deconv; | |||||
| deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | |||||
| deconv.stride_h = deconv.stride_w = 2; | |||||
| deconv.pad_h = deconv.pad_w = 1; | |||||
| auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); | |||||
| auto y6 = opr::ConvolutionBackwardData::make( | |||||
| w6, y5, deconv, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); | |||||
| y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); | |||||
| std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | |||||
| cn, TensorShape{16, 3, 3}, dtype::Float32()); | |||||
| warp_perspective_mat_gen(*mat, 16, 14, 14); | |||||
| auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); | |||||
| opr::WarpPerspective::Param warp_param; | |||||
| warp_param.format = opr::WarpPerspective::Param::Format::NCHW; | |||||
| auto y7 = opr::WarpPerspective::make(y6, mat_var, TensorShape{14, 14}, | |||||
| warp_param); | |||||
| y7 = opr::TypeCvt::make(y7, dtype::Float32()); | |||||
| SymbolVar y7_pad; | |||||
| auto opt = gopt::OptimizeForInferenceOptions{}; | |||||
| opt.enable_nchw64(); | |||||
| unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); | |||||
| EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); | |||||
| HostTensorND t1, t2; | |||||
| auto func1 = graph->compile({make_callback_copy(y7, t1)}); | |||||
| func1->execute(); | |||||
| auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); | |||||
| func2->execute(); | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
| using Format = opr::ConvBiasForward::Param::Format; | |||||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<opr::ConvBias>()) { | |||||
| oprs.push_back(opr); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); | |||||
| ASSERT_EQ(oprs.size(), 5); | |||||
| #define CHECK(_i, _fmt) \ | |||||
| { \ | |||||
| const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | |||||
| ASSERT_EQ(o.param().format, Format::_fmt); \ | |||||
| } | |||||
| CHECK(0, NCHW4); | |||||
| CHECK(1, NCHW32); | |||||
| CHECK(2, NCHW32); | |||||
| CHECK(3, NCHW64); | |||||
| CHECK(4, NCHW64); | |||||
| #undef CHECK | |||||
| { | |||||
| const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad); | |||||
| ASSERT_EQ(deconv.param().format, Format::NCHW4); | |||||
| const auto& pool = find_opr<opr::PoolingForward>(y7_pad); | |||||
| ASSERT_EQ(pool.param().format, Format::NCHW4); | |||||
| const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad); | |||||
| ASSERT_EQ(warp.param().format, Format::NCHW64); | |||||
| } | |||||
| size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad); | |||||
| ASSERT_EQ(nr_dimshuffle, 8); | |||||
| } | |||||
| TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorND t1, t2; | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||||
| const DType& dtype) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
| w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||||
| auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
| b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
| auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| auto w3 = mkcvar("w3", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | |||||
| b3 = mkcvar("b3", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | |||||
| auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||||
| auto y4 = opr::ElemwiseMultiType::make( | |||||
| {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| y4 = opr::TypeCvt::make(y4, dtype::Float32()); | |||||
| auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
| y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||||
| SymbolVar y4_pad; | |||||
| auto opt = gopt::OptimizeForInferenceOptions{}; | |||||
| opt.enable_nchw64(); | |||||
| unpack_vector(gopt::optimize_for_inference({y4}, opt), y4_pad); | |||||
| EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); | |||||
| size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad); | |||||
| ASSERT_EQ(nr_elem_mult_type, 0); | |||||
| auto func = graph->compile({make_callback_copy(y4_pad, t1)}); | |||||
| func->execute(); | |||||
| { | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto y = opr::ConvBias::make( | |||||
| x, w, b, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto y1 = opr::ConvBias::make( | |||||
| y, w1, b1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||||
| auto y2 = opr::ConvBias::make( | |||||
| y1, w2, b2, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
| auto y3 = opr::ConvBias::make( | |||||
| y2, w3, b3, y1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
| y3 = opr::TypeCvt::make(y3, dtype::Float32()); | |||||
| auto func = graph->compile({make_callback_copy(y3, t2)}); | |||||
| func->execute(); | |||||
| } | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
| } | |||||
| #endif | #endif | ||||
| @@ -2604,174 +2604,6 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||||
| #endif | #endif | ||||
| namespace { | namespace { | ||||
| TEST(TestOprDNN, ConvBiasInt4NCHW) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver != 75) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 75); | |||||
| return; | |||||
| } | |||||
| auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
| size_t P) { | |||||
| auto graph = ComputingGraph::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||||
| const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| const CompNode& cn) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||||
| const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| const CompNode& cn) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| using Policy = opr::ConvBias::ExecutionPolicy; | |||||
| using Strategy = Policy::Strategy; | |||||
| auto x = mkvar("x", {N, C * 4, H, W}, dtype::QuantizedS4(1.19960327f), | |||||
| graph, cn), | |||||
| w = mkcvar("w1", {C, C * 4, F, F}, dtype::QuantizedS4(1.19970327f), | |||||
| graph, cn), | |||||
| b = mkcvar("b1", {1, C, 1, 1}, | |||||
| dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||||
| cn); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
| param.stride_h = param.stride_w = S; | |||||
| param.pad_h = param.pad_w = P; | |||||
| Policy policy; | |||||
| policy.strategy = Strategy::PROFILE; | |||||
| auto y = opr::ConvBias::make( | |||||
| x, w, b, param, policy, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||||
| y = opr::TypeCvt::make(y, dtype::Float32()); | |||||
| auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||||
| w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||||
| b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||||
| auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||||
| auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||||
| y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||||
| HostTensorND host_y, host_y_q4; | |||||
| auto func = graph->compile({make_callback_copy(y, host_y), | |||||
| make_callback_copy(y_q4, host_y_q4)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||||
| }; | |||||
| run(2, 64, 14, 14, 3, 2, 1); | |||||
| run(2, 64, 7, 7, 3, 1, 1); | |||||
| run(2, 64, 14, 14, 1, 2, 0); | |||||
| run(2, 64, 7, 7, 1, 1, 0); | |||||
| } | |||||
| TEST(TestOprDNN, ConvBiasInt4NCHW64) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver != 75) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 75); | |||||
| return; | |||||
| } | |||||
| auto nchw2nchw64 = [](SymbolVar x) { | |||||
| auto y = opr::RelayoutFormat::make( | |||||
| x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); | |||||
| return y; | |||||
| }; | |||||
| auto nchw642nchw = [](SymbolVar x) { | |||||
| auto y = opr::RelayoutFormat::make( | |||||
| x, opr::RelayoutFormat::Param::Mode::NCHW64_NCHW); | |||||
| return y; | |||||
| }; | |||||
| auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
| size_t P) { | |||||
| auto graph = ComputingGraph::make(); | |||||
| HostTensorGenerator<dtype::Int8> gen; | |||||
| auto mkvar = [&gen](const char* name, const TensorShape& shp, | |||||
| const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| const CompNode& cn) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| auto mkcvar = [&gen](const char* name, const TensorShape& shp, | |||||
| const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| const CompNode& cn) { | |||||
| return opr::TypeCvt::make( | |||||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||||
| .rename(name), | |||||
| dtype); | |||||
| }; | |||||
| using Policy = opr::ConvBias::ExecutionPolicy; | |||||
| using Strategy = Policy::Strategy; | |||||
| auto x = mkvar("x", {N, C / 16, H, W, 64}, | |||||
| dtype::QuantizedS4(1.19960327f), graph, cn), | |||||
| w = mkcvar("w1", {C, C / 16, F, F, 64}, | |||||
| dtype::QuantizedS4(1.19970327f), graph, cn), | |||||
| b = mkcvar("b1", {1, C / 64, 1, 1, 64}, | |||||
| dtype::QuantizedS32(1.19960327f * 1.19970327f), graph, | |||||
| cn); | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW64; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
| param.stride_h = param.stride_w = S; | |||||
| param.pad_h = param.pad_w = P; | |||||
| Policy policy; | |||||
| policy.strategy = Strategy::PROFILE; | |||||
| auto y = opr::ConvBias::make( | |||||
| x, w, b, param, policy, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(11.9960501f)}); | |||||
| y = opr::TypeCvt::make(y, dtype::Float32()); | |||||
| x = nchw642nchw(x); | |||||
| w = nchw642nchw(w); | |||||
| b = nchw642nchw(b); | |||||
| auto x_f32 = opr::TypeCvt::make(x, dtype::Float32()), | |||||
| w_f32 = opr::TypeCvt::make(w, dtype::Float32()), | |||||
| b_f32 = opr::TypeCvt::make(b, dtype::Float32()); | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| auto y_f32 = opr::ConvBias::make(x_f32, w_f32, b_f32, param, policy); | |||||
| auto y_q4 = opr::TypeCvt::make(y_f32, dtype::QuantizedS4{11.9960501f}); | |||||
| y_q4 = opr::TypeCvt::make(y_q4, dtype::Float32()); | |||||
| y_q4 = nchw2nchw64(y_q4); | |||||
| HostTensorND host_y, host_y_q4; | |||||
| auto func = graph->compile({make_callback_copy(y, host_y), | |||||
| make_callback_copy(y_q4, host_y_q4)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_q4, 1e-3); | |||||
| }; | |||||
| run(2, 64, 14, 14, 3, 2, 1); | |||||
| run(2, 64, 7, 7, 3, 1, 1); | |||||
| run(2, 64, 14, 14, 1, 2, 0); | |||||
| run(2, 64, 7, 7, 1, 1, 0); | |||||
| } | |||||
| TEST(TestOprDNN, ConvBiasInt4Serialize) { | TEST(TestOprDNN, ConvBiasInt4Serialize) { | ||||
| using namespace serialization; | using namespace serialization; | ||||
| @@ -2783,7 +2615,7 @@ TEST(TestOprDNN, ConvBiasInt4Serialize) { | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| std::shared_ptr<HostTensorND> xv; | std::shared_ptr<HostTensorND> xv; | ||||
| auto mkvar = [&gen](const char* name, const DType& dtype, | |||||
| auto mkvar = [](const char* name, const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | std::shared_ptr<ComputingGraph> graph, | ||||
| std::shared_ptr<HostTensorND> val) { | std::shared_ptr<HostTensorND> val) { | ||||
| return opr::TypeCvt::make( | return opr::TypeCvt::make( | ||||
| @@ -2856,9 +2688,9 @@ TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) { | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| std::shared_ptr<HostTensorND> xv; | std::shared_ptr<HostTensorND> xv; | ||||
| auto mkvar = [&gen](const char* name, const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| std::shared_ptr<HostTensorND> val) { | |||||
| auto mkvar = [](const char* name, const DType& dtype, | |||||
| std::shared_ptr<ComputingGraph> graph, | |||||
| std::shared_ptr<HostTensorND> val) { | |||||
| return opr::TypeCvt::make( | return opr::TypeCvt::make( | ||||
| opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); | opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype); | ||||
| }; | }; | ||||
| @@ -62,7 +62,12 @@ bool contains_any_in_set(const SmallVector<T>& list, | |||||
| void check_tensor_value_valid(const std::string& name, | void check_tensor_value_valid(const std::string& name, | ||||
| const HostTensorND& tensor) { | const HostTensorND& tensor) { | ||||
| mgb_assert(tensor.layout().is_physical_contiguous(), | |||||
| bool cond_normal = tensor.layout().format.is_default() && | |||||
| tensor.layout().is_physical_contiguous(); | |||||
| bool cond_lowbit = tensor.layout().dtype.is_quantized_lowbit() && | |||||
| tensor.layout().format.is_lowbit_aligned() && | |||||
| tensor.layout().is_contiguous(); | |||||
| mgb_assert(cond_normal || cond_lowbit, | |||||
| "non-contiguous tensor: name=%s layout=%s", name.c_str(), | "non-contiguous tensor: name=%s layout=%s", name.c_str(), | ||||
| tensor.layout().to_string().c_str()); | tensor.layout().to_string().c_str()); | ||||
| if (tensor.dtype() == dtype::Float32()) { | if (tensor.dtype() == dtype::Float32()) { | ||||
| @@ -585,11 +590,12 @@ TensorLayout load_tensor_layout(const fbs::Tensor* tensor) { | |||||
| layout.ndim = tensor->shape()->size(); | layout.ndim = tensor->shape()->size(); | ||||
| std::copy(tensor->shape()->begin(), tensor->shape()->end(), | std::copy(tensor->shape()->begin(), tensor->shape()->end(), | ||||
| layout.shape); | layout.shape); | ||||
| layout.init_contiguous_stride(); | |||||
| } | } | ||||
| if (tensor->dtype()) { | if (tensor->dtype()) { | ||||
| layout.dtype = fbs::intl::load_dtype(tensor->dtype()); | |||||
| // modify data type inplace for TensorLayout | |||||
| layout.modify_dtype_inplace(fbs::intl::load_dtype(tensor->dtype())); | |||||
| } | } | ||||
| layout.init_contiguous_stride(); | |||||
| return layout; | return layout; | ||||
| } | } | ||||