GitOrigin-RevId: 7b77579acd
tags/v1.1.0
| @@ -428,6 +428,11 @@ public: | |||
| void exec(const ExecArgs& args) const override; | |||
| const char* name() const override { return m_name.c_str(); } | |||
| bool is_reproducible() const override { return true; } | |||
| size_t get_preprocess_workspace_in_bytes( | |||
| const SizeArgs& args) const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const SizeArgs& args) const override; | |||
| void exec_preprocess(const ExecArgs& args) const override; | |||
| private: | |||
| WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | |||
| @@ -560,6 +565,11 @@ public: | |||
| const char* name() const override { return m_name.c_str(); } | |||
| bool is_reproducible() const override { return true; } | |||
| static std::string to_string(AlgoParam algo_param); | |||
| size_t get_preprocess_workspace_in_bytes( | |||
| const SizeArgs& args) const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const SizeArgs& args) const override; | |||
| void exec_preprocess(const ExecArgs& args) const override; | |||
| private: | |||
| WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | |||
| @@ -65,8 +65,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( | |||
| WorkspaceBundle | |||
| ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::get_workspace_bundle( | |||
| dt_byte* raw_ptr, const SizeArgs& args) const { | |||
| size_t ws_filter = args.filter_layout->span().dist_byte(); | |||
| return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||
| if (args.preprocessed_filter) { | |||
| return WorkspaceBundle{raw_ptr, {}}; | |||
| } else { | |||
| size_t ws_filter = args.filter_layout->span().dist_byte(); | |||
| return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||
| } | |||
| } | |||
| size_t | |||
| @@ -82,12 +86,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||
| auto&& fm = args.filter_meta; | |||
| UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||
| param); | |||
| auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); | |||
| auto ws_filter = ws.get(0); | |||
| auto&& stream = cuda_stream(args.opr->handle()); | |||
| // reformat filter from nchw32 to chwn32 | |||
| { | |||
| int8_t* filter_ptr = nullptr; | |||
| if (args.preprocessed_filter == nullptr) { | |||
| filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr); | |||
| // reformat filter from nchw32 to chwn32 | |||
| TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; | |||
| src.init_contiguous_stride(); | |||
| TensorLayout dst = src; | |||
| @@ -99,11 +103,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||
| TensorND ts_src, ts_dst; | |||
| ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||
| ts_src.layout = src; | |||
| ts_dst.raw_ptr = ws_filter; | |||
| ts_dst.raw_ptr = args.workspace.raw_ptr; | |||
| ts_dst.layout = dst; | |||
| auto&& transpose = | |||
| args.opr->handle()->create_operator<RelayoutForward>(); | |||
| transpose->exec(ts_src, ts_dst); | |||
| } else { | |||
| filter_ptr = reinterpret_cast<int8_t*>( | |||
| args.preprocessed_filter->tensors[0].raw_ptr); | |||
| } | |||
| ConvParam kern_param; | |||
| @@ -131,8 +138,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||
| uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | |||
| if (fh == 1 && fw == 1) { | |||
| cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< | |||
| false>(args.src_tensor->compatible_ptr<int8_t>(), | |||
| reinterpret_cast<int8_t*>(ws_filter), | |||
| false>(args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||
| args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | |||
| args.dst_tensor->compatible_ptr<int8_t>(), nullptr, | |||
| kern_param, nonlinear_mode, alpha, beta, gamma, | |||
| @@ -146,8 +152,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||
| stream); | |||
| } else { | |||
| cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32<true>( | |||
| args.src_tensor->compatible_ptr<int8_t>(), | |||
| reinterpret_cast<int8_t*>(ws_filter), | |||
| args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||
| args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | |||
| args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | |||
| nonlinear_mode, alpha, beta, gamma, dst_scale, | |||
| @@ -167,6 +172,41 @@ std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string( | |||
| algo_param.threadblock_n, algo_param.threadblock_k, | |||
| algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); | |||
| } | |||
| size_t ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm:: | |||
| get_preprocess_workspace_in_bytes(const SizeArgs& args) const { | |||
| return 0_z; | |||
| } | |||
| SmallVector<TensorLayout> ConvBiasForwardImpl:: | |||
| AlgoInt8NCHW32IMMAImplicitGemm::deduce_preprocessed_filter_layout( | |||
| const SizeArgs& args) const { | |||
| return {args.filter_layout->collapse_contiguous()}; | |||
| } | |||
| void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( | |||
| const ExecArgs& args) const { | |||
| using Format = Param::Format; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||
| param); | |||
| TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; | |||
| src.init_contiguous_stride(); | |||
| TensorLayout dst = src; | |||
| dst.stride[0] = 32; | |||
| dst.stride[1] = co * fh * fw * 32; | |||
| dst.stride[2] = co * fw * 32; | |||
| dst.stride[3] = co * 32; | |||
| dst.stride[4] = 1; | |||
| TensorND ts_src, ts_dst; | |||
| ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||
| ts_src.layout = src; | |||
| ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||
| ts_dst.layout = dst; | |||
| auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>(); | |||
| transpose->exec(ts_src, ts_dst); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -62,8 +62,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||
| WorkspaceBundle | |||
| ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle( | |||
| dt_byte* raw_ptr, const SizeArgs& args) const { | |||
| size_t ws_filter = args.filter_layout->span().dist_byte(); | |||
| return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||
| if (args.preprocessed_filter) { | |||
| return WorkspaceBundle{raw_ptr, {}}; | |||
| } else { | |||
| size_t ws_filter = args.filter_layout->span().dist_byte(); | |||
| return WorkspaceBundle{raw_ptr, {ws_filter}}; | |||
| } | |||
| } | |||
| size_t | |||
| @@ -79,12 +83,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| auto&& fm = args.filter_meta; | |||
| UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||
| param); | |||
| auto ws = get_workspace_bundle(args.workspace.raw_ptr, args); | |||
| auto ws_filter = ws.get(0); | |||
| auto&& stream = cuda_stream(args.opr->handle()); | |||
| // reformat filter from nchw4 to chwn4 | |||
| { | |||
| int8_t* filter_ptr = nullptr; | |||
| if (args.preprocessed_filter == nullptr) { | |||
| filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr); | |||
| // reformat filter from nchw4 to chwn4 | |||
| TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; | |||
| src.init_contiguous_stride(); | |||
| TensorLayout dst = src; | |||
| @@ -92,11 +96,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| TensorND ts_src, ts_dst; | |||
| ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||
| ts_src.layout = src; | |||
| ts_dst.raw_ptr = ws_filter; | |||
| ts_dst.raw_ptr = args.workspace.raw_ptr; | |||
| ts_dst.layout = dst; | |||
| auto&& transpose = | |||
| args.opr->handle()->create_operator<RelayoutForward>(); | |||
| transpose->exec(ts_src, ts_dst); | |||
| } else { | |||
| filter_ptr = reinterpret_cast<int8_t*>( | |||
| args.preprocessed_filter->tensors[0].raw_ptr); | |||
| } | |||
| convolution::ConvParam kern_param; | |||
| @@ -124,8 +131,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode); | |||
| if (fh == 1 && fw == 1) { | |||
| cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<false>( | |||
| args.src_tensor->compatible_ptr<int8_t>(), | |||
| reinterpret_cast<int8_t*>(ws_filter), | |||
| args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||
| args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | |||
| args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | |||
| nonlinear_mode, alpha, beta, gamma, dst_scale, | |||
| @@ -138,8 +144,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| stream); | |||
| } else { | |||
| cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<true>( | |||
| args.src_tensor->compatible_ptr<int8_t>(), | |||
| reinterpret_cast<int8_t*>(ws_filter), | |||
| args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, | |||
| args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr, | |||
| args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param, | |||
| nonlinear_mode, alpha, beta, gamma, dst_scale, | |||
| @@ -153,4 +158,35 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| } | |||
| } | |||
| size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||
| get_preprocess_workspace_in_bytes(const SizeArgs& args) const { | |||
| return 0_z; | |||
| } | |||
| SmallVector<TensorLayout> ConvBiasForwardImpl:: | |||
| AlgoInt8NCHW4DotProdImplicitGemm::deduce_preprocessed_filter_layout( | |||
| const SizeArgs& args) const { | |||
| return {args.filter_layout->collapse_contiguous()}; | |||
| } | |||
| void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( | |||
| const ExecArgs& args) const { | |||
| using Format = Param::Format; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), | |||
| param); | |||
| TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; | |||
| src.init_contiguous_stride(); | |||
| TensorLayout dst = src; | |||
| dst.stride[0] = 1, dst.stride[1] = dst[0]; | |||
| TensorND ts_src, ts_dst; | |||
| ts_src.raw_ptr = args.filter_tensor->raw_ptr; | |||
| ts_src.layout = src; | |||
| ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||
| ts_dst.layout = dst; | |||
| auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>(); | |||
| transpose->exec(ts_src, ts_dst); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1084,6 +1084,42 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { | |||
| } | |||
| TEST_F(CUDA, CUTLASS_WEIGHT_PREPROCESS) { | |||
| require_compute_capability(6, 1); | |||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||
| handle_cuda()); | |||
| auto check = [&checker](const std::string& algo) { | |||
| checker.set_before_exec_callback( | |||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str())); | |||
| UniformIntRNG rng{-16, 16}; | |||
| UniformIntRNG bias_rng{-50, 50}; | |||
| UniformIntRNG const_rng{1, 1}; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_rng(2, &bias_rng) | |||
| .set_rng(3, &rng) | |||
| .set_dtype(0, dtype::QuantizedS8{1.2f}) | |||
| .set_dtype(1, dtype::QuantizedS8{1.3f}) | |||
| .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) | |||
| .set_dtype(3, dtype::QuantizedS8{1.3f}) | |||
| .set_dtype(4, dtype::QuantizedS8{1.0f}) | |||
| .set_epsilon(1 + 1e-3) | |||
| .set_max_avg_error(1e-1) | |||
| .set_max_avg_biased_error(1e-3); | |||
| param::ConvBias param; | |||
| param.pad_h = param.pad_w = 1; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.format = param::ConvBias::Format::NCHW4; | |||
| checker.set_param(param).execs({{16, 4, 14, 14, 4}, | |||
| {16, 4, 3, 3, 4}, | |||
| {1, 4, 1, 1, 4}, | |||
| {}, | |||
| {}}); | |||
| }; | |||
| check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32"); | |||
| check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8"); | |||
| } | |||
| #if CUDA_VERSION >= 10020 | |||
| /// \note: we only check several cases and block sizes in megdnn_test, the | |||
| /// full testcases are written in cutlass repository | |||