| @@ -104,20 +104,19 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; | ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; | ||||
| ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvBiasForwardImpl* o, | |||||
| const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& bias, | |||||
| const TensorLayout& z, | |||||
| const TensorLayout& dst) | |||||
| ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvBiasForwardImpl* o, const TensorLayout& src, | |||||
| const TensorLayout& filter, const TensorLayout& bias, | |||||
| const TensorLayout& z, const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter) | |||||
| : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, | : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, | ||||
| z, dst) {} | |||||
| z, dst, preprocessed_filter) {} | |||||
| ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
| ConvBiasForwardImpl* o, const TensorLayout& src, | ConvBiasForwardImpl* o, const TensorLayout& src, | ||||
| const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, | const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, | ||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst) | |||||
| const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) | |||||
| : BiasForwardSizeArgs{concrete_handle(o->handle()), | : BiasForwardSizeArgs{concrete_handle(o->handle()), | ||||
| &src, | &src, | ||||
| &filter, | &filter, | ||||
| @@ -126,14 +125,16 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| filter_meta, | filter_meta, | ||||
| &dst, | &dst, | ||||
| o->param().nonlineMode}, | o->param().nonlineMode}, | ||||
| opr{o} {} | |||||
| opr{o}, | |||||
| preprocessed_filter{preprocessed_filter} {} | |||||
| ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( | ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( | ||||
| ConvBiasForwardImpl* opr, _megdnn_tensor_in src, | ConvBiasForwardImpl* opr, _megdnn_tensor_in src, | ||||
| _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, | _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, | ||||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) | |||||
| _megdnn_tensor_out dst, _megdnn_workspace workspace, | |||||
| const PreprocessedFilter* preprocessed_filter) | |||||
| : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, | : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, | ||||
| dst.layout), | |||||
| dst.layout, preprocessed_filter), | |||||
| src_tensor{&src}, | src_tensor{&src}, | ||||
| filter_tensor{&filter}, | filter_tensor{&filter}, | ||||
| bias_tensor{&bias}, | bias_tensor{&bias}, | ||||
| @@ -41,16 +41,19 @@ public: | |||||
| AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | ||||
| struct SizeArgs : public conv_bias::BiasForwardSizeArgs { | struct SizeArgs : public conv_bias::BiasForwardSizeArgs { | ||||
| ConvBiasForwardImpl* opr; | ConvBiasForwardImpl* opr; | ||||
| const PreprocessedFilter* preprocessed_filter; | |||||
| std::string to_string() const; | std::string to_string() const; | ||||
| SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | ||||
| const TensorLayout& filter, const TensorLayout& bias, | const TensorLayout& filter, const TensorLayout& bias, | ||||
| const TensorLayout& z, const TensorLayout& dst); | |||||
| const TensorLayout& z, const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter = nullptr); | |||||
| SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, | ||||
| const TensorLayout& filter, | const TensorLayout& filter, | ||||
| const CanonizedFilterMeta& filter_meta, | const CanonizedFilterMeta& filter_meta, | ||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst); | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter = nullptr); | |||||
| void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const { | void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const { | ||||
| desc.set_conv_bias(*src_layout, filter_meta, *dst_layout, | desc.set_conv_bias(*src_layout, filter_meta, *dst_layout, | ||||
| @@ -69,11 +72,21 @@ public: | |||||
| ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src, | ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src, | ||||
| _megdnn_tensor_in filter, _megdnn_tensor_in bias, | _megdnn_tensor_in filter, _megdnn_tensor_in bias, | ||||
| _megdnn_tensor_in z, _megdnn_tensor_out dst, | _megdnn_tensor_in z, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace); | |||||
| _megdnn_workspace workspace, | |||||
| const PreprocessedFilter* preprocessed_filter = nullptr); | |||||
| }; | }; | ||||
| virtual bool is_available(const SizeArgs& args) const = 0; | virtual bool is_available(const SizeArgs& args) const = 0; | ||||
| virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | ||||
| virtual void exec(const ExecArgs& args) const = 0; | virtual void exec(const ExecArgs& args) const = 0; | ||||
| virtual size_t get_preprocess_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return 0; | |||||
| } | |||||
| virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const SizeArgs& args) const { | |||||
| return {}; | |||||
| } | |||||
| virtual void exec_preprocess(const ExecArgs& args) const {} | |||||
| bool is_available_wk(const SizeArgs& args, size_t limit) { | bool is_available_wk(const SizeArgs& args, size_t limit) { | ||||
| return is_available(args) && get_workspace_in_bytes(args) <= limit; | return is_available(args) && get_workspace_in_bytes(args) <= limit; | ||||
| @@ -29,7 +29,8 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | ||||
| workspace.size, preprocessed_filter); | workspace.size, preprocessed_filter); | ||||
| AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); | |||||
| AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, | |||||
| preprocessed_filter); | |||||
| auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | ||||
| z.layout, dst.layout); | z.layout, dst.layout); | ||||
| algo->check_workspace(args, workspace).exec(args); | algo->check_workspace(args, workspace).exec(args); | ||||
| @@ -205,17 +206,50 @@ const char* ConvBiasForwardImpl::get_algorithm_set_name() const { | |||||
| return "CONV_BIAS_CUDA"; | return "CONV_BIAS_CUDA"; | ||||
| } | } | ||||
| size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& bias, | |||||
| const TensorLayout& z, | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter*) { | |||||
| AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; | |||||
| size_t ConvBiasForwardImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| AlgoBase::SizeArgs args{ | |||||
| this, src, filter, bias, z, dst, preprocessed_filter}; | |||||
| return get_algorithm(this, src, filter, bias, z, dst) | return get_algorithm(this, src, filter, bias, z, dst) | ||||
| ->get_workspace_in_bytes(args); | ->get_workspace_in_bytes(args); | ||||
| }; | }; | ||||
| size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst) { | |||||
| AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; | |||||
| return get_algorithm(this, src, filter, bias, z, dst) | |||||
| ->get_preprocess_workspace_in_bytes(args); | |||||
| } | |||||
| SmallVector<TensorLayout> | |||||
| ConvBiasForwardImpl::deduce_preprocessed_filter_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst) { | |||||
| AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; | |||||
| return get_algorithm(this, src, filter, bias, z, dst) | |||||
| ->deduce_preprocessed_filter_layout(args); | |||||
| } | |||||
| void ConvBiasForwardImpl::exec_preprocess( | |||||
| const TensorLayout& src_layout, _megdnn_tensor_in filter, | |||||
| const TensorLayout& bias_layout, const TensorLayout& z_layout, | |||||
| const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | |||||
| TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, | |||||
| z{nullptr, z_layout}, bias{nullptr, bias_layout}; | |||||
| AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, | |||||
| preprocessed_filter); | |||||
| auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | |||||
| z.layout, dst.layout); | |||||
| return algo->exec_preprocess(args); | |||||
| } | |||||
| } // namespace cuda | } // namespace cuda | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -44,21 +44,14 @@ public: | |||||
| const TensorLayout&, | const TensorLayout&, | ||||
| const TensorLayout&, | const TensorLayout&, | ||||
| const TensorLayout&, | const TensorLayout&, | ||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| }; | |||||
| const TensorLayout&) override; | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | ||||
| const TensorLayout&, const TensorLayout&, const TensorLayout&, | const TensorLayout&, const TensorLayout&, const TensorLayout&, | ||||
| const TensorLayout&, const TensorLayout&) override { | |||||
| return {}; | |||||
| } | |||||
| const TensorLayout&, const TensorLayout&) override; | |||||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | ||||
| const TensorLayout&, const TensorLayout&, | const TensorLayout&, const TensorLayout&, | ||||
| const TensorLayout&, PreprocessedFilter*, | const TensorLayout&, PreprocessedFilter*, | ||||
| _megdnn_workspace) override { | |||||
| megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | |||||
| } | |||||
| _megdnn_workspace) override; | |||||
| const char* get_algorithm_set_name() const override; | const char* get_algorithm_set_name() const override; | ||||
| class AlgoBase; | class AlgoBase; | ||||
| @@ -32,6 +32,22 @@ class DefaultComparator<dt_float32> { | |||||
| } | } | ||||
| }; | }; | ||||
| template <> | |||||
| class DefaultComparator<dt_qint8> { | |||||
| public: | |||||
| bool is_same(dt_qint8 expected, dt_qint8 actual) const { | |||||
| return expected.as_int8() == actual.as_int8(); | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| class DefaultComparator<dt_qint32> { | |||||
| public: | |||||
| bool is_same(dt_qint32 expected, dt_qint32 actual) const { | |||||
| return expected.as_int32() == actual.as_int32(); | |||||
| } | |||||
| }; | |||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -741,10 +741,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, | |||||
| std::unique_ptr<RNG> rng; | std::unique_ptr<RNG> rng; | ||||
| std::unique_ptr<RNG> bias_rng; | std::unique_ptr<RNG> bias_rng; | ||||
| std::unique_ptr<RNG> const_rng; | std::unique_ptr<RNG> const_rng; | ||||
| std::unique_ptr<RNG> zero_rng; | |||||
| // TODO: check range of rng | // TODO: check range of rng | ||||
| if (src_dtype.enumv() == DTypeEnum::QuantizedS8) { | if (src_dtype.enumv() == DTypeEnum::QuantizedS8) { | ||||
| rng = std::make_unique<UniformIntRNG>(-3, 3); | rng = std::make_unique<UniformIntRNG>(-3, 3); | ||||
| const_rng = std::make_unique<UniformIntRNG>(1, 1); | const_rng = std::make_unique<UniformIntRNG>(1, 1); | ||||
| zero_rng = std::make_unique<UniformIntRNG>(0, 0); | |||||
| megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); | megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); | ||||
| bias_rng = std::make_unique<UniformIntRNG>(-50, 50); | bias_rng = std::make_unique<UniformIntRNG>(-50, 50); | ||||
| checker.set_epsilon(1 + 1e-3) | checker.set_epsilon(1 + 1e-3) | ||||
| @@ -775,6 +777,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, | |||||
| fh = arg.filter[2]; | fh = arg.filter[2]; | ||||
| fw = arg.filter[3]; | fw = arg.filter[3]; | ||||
| z[1] = arg.filter[0] / 4; | z[1] = arg.filter[0] / 4; | ||||
| } else if (format == Format::NCHW32) { | |||||
| hi = arg.src[2]; | |||||
| wi = arg.src[3]; | |||||
| fh = arg.filter[2]; | |||||
| fw = arg.filter[3]; | |||||
| z[1] = arg.filter[0] / 32; | |||||
| } else { | } else { | ||||
| megdnn_assert(format == Format::CHWN4); | megdnn_assert(format == Format::CHWN4); | ||||
| hi = arg.src[1]; | hi = arg.src[1]; | ||||
| @@ -798,7 +806,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, | |||||
| megdnn_assert(rng != nullptr && bias_rng != nullptr); | megdnn_assert(rng != nullptr && bias_rng != nullptr); | ||||
| checker.set_rng(0, rng.get()) | checker.set_rng(0, rng.get()) | ||||
| .set_rng(1, rng.get()) | .set_rng(1, rng.get()) | ||||
| .set_rng(2, rng.get()) | |||||
| .set_rng(2, bias_rng.get()) | |||||
| .set_rng(3, rng.get()); | .set_rng(3, rng.get()); | ||||
| if (args.empty()) { | if (args.empty()) { | ||||
| std::vector<TestArg> default_args; | std::vector<TestArg> default_args; | ||||
| @@ -24,7 +24,8 @@ Tensor<T, C>::Tensor(Handle *handle, TensorLayout layout): | |||||
| m_handle(handle), | m_handle(handle), | ||||
| m_comparator(C()) | m_comparator(C()) | ||||
| { | { | ||||
| layout.dtype = get_dtype_from_static_type<T>(); | |||||
| if (!layout.dtype.valid()) | |||||
| layout.dtype = get_dtype_from_static_type<T>(); | |||||
| m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); | m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); | ||||
| m_tensornd.layout = layout; | m_tensornd.layout = layout; | ||||
| } | } | ||||
| @@ -67,10 +68,10 @@ void Tensor<T, C>::check_with(const Tensor<T, C_> &rhs) const | |||||
| auto index = Index(m_tensornd.layout, linear_idx); | auto index = Index(m_tensornd.layout, linear_idx); | ||||
| auto offset = index.positive_offset(); | auto offset = index.positive_offset(); | ||||
| ASSERT_TRUE(m_comparator.is_same(p0[offset], p1[offset])) | ASSERT_TRUE(m_comparator.is_same(p0[offset], p1[offset])) | ||||
| << "Index is " << index.to_string() | |||||
| << "; layout is " << m_tensornd.layout.to_string() | |||||
| << "; this->ptr()[offset] is " << this->ptr()[offset] | |||||
| << "; rhs.ptr()[offset] is " << rhs.ptr()[offset]; | |||||
| << "Index is " << index.to_string() << "; layout is " | |||||
| << m_tensornd.layout.to_string() << "; this->ptr()[offset] is " | |||||
| << this->ptr()[offset] << "; rhs.ptr()[offset] is " | |||||
| << rhs.ptr()[offset]; | |||||
| } | } | ||||
| } | } | ||||
| @@ -18,6 +18,8 @@ | |||||
| #include "test/cuda/benchmark.h" | #include "test/cuda/benchmark.h" | ||||
| #include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
| #include "test/cuda/utils.h" | #include "test/cuda/utils.h" | ||||
| #include "test/common/tensor.h" | |||||
| #include "test/common/workspace_wrapper.h" | |||||
| #define V1(x) #x | #define V1(x) #x | ||||
| #define V(x) V1(x) | #define V(x) V1(x) | ||||
| @@ -34,7 +36,6 @@ struct BenchArgs { | |||||
| std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) { | std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) { | ||||
| std::vector<BenchArgs> args; | std::vector<BenchArgs> args; | ||||
| args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); | args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1}); | args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2}); | args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2}); | args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2}); | ||||
| @@ -44,7 +45,6 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) { | |||||
| args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1}); | args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2}); | args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2}); | args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); | |||||
| args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2}); | args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2}); | args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2}); | ||||
| @@ -57,6 +57,7 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) { | |||||
| args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1}); | args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1}); | args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1}); | args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2}); | |||||
| args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2}); | args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2}); | args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2}); | ||||
| @@ -331,6 +332,12 @@ void benchmark_target_algo_with_cudnn_tsc( | |||||
| if ((format == Format::CHWN4 || format == Format::NCHW4) && | if ((format == Format::CHWN4 || format == Format::NCHW4) && | ||||
| (arg.ci % 16 != 0)) | (arg.ci % 16 != 0)) | ||||
| continue; | continue; | ||||
| Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 | |||||
| ? Format::NCHW32 | |||||
| : Format::NCHW4; | |||||
| param.format = format_cudnn; | |||||
| benchmarker_cudnn.set_param(param); | |||||
| float time_in_ms = 0.f; | float time_in_ms = 0.f; | ||||
| if (algo) { | if (algo) { | ||||
| time_in_ms = | time_in_ms = | ||||
| @@ -351,18 +358,14 @@ void benchmark_target_algo_with_cudnn_tsc( | |||||
| {}}) / | {}}) / | ||||
| RUNS; | RUNS; | ||||
| } | } | ||||
| Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 | |||||
| ? Format::NCHW32 | |||||
| : Format::NCHW4; | |||||
| param.format = format_cudnn; | |||||
| benchmarker_cudnn.set_param(param); | |||||
| auto time_in_ms_cudnn = | |||||
| float time_in_ms_cudnn = | |||||
| benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn), | benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn), | ||||
| get_tensor_shape(filter, format_cudnn), | get_tensor_shape(filter, format_cudnn), | ||||
| get_tensor_shape(bias, format_cudnn), | get_tensor_shape(bias, format_cudnn), | ||||
| {}, | {}, | ||||
| {}}) / | {}}) / | ||||
| RUNS; | RUNS; | ||||
| float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / | float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / | ||||
| (1e12); | (1e12); | ||||
| printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, " | printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, " | ||||
| @@ -1075,8 +1078,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { | |||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| /// \note: we only check several cases and block sizes in megdnn_test, the full | |||||
| /// testcases are written in cutlass repository | |||||
| /// \note: we only check several cases and block sizes in megdnn_test, the | |||||
| /// full testcases are written in cutlass repository | |||||
| TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { | TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { | ||||
| require_compute_capability_eq(7, 5); | require_compute_capability_eq(7, 5); | ||||
| Checker<ConvBiasForward> checker(handle_cuda()); | Checker<ConvBiasForward> checker(handle_cuda()); | ||||