You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensorrt_runtime_opr.cpp 9.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /**
  2. * \file src/tensorrt/impl/tensorrt_runtime_opr.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/tensorrt/tensorrt_runtime_opr.h"
  12. #include "megbrain/serialization/opr_load_dump.h"
  13. #include "megbrain/common.h"
  14. #include "megbrain/plugin/profiler.h"
  15. #include "megbrain/version_symbol.h"
  16. #include "megdnn/basic_types.h"
  17. #include <cinttypes>
  18. #if MGB_ENABLE_TENSOR_RT
  19. using namespace mgb;
  20. using namespace opr;
  21. using TensorRTManager = intl::TensorRTManager;
  22. namespace {
  23. DType get_dtype_from_trt(nvinfer1::DataType trt_dtype) {
  24. switch (trt_dtype) {
  25. case nvinfer1::DataType::kFLOAT:
  26. return dtype::Float32();
  27. case nvinfer1::DataType::kHALF:
  28. #if !MEGDNN_DISABLE_FLOAT16
  29. return dtype::Float16();
  30. #else
  31. mgb_throw(MegBrainError, "Float16 support is disabled.");
  32. #endif
  33. // We cannot get scale of an Tensor from tensorrt Engine, so the scale
  34. // here is not correct. When researchers build TensorRT engine, they
  35. // should make sure the scale of quantized int8 tensors in MegBrain
  36. // matches with dynamic ranges of TensorRT tensors
  37. case nvinfer1::DataType::kINT8:
  38. return dtype::QuantizedS8(1.f);
  39. case nvinfer1::DataType::kINT32:
  40. return dtype::Int32();
  41. default:
  42. mgb_assert("DataType of trt engine is unknown.");
  43. }
  44. return DType();
  45. }
  46. } // anonymous namespace
  47. /* ========================== TensorRTRuntimeOpr ========================== */
  48. MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorRTRuntimeOpr);
  49. TensorRTRuntimeOpr::TensorRTRuntimeOpr(
  50. std::shared_ptr<nvinfer1::ICudaEngine> engine,
  51. std::shared_ptr<GpuAllocator> gpu_allocator, const VarNodeArray& inputs,
  52. const OperatorNodeConfig& config)
  53. : Super(inputs.at(0)->owner_graph(), config, "tensor_rt",
  54. {inputs.at(0)}),
  55. m_gpu_allocator{std::move(gpu_allocator)},
  56. m_engine{std::move(engine)},
  57. m_trt_engine_has_batch{false} {
  58. mgb_assert(
  59. inputs[0]->comp_node().device_type() == CompNode::DeviceType::CUDA,
  60. "TensorRTRuntimeOpr can only be used on cuda comp nodes; got %s",
  61. inputs[0]->comp_node().to_string().c_str());
  62. size_t nr_input = 0;
  63. bool is_input = true;
  64. for (int i = 0; i < m_engine->getNbBindings(); ++i) {
  65. if (m_engine->bindingIsInput(nr_input)) {
  66. mgb_assert(is_input, "mixed input/output bindings");
  67. // nbDims == 3, means CHW, without batch
  68. if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
  69. m_trt_engine_has_batch = true;
  70. ++nr_input;
  71. } else {
  72. is_input = false;
  73. }
  74. }
  75. size_t nr_output = m_engine->getNbBindings() - nr_input;
  76. mgb_assert(nr_input == inputs.size(),
  77. "inputs size not equal: expect=%zu got=%zu", nr_input,
  78. inputs.size());
  79. for (auto i : inputs) {
  80. add_input({i});
  81. }
  82. if (nr_output == 1) {
  83. add_output(None);
  84. } else {
  85. for (size_t i = 0; i < nr_output; ++i)
  86. add_output(ssprintf("o%zu", i));
  87. }
  88. cg::add_workspace_output(this);
  89. add_equivalence_component<mgb::ScalarHash<void*>>(m_engine.get());
  90. }
  91. void TensorRTRuntimeOpr::get_output_var_shape(
  92. const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
  93. auto batch = inp_shape.at(0)[0];
  94. auto get_mgb_shape = [this, batch](int binding_idx) -> TensorShape {
  95. auto dims = m_engine->getBindingDimensions(binding_idx);
  96. #if NV_TENSOR_RT_VERSION >= 6001
  97. auto format = m_engine->getBindingFormat(binding_idx);
  98. // converting dims to nchw4 format
  99. if (format == nvinfer1::TensorFormat::kCHW4) {
  100. mgb_assert(dims.nbDims == 3 || dims.nbDims == 4,
  101. "Tensor with NCHW4 format should have dimensions of "
  102. "3/4.(got: %d)",
  103. dims.nbDims);
  104. int chan_pos = 0;
  105. if (dims.nbDims == 4) {
  106. chan_pos = 1;
  107. }
  108. dims.nbDims = dims.nbDims + 1;
  109. dims.d[chan_pos] = dims.d[chan_pos] / 4;
  110. dims.d[dims.nbDims - 1] = 4;
  111. }
  112. #endif
  113. return m_trt_engine_has_batch ? TensorRTOpr::dims2shape(dims)
  114. : TensorRTOpr::dims2shape(dims, batch);
  115. };
  116. for (size_t i = 0; i < inp_shape.size(); ++i) {
  117. mgb_assert(batch == inp_shape[i][0], "input batchsize not equal");
  118. TensorShape shp = get_mgb_shape(i);
  119. mgb_assert(shp.eq_shape(inp_shape[i]),
  120. "input shape mismatch: expect=%s got=%s",
  121. shp.to_string().c_str(), inp_shape[i].to_string().c_str());
  122. }
  123. for (size_t i = 0; i < out_shape.size() - 1; ++i) {
  124. out_shape[i] = get_mgb_shape(i + input().size());
  125. }
  126. out_shape.back() = {intl::workspace_size(m_engine.get())};
  127. }
  128. void TensorRTRuntimeOpr::add_input_layout_constraint() {
  129. for (auto i : input()) {
  130. i->add_layout_constraint_contiguous();
  131. }
  132. }
  133. void TensorRTRuntimeOpr::scn_do_execute() {
  134. auto batch = this->input(0)->shape()[0];
  135. if (m_trt_engine_has_batch)
  136. m_manager.exec(this,
  137. m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
  138. m_engine.get());
  139. else
  140. m_manager.exec(this,
  141. m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
  142. m_engine.get(), batch);
  143. }
  144. void TensorRTRuntimeOpr::init_output_dtype() {
  145. DType dt_trt, dt_input;
  146. int idx = 0;
  147. for (auto inp : input()) {
  148. dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
  149. dt_input = inp->dtype();
  150. mgb_assert(dt_trt.valid() && dt_input.valid() &&
  151. dt_trt.enumv() == dt_input.enumv(),
  152. "Input %d Dtype is not expected in trt engine: expected %s, "
  153. "got %s",
  154. idx, dt_trt.name(), dt_input.name());
  155. idx++;
  156. }
  157. for (size_t i = 0; i < output().size(); ++i) {
  158. dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
  159. mgb_assert(dt_trt.valid(),
  160. "output dtype checking failed: invalid dtype returned.");
  161. if (dt_trt.enumv() == DTypeEnum::QuantizedS8) {
  162. mgb_assert(output(i)->dtype().valid(),
  163. "user should specify scale of output tensor of "
  164. "TensorRTRuntimeOpr.");
  165. }
  166. if (!output(i)->dtype().valid())
  167. output(i)->dtype(dt_trt);
  168. idx++;
  169. }
  170. }
  171. SymbolVarArray TensorRTRuntimeOpr::make(
  172. std::shared_ptr<nvinfer1::ICudaEngine> engine,
  173. std::shared_ptr<GpuAllocator> gpu_allocator, const SymbolVarArray& src,
  174. const OperatorNodeConfig& config) {
  175. VarNodeArray var_node_array = cg::to_var_node_array(src);
  176. auto tensor_rt_opr = std::make_unique<TensorRTRuntimeOpr>(
  177. std::move(engine), std::move(gpu_allocator), var_node_array,
  178. config);
  179. auto ret = cg::to_symbol_var_array(
  180. src[0].node()
  181. ->owner_graph()
  182. ->insert_opr(std::move(tensor_rt_opr))
  183. ->output());
  184. ret.pop_back(); // remove workspace
  185. return ret;
  186. }
  187. SymbolVarArray TensorRTRuntimeOpr::make(const void* buf, size_t buf_size,
  188. const SymbolVarArray& src,
  189. const OperatorNodeConfig& config) {
  190. mgb_throw_if(
  191. !CompNode::get_device_count(CompNode::DeviceType::CUDA),
  192. SystemError,
  193. "can not create TensorRTRuntimeOpr when CUDA is not available");
  194. mgb_assert(!src.empty(), "no inputs provided");
  195. TensorRTUniquePtr<nvinfer1::IRuntime> runtime{
  196. nvinfer1::createInferRuntime(TensorRTOpr::Logger::instance()), {}};
  197. auto gpu_allocator =
  198. std::make_shared<GpuAllocator>(src[0].node()->comp_node());
  199. runtime->setGpuAllocator(gpu_allocator.get());
  200. auto engine = runtime->deserializeCudaEngine(buf, buf_size, nullptr);
  201. mgb_assert(engine, "failed to deserialize ICudaEngine");
  202. return make(to_shared_ptr_engine(engine), gpu_allocator, src, config);
  203. }
  204. void TensorRTRuntimeOpr::LoadDumpImpl::dump(serialization::OprDumpContext& ctx,
  205. const cg::OperatorNodeBase& opr) {
  206. TensorRTUniquePtr<nvinfer1::IHostMemory> buf{
  207. opr.cast_final_safe<Opr>().trt_cuda_engine()->serialize(), {}};
  208. mgb_assert(buf, "failed to serialize ICudaEngine");
  209. ctx.dump_buf_with_len(buf->data(), buf->size());
  210. }
  211. cg::OperatorNodeBase* TensorRTRuntimeOpr::LoadDumpImpl::load(
  212. serialization::OprLoadContext& ctx, const cg::VarNodeArray& inputs,
  213. const OperatorNodeConfig& config) {
  214. inputs.at(0)->comp_node().activate();
  215. auto buf = ctx.load_shared_buf_with_len();
  216. return Opr::make(buf.data(), buf.size(), cg::to_symbol_var_array(inputs),
  217. config)
  218. .at(0)
  219. .node()
  220. ->owner_opr();
  221. }
  222. #endif // MGB_ENABLE_TENSOR_RT
  223. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台