GitOrigin-RevId: f2f1a10762
tags/v1.5.0
| @@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | |||||
| dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | ||||
| dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | ||||
| tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | ||||
| *.caffemodel filter=lfs diff=lfs merge=lfs -text | |||||
| imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | ||||
| ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ||||
| ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ||||
| @@ -72,12 +72,11 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr( | |||||
| size_t nr_input = 0; | size_t nr_input = 0; | ||||
| bool is_input = true; | bool is_input = true; | ||||
| for (int i = 0; i < m_engine->getNbBindings(); ++i) { | for (int i = 0; i < m_engine->getNbBindings(); ++i) { | ||||
| // nbDims == 3, means CHW, without batch | |||||
| if (m_engine->getBindingDimensions(i).nbDims != 3) | |||||
| m_trt_engine_has_batch = true; | |||||
| if (m_engine->bindingIsInput(nr_input)) { | if (m_engine->bindingIsInput(nr_input)) { | ||||
| mgb_assert(is_input, "mixed input/output bindings"); | mgb_assert(is_input, "mixed input/output bindings"); | ||||
| // nbDims == 3, means CHW, without batch | |||||
| if (m_engine->getBindingDimensions(nr_input).nbDims != 3) | |||||
| m_trt_engine_has_batch = true; | |||||
| ++nr_input; | ++nr_input; | ||||
| } else { | } else { | ||||
| is_input = false; | is_input = false; | ||||
| @@ -106,6 +106,70 @@ intl::SimpleTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||||
| return std::make_pair(builder, network); | return std::make_pair(builder, network); | ||||
| } | } | ||||
| intl::BatchedTensorRTNetwork::BatchedTensorRTNetwork() { | |||||
| host_x = gen({23, 28, 28}); | |||||
| graph = ComputingGraph::make(); | |||||
| x = Host2DeviceCopy::make(*graph, host_x); | |||||
| opr::Reduce::Param param1{Reduce::Mode::SUM, 0, Reduce::Param::DataType::DEFAULT}; | |||||
| opr::Reduce::Param param2{Reduce::Mode::SUM, 1, Reduce::Param::DataType::DEFAULT}; | |||||
| auto y0 = opr::Reduce::make(x, param1); | |||||
| auto y1 = opr::Reduce::make(y0, param2); | |||||
| TensorShape tshp{1, 28}; | |||||
| y = opr::Reshape::make(y1, tshp); | |||||
| } | |||||
| std::pair<nvinfer1::IBuilder*, INetworkDefinition*> | |||||
| intl::BatchedTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||||
| CompNode::load("xpu0").activate(); | |||||
| auto builder = createInferBuilder(TensorRTOpr::Logger::instance()); | |||||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||||
| nvinfer1::NetworkDefinitionCreationFlags flags; | |||||
| ::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags)); | |||||
| if (has_batch_dim) | |||||
| flags = 1 << static_cast<int>(nvinfer1::NetworkDefinitionCreationFlag:: | |||||
| kEXPLICIT_BATCH); | |||||
| auto network = builder->createNetworkV2(flags); | |||||
| #else | |||||
| auto network = builder->createNetwork(); | |||||
| #endif | |||||
| nvinfer1::ITensor* data; | |||||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||||
| if (has_batch_dim) { | |||||
| data = network->addInput("data", DataType::kFLOAT, | |||||
| Dims4{1, 23, 28, 28}); | |||||
| } else { | |||||
| data = network->addInput("data", DataType::kFLOAT, Dims3{23, 28, 28}); | |||||
| } | |||||
| { | |||||
| nvinfer1::TensorFormats formats = | |||||
| 1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR); | |||||
| data->setAllowedFormats(formats); | |||||
| } | |||||
| #else | |||||
| if (has_batch_dim) { | |||||
| data = network->addInput("data", DataType::kFLOAT, | |||||
| DimsNCHW{1, 23, 28, 28}); | |||||
| } else { | |||||
| data = network->addInput("data", DataType::kFLOAT, DimsCHW{23, 28, 28}); | |||||
| } | |||||
| #endif | |||||
| mgb_assert(data != nullptr, "data is invalid"); | |||||
| auto reduce1 = network->addReduce(*data, nvinfer1::ReduceOperation::kSUM, 3, false); | |||||
| mgb_assert(reduce1 != nullptr, "reduce1 is invalid"); | |||||
| reduce1->getOutput(0)->setName("prob"); | |||||
| network->markOutput(*reduce1->getOutput(0)); | |||||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||||
| { | |||||
| nvinfer1::TensorFormats formats = | |||||
| 1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR); | |||||
| reduce1->getOutput(0)->setAllowedFormats(formats); | |||||
| } | |||||
| #endif | |||||
| return std::make_pair(builder, network); | |||||
| } | |||||
| intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | ||||
| host_x = range_gen({32, 8, 28, 28}); | host_x = range_gen({32, 8, 28, 28}); | ||||
| host_w = weight_gen({8, 8, 3, 3}); | host_w = weight_gen({8, 8, 3, 3}); | ||||
| @@ -48,6 +48,20 @@ struct SimpleTensorRTNetwork { | |||||
| create_trt_network(bool has_batch_dim); | create_trt_network(bool has_batch_dim); | ||||
| }; | }; | ||||
| struct BatchedTensorRTNetwork { | |||||
| HostTensorGenerator<> gen; | |||||
| std::shared_ptr<HostTensorND> host_x, host_w, host_b; | |||||
| std::shared_ptr<ComputingGraph> graph; | |||||
| SymbolVar x, y; | |||||
| HostTensorND host_z1; | |||||
| BatchedTensorRTNetwork(); | |||||
| std::pair<nvinfer1::IBuilder*, INetworkDefinition*> | |||||
| create_trt_network(bool has_batch_dim); | |||||
| }; | |||||
| struct SimpleQuantizedTensorRTNetwork { | struct SimpleQuantizedTensorRTNetwork { | ||||
| HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> weight_gen{ | HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> weight_gen{ | ||||
| 1*1.1f, 127*1.1f}; | 1*1.1f, 127*1.1f}; | ||||
| @@ -62,6 +62,37 @@ TEST(TestOprTensorRT, RuntimeBasic) { | |||||
| } | } | ||||
| TEST(TestOprTensorRT, RuntimeBasicBatched) { | |||||
| REQUIRE_GPU(1); | |||||
| intl::BatchedTensorRTNetwork net; | |||||
| auto make_trt = [&net]() { | |||||
| auto p = net.create_trt_network(false); | |||||
| TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}}; | |||||
| TensorRTUniquePtr<IBuilder> builder{p.first, {}}; | |||||
| builder->setMaxBatchSize(5); | |||||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||||
| TensorRTUniquePtr<IBuilderConfig> build_config{ | |||||
| builder->createBuilderConfig()}; | |||||
| TensorRTUniquePtr<ICudaEngine> cuda_engine{ | |||||
| builder->buildEngineWithConfig(*trt_net, *build_config)}; | |||||
| #else | |||||
| TensorRTUniquePtr<ICudaEngine> cuda_engine{ | |||||
| builder->buildCudaEngine(*trt_net)}; | |||||
| #endif | |||||
| TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}}; | |||||
| auto nx = opr::Broadcast::make(net.x, {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]}); | |||||
| return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {nx})[0]; | |||||
| }; | |||||
| auto y2 = make_trt(); | |||||
| HostTensorND host_z1; | |||||
| HostTensorND host_z2; | |||||
| auto func = net.graph->compile({make_callback_copy(net.y, host_z1), | |||||
| make_callback_copy(y2, host_z2)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4); | |||||
| } | |||||
| TEST(TestOprTensorRT, ConcatRuntimeBasic) { | TEST(TestOprTensorRT, ConcatRuntimeBasic) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||