| @@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() { | |||
| printf("Total time: %4.3fms\n", total_time); | |||
| } | |||
| std::shared_ptr<json::Value> TensorRTProfiler::to_json() { | |||
| using namespace json; | |||
| auto prof_arr = Array::make(); | |||
| for (auto&& rec : profile) { | |||
| auto&& item = Array::make(); | |||
| item->add(String::make(rec.first)); | |||
| item->add(Number::make(rec.second)); | |||
| prof_arr->add(item); | |||
| } | |||
| return prof_arr; | |||
| } | |||
| #endif // MGB_ENABLE_JSON | |||
| @@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) { | |||
| void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||
| CompNode comp_node_check, | |||
| nvinfer1::ICudaEngine* engine, | |||
| size_t batch) { | |||
| size_t batch, bool use_trt_profiler) { | |||
| auto comp_node = opr->comp_node(); | |||
| // ICudaEngine is bound to the currently active device | |||
| @@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||
| comp_node_check.to_string().c_str(), | |||
| comp_node.to_string().c_str()); | |||
| } | |||
| #if MGB_ENABLE_JSON | |||
| auto pf_holder_pair = | |||
| opr->owner_graph() | |||
| ->options() | |||
| .user_data.get_user_data<opr_profile::OprProfileHolder>(); | |||
| if (m_has_profiler && !pf_holder_pair.second) { | |||
| m_context.reset(); | |||
| m_has_profiler = false; | |||
| } | |||
| #endif | |||
| auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr(); | |||
| bool should_reinit_device_memory = | |||
| !m_context || m_device_workspace_memory_ptr != workspace_ptr; | |||
| if (!m_context) { | |||
| m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}}; | |||
| m_has_profiler = false; | |||
| } | |||
| m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1); | |||
| bool is_trt_opr = false; | |||
| @@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||
| bool exec_success = false; | |||
| #if MGB_ENABLE_JSON | |||
| if (!pf_holder_pair.second) { | |||
| mgb_assert(!m_has_profiler, | |||
| "Invalid state of TensorRTRuntimeOpr: should not have " | |||
| "profiler."); | |||
| if (!use_trt_profiler) { | |||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||
| if (is_trt_opr) | |||
| exec_success = m_context->enqueueV2(m_trt_iobuf.data(), | |||
| @@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||
| } else { | |||
| TensorRTProfiler trt_profiler; | |||
| m_context->setProfiler(&trt_profiler); | |||
| m_has_profiler = true; | |||
| // TensorRT documentation stated that IExecutionContext->execute | |||
| // "Synchronously execute inference on a batch", and it does not take a | |||
| // cudaStream_t, we expect it do a device synchronize. But it seems like | |||
| @@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr, | |||
| exec_success = m_context->execute(batch, m_trt_iobuf.data()); | |||
| #endif | |||
| mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname()); | |||
| pf_holder_pair.first[0]->id2object_map[opr] = trt_profiler.to_json(); | |||
| printf("TRT profile info of opr %s:\n", opr->name().c_str()); | |||
| trt_profiler.print_layer_times(); | |||
| } | |||
| #else | |||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||
| if (is_trt_opr) | |||
| exec_success = m_context->enqueueV2(m_trt_iobuf.data(), | |||
| env.cuda_env().stream, nullptr); | |||
| else | |||
| exec_success = m_context->enqueue(batch, m_trt_iobuf.data(), | |||
| env.cuda_env().stream, nullptr); | |||
| #else | |||
| exec_success = m_context->enqueue(batch, m_trt_iobuf.data(), | |||
| env.cuda_env().stream, nullptr); | |||
| #endif | |||
| mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname()); | |||
| #endif | |||
| } | |||
| /* ========================== TensorRTOpr ========================== */ | |||
| @@ -50,11 +50,11 @@ class TensorRTManager { | |||
| std::vector<void*> m_trt_iobuf; | |||
| TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context; | |||
| void* m_device_workspace_memory_ptr; | |||
| bool m_has_profiler; | |||
| public: | |||
| void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check, | |||
| nvinfer1::ICudaEngine* engine, size_t batch = 1); | |||
| nvinfer1::ICudaEngine* engine, size_t batch = 1, | |||
| bool use_trt_profiler = false); | |||
| void clear_trt_context() { m_context.reset(); } | |||
| @@ -28,50 +28,6 @@ using namespace mgb; | |||
| using namespace nvinfer1; | |||
| using namespace opr; | |||
| TEST(TestOprTensorRT, Profile) { | |||
| REQUIRE_GPU(1); | |||
| intl::ConcatConvTensorRTNetwork net; | |||
| auto p = net.create_trt_network(true); | |||
| auto y2 = TensorRTOpr::make(TensorRTOpr::to_shared_ptr_builder(p.first), | |||
| TensorRTOpr::to_shared_ptr_network(p.second), | |||
| intl::TensorRTGraphFeatureBits::NCHW_FLOAT, {}, | |||
| {net.x0, net.x1})[0]; | |||
| HostTensorND host_z1; | |||
| HostTensorND host_z2; | |||
| auto func = net.graph->compile({make_callback_copy(net.y, host_z1), | |||
| make_callback_copy(y2, host_z2)}); | |||
| { | |||
| mgb::GraphProfiler profiler(net.graph.get()); | |||
| func->execute(); | |||
| profiler.to_json()->writeto_fpath( | |||
| output_file("TestOprTensorRT.Profile.FromProfiler.json")); | |||
| auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get()); | |||
| auto record_obj = | |||
| *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get()); | |||
| auto opr_prof_arr = *static_cast<json::Array*>( | |||
| record_obj[y2.node()->owner_opr()->id_str()].get()); | |||
| for (auto item_arr : opr_prof_arr.get_impl()) { | |||
| auto layer_info_arr = *static_cast<json::Array*>(item_arr.get()); | |||
| auto layer_time = | |||
| *static_cast<json::Number*>(layer_info_arr[1].get()); | |||
| mgb_assert(layer_time.get_impl() > 0, "Error occured in json."); | |||
| } | |||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||
| } | |||
| // Run it again after profiler is not in existance. | |||
| func->execute(); | |||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||
| } | |||
| TEST(TestOprTensorRT, Basic) { | |||
| REQUIRE_GPU(1); | |||
| intl::SimpleTensorRTNetwork net; | |||
| @@ -10,7 +10,6 @@ | |||
| */ | |||
| #include "megbrain/comp_node_env.h" | |||
| #include "megbrain/plugin/profiler.h" | |||
| #include "megbrain/test/autocheck.h" | |||
| #include "megbrain/test/helper.h" | |||
| #include "megbrain/test/megdnn_helper.h" | |||
| @@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) { | |||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||
| } | |||
| TEST(TestOprTensorRT, RuntimeProfile) { | |||
| REQUIRE_GPU(1); | |||
| intl::ConcatConvTensorRTNetwork net; | |||
| SymbolVar y2; | |||
| { | |||
| auto p = net.create_trt_network(false); | |||
| TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}}; | |||
| TensorRTUniquePtr<IBuilder> builder{p.first, {}}; | |||
| builder->setMaxBatchSize(5); | |||
| #if NV_TENSOR_RT_VERSION >= 6001 | |||
| TensorRTUniquePtr<IBuilderConfig> build_config{ | |||
| builder->createBuilderConfig()}; | |||
| auto cuda_engine = | |||
| builder->buildEngineWithConfig(*trt_net, *build_config); | |||
| #else | |||
| auto cuda_engine = builder->buildCudaEngine(*trt_net); | |||
| #endif | |||
| TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}}; | |||
| FILE* fout = fopen(output_file("trt_cuda_engine").c_str(), "wb"); | |||
| auto wr = fwrite(mem->data(), 1, mem->size(), fout); | |||
| mgb_assert(wr == mem->size()); | |||
| fclose(fout); | |||
| y2 = TensorRTRuntimeOpr::make( | |||
| TensorRTRuntimeOpr::to_shared_ptr_engine(cuda_engine), {}, | |||
| {net.x0, net.x1})[0]; | |||
| } | |||
| HostTensorND host_z1; | |||
| HostTensorND host_z2; | |||
| auto func = net.graph->compile({make_callback_copy(net.y, host_z1), | |||
| make_callback_copy(y2, host_z2)}); | |||
| { | |||
| mgb::GraphProfiler profiler(net.graph.get()); | |||
| func->execute(); | |||
| profiler.to_json()->writeto_fpath(output_file( | |||
| "TestOprTensorRT.RuntimeProfile.FromProfiler.json")); | |||
| auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get()); | |||
| auto record_obj = | |||
| *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get()); | |||
| auto opr_prof_arr = *static_cast<json::Array*>( | |||
| record_obj[y2.node()->owner_opr()->id_str()].get()); | |||
| for (auto item_arr : opr_prof_arr.get_impl()) { | |||
| auto layer_info_arr = *static_cast<json::Array*>(item_arr.get()); | |||
| auto layer_time = | |||
| *static_cast<json::Number*>(layer_info_arr[1].get()); | |||
| mgb_assert(layer_time.get_impl() > 0, "Error occured in json."); | |||
| } | |||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||
| } | |||
| // Run it again after profiler is not in existance. | |||
| func->execute(); | |||
| MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4); | |||
| } | |||
| TEST(TestOprTensorRT, RuntimeChangeBatchSize) { | |||
| REQUIRE_GPU(1); | |||
| intl::SimpleTensorRTNetwork net; | |||