Browse Source

!22414 [MSLITE] add support gpu device context set device info for tensorrt

Merge pull request !22414 from Liu_Xuu/trt_0825_cudadevice
tags/v1.5.0-rc1
i-robot Gitee 4 years ago
parent
commit
207ac21853
14 changed files with 64 additions and 34 deletions
  1. +2
    -2
      mindspore/lite/build_lite.sh
  2. +1
    -0
      mindspore/lite/include/context.h
  3. +1
    -0
      mindspore/lite/src/common/context_util.cc
  4. +14
    -3
      mindspore/lite/src/cxx_api/context.cc
  5. +1
    -1
      mindspore/lite/src/cxx_api/converters.cc
  6. +11
    -6
      mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.cc
  7. +1
    -1
      mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.h
  8. +2
    -0
      mindspore/lite/src/delegate/tensorrt/tensorrt_delegate.cc
  9. +2
    -15
      mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc
  10. +0
    -2
      mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h
  11. +20
    -0
      mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc
  12. +2
    -0
      mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h
  13. +5
    -4
      mindspore/lite/tools/benchmark/benchmark_unified_api.cc
  14. +2
    -0
      mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh

+ 2
- 2
mindspore/lite/build_lite.sh View File

@@ -230,13 +230,13 @@ build_lite() {
compile_nnie_script=${BASEPATH}/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
cd ${BASEPATH}/../
if [[ "${local_lite_platform}" == "x86_64" ]]; then
sh ${compile_nnie_script} -I x86_64 -b nnie_3516_master -j $THREAD_NUM
sh ${compile_nnie_script} -I x86_64 -b nnie_3516_master_dev -j $THREAD_NUM
if [[ $? -ne 0 ]]; then
echo "compile x86_64 for nnie failed."
exit 1
fi
elif [[ "${local_lite_platform}" == "arm32" ]]; then
sh ${compile_nnie_script} -I arm32 -b nnie_3516_master -j $THREAD_NUM
sh ${compile_nnie_script} -I arm32 -b nnie_3516_master_dev -j $THREAD_NUM
if [[ $? -ne 0 ]]; then
echo "compile arm32 for nnie failed."
exit 1


+ 1
- 0
mindspore/lite/include/context.h View File

@@ -31,6 +31,7 @@ typedef struct CpuDeviceInfo {
/// \brief GpuDeviceInfo defined for GPU's configuration information.
typedef struct GpuDeviceInfo {
bool enable_float16_ = false; /**< prior enable float16 inference */
uint32_t gpu_device_id_ = 0;
} GpuDeviceInfo;

/// \brief NpuDeviceInfo defined for NPU's configuration information.


+ 1
- 0
mindspore/lite/src/common/context_util.cc View File

@@ -49,6 +49,7 @@ std::shared_ptr<mindspore::GPUDeviceInfo> GPUDeviceInfoFromGPUDeviceContext(cons
}
auto gpu_info = std::make_shared<mindspore::GPUDeviceInfo>();
gpu_info->SetEnableFP16(gpu_context.device_info_.gpu_device_info_.enable_float16_);
gpu_info->SetDeviceID(gpu_context.device_info_.gpu_device_info_.gpu_device_id_);
PassBasicProperties(gpu_info, gpu_context);
return gpu_info;
}


+ 14
- 3
mindspore/lite/src/cxx_api/context.cc View File

@@ -30,6 +30,7 @@
namespace mindspore {
constexpr auto kModelOptionCpuEnableFP16 = "mindspore.option.cpu.enable_fp16";
constexpr auto kModelOptionGPUEnableFP16 = "mindspore.option.gpu.enable_fp16";
constexpr auto kModelOptionGPUDeviceID = "mindspore.option.gpu.device_id";
constexpr auto kModelOptionKirinNpuFrequency = "mindspore.option.kirin_npu.frequency";
constexpr auto kModelOptionProvider = "mindspore.option.provider";
constexpr auto kModelOptionProviderDevice = "mindspore.option.provider.device";
@@ -276,10 +277,20 @@ int KirinNPUDeviceInfo::GetFrequency() const {
return GetValue<int>(data_, kModelOptionKirinNpuFrequency);
}

void GPUDeviceInfo::SetDeviceID(uint32_t device_id) { MS_LOG(ERROR) << "Unsupported Feature."; }
void GPUDeviceInfo::SetDeviceID(uint32_t device_id) {
if (data_ == nullptr) {
MS_LOG(ERROR) << "Invalid context.";
return;
}
data_->params[kModelOptionGPUDeviceID] = device_id;
}

uint32_t GPUDeviceInfo::GetDeviceID() const {
MS_LOG(ERROR) << "Unsupported Feature.";
return 0;
if (data_ == nullptr) {
MS_LOG(ERROR) << "Invalid context.";
return 0;
}
return GetValue<uint32_t>(data_, kModelOptionGPUDeviceID);
}

void GPUDeviceInfo::SetGpuTrtInferMode(bool gpu_trt_infer_mode) { MS_LOG(ERROR) << "Unsupported Feature."; }


+ 1
- 1
mindspore/lite/src/cxx_api/converters.cc View File

@@ -56,7 +56,7 @@ Status AddCpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInf
Status AddGpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInfoContext *device) {
lite::DeviceInfo device_info = {0};
auto gpu_context = device->Cast<GPUDeviceInfo>();
device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
device_info.gpu_device_info_ = {gpu_context->GetEnableFP16(), gpu_context->GetDeviceID()};
l_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
return kSuccess;


+ 11
- 6
mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.cc View File

@@ -49,18 +49,23 @@ void *TensorRTAllocator::MallocDeviceMem(const std::string &name, size_t size, D
if (cuda_tensor_map_[name].data != nullptr) {
cuda_ret = cudaFree(cuda_tensor_map_[name].data);
if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
MS_LOG(ERROR) << "free cuda failed for " << cudaGetErrorName(cuda_ret);
MS_LOG(ERROR) << "free old cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
cuda_ret = cudaFree(device_ptr);
if (cuda_ret != cudaSuccess) {
MS_LOG(ERROR) << "free new cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
return nullptr;
}
return nullptr;
}
}
cuda_tensor_map_[name].data = device_ptr;
cuda_tensor_map_[name].isValidMem = false;
cuda_tensor_map_[name].is_valid_mem = false;
cuda_tensor_map_[name].size = size;
return device_ptr;
}

void TensorRTAllocator::MarkMemValid(const std::string &name, bool isValid) {
cuda_tensor_map_[name].isValidMem = isValid;
cuda_tensor_map_[name].is_valid_mem = isValid;
return;
}

@@ -83,8 +88,8 @@ int TensorRTAllocator::SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, c
}
CudaTensorParam &current_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second;
// is memcpy from device to host, the host mem is valid, change tag for mem pool.
current_cuda_tensor.isValidMem = is_host2device ? current_cuda_tensor.isValidMem : true;
if (is_host2device && current_cuda_tensor.isValidMem) {
current_cuda_tensor.is_valid_mem = is_host2device ? current_cuda_tensor.is_valid_mem : true;
if (is_host2device && current_cuda_tensor.is_valid_mem) {
MS_LOG(INFO) << "no need memcpy for: " << device_tensor_name;
return RET_OK;
}
@@ -108,7 +113,7 @@ int TensorRTAllocator::ClearDeviceMem() {
MS_LOG(WARNING) << "free cuda failed for " << cudaGetErrorName(cuda_ret);
}
iter.second.data = nullptr;
iter.second.isValidMem = false;
iter.second.is_valid_mem = false;
}
return RET_OK;
}


+ 1
- 1
mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.h View File

@@ -25,7 +25,7 @@
namespace mindspore::lite {
struct CudaTensorParam {
void *data = nullptr;
bool isValidMem = false;
bool is_valid_mem = false;
size_t size = 0;
};
class TensorRTAllocator {


+ 2
- 0
mindspore/lite/src/delegate/tensorrt/tensorrt_delegate.cc View File

@@ -98,6 +98,7 @@ Status TensorRTDelegate::Init() {
{schema::PrimitiveType_Flatten, GetTensorRTOp<ShuffleTensorRT>},
{schema::PrimitiveType_Sqrt, GetTensorRTOp<UnaryTensorRT>},
};
lite::SetCudaDevice(device_info_);
if (runtime_ == nullptr) {
runtime_ = new (std::nothrow) TensorRTRuntime();
}
@@ -109,6 +110,7 @@ Status TensorRTDelegate::Init() {
}

Status TensorRTDelegate::Build(DelegateModel *model) {
lite::SetCudaDevice(device_info_);
KernelIter from, end;
std::vector<TensorRTOp *> tensorrt_ops;
int graph_index = 0;


+ 2
- 15
mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc View File

@@ -256,6 +256,7 @@ int TensorRTSubGraph::BuildTensorRTGraph() {
}

int TensorRTSubGraph::Prepare() {
lite::SetCudaDevice(device_info_);
if (this->engine_ == nullptr) {
MS_LOG(ERROR) << "engine_ is null in this builder_";
return RET_ERROR;
@@ -342,6 +343,7 @@ int TensorRTSubGraph::ReSize() {
}

int TensorRTSubGraph::Execute() {
lite::SetCudaDevice(device_info_);
if (runtime_->GetBatchSize() <= 0) {
MS_LOG(ERROR) << "TensorRTSubGraph has invalid batch size.";
return RET_ERROR;
@@ -422,19 +424,4 @@ nvinfer1::ITensor *TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, cons
}
return nullptr;
}
void TensorRTSubGraph::SetCudaDevice() {
int device = 0;
if (cudaGetDevice(&device) != cudaSuccess) {
MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
}
if (device != static_cast<int>(device_info_->GetDeviceID())) {
if (cudaSetDevice(device_info_->GetDeviceID()) != cudaSuccess) {
MS_LOG(WARNING) << "cudaSetDevice failed.";
}
}
if (cudaGetDevice(&device) != cudaSuccess) {
MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
}
MS_LOG(INFO) << "cuda is running on device: " << device;
}
} // namespace mindspore::lite

+ 0
- 2
mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h View File

@@ -67,8 +67,6 @@ class TensorRTSubGraph : public kernel::Kernel {

static nvinfer1::ITensor *FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);

void SetCudaDevice();

std::vector<TensorRTOp *> all_ops_{};
// subgraph input nodes.
std::vector<TensorRTOp *> in_ops_{};


+ 20
- 0
mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc View File

@@ -15,6 +15,7 @@
*/

#include "src/delegate/tensorrt/tensorrt_utils.h"
#include <cuda_runtime_api.h>
#include <map>

namespace mindspore::lite {
@@ -234,4 +235,23 @@ nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor) {
}
return weights;
}

void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_) {
int device = 0;
auto ret = cudaGetDevice(&device);
if (ret != cudaSuccess) {
MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable. error code: " << ret;
}
int set_device_id = static_cast<int>(device_info_->GetDeviceID());
if (device != set_device_id) {
ret = cudaSetDevice(set_device_id);
if (ret != cudaSuccess) {
MS_LOG(WARNING) << "cudaSetDevice failed, error code: " << ret;
}
}
if (cudaGetDevice(&device) != cudaSuccess) {
MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
}
MS_LOG(INFO) << "cuda is running on device: " << device;
}
} // namespace mindspore::lite

+ 2
- 0
mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h View File

@@ -17,6 +17,7 @@
#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_
#include <vector>
#include <NvInfer.h>
#include <memory>
#include "src/delegate/tensorrt/op/tensorrt_op.h"
#include "mindspore/core/ir/dtype/type_id.h"
#include "schema/ops_generated.h"
@@ -59,5 +60,6 @@ nvinfer1::Weights TransposeWeight(const mindspore::MSTensor &ms_tensor, float **

nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor);

void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_);
} // namespace mindspore::lite
#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_

+ 5
- 4
mindspore/lite/tools/benchmark/benchmark_unified_api.cc View File

@@ -165,10 +165,6 @@ void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context
context->SetThreadAffinity(flags_->cpu_bind_mode_);
auto &device_list = context->MutableDeviceInfo();

std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
device_info->SetEnableFP16(flags_->enable_fp16_);
device_list.push_back(device_info);

if (flags_->device_ == "GPU") {
std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
@@ -180,6 +176,11 @@ void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context
npu_device_info->SetFrequency(kFrequencyDefault);
device_list.push_back(npu_device_info);
}

// CPU priority is behind GPU and NPU
std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
device_info->SetEnableFP16(flags_->enable_fp16_);
device_list.push_back(device_info);
}

int BenchmarkUnifiedApi::CompareOutput() {


+ 2
- 0
mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh View File

@@ -19,6 +19,8 @@ function Run_Build_x86() {
export MSLITE_ENABLE_NNIE=on
export MSLITE_ENABLE_CONVERTER=off
export MSLITE_ENABLE_TRAIN=off
export MSLITE_ENABLE_TOOLS=off
export MSLITE_ENABLE_TESTCASES=off
bash ${nnie_code_path}/mindspore/build.sh -I x86_64 -j ${thread_num}
if [ $? = 0 ]; then
echo "build x86 for nnie success"


Loading…
Cancel
Save