!22414 [MSLITE] add support gpu device context set device info for tensorrt

Merge pull request !22414 from Liu_Xuu/trt_0825_cudadevice
4 years ago · 207ac21853
--- a/mindspore/lite/build_lite.sh
+++ b/mindspore/lite/build_lite.sh
@@ -230,13 +230,13 @@ build_lite() {
          compile_nnie_script=${BASEPATH}/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
          cd ${BASEPATH}/../
          if [[ "${local_lite_platform}" == "x86_64" ]]; then
            sh ${compile_nnie_script} -I x86_64 -b nnie_3516_master -j $THREAD_NUM
            sh ${compile_nnie_script} -I x86_64 -b nnie_3516_master_dev -j $THREAD_NUM
            if [[ $? -ne 0 ]]; then
              echo "compile x86_64 for nnie failed."
              exit 1
            fi
          elif [[ "${local_lite_platform}" == "arm32" ]]; then
            sh ${compile_nnie_script} -I arm32 -b nnie_3516_master -j $THREAD_NUM
            sh ${compile_nnie_script} -I arm32 -b nnie_3516_master_dev -j $THREAD_NUM
            if [[ $? -ne 0 ]]; then
              echo "compile arm32 for nnie failed."
              exit 1
--- a/mindspore/lite/include/context.h
+++ b/mindspore/lite/include/context.h
@@ -31,6 +31,7 @@ typedef struct CpuDeviceInfo {
 /// \brief GpuDeviceInfo defined for GPU's configuration information.
 typedef struct GpuDeviceInfo {
  bool enable_float16_ = false; /**< prior enable float16 inference */
  uint32_t gpu_device_id_ = 0;
 } GpuDeviceInfo;

 /// \brief NpuDeviceInfo defined for NPU's configuration information.
--- a/mindspore/lite/src/common/context_util.cc
+++ b/mindspore/lite/src/common/context_util.cc
@@ -49,6 +49,7 @@ std::shared_ptr<mindspore::GPUDeviceInfo> GPUDeviceInfoFromGPUDeviceContext(cons
  }
  auto gpu_info = std::make_shared<mindspore::GPUDeviceInfo>();
  gpu_info->SetEnableFP16(gpu_context.device_info_.gpu_device_info_.enable_float16_);
  gpu_info->SetDeviceID(gpu_context.device_info_.gpu_device_info_.gpu_device_id_);
  PassBasicProperties(gpu_info, gpu_context);
  return gpu_info;
 }
--- a/mindspore/lite/src/cxx_api/context.cc
+++ b/mindspore/lite/src/cxx_api/context.cc
@@ -30,6 +30,7 @@
 namespace mindspore {
 constexpr auto kModelOptionCpuEnableFP16 = "mindspore.option.cpu.enable_fp16";
 constexpr auto kModelOptionGPUEnableFP16 = "mindspore.option.gpu.enable_fp16";
 constexpr auto kModelOptionGPUDeviceID = "mindspore.option.gpu.device_id";
 constexpr auto kModelOptionKirinNpuFrequency = "mindspore.option.kirin_npu.frequency";
 constexpr auto kModelOptionProvider = "mindspore.option.provider";
 constexpr auto kModelOptionProviderDevice = "mindspore.option.provider.device";
@@ -276,10 +277,20 @@ int KirinNPUDeviceInfo::GetFrequency() const {
  return GetValue<int>(data_, kModelOptionKirinNpuFrequency);
 }

 void GPUDeviceInfo::SetDeviceID(uint32_t device_id) { MS_LOG(ERROR) << "Unsupported Feature."; }
 void GPUDeviceInfo::SetDeviceID(uint32_t device_id) {
  if (data_ == nullptr) {
    MS_LOG(ERROR) << "Invalid context.";
    return;
  }
  data_->params[kModelOptionGPUDeviceID] = device_id;
 }

 uint32_t GPUDeviceInfo::GetDeviceID() const {
  MS_LOG(ERROR) << "Unsupported Feature.";
  return 0;
  if (data_ == nullptr) {
    MS_LOG(ERROR) << "Invalid context.";
    return 0;
  }
  return GetValue<uint32_t>(data_, kModelOptionGPUDeviceID);
 }

 void GPUDeviceInfo::SetGpuTrtInferMode(bool gpu_trt_infer_mode) { MS_LOG(ERROR) << "Unsupported Feature."; }
--- a/mindspore/lite/src/cxx_api/converters.cc
+++ b/mindspore/lite/src/cxx_api/converters.cc
@@ -56,7 +56,7 @@ Status AddCpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInf
 Status AddGpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInfoContext *device) {
  lite::DeviceInfo device_info = {0};
  auto gpu_context = device->Cast<GPUDeviceInfo>();
  device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
  device_info.gpu_device_info_ = {gpu_context->GetEnableFP16(), gpu_context->GetDeviceID()};
  l_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
                                     gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
  return kSuccess;
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.cc
@@ -49,18 +49,23 @@ void *TensorRTAllocator::MallocDeviceMem(const std::string &name, size_t size, D
  if (cuda_tensor_map_[name].data != nullptr) {
    cuda_ret = cudaFree(cuda_tensor_map_[name].data);
    if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
      MS_LOG(ERROR) << "free cuda failed for " << cudaGetErrorName(cuda_ret);
      MS_LOG(ERROR) << "free old cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
      cuda_ret = cudaFree(device_ptr);
      if (cuda_ret != cudaSuccess) {
        MS_LOG(ERROR) << "free new cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
        return nullptr;
      }
      return nullptr;
    }
  }
  cuda_tensor_map_[name].data = device_ptr;
  cuda_tensor_map_[name].isValidMem = false;
  cuda_tensor_map_[name].is_valid_mem = false;
  cuda_tensor_map_[name].size = size;
  return device_ptr;
 }

 void TensorRTAllocator::MarkMemValid(const std::string &name, bool isValid) {
  cuda_tensor_map_[name].isValidMem = isValid;
  cuda_tensor_map_[name].is_valid_mem = isValid;
  return;
 }

@@ -83,8 +88,8 @@ int TensorRTAllocator::SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, c
  }
  CudaTensorParam &current_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second;
  // is memcpy from device to host, the host mem is valid, change tag for mem pool.
  current_cuda_tensor.isValidMem = is_host2device ? current_cuda_tensor.isValidMem : true;
  if (is_host2device && current_cuda_tensor.isValidMem) {
  current_cuda_tensor.is_valid_mem = is_host2device ? current_cuda_tensor.is_valid_mem : true;
  if (is_host2device && current_cuda_tensor.is_valid_mem) {
    MS_LOG(INFO) << "no need memcpy for: " << device_tensor_name;
    return RET_OK;
  }
@@ -108,7 +113,7 @@ int TensorRTAllocator::ClearDeviceMem() {
      MS_LOG(WARNING) << "free cuda failed for " << cudaGetErrorName(cuda_ret);
    }
    iter.second.data = nullptr;
    iter.second.isValidMem = false;
    iter.second.is_valid_mem = false;
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.h
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_allocator.h
@@ -25,7 +25,7 @@
 namespace mindspore::lite {
 struct CudaTensorParam {
  void *data = nullptr;
  bool isValidMem = false;
  bool is_valid_mem = false;
  size_t size = 0;
 };
 class TensorRTAllocator {
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_delegate.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_delegate.cc
@@ -98,6 +98,7 @@ Status TensorRTDelegate::Init() {
    {schema::PrimitiveType_Flatten, GetTensorRTOp<ShuffleTensorRT>},
    {schema::PrimitiveType_Sqrt, GetTensorRTOp<UnaryTensorRT>},
  };
  lite::SetCudaDevice(device_info_);
  if (runtime_ == nullptr) {
    runtime_ = new (std::nothrow) TensorRTRuntime();
  }
@@ -109,6 +110,7 @@ Status TensorRTDelegate::Init() {
 }

 Status TensorRTDelegate::Build(DelegateModel *model) {
  lite::SetCudaDevice(device_info_);
  KernelIter from, end;
  std::vector<TensorRTOp *> tensorrt_ops;
  int graph_index = 0;
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc
@@ -256,6 +256,7 @@ int TensorRTSubGraph::BuildTensorRTGraph() {
 }

 int TensorRTSubGraph::Prepare() {
  lite::SetCudaDevice(device_info_);
  if (this->engine_ == nullptr) {
    MS_LOG(ERROR) << "engine_ is null in this builder_";
    return RET_ERROR;
@@ -342,6 +343,7 @@ int TensorRTSubGraph::ReSize() {
 }

 int TensorRTSubGraph::Execute() {
  lite::SetCudaDevice(device_info_);
  if (runtime_->GetBatchSize() <= 0) {
    MS_LOG(ERROR) << "TensorRTSubGraph has invalid batch size.";
    return RET_ERROR;
@@ -422,19 +424,4 @@ nvinfer1::ITensor *TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, cons
  }
  return nullptr;
 }
 void TensorRTSubGraph::SetCudaDevice() {
  int device = 0;
  if (cudaGetDevice(&device) != cudaSuccess) {
    MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
  }
  if (device != static_cast<int>(device_info_->GetDeviceID())) {
    if (cudaSetDevice(device_info_->GetDeviceID()) != cudaSuccess) {
      MS_LOG(WARNING) << "cudaSetDevice failed.";
    }
  }
  if (cudaGetDevice(&device) != cudaSuccess) {
    MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
  }
  MS_LOG(INFO) << "cuda is running on device: " << device;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h
@@ -67,8 +67,6 @@ class TensorRTSubGraph : public kernel::Kernel {

  static nvinfer1::ITensor *FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);

  void SetCudaDevice();

  std::vector<TensorRTOp *> all_ops_{};
  // subgraph input nodes.
  std::vector<TensorRTOp *> in_ops_{};
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc
@@ -15,6 +15,7 @@
 */

 #include "src/delegate/tensorrt/tensorrt_utils.h"
 #include <cuda_runtime_api.h>
 #include <map>

 namespace mindspore::lite {
@@ -234,4 +235,23 @@ nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor) {
  }
  return weights;
 }

 void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_) {
  int device = 0;
  auto ret = cudaGetDevice(&device);
  if (ret != cudaSuccess) {
    MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable. error code: " << ret;
  }
  int set_device_id = static_cast<int>(device_info_->GetDeviceID());
  if (device != set_device_id) {
    ret = cudaSetDevice(set_device_id);
    if (ret != cudaSuccess) {
      MS_LOG(WARNING) << "cudaSetDevice failed, error code: " << ret;
    }
  }
  if (cudaGetDevice(&device) != cudaSuccess) {
    MS_LOG(WARNING) << "cudaGetDevice failed, device is untrustable.";
  }
  MS_LOG(INFO) << "cuda is running on device: " << device;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h
@@ -17,6 +17,7 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_
 #include <vector>
 #include <NvInfer.h>
 #include <memory>
 #include "src/delegate/tensorrt/op/tensorrt_op.h"
 #include "mindspore/core/ir/dtype/type_id.h"
 #include "schema/ops_generated.h"
@@ -59,5 +60,6 @@ nvinfer1::Weights TransposeWeight(const mindspore::MSTensor &ms_tensor, float **

 nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor);

 void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_);
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
@@ -165,10 +165,6 @@ void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context
  context->SetThreadAffinity(flags_->cpu_bind_mode_);
  auto &device_list = context->MutableDeviceInfo();

  std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
  device_info->SetEnableFP16(flags_->enable_fp16_);
  device_list.push_back(device_info);

  if (flags_->device_ == "GPU") {
    std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
    gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
@@ -180,6 +176,11 @@ void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context
    npu_device_info->SetFrequency(kFrequencyDefault);
    device_list.push_back(npu_device_info);
  }

  // CPU priority is behind GPU and NPU
  std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
  device_info->SetEnableFP16(flags_->enable_fp16_);
  device_list.push_back(device_info);
 }

 int BenchmarkUnifiedApi::CompareOutput() {
--- a/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
+++ b/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
@@ -19,6 +19,8 @@ function Run_Build_x86() {
  export MSLITE_ENABLE_NNIE=on
  export MSLITE_ENABLE_CONVERTER=off
  export MSLITE_ENABLE_TRAIN=off
  export MSLITE_ENABLE_TOOLS=off
  export MSLITE_ENABLE_TESTCASES=off
  bash ${nnie_code_path}/mindspore/build.sh -I x86_64 -j ${thread_num}
  if [ $? = 0 ]; then
    echo "build x86 for nnie success"