!16570 gpu inference

From: @wilfchen Reviewed-by: @limingqi107,@cristoval Signed-off-by: @cristoval
4 years ago · ac9754b7c8
--- a/mindspore/ccsrc/backend/optimizer/trt_pass/trt_op_factory.h
+++ b/mindspore/ccsrc/backend/optimizer/trt_pass/trt_op_factory.h
@@ -50,7 +50,8 @@ class TrtOpFactory {
  ConvertFunc GetConvertFunc(const std::string &op_name) const {
    auto iter = op_convert_map_.find(op_name);
    if (iter == op_convert_map_.end()) {
      MS_LOG(EXCEPTION) << "Operator: " << op_name << " not support.";
      MS_LOG(WARNING) << "Operator: " << op_name << " not support.";
      return nullptr;
    }
    return iter->second;
  }
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -107,6 +107,53 @@ std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) {
 }

 enum ShapeType { kMaxShape, kMinShape };

 void GetRealOutputRecursively(const AnfNodePtr &node, size_t output_index,
                              std::vector<session::KernelWithIndex> *inputs) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<ValueNode>() || node->isa<Parameter>()) {
    return inputs->push_back(std::make_pair(node, 0));
  }

  // Skip control node
  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimDepend) || AnfAlgo::CheckPrimitiveType(node, prim::kPrimLoad) ||
      AnfAlgo::CheckPrimitiveType(node, prim::kPrimUpdateState)) {
    return GetRealOutputRecursively(node->cast<CNodePtr>()->input(kRealInputIndexInDepend), 0, inputs);
  }

  // Bypass TupleGetItem
  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimTupleGetItem)) {
    auto tuple_get_item = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(tuple_get_item);
    auto input = AnfAlgo::GetTupleGetItemRealInput(tuple_get_item);
    auto index = AnfAlgo::GetTupleGetItemOutIndex(tuple_get_item);

    // Conceal MakeTuple + TupleGetItem pair.
    if (AnfAlgo::CheckPrimitiveType(input, prim::kPrimMakeTuple)) {
      auto make_tuple = input->cast<CNodePtr>();
      MS_EXCEPTION_IF_NULL(make_tuple);
      auto real_input = AnfAlgo::GetInputNode(make_tuple, index);
      return GetRealOutputRecursively(real_input, 0, inputs);
    }

    // Skip TupleGetItem.
    return GetRealOutputRecursively(input, index, inputs);
  }

  // Flatten MakeTuple inputs.
  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimMakeTuple)) {
    auto make_tuple = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(make_tuple);
    size_t input_num = AnfAlgo::GetInputTensorNum(make_tuple);
    for (size_t input_index = 0; input_index < input_num; ++input_index) {
      auto input_node = AnfAlgo::GetInputNode(make_tuple, input_index);
      GetRealOutputRecursively(input_node, 0, inputs);
    }
    return;
  }

  return inputs->push_back(std::make_pair(node, output_index));
 }
 }  // namespace

 AnfNodePtr AnfRuntimeAlgorithm::MakeMonadValueNode(const KernelGraphPtr &kg) {
@@ -1956,5 +2003,13 @@ AnfNodeIndexSet AnfRuntimeAlgorithm::GetUpdateStateUsers(const FuncGraphManagerP
  }
  return update_states;
 }

 void AnfRuntimeAlgorithm::GetRealInputs(const AnfNodePtr &node, std::vector<session::KernelWithIndex> *inputs) {
  size_t input_num = AnfAlgo::GetInputTensorNum(node);
  for (size_t input_index = 0; input_index < input_num; ++input_index) {
    auto input_node = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), input_index);
    GetRealOutputRecursively(input_node, 0, inputs);
  }
 }
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -273,6 +273,8 @@ class AnfRuntimeAlgorithm {
                                   std::set<AnfNodePtr> *visited);
  static void InsertMakeTupleForOutput(NotNull<KernelGraphPtr> root_graph);
  static AnfNodeIndexSet GetUpdateStateUsers(const FuncGraphManagerPtr &manager, const AnfNodePtr &node);
  // Get node real inputs, skip `MakeTuple`, `TupleGetItem`, `Depend`, `Load`, `UpdateState` etc.
  static void GetRealInputs(const AnfNodePtr &anf_node, std::vector<session::KernelWithIndex> *inputs);
 };
 }  // namespace session
 using AnfAlgo = session::AnfRuntimeAlgorithm;
--- a/mindspore/ccsrc/backend/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/session/kernel_graph.h
@@ -98,6 +98,9 @@ class KernelGraph : public FuncGraph {

  const std::vector<AnfNodePtr> &inputs() const;
  std::vector<AnfNodePtr> *MutableInputs() const { return inputs_.get(); }
  void SetGraphInputs(const std::vector<AnfNodePtr> &inputs) {
    inputs_ = std::make_shared<std::vector<AnfNodePtr>>(inputs);
  }
  void ReplaceGraphInput(const AnfNodePtr &old_parameter, const AnfNodePtr &new_parameter);
  std::vector<AnfNodePtr> outputs() const;
  CNodePtr NewCNode(const std::vector<AnfNodePtr> &inputs) override;
--- a/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.cc
+++ b/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.cc
@@ -23,6 +23,7 @@
 #include "backend/session/session_factory.h"
 #include "backend/session/executor_manager.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/device/gpu/cuda_driver.h"

 namespace mindspore {
 API_FACTORY_REG(GraphCell::GraphImpl, GPU, GPUGraphImpl);
@@ -36,7 +37,8 @@ GPUGraphImpl::GPUGraphImpl()
      input_names_(),
      output_names_(),
      init_flag_(false),
      load_flag_(false) {}
      load_flag_(false),
      set_device_id_flag_(false) {}

 Status GPUGraphImpl::InitEnv() {
  if (init_flag_) {
@@ -55,6 +57,13 @@ Status GPUGraphImpl::InitEnv() {
  ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id_);
  ms_context->set_param<std::string>(MS_CTX_DEVICE_TARGET, kGPUDevice);

  // Set device id for sync data to host as cudaSetDevice is thread level config.
  bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id_));
  if (!ret) {
    MS_LOG(ERROR) << "Failed to set device id:" << device_id_;
    return kMCDeviceError;
  }

  auto &device_infos = graph_context_->MutableDeviceInfo();
  if (device_infos.size() != 1) {
    return kMCDeviceError;
@@ -194,6 +203,17 @@ Status GPUGraphImpl::Run(const std::vector<MSTensor> &inputs, std::vector<MSTens
    }
  }

  // The `Load()` and `Run()` running in two threads. `Run()` always running in same thread.
  // It should set device id once.
  if (!set_device_id_flag_) {
    bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id_));
    if (!ret) {
      MS_LOG(ERROR) << "Failed to set device id:" << device_id_;
      return kMCDeviceError;
    }
    set_device_id_flag_ = true;
  }

  if (inputs.size() != inputs_info_.size()) {
    MS_LOG(ERROR) << "inputs count not match, required count " << inputs_info_.size() << ", given count "
                  << inputs.size();
--- a/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.h
+++ b/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.h
@@ -57,6 +57,7 @@ class GPUGraphImpl : public GraphCell::GraphImpl {
  std::vector<std::string> output_names_;
  bool init_flag_;
  bool load_flag_;
  bool set_device_id_flag_;

  // tensor-rt
  uint32_t batch_size_;
--- a/mindspore/ccsrc/runtime/device/gpu/trt_loader.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/trt_loader.cc
@@ -30,7 +30,6 @@ TrtLoader::TrtLoader()
  if (nvinfer_handle_ == nullptr) {
    MS_LOG(WARNING) << "Can not open libnvinfer.so. " << dlerror()
                    << ". Install Tensor-RT and export LD_LIBRARY_PATH=${TENSORRT_HOME}/lib:$LD_LIBRARY_PATH.";
    MS_LOG(WARNING) << "Inference with native backend.";
    return;
  }