You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

executable_cuda.cpp 6.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. /**
  2. * \file src/jit/impl/mlir/executable_cuda.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include <vector>
  13. #include "megbrain_build_config.h"
  14. #include "megdnn/dtype.h"
  15. #if MGB_JIT && MGB_JIT_MLIR
  16. #if MGB_CUDA
  17. #include "./executable_cuda.h"
  18. #include "megbrain/comp_node_env.h"
  19. #include "megbrain/jit/mlir/ir/utils.h"
  20. #include "megbrain/utils/persistent_cache.h"
  21. #include "megbrain/utils/timer.h"
  22. #include <mlir/Dialect/GPU/GPUDialect.h>
  23. #include <mlir/ExecutionEngine/CRunnerUtils.h>
  24. #include <mlir/ExecutionEngine/OptUtils.h>
  25. #include <mlir/IR/OpDefinition.h>
  26. using namespace mgb;
  27. using namespace jit;
  28. namespace {
  29. template <int out_dim, typename ctype>
  30. void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func,
  31. int block_size) {
  32. auto&& args = fusion_opr->args();
  33. std::vector<StridedMemRefType<ctype, out_dim>> param_holders;
  34. std::vector<void*> params;
  35. auto set_params = [&param_holders, &params](
  36. void* ptr, const megdnn::TensorLayout& layout) {
  37. param_holders.push_back(StridedMemRefType<ctype, out_dim>{});
  38. StridedMemRefType<ctype, out_dim>& desc = param_holders.back();
  39. desc.basePtr = static_cast<ctype*>(ptr);
  40. params.push_back(&(desc.basePtr));
  41. desc.data = static_cast<ctype*>(ptr);
  42. params.push_back(&(desc.data));
  43. desc.offset = 0;
  44. params.push_back(&(desc.offset));
  45. for (size_t i = 0; i < layout.ndim; i++) {
  46. desc.sizes[i] = layout.shape[i];
  47. params.push_back(&(desc.sizes[i]));
  48. desc.strides[i] = layout.stride[i];
  49. params.push_back(&(desc.strides[i]));
  50. }
  51. };
  52. for (const auto& arg : args.inputs) {
  53. set_params(arg.from->dev_tensor().raw_ptr(), arg.layout);
  54. }
  55. int64_t nr_elements = 0;
  56. for (const auto& arg : args.outputs) {
  57. if (nr_elements == 0) {
  58. nr_elements = arg.layout.total_nr_elems();
  59. } else {
  60. mgb_assert(static_cast<size_t>(nr_elements) ==
  61. arg.layout.total_nr_elems(),
  62. "The number of elements of outputs mismatch, expected: "
  63. "%zu got: %zu(%s)",
  64. static_cast<size_t>(nr_elements),
  65. arg.layout.total_nr_elems(),
  66. arg.layout.to_string().c_str());
  67. }
  68. set_params(arg.from->dev_tensor().raw_ptr(), arg.layout);
  69. }
  70. const CompNodeEnv& env =
  71. CompNodeEnv::from_comp_node(fusion_opr->comp_node());
  72. int64_t num_block = (nr_elements - 1) / block_size + 1;
  73. params.insert(params.begin(), &nr_elements);
  74. MGB_CUDA_CU_CHECK(cuLaunchKernel(func, num_block, 1, 1, block_size, 1, 1, 0,
  75. env.cuda_env().stream, params.data(), 0));
  76. }
  77. } // namespace
  78. const std::string MLIRCUDAExecutable::sm_blob_annotation = "nvvm.cubin";
  79. MLIRCUDAExecutable::MLIRCUDAExecutable(mlir::OwningModuleRef& module,
  80. const std::string& kernel_name) {
  81. m_kernel_name = kernel_name + "_kernel";
  82. auto kernel_module =
  83. module->lookupSymbol<mlir::gpu::GPUModuleOp>(m_kernel_name);
  84. mgb_assert(kernel_module, "Expected gpu kernel module");
  85. auto binary_attr = kernel_module.getAttrOfType<mlir::StringAttr>(
  86. llvm::StringRef(sm_blob_annotation));
  87. mgb_assert(binary_attr, "Missing %s attribute in gpu kernel module",
  88. sm_blob_annotation.c_str());
  89. m_kernel_data = binary_attr.getValue().str();
  90. }
  91. void MLIRCUDAExecutable::execute(JITExecutor* fusion_opr) {
  92. FuncCache* func;
  93. auto cn = fusion_opr->comp_node();
  94. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  95. func = &m_func_cache[{prop.major, prop.minor}];
  96. func->kernel_data = m_kernel_data;
  97. func->exec(fusion_opr, this);
  98. }
  99. MLIRCUDAExecutable::~MLIRCUDAExecutable() {}
  100. void MLIRCUDAExecutable::FuncCache::exec(const JITExecutor* fusion_opr,
  101. const MLIRCUDAExecutable* cuda_exe) {
  102. Func* func;
  103. {
  104. MGB_LOCK_GUARD(mtx);
  105. auto ins = cn2func.insert({fusion_opr->comp_node(), {}});
  106. func = &ins.first->second;
  107. if (ins.second) {
  108. MGB_CUDA_CU_CHECK(
  109. cuModuleLoadData(&func->module, kernel_data.data()));
  110. MGB_CUDA_CU_CHECK(
  111. cuModuleGetFunction(&func->func, func->module,
  112. cuda_exe->m_kernel_name.c_str()));
  113. int min_grid_size = 0;
  114. MGB_CUDA_CU_CHECK(cuOccupancyMaxPotentialBlockSize(
  115. &min_grid_size, &func->block_size, func->func, nullptr, 0,
  116. 0));
  117. }
  118. }
  119. mgb_assert(fusion_opr->args().outputs.size() == 1,
  120. "Currently only support 1 outputs, got %zu",
  121. fusion_opr->args().outputs.size());
  122. int out_dim = fusion_opr->args().outputs[0].layout.ndim;
  123. DType dtype = fusion_opr->args().outputs[0].layout.dtype;
  124. #define cb_outdim(_ndim, _dtype) \
  125. if (_ndim == out_dim) { \
  126. setup_and_launch<_ndim, _dtype>(fusion_opr, func->func, \
  127. func->block_size); \
  128. return; \
  129. }
  130. #define cb(_dtype) \
  131. cb_outdim(1, float); \
  132. cb_outdim(2, float); \
  133. cb_outdim(3, float); \
  134. cb_outdim(4, float); \
  135. mgb_throw(InternalError, "unsupported out_dim=%zu", \
  136. static_cast<size_t>(out_dim)); \
  137. return;
  138. switch (dtype.enumv()) {
  139. case DTypeEnum::Float32:
  140. cb(float);
  141. default:
  142. mgb_throw(InternalError, "unsupport dtype: %s", dtype.name());
  143. }
  144. #undef cb
  145. #undef cb_outdim
  146. }
  147. #endif // MGB_CUDA
  148. #endif // MGB_JIT && MGB_JIT_MLIR
  149. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台