You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

compiler.cpp 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /**
  2. * \file src/jit/impl/mlir/compiler.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain_build_config.h"
  13. #if MGB_JIT && MGB_JIT_MLIR
  14. #include "./compiler.h"
  15. #include "./executable_cpu.h"
  16. #include "./executable_cuda.h"
  17. #include "./mlir_gen.h"
  18. #include "megbrain/common.h"
  19. #include "megbrain/comp_node_env.h"
  20. #include "megbrain/jit/mlir/ir/dialect.h"
  21. #include "megbrain/jit/mlir/ir/passes.h"
  22. #include "megbrain/utils/timer.h"
  23. #include <mlir/Conversion/GPUCommon/GPUCommonPass.h>
  24. #include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h>
  25. #include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h>
  26. #include <mlir/Dialect/GPU/Passes.h>
  27. #include <mlir/IR/Dialect.h>
  28. #include <mlir/IR/MLIRContext.h>
  29. #include <mlir/IR/Module.h>
  30. #include <mlir/InitAllDialects.h>
  31. #include <mlir/Pass/PassManager.h>
  32. #include <mlir/Support/LogicalResult.h>
  33. #include <mlir/Target/NVVMIR.h>
  34. #include <mlir/Transforms/Passes.h>
  35. #include <llvm/Support/TargetSelect.h>
  36. #include <llvm/IRReader/IRReader.h>
  37. #include <llvm/Linker/Linker.h>
  38. #include <llvm/Pass.h>
  39. #include <dlfcn.h>
  40. #include <dirent.h>
  41. using namespace mgb;
  42. using namespace jit;
  43. namespace {
  44. struct LLVMInitializer {
  45. LLVMInitializer() {
  46. llvm::InitializeNativeTarget();
  47. llvm::InitializeNativeTargetAsmPrinter();
  48. }
  49. };
  50. static LLVMInitializer initializer;
  51. #if MGB_CUDA
  52. mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location,
  53. llvm::StringRef) {
  54. OwnedBlob result = std::make_unique<std::vector<char>>(
  55. ptx.data(), ptx.data() + ptx.size());
  56. return result;
  57. }
  58. std::unique_ptr<llvm::Module> translate_module_to_nvvm_ir_and_link_device(
  59. Operation* m) {
  60. std::unique_ptr<llvm::Module> module = mlir::translateModuleToNVVMIR(m);
  61. auto get_device_path = []() -> std::string {
  62. auto cuda_path = getenv("CUDA_BIN_PATH");
  63. std::string device_dir;
  64. if (!cuda_path) {
  65. char cuda_lib_path[PATH_MAX];
  66. auto handle = dlopen("libcudart.so", RTLD_GLOBAL | RTLD_LAZY);
  67. mgb_assert(handle != nullptr, "%s", dlerror());
  68. mgb_assert(dlinfo(handle, RTLD_DI_ORIGIN, &cuda_lib_path) != -1,
  69. "%s", dlerror());
  70. device_dir =
  71. std::string(cuda_lib_path) + "/../../../nvvm/libdevice/";
  72. mgb_assert(!dlclose(handle), "fail to dlclose handle");
  73. } else {
  74. device_dir = std::string(cuda_path) + "/nvvm/libdevice/";
  75. }
  76. DIR* dirp;
  77. struct dirent* directory;
  78. dirp = opendir(device_dir.c_str());
  79. if (dirp) {
  80. while ((directory = readdir(dirp)) != nullptr) {
  81. if (!strncmp(directory->d_name, "libdevice", 9)) {
  82. closedir(dirp);
  83. return device_dir + std::string(directory->d_name);
  84. }
  85. }
  86. closedir(dirp);
  87. }
  88. return {};
  89. };
  90. //! load libdevice.bc
  91. llvm::SMDiagnostic err;
  92. auto libdevice_path = get_device_path();
  93. std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(
  94. libdevice_path.c_str(), err, module->getContext());
  95. if (mlib.get()) {
  96. mlib->setTargetTriple(module->getTargetTriple());
  97. mlib->setDataLayout(module->getDataLayout());
  98. RealTimer timer;
  99. mgb_assert(
  100. !llvm::Linker::linkModules(*module, std::move(mlib),
  101. llvm::Linker::Flags::LinkOnlyNeeded),
  102. "failed to parse ir file libdevice.bc");
  103. mgb_log("MLIR JIT: link libdevice.bc, used: %.3fms", timer.get_msecs());
  104. } else {
  105. mgb_log_warn("Fail to load bitcode file %s", libdevice_path.c_str());
  106. }
  107. return module;
  108. }
  109. #endif
  110. void add_cpu_lowering_pass(mlir::PassManager& manager) {
  111. {
  112. mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
  113. opt_pm.addPass(mlir::createCanonicalizerPass());
  114. opt_pm.addPass(mlir::createCSEPass());
  115. }
  116. manager.addPass(create_lower_to_affine_pass());
  117. {
  118. mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
  119. opt_pm.addPass(mlir::createCanonicalizerPass());
  120. opt_pm.addPass(mlir::createCSEPass());
  121. opt_pm.addPass(mlir::createLoopFusionPass());
  122. opt_pm.addPass(mlir::createMemRefDataFlowOptPass());
  123. }
  124. manager.addPass(create_lower_to_llvm_pass());
  125. }
  126. #if MGB_CUDA
  127. void add_cuda_lowering_pass(mlir::PassManager& manager,
  128. const std::string& target_chip) {
  129. {
  130. mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
  131. opt_pm.addPass(mlir::createCanonicalizerPass());
  132. opt_pm.addPass(mlir::createCSEPass());
  133. }
  134. manager.addPass(create_lower_to_gpu_pass());
  135. {
  136. mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
  137. opt_pm.addPass(mlir::createCanonicalizerPass());
  138. opt_pm.addPass(mlir::createCSEPass());
  139. opt_pm.addPass(mlir::createLoopFusionPass());
  140. opt_pm.addPass(mlir::createMemRefDataFlowOptPass());
  141. }
  142. manager.addPass(create_gpu_kernel_outlining_pass());
  143. {
  144. auto& kernel_pm = manager.nest<gpu::GPUModuleOp>();
  145. kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass());
  146. kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass(
  147. translate_module_to_nvvm_ir_and_link_device,
  148. compile_ptx_to_cubin, "nvptx64-nvidia-cuda", target_chip,
  149. "+ptx60", MLIRCUDAExecutable::sm_blob_annotation));
  150. }
  151. }
  152. #endif
  153. } // namespace
  154. /* ==================== MLIRCompiler ===================== */
  155. thread_local mlir::MLIRContext MLIRCompiler::sm_ctx;
  156. MLIRCompiler::MLIRCompiler(CompNode::DeviceType device_type)
  157. : m_device_type{device_type} {
  158. mlir::registerAllDialects();
  159. mlir::registerDialect<MgbDialect>();
  160. #if MGB_CUDA
  161. if (m_device_type == CompNode::DeviceType::CUDA) {
  162. LLVMInitializeNVPTXTarget();
  163. LLVMInitializeNVPTXTargetInfo();
  164. LLVMInitializeNVPTXTargetMC();
  165. LLVMInitializeNVPTXAsmPrinter();
  166. }
  167. #endif
  168. }
  169. void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module,
  170. CompNode cn) {
  171. mgb_assert(cn.device_type() == m_device_type);
  172. mlir::PassManager manager(module->getContext());
  173. std::string target_chip;
  174. switch (m_device_type) {
  175. case CompNode::DeviceType::CPU:
  176. add_cpu_lowering_pass(manager);
  177. break;
  178. #if MGB_CUDA
  179. case CompNode::DeviceType::CUDA: {
  180. auto&& prop =
  181. CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  182. std::string target_chip =
  183. ssprintf("sm_%d%d", prop.major, prop.minor);
  184. add_cuda_lowering_pass(manager, target_chip);
  185. break;
  186. }
  187. #endif
  188. default:
  189. mgb_throw(InternalError, "Unsupport device type: %d",
  190. static_cast<int>(m_device_type));
  191. break;
  192. }
  193. RealTimer timer;
  194. mgb_assert(mlir::succeeded(manager.run(*module)));
  195. mgb_log("MLIR JIT: run lowering pass used: %.3f ms", timer.get_msecs());
  196. }
  197. std::unique_ptr<Executable> MLIRCompiler::do_compile(
  198. const InternalGraph& graph, const JITExecutor::Args& args) {
  199. mlir::MLIRContext ctx;
  200. ctx.printStackTraceOnDiagnostic(true);
  201. ctx.printOpOnDiagnostic(true);
  202. auto&& res = mlir_gen(ctx, graph, args);
  203. mgb_assert(res.second, "failed to generate module");
  204. CompNode cn = args.owner->comp_node();
  205. run_lowering_pass(res.second, cn);
  206. switch (cn.device_type()) {
  207. case CompNode::DeviceType::CPU:
  208. return std::make_unique<MLIRCPUExecutable>(res.second,
  209. res.first.str());
  210. #if MGB_CUDA
  211. case CompNode::DeviceType::CUDA:
  212. return std::make_unique<MLIRCUDAExecutable>(res.second,
  213. res.first.str());
  214. #endif
  215. default:
  216. mgb_throw(InternalError, "Unsupport device type: %d",
  217. static_cast<int>(cn.device_type()));
  218. return nullptr;
  219. }
  220. }
  221. size_t MLIRCompiler::get_nr_workspace_outputs(JITExecutor* opr) const {
  222. MGB_MARK_USED_VAR(opr);
  223. return 0;
  224. }
  225. void MLIRCompiler::init_workspace_size_infer(JITExecutor* opr) {
  226. MGB_MARK_USED_VAR(opr);
  227. }
  228. #endif // MGB_JIT && MGB_JIT_MLIR
  229. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台