You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_kernel_runtime.cc 18 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_kernel_runtime.h"
  17. #include "device/gpu/gpu_device_address.h"
  18. #include "device/gpu/cuda_driver.h"
  19. #include "device/gpu/gpu_buffer_mgr.h"
  20. #include "device/gpu/gpu_device_manager.h"
  21. #include "device/gpu/gpu_memory_allocator.h"
  22. #include "device/gpu/distribution/collective_init.h"
  23. #include "utils/convert_utils.h"
  24. #include "utils/context/ms_context.h"
  25. #include "device/kernel_runtime_manager.h"
  26. #include "device/gpu/gpu_common.h"
  27. #include "common/utils.h"
  28. #include "device/gpu/gpu_memory_manager.h"
  29. namespace mindspore {
  30. namespace device {
  31. namespace gpu {
  32. bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
  33. bool GPUKernelRuntime::Init() {
  34. if (device_init_ == true) {
  35. return true;
  36. }
  37. auto ret = InitDevice();
  38. if (!ret) {
  39. MS_LOG(ERROR) << "InitDevice error.";
  40. return ret;
  41. }
  42. mem_manager_ = std::make_shared<GPUMemoryManager>();
  43. MS_EXCEPTION_IF_NULL(mem_manager_);
  44. mem_manager_->MallocDeviceMemory();
  45. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  46. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  47. if (collective_inited && collective_handle_ != nullptr) {
  48. auto init_nccl_comm_funcptr =
  49. reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
  50. MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
  51. (*init_nccl_comm_funcptr)();
  52. }
  53. device_init_ = true;
  54. return ret;
  55. }
  56. DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
  57. TypeId type_id) {
  58. return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
  59. }
  60. bool GPUKernelRuntime::InitDevice() {
  61. if (GPUDeviceManager::GetInstance().device_count() <= 0) {
  62. MS_LOG(ERROR) << "No GPU device found.";
  63. return false;
  64. }
  65. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  66. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  67. if (collective_inited && collective_handle_ != nullptr) {
  68. auto get_local_rank_funcptr =
  69. reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
  70. MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
  71. device_id_ = IntToUint((*get_local_rank_funcptr)());
  72. }
  73. if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
  74. if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
  75. MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
  76. return false;
  77. }
  78. }
  79. GPUDeviceManager::GetInstance().InitDevice();
  80. stream_ = GPUDeviceManager::GetInstance().default_stream();
  81. if (stream_ == nullptr) {
  82. MS_LOG(ERROR) << "No default CUDA stream found.";
  83. return false;
  84. }
  85. return true;
  86. }
  87. void GPUKernelRuntime::ReleaseDeviceRes() {
  88. // For dataset mode.
  89. if (GpuBufferMgr::GetInstance().IsInit()) {
  90. if (!GpuBufferMgr::GetInstance().IsClosed()) {
  91. if (!GpuBufferMgr::GetInstance().CloseNotify()) {
  92. MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
  93. }
  94. }
  95. CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  96. }
  97. GPUDeviceManager::GetInstance().ReleaseDevice();
  98. if (mem_manager_ != nullptr) {
  99. mem_manager_->FreeDeviceMemory();
  100. }
  101. }
  102. void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  103. auto context_ptr = MsContext::GetInstance();
  104. MS_EXCEPTION_IF_NULL(context_ptr);
  105. MS_EXCEPTION_IF_NULL(mem_manager_);
  106. mem_manager_->ResetDynamicMemory();
  107. AssignStaticMemory(graph);
  108. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  109. if (is_enable_dynamic_mem) {
  110. // Use the dynamic memory pool.
  111. InitKernelRefCount(graph);
  112. InitKernelOutputAddress(graph);
  113. } else {
  114. AssignDynamicMemory(graph);
  115. }
  116. }
  117. bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  118. bool ret;
  119. auto context_ptr = MsContext::GetInstance();
  120. MS_EXCEPTION_IF_NULL(context_ptr);
  121. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  122. bool is_enable_pynative_infer = context_ptr->enable_pynative_infer();
  123. struct timeval start_time, end_time;
  124. (void)gettimeofday(&start_time, nullptr);
  125. if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
  126. ret = LaunchKernelDynamic(graph);
  127. } else {
  128. ret = LaunchKernel(graph);
  129. }
  130. (void)gettimeofday(&end_time, nullptr);
  131. const uint64_t kUSecondInSecond = 1000000;
  132. uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  133. cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  134. MS_LOG(DEBUG) << "kernel runtime run graph in " << cost << " us";
  135. return ret;
  136. }
  137. void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  138. MS_EXCEPTION_IF_NULL(graph);
  139. MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  140. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  141. // Init the kernel reference count.
  142. if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
  143. MS_LOG(EXCEPTION) << "Init kernel reference count failed";
  144. }
  145. mem_reuse_util_ptr->SetKernelDefMap();
  146. mem_reuse_util_ptr->SetReuseRefCount();
  147. // Can't free the device address of graph output, so set the reference count of graph output specially.
  148. mem_reuse_util_ptr->SetGraphOutputRefCount();
  149. auto graph_id = graph->graph_id();
  150. mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
  151. }
  152. void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
  153. MS_EXCEPTION_IF_NULL(graph);
  154. auto &kernels = graph->execution_order();
  155. for (const auto &kernel : kernels) {
  156. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  157. MS_EXCEPTION_IF_NULL(kernel_mod);
  158. auto output_sizes = kernel_mod->GetOutputSizeList();
  159. for (size_t i = 0; i < output_sizes.size(); ++i) {
  160. if (AnfAlgo::OutputAddrExist(kernel, i)) {
  161. continue;
  162. }
  163. std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
  164. auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
  165. auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
  166. AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  167. }
  168. }
  169. }
  170. bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  171. MS_EXCEPTION_IF_NULL(graph);
  172. auto graph_id = graph->graph_id();
  173. // The inputs and outputs memory of communication kernel are special, so separate processing.
  174. AllocCommunicationOpDynamicRes(graph);
  175. auto &kernels = graph->execution_order();
  176. for (const auto &kernel : kernels) {
  177. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  178. MS_EXCEPTION_IF_NULL(kernel_mod);
  179. AddressPtrList kernel_inputs;
  180. AddressPtrList kernel_workspaces;
  181. AddressPtrList kernel_outputs;
  182. AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
  183. if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, reinterpret_cast<uintptr_t>(stream_))) {
  184. MS_LOG(ERROR) << "Launch kernel failed.";
  185. return false;
  186. }
  187. FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  188. }
  189. if (!SyncStream()) {
  190. MS_LOG(ERROR) << "SyncStream failed.";
  191. return false;
  192. }
  193. return true;
  194. }
  195. void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  196. const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
  197. AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) {
  198. MS_EXCEPTION_IF_NULL(kernel);
  199. MS_EXCEPTION_IF_NULL(kernel_inputs);
  200. MS_EXCEPTION_IF_NULL(kernel_workspaces);
  201. MS_EXCEPTION_IF_NULL(kernel_outputs);
  202. MS_EXCEPTION_IF_NULL(mem_manager_);
  203. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  204. auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
  205. MS_EXCEPTION_IF_NULL(device_address);
  206. MS_EXCEPTION_IF_NULL(device_address->ptr_);
  207. kernel::AddressPtr input = std::make_shared<kernel::Address>();
  208. MS_EXCEPTION_IF_NULL(input);
  209. input->addr = device_address->ptr_;
  210. input->size = device_address->size_;
  211. kernel_inputs->push_back(input);
  212. }
  213. auto output_sizes = kernel_mod.GetOutputSizeList();
  214. for (size_t i = 0; i < output_sizes.size(); ++i) {
  215. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  216. MS_EXCEPTION_IF_NULL(device_address);
  217. auto device_ptr = device_address->ptr_;
  218. if (device_ptr == nullptr) {
  219. device_ptr = mem_manager_->MallocMemFromMemPool(output_sizes[i]);
  220. MS_EXCEPTION_IF_NULL(device_ptr);
  221. device_address->ptr_ = device_ptr;
  222. }
  223. kernel::AddressPtr output = std::make_shared<kernel::Address>();
  224. MS_EXCEPTION_IF_NULL(output);
  225. output->addr = device_ptr;
  226. output->size = output_sizes[i];
  227. kernel_outputs->push_back(output);
  228. }
  229. auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
  230. for (size_t i = 0; i < workspace_sizes.size(); ++i) {
  231. if (workspace_sizes[i] == 0) {
  232. kernel_workspaces->emplace_back(nullptr);
  233. continue;
  234. }
  235. auto device_ptr = mem_manager_->MallocMemFromMemPool(workspace_sizes[i]);
  236. MS_EXCEPTION_IF_NULL(device_ptr);
  237. kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
  238. MS_EXCEPTION_IF_NULL(workspace);
  239. workspace->addr = device_ptr;
  240. workspace->size = workspace_sizes[i];
  241. kernel_workspaces->push_back(workspace);
  242. }
  243. }
  244. void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
  245. MS_EXCEPTION_IF_NULL(graph);
  246. auto &kernels = graph->execution_order();
  247. for (auto &kernel : kernels) {
  248. MS_EXCEPTION_IF_NULL(kernel);
  249. auto kernel_name = AnfAlgo::GetCNodeName(kernel);
  250. if (kernel_name == kAllReduceOpName) {
  251. AllocCommunicationOpInputDynamicRes(kernel);
  252. AllocCommunicationOpOutputDynamicRes(kernel);
  253. return;
  254. }
  255. }
  256. }
  257. void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  258. MS_EXCEPTION_IF_NULL(kernel);
  259. MS_EXCEPTION_IF_NULL(mem_manager_);
  260. // The reference count of communication kernel input is not 0.
  261. if (communication_op_input_ref_count_ != 0) {
  262. MS_LOG(ERROR) << "The reference count of communication kernel input is not 0.";
  263. return;
  264. }
  265. size_t total = 0;
  266. std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size;
  267. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  268. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  269. MS_EXCEPTION_IF_NULL(device_address);
  270. // The inputs of communication kernel are not released.
  271. if ((i == 0) && (device_address->ptr_ != nullptr)) {
  272. MS_LOG(ERROR) << "The inputs of communication kernel are not released.";
  273. return;
  274. }
  275. auto output_size = device_address->size_;
  276. total += output_size;
  277. addr_size.emplace_back(device_address.get(), output_size);
  278. }
  279. auto device_mem_ptr = mem_manager_->MallocMemFromMemPool(total);
  280. MS_EXCEPTION_IF_NULL(device_mem_ptr);
  281. for (const auto &iter : addr_size) {
  282. MS_EXCEPTION_IF_NULL(iter.first);
  283. iter.first->set_ptr(device_mem_ptr);
  284. communication_op_input_ref_count_++;
  285. device_mem_ptr = AddressOffset(device_mem_ptr, iter.second);
  286. }
  287. }
  288. void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  289. MS_EXCEPTION_IF_NULL(kernel);
  290. MS_EXCEPTION_IF_NULL(mem_manager_);
  291. // The reference count of communication kernel output is not 0.
  292. if (communication_op_output_ref_count_ != 0) {
  293. MS_LOG(ERROR) << "The reference count of communication kernel output is not 0.";
  294. return;
  295. }
  296. size_t total = 0;
  297. std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size;
  298. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  299. MS_EXCEPTION_IF_NULL(kernel_mod);
  300. auto output_sizes = kernel_mod->GetOutputSizeList();
  301. for (size_t i = 0; i < output_sizes.size(); ++i) {
  302. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  303. MS_EXCEPTION_IF_NULL(device_address);
  304. // The outputs of communication kernel are not released.
  305. if ((i == 0) && (device_address->ptr_ != nullptr)) {
  306. MS_LOG(ERROR) << "The outputs of communication kernel are not released.";
  307. return;
  308. }
  309. total += output_sizes[i];
  310. addr_size.emplace_back(device_address.get(), output_sizes[i]);
  311. }
  312. auto device_mem_ptr = mem_manager_->MallocMemFromMemPool(total);
  313. MS_EXCEPTION_IF_NULL(device_mem_ptr);
  314. for (const auto &iter : addr_size) {
  315. MS_EXCEPTION_IF_NULL(iter.first);
  316. iter.first->set_ptr(device_mem_ptr);
  317. communication_op_output_ref_count_++;
  318. device_mem_ptr = AddressOffset(device_mem_ptr, iter.second);
  319. }
  320. }
  321. void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
  322. const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  323. MS_EXCEPTION_IF_NULL(kernel);
  324. MS_EXCEPTION_IF_NULL(mem_manager_);
  325. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  326. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  327. auto cnode = kernel->cast<CNodePtr>();
  328. MS_EXCEPTION_IF_NULL(cnode);
  329. // Free the input of kernel by reference count.
  330. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  331. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
  332. if (kernel_ref_count_ptr == nullptr) {
  333. continue;
  334. }
  335. // Can't free the output of graph.
  336. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == memreuse::kMaxRefCount) {
  337. continue;
  338. }
  339. kernel_ref_count_ptr->ref_count_dynamic_use_--;
  340. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  341. // Reset the reference count.
  342. kernel_ref_count_ptr->ref_count_dynamic_use_ = kernel_ref_count_ptr->ref_count_;
  343. bool is_communication_op = false;
  344. FreeCommunicationOpDynamicRes(kernel, i, &is_communication_op);
  345. if (!is_communication_op) {
  346. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  347. mem_manager_->FreeMemFromMemPool(device_address);
  348. }
  349. }
  350. }
  351. // Free the output of kernel, if output has no reference.
  352. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
  353. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
  354. if (kernel_ref_count_ptr == nullptr) {
  355. continue;
  356. }
  357. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  358. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  359. mem_manager_->FreeMemFromMemPool(device_address);
  360. }
  361. }
  362. // Free the workspace of kernel.
  363. for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
  364. auto workspace = kernel_workspaces[i];
  365. if (workspace != nullptr) {
  366. MS_EXCEPTION_IF_NULL(workspace->addr);
  367. mem_manager_->FreeMemFromMemPool(workspace->addr);
  368. workspace->addr = nullptr;
  369. }
  370. }
  371. }
  372. void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx,
  373. bool *is_communication_op) {
  374. MS_EXCEPTION_IF_NULL(kernel);
  375. MS_EXCEPTION_IF_NULL(mem_manager_);
  376. // The inputs memory of communication kernel is one piece memory, need release together.
  377. if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
  378. communication_op_input_ref_count_--;
  379. if (communication_op_input_ref_count_ == 0) {
  380. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0);
  381. mem_manager_->FreeMemFromMemPool(device_address);
  382. }
  383. *is_communication_op = true;
  384. return;
  385. }
  386. auto cnode = kernel->cast<CNodePtr>();
  387. MS_EXCEPTION_IF_NULL(cnode);
  388. if (input_idx + 1 >= cnode->inputs().size()) {
  389. MS_LOG(EXCEPTION) << "Input index " << input_idx << " is larger than input number " << cnode->inputs().size() - 1
  390. << ".";
  391. }
  392. auto input_node = cnode->input(input_idx + 1);
  393. auto kernel_input = AnfAlgo::VisitKernel(input_node, 0);
  394. // The outputs memory of communication kernel is one piece memory, need release together.
  395. if (AnfAlgo::GetCNodeName(kernel_input.first) == kAllReduceOpName) {
  396. communication_op_output_ref_count_--;
  397. if (communication_op_output_ref_count_ == 0) {
  398. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0);
  399. mem_manager_->FreeMemFromMemPool(device_address);
  400. }
  401. *is_communication_op = true;
  402. }
  403. }
  404. } // namespace gpu
  405. } // namespace device
  406. } // namespace mindspore