You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_kernel_runtime.cc 26 kB

6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_kernel_runtime.h"
  17. #include "device/gpu/gpu_device_address.h"
  18. #include "device/gpu/cuda_driver.h"
  19. #include "device/gpu/gpu_buffer_mgr.h"
  20. #include "device/gpu/gpu_device_manager.h"
  21. #include "device/gpu/gpu_memory_allocator.h"
  22. #include "device/gpu/distribution/collective_init.h"
  23. #include "utils/convert_utils.h"
  24. #include "utils/context/ms_context.h"
  25. #include "device/kernel_runtime_manager.h"
  26. #include "device/gpu/gpu_common.h"
  27. #include "common/utils.h"
  28. #include "device/gpu/gpu_memory_manager.h"
  29. #include "kernel/common_utils.h"
  30. #include "device/gpu/gpu_memory_copy_manager.h"
  31. namespace mindspore {
  32. namespace device {
  33. namespace gpu {
  34. using mindspore::device::memswap::MemSwapManager;
  35. using mindspore::device::memswap::SwapKind;
  36. bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
  37. bool GPUKernelRuntime::Init() {
  38. if (device_init_ == true) {
  39. GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory();
  40. return true;
  41. }
  42. auto ret = InitDevice();
  43. if (!ret) {
  44. MS_LOG(ERROR) << "InitDevice error.";
  45. return ret;
  46. }
  47. mem_manager_ = std::make_shared<GPUMemoryManager>();
  48. MS_EXCEPTION_IF_NULL(mem_manager_);
  49. mem_manager_->MallocDeviceMemory();
  50. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  51. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  52. if (collective_inited && collective_handle_ != nullptr) {
  53. auto init_nccl_comm_funcptr =
  54. reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
  55. MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
  56. (*init_nccl_comm_funcptr)();
  57. }
  58. device_init_ = true;
  59. return ret;
  60. }
  61. DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
  62. TypeId type_id) {
  63. return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
  64. }
  65. bool GPUKernelRuntime::InitDevice() {
  66. if (GPUDeviceManager::GetInstance().device_count() <= 0) {
  67. MS_LOG(ERROR) << "No GPU device found.";
  68. return false;
  69. }
  70. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  71. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  72. if (collective_inited && collective_handle_ != nullptr) {
  73. auto get_local_rank_funcptr =
  74. reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
  75. MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
  76. device_id_ = IntToUint((*get_local_rank_funcptr)());
  77. }
  78. if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
  79. if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
  80. MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
  81. return false;
  82. }
  83. }
  84. GPUDeviceManager::GetInstance().InitDevice();
  85. stream_ = GPUDeviceManager::GetInstance().default_stream();
  86. if (stream_ == nullptr) {
  87. MS_LOG(ERROR) << "No default CUDA stream found.";
  88. return false;
  89. }
  90. return true;
  91. }
  92. void GPUKernelRuntime::ReleaseDeviceRes() {
  93. // For dataset mode.
  94. if (GpuBufferMgr::GetInstance().IsInit()) {
  95. if (!GpuBufferMgr::GetInstance().IsClosed()) {
  96. if (!GpuBufferMgr::GetInstance().CloseNotify()) {
  97. MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
  98. }
  99. }
  100. CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  101. }
  102. // Destroy remaining memory swap events and free host memory.
  103. for (auto &item : mem_swap_map_) {
  104. auto &mem_swap_manager = item.second;
  105. MS_EXCEPTION_IF_NULL(mem_swap_manager);
  106. if (mem_swap_manager->trigger_swap()) {
  107. mem_swap_manager->ClearSwapQueue();
  108. mem_swap_manager->ReleaseHostPinnedMem();
  109. }
  110. }
  111. GPUDeviceManager::GetInstance().ReleaseDevice();
  112. if (mem_manager_ != nullptr) {
  113. mem_manager_->FreeDeviceMemory();
  114. }
  115. kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
  116. MS_EXCEPTION_IF_NULL(bin_map);
  117. bin_map->RemoveKernelCache();
  118. }
  119. void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  120. auto context_ptr = MsContext::GetInstance();
  121. MS_EXCEPTION_IF_NULL(context_ptr);
  122. MS_EXCEPTION_IF_NULL(mem_manager_);
  123. mem_manager_->ResetDynamicMemory();
  124. AssignStaticMemoryInput(graph);
  125. AssignStaticMemoryValueNode(graph);
  126. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  127. if (is_enable_dynamic_mem) {
  128. // Use the dynamic memory pool.
  129. InitKernelRefCount(graph);
  130. InitMemorySwapInfo(graph);
  131. InitKernelOutputAddress(graph);
  132. } else {
  133. AssignDynamicMemory(graph);
  134. }
  135. }
  136. bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  137. struct timeval start_time, end_time;
  138. (void)gettimeofday(&start_time, nullptr);
  139. bool ret = true;
  140. auto context_ptr = MsContext::GetInstance();
  141. MS_EXCEPTION_IF_NULL(context_ptr);
  142. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  143. bool is_enable_pynative_infer = context_ptr->enable_pynative_infer();
  144. if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
  145. auto graph_id = graph->graph_id();
  146. auto iter = mem_swap_map_.find(graph_id);
  147. if (iter == mem_swap_map_.end()) {
  148. MS_LOG(EXCEPTION) << "Find memory swap map failed.";
  149. }
  150. mem_swap_manager_ = iter->second;
  151. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  152. while (!LaunchKernelDynamic(graph)) {
  153. MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
  154. if (!UpdateMemorySwapInfo(graph)) {
  155. return false;
  156. }
  157. }
  158. } else {
  159. ret = LaunchKernel(graph);
  160. }
  161. (void)gettimeofday(&end_time, nullptr);
  162. const uint64_t kUSecondInSecond = 1000000;
  163. uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  164. cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  165. MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us";
  166. return ret;
  167. }
  168. void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  169. MS_EXCEPTION_IF_NULL(graph);
  170. MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  171. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  172. // Init the kernel reference count.
  173. if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
  174. MS_LOG(EXCEPTION) << "Init kernel reference count failed";
  175. }
  176. mem_reuse_util_ptr->SetKernelDefMap();
  177. mem_reuse_util_ptr->SetReuseRefCount();
  178. // Can't free the device address of graph output, so set the reference count of graph output specially.
  179. mem_reuse_util_ptr->SetGraphOutputRefCount();
  180. // Can't free the device address of summary nodes, so set the reference count of summary nodes specially.
  181. mem_reuse_util_ptr->SetSummaryNodesRefCount();
  182. auto graph_id = graph->graph_id();
  183. mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
  184. }
  185. void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) {
  186. MS_EXCEPTION_IF_NULL(graph);
  187. GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared<GPUMemCopyManager>();
  188. MS_EXCEPTION_IF_NULL(gpu_mem_copy_manager);
  189. MemSwapManagerPtr mem_swap_manager = std::make_shared<MemSwapManager>(gpu_mem_copy_manager);
  190. MS_EXCEPTION_IF_NULL(mem_swap_manager);
  191. auto graph_id = graph->graph_id();
  192. mem_swap_map_[graph_id] = mem_swap_manager;
  193. }
  194. void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
  195. MS_EXCEPTION_IF_NULL(graph);
  196. auto &kernels = graph->execution_order();
  197. for (const auto &kernel : kernels) {
  198. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  199. MS_EXCEPTION_IF_NULL(kernel_mod);
  200. auto output_sizes = kernel_mod->GetOutputSizeList();
  201. for (size_t i = 0; i < output_sizes.size(); ++i) {
  202. if (AnfAlgo::OutputAddrExist(kernel, i)) {
  203. continue;
  204. }
  205. std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
  206. auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
  207. auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
  208. AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  209. }
  210. }
  211. }
  212. void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) {
  213. MS_EXCEPTION_IF_NULL(graph);
  214. auto &kernels = graph->execution_order();
  215. for (const auto &kernel : kernels) {
  216. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  217. MS_EXCEPTION_IF_NULL(kernel_mod);
  218. auto output_sizes = kernel_mod->GetOutputSizeList();
  219. for (size_t i = 0; i < output_sizes.size(); ++i) {
  220. if (!AnfAlgo::OutputAddrExist(kernel, i)) {
  221. continue;
  222. }
  223. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
  224. if (device_address->ptr_) {
  225. mem_manager_->FreeMemFromMemPool(device_address);
  226. }
  227. device_address->set_status(DeviceAddressStatus::kInDevice);
  228. }
  229. }
  230. }
  231. bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  232. MS_EXCEPTION_IF_NULL(graph);
  233. auto graph_id = graph->graph_id();
  234. auto iter = mem_reuse_util_map_.find(graph_id);
  235. if (iter == mem_reuse_util_map_.end()) {
  236. MS_LOG(EXCEPTION) << "Find memory reuse map failed.";
  237. }
  238. auto mem_reuse_util_ptr = iter->second;
  239. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  240. // Reset the reference count.
  241. mem_reuse_util_ptr->ResetDynamicUsedRefCount();
  242. // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
  243. AllocCommunicationOpDynamicRes(graph);
  244. auto &kernels = graph->execution_order();
  245. for (const auto &kernel : kernels) {
  246. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  247. MS_EXCEPTION_IF_NULL(kernel_mod);
  248. AddressPtrList kernel_inputs;
  249. AddressPtrList kernel_workspaces;
  250. AddressPtrList kernel_outputs;
  251. auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
  252. if (!ret) {
  253. return false;
  254. }
  255. if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) {
  256. MS_LOG(EXCEPTION) << "Launch kernel failed.";
  257. }
  258. FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  259. UpdateMemorySwapTask(kernel);
  260. }
  261. CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  262. ClearSwapQueue();
  263. return true;
  264. }
  265. bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel) {
  266. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  267. auto &mem_swap_info_list = mem_swap_manager_->QueryKernelMemSwapInfo(kernel);
  268. for (auto &mem_swap_info : mem_swap_info_list) {
  269. auto &kernel_exec_info = mem_swap_manager_->SearchKernelExecutionInfo(mem_swap_info.kernel_);
  270. const HostAddress &host_address = kernel_exec_info.host_addrs_[mem_swap_info.output_idx_];
  271. auto device_address = AnfAlgo::GetMutableOutputAddr(mem_swap_info.kernel_, mem_swap_info.output_idx_, false);
  272. if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) {
  273. mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address);
  274. } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) {
  275. auto status = device_address->status();
  276. if (status == DeviceAddressStatus::kInDeviceToHost) {
  277. mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_);
  278. device_address->set_status(DeviceAddressStatus::kInDevice);
  279. } else if (status == DeviceAddressStatus::kInHost) {
  280. if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_)) {
  281. return false;
  282. }
  283. if (!mem_swap_manager_->FindInSwapInBlackList(device_address->ptr_)) {
  284. mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address);
  285. }
  286. }
  287. }
  288. }
  289. return true;
  290. }
  291. bool GPUKernelRuntime::UpdateMemorySwapInfo(const session::KernelGraph *graph) {
  292. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  293. ClearKernelOutputAddress(graph);
  294. if (!mem_swap_manager_->mem_swap_init()) {
  295. mem_swap_manager_->Init(graph);
  296. }
  297. return mem_swap_manager_->RetreatSwapInfo();
  298. }
  299. bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel) {
  300. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  301. if (!mem_swap_manager_->trigger_swap()) {
  302. return true;
  303. }
  304. if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) {
  305. CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  306. if (!AddMemorySwapTask(kernel)) {
  307. return false;
  308. }
  309. }
  310. CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed.");
  311. return true;
  312. }
  313. void GPUKernelRuntime::UpdateHostSwapQueue(const DeviceAddressPtr device_address) {
  314. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  315. if (!mem_swap_manager_->trigger_swap()) {
  316. return;
  317. }
  318. while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) {
  319. device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
  320. }
  321. auto status = device_address->status();
  322. switch (status) {
  323. case DeviceAddressStatus::kInDevice:
  324. break;
  325. case DeviceAddressStatus::kInDeviceToHost: {
  326. mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_);
  327. device_address->set_status(DeviceAddressStatus::kInDevice);
  328. break;
  329. }
  330. case DeviceAddressStatus::kInHostToDevice: {
  331. while (device_address->status() != DeviceAddressStatus::kInDevice) {
  332. while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) {
  333. device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
  334. }
  335. }
  336. break;
  337. }
  338. case DeviceAddressStatus::kInHost:
  339. MS_LOG(ERROR) << "Invaild device address status:" << status;
  340. break;
  341. default:
  342. MS_LOG(EXCEPTION) << "Invaild device address status:" << status;
  343. }
  344. }
  345. void GPUKernelRuntime::UpdateDeviceSwapQueue() {
  346. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  347. if (!mem_swap_manager_->trigger_swap()) {
  348. return;
  349. }
  350. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  351. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  352. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  353. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  354. }
  355. }
  356. }
  357. void GPUKernelRuntime::ClearSwapQueue() {
  358. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  359. if (!mem_swap_manager_->trigger_swap()) {
  360. return;
  361. }
  362. mem_swap_manager_->ClearSwapQueue();
  363. }
  364. bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size) {
  365. MS_EXCEPTION_IF_NULL(mem_manager_);
  366. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  367. auto ret = mem_manager_->MallocMemFromMemPool(device_address, size);
  368. if (!ret) {
  369. if (!mem_swap_manager_->trigger_swap()) {
  370. return false;
  371. }
  372. mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
  373. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  374. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  375. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  376. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  377. }
  378. }
  379. ret = mem_manager_->MallocMemFromMemPool(device_address, size);
  380. if (!ret) {
  381. return false;
  382. }
  383. }
  384. return true;
  385. }
  386. void *GPUKernelRuntime::AttemptMallocMem(size_t size) {
  387. MS_EXCEPTION_IF_NULL(mem_manager_);
  388. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  389. auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
  390. if (!device_ptr) {
  391. if (!mem_swap_manager_->trigger_swap()) {
  392. return nullptr;
  393. }
  394. mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
  395. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  396. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  397. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  398. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  399. }
  400. }
  401. device_ptr = mem_manager_->MallocMemFromMemPool(size);
  402. if (!device_ptr) {
  403. return nullptr;
  404. }
  405. }
  406. return device_ptr;
  407. }
  408. bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  409. const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
  410. AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) {
  411. if (!AllocKernelInputDynamicRes(kernel, kernel_inputs)) {
  412. return false;
  413. }
  414. if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs)) {
  415. return false;
  416. }
  417. if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces)) {
  418. return false;
  419. }
  420. return true;
  421. }
  422. bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs) {
  423. MS_EXCEPTION_IF_NULL(kernel);
  424. MS_EXCEPTION_IF_NULL(kernel_inputs);
  425. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  426. // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
  427. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
  428. MS_EXCEPTION_IF_NULL(device_address);
  429. UpdateHostSwapQueue(device_address);
  430. MS_EXCEPTION_IF_NULL(device_address->ptr_);
  431. kernel::AddressPtr input = std::make_shared<kernel::Address>();
  432. MS_EXCEPTION_IF_NULL(input);
  433. input->addr = device_address->ptr_;
  434. input->size = device_address->size_;
  435. kernel_inputs->emplace_back(input);
  436. }
  437. return true;
  438. }
  439. bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  440. const mindspore::AnfNodePtr &kernel,
  441. AddressPtrList *kernel_outputs) {
  442. MS_EXCEPTION_IF_NULL(kernel);
  443. MS_EXCEPTION_IF_NULL(kernel_outputs);
  444. UpdateDeviceSwapQueue();
  445. auto output_sizes = kernel_mod.GetOutputSizeList();
  446. for (size_t i = 0; i < output_sizes.size(); ++i) {
  447. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
  448. MS_EXCEPTION_IF_NULL(device_address);
  449. if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i])) {
  450. return false;
  451. }
  452. kernel::AddressPtr output = std::make_shared<kernel::Address>();
  453. MS_EXCEPTION_IF_NULL(output);
  454. output->addr = device_address->ptr_;
  455. output->size = output_sizes[i];
  456. kernel_outputs->emplace_back(output);
  457. }
  458. return true;
  459. }
  460. bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  461. const mindspore::AnfNodePtr &kernel,
  462. AddressPtrList *kernel_workspaces) {
  463. MS_EXCEPTION_IF_NULL(kernel);
  464. MS_EXCEPTION_IF_NULL(kernel_workspaces);
  465. auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
  466. for (size_t i = 0; i < workspace_sizes.size(); ++i) {
  467. if (workspace_sizes[i] == 0) {
  468. kernel_workspaces->emplace_back(nullptr);
  469. continue;
  470. }
  471. auto device_ptr = AttemptMallocMem(workspace_sizes[i]);
  472. if (!device_ptr) {
  473. return false;
  474. }
  475. kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
  476. MS_EXCEPTION_IF_NULL(workspace);
  477. workspace->addr = device_ptr;
  478. workspace->size = workspace_sizes[i];
  479. kernel_workspaces->emplace_back(workspace);
  480. }
  481. return true;
  482. }
  483. void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
  484. MS_EXCEPTION_IF_NULL(graph);
  485. auto &kernels = graph->execution_order();
  486. for (auto &kernel : kernels) {
  487. MS_EXCEPTION_IF_NULL(kernel);
  488. if (AnfAlgo::IsCommunicationOp(kernel)) {
  489. AllocCommunicationOpInputDynamicRes(kernel);
  490. AllocCommunicationOpOutputDynamicRes(kernel);
  491. }
  492. }
  493. }
  494. void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  495. MS_EXCEPTION_IF_NULL(kernel);
  496. bool is_need_alloc_memory = false;
  497. bool is_need_free_memory = false;
  498. size_t total_size = 0;
  499. std::vector<size_t> size_list;
  500. DeviceAddressPtrList addr_list;
  501. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  502. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
  503. MS_EXCEPTION_IF_NULL(device_address);
  504. if (device_address->ptr_ == nullptr) {
  505. is_need_alloc_memory = true;
  506. } else {
  507. is_need_free_memory = true;
  508. }
  509. total_size += device_address->size_;
  510. size_list.emplace_back(device_address->size_);
  511. addr_list.emplace_back(device_address);
  512. }
  513. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  514. }
  515. void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  516. MS_EXCEPTION_IF_NULL(kernel);
  517. bool is_need_alloc_memory = false;
  518. bool is_need_free_memory = false;
  519. size_t total_size = 0;
  520. std::vector<size_t> size_list;
  521. DeviceAddressPtrList addr_list;
  522. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  523. MS_EXCEPTION_IF_NULL(kernel_mod);
  524. auto output_sizes = kernel_mod->GetOutputSizeList();
  525. for (size_t i = 0; i < output_sizes.size(); ++i) {
  526. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
  527. MS_EXCEPTION_IF_NULL(device_address);
  528. if (device_address->ptr_ == nullptr) {
  529. is_need_alloc_memory = true;
  530. } else {
  531. is_need_free_memory = true;
  532. }
  533. total_size += output_sizes[i];
  534. size_list.emplace_back(output_sizes[i]);
  535. addr_list.emplace_back(device_address);
  536. }
  537. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  538. }
  539. void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
  540. const DeviceAddressPtrList addr_list, size_t total_size,
  541. std::vector<size_t> size_list) {
  542. MS_EXCEPTION_IF_NULL(mem_manager_);
  543. if (!is_need_alloc_memory) {
  544. return;
  545. }
  546. if (is_need_free_memory) {
  547. for (const auto &iter : addr_list) {
  548. MS_EXCEPTION_IF_NULL(iter);
  549. // Free the inputs/outputs of communication kernel which are not released.
  550. if (iter->ptr_ != nullptr) {
  551. mem_manager_->FreeMemFromMemPool(iter);
  552. }
  553. }
  554. }
  555. auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
  556. if (!ret) {
  557. MS_LOG(EXCEPTION) << "Malloc device memory failed.";
  558. }
  559. }
  560. void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
  561. const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  562. MS_EXCEPTION_IF_NULL(kernel);
  563. MS_EXCEPTION_IF_NULL(mem_manager_);
  564. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  565. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  566. auto cnode = kernel->cast<CNodePtr>();
  567. MS_EXCEPTION_IF_NULL(cnode);
  568. if (AnfAlgo::IsCommunicationOp(kernel)) {
  569. return;
  570. }
  571. // Free the input of kernel by reference count.
  572. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  573. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
  574. if (kernel_ref_count_ptr == nullptr) {
  575. continue;
  576. }
  577. kernel_ref_count_ptr->ref_count_dynamic_use_--;
  578. if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) {
  579. MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
  580. }
  581. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  582. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
  583. mem_manager_->FreeMemFromMemPool(device_address);
  584. device_address->set_status(DeviceAddressStatus::kInDevice);
  585. }
  586. }
  587. // Free the output of kernel, if output has no reference.
  588. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
  589. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
  590. if (kernel_ref_count_ptr == nullptr) {
  591. continue;
  592. }
  593. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  594. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
  595. mem_manager_->FreeMemFromMemPool(device_address);
  596. device_address->set_status(DeviceAddressStatus::kInDevice);
  597. }
  598. }
  599. // Free the workspace of kernel.
  600. for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
  601. auto workspace = kernel_workspaces[i];
  602. if (workspace != nullptr) {
  603. MS_EXCEPTION_IF_NULL(workspace->addr);
  604. mem_manager_->FreeMemFromMemPool(workspace->addr);
  605. workspace->addr = nullptr;
  606. }
  607. }
  608. }
  609. } // namespace gpu
  610. } // namespace device
  611. } // namespace mindspore