You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_kernel_runtime.cc 24 kB

6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_kernel_runtime.h"
  17. #include "device/gpu/gpu_device_address.h"
  18. #include "device/gpu/cuda_driver.h"
  19. #include "device/gpu/gpu_buffer_mgr.h"
  20. #include "device/gpu/gpu_device_manager.h"
  21. #include "device/gpu/gpu_memory_allocator.h"
  22. #include "device/gpu/distribution/collective_init.h"
  23. #include "utils/convert_utils.h"
  24. #include "utils/context/ms_context.h"
  25. #include "device/kernel_runtime_manager.h"
  26. #include "device/gpu/gpu_common.h"
  27. #include "common/utils.h"
  28. #include "device/gpu/gpu_memory_manager.h"
  29. #include "kernel/common_utils.h"
  30. #include "device/gpu/gpu_memory_copy_manager.h"
  31. namespace mindspore {
  32. namespace device {
  33. namespace gpu {
  34. using mindspore::device::memswap::MemSwapManager;
  35. using mindspore::device::memswap::SwapKind;
  36. bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
  37. bool GPUKernelRuntime::Init() {
  38. if (device_init_ == true) {
  39. return true;
  40. }
  41. auto ret = InitDevice();
  42. if (!ret) {
  43. MS_LOG(ERROR) << "InitDevice error.";
  44. return ret;
  45. }
  46. mem_manager_ = std::make_shared<GPUMemoryManager>();
  47. MS_EXCEPTION_IF_NULL(mem_manager_);
  48. mem_manager_->MallocDeviceMemory();
  49. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  50. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  51. if (collective_inited && collective_handle_ != nullptr) {
  52. auto init_nccl_comm_funcptr =
  53. reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
  54. MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
  55. (*init_nccl_comm_funcptr)();
  56. }
  57. device_init_ = true;
  58. return ret;
  59. }
  60. DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
  61. TypeId type_id) {
  62. return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
  63. }
  64. bool GPUKernelRuntime::InitDevice() {
  65. if (GPUDeviceManager::GetInstance().device_count() <= 0) {
  66. MS_LOG(ERROR) << "No GPU device found.";
  67. return false;
  68. }
  69. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  70. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  71. if (collective_inited && collective_handle_ != nullptr) {
  72. auto get_local_rank_funcptr =
  73. reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
  74. MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
  75. device_id_ = IntToUint((*get_local_rank_funcptr)());
  76. }
  77. if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
  78. if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
  79. MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
  80. return false;
  81. }
  82. }
  83. GPUDeviceManager::GetInstance().InitDevice();
  84. stream_ = GPUDeviceManager::GetInstance().default_stream();
  85. if (stream_ == nullptr) {
  86. MS_LOG(ERROR) << "No default CUDA stream found.";
  87. return false;
  88. }
  89. return true;
  90. }
  91. void GPUKernelRuntime::ReleaseDeviceRes() {
  92. // For dataset mode.
  93. if (GpuBufferMgr::GetInstance().IsInit()) {
  94. if (!GpuBufferMgr::GetInstance().IsClosed()) {
  95. if (!GpuBufferMgr::GetInstance().CloseNotify()) {
  96. MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
  97. }
  98. }
  99. CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  100. }
  101. // Destroy remaining memory swap events and free host memory.
  102. for (auto &item : mem_swap_map_) {
  103. auto &mem_swap_manager = item.second;
  104. MS_EXCEPTION_IF_NULL(mem_swap_manager);
  105. if (mem_swap_manager->trigger_swap()) {
  106. mem_swap_manager->ClearSwapQueue();
  107. mem_swap_manager->ReleaseHostPinnedMem();
  108. }
  109. }
  110. GPUDeviceManager::GetInstance().ReleaseDevice();
  111. if (mem_manager_ != nullptr) {
  112. mem_manager_->FreeDeviceMemory();
  113. }
  114. kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
  115. MS_EXCEPTION_IF_NULL(bin_map);
  116. bin_map->RemoveKernelCache();
  117. }
  118. void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  119. auto context_ptr = MsContext::GetInstance();
  120. MS_EXCEPTION_IF_NULL(context_ptr);
  121. MS_EXCEPTION_IF_NULL(mem_manager_);
  122. mem_manager_->ResetDynamicMemory();
  123. AssignStaticMemoryInput(graph);
  124. AssignStaticMemoryValueNode(graph);
  125. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  126. if (is_enable_dynamic_mem) {
  127. // Use the dynamic memory pool.
  128. InitKernelRefCount(graph);
  129. InitKernelOutputAddress(graph);
  130. } else {
  131. AssignDynamicMemory(graph);
  132. }
  133. }
  134. bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  135. bool ret = true;
  136. auto context_ptr = MsContext::GetInstance();
  137. MS_EXCEPTION_IF_NULL(context_ptr);
  138. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  139. bool is_enable_pynative_infer = context_ptr->enable_pynative_infer();
  140. auto iter = mem_swap_map_.find(graph);
  141. if (iter == mem_swap_map_.end()) {
  142. GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared<GPUMemCopyManager>();
  143. iter = mem_swap_map_.emplace(graph, std::make_shared<MemSwapManager>(gpu_mem_copy_manager)).first;
  144. }
  145. mem_swap_manager_ = iter->second;
  146. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  147. struct timeval start_time, end_time;
  148. (void)gettimeofday(&start_time, nullptr);
  149. if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
  150. while (!LaunchKernelDynamic(graph)) {
  151. ClearKernelOutputAddress(graph);
  152. if (!mem_swap_manager_->mem_swap_init()) {
  153. mem_swap_manager_->Init(graph);
  154. }
  155. if (!mem_swap_manager_->RetreatSwapInfo()) {
  156. return false;
  157. }
  158. }
  159. } else {
  160. ret = LaunchKernel(graph);
  161. }
  162. (void)gettimeofday(&end_time, nullptr);
  163. const uint64_t kUSecondInSecond = 1000000;
  164. uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  165. cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  166. MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us";
  167. return ret;
  168. }
  169. void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  170. MS_EXCEPTION_IF_NULL(graph);
  171. MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  172. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  173. // Init the kernel reference count.
  174. if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
  175. MS_LOG(EXCEPTION) << "Init kernel reference count failed";
  176. }
  177. mem_reuse_util_ptr->SetKernelDefMap();
  178. mem_reuse_util_ptr->SetReuseRefCount();
  179. // Can't free the device address of graph output, so set the reference count of graph output specially.
  180. mem_reuse_util_ptr->SetGraphOutputRefCount();
  181. auto graph_id = graph->graph_id();
  182. mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
  183. }
  184. void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
  185. MS_EXCEPTION_IF_NULL(graph);
  186. auto &kernels = graph->execution_order();
  187. for (const auto &kernel : kernels) {
  188. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  189. MS_EXCEPTION_IF_NULL(kernel_mod);
  190. auto output_sizes = kernel_mod->GetOutputSizeList();
  191. for (size_t i = 0; i < output_sizes.size(); ++i) {
  192. if (AnfAlgo::OutputAddrExist(kernel, i)) {
  193. continue;
  194. }
  195. std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
  196. auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
  197. auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
  198. AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  199. }
  200. }
  201. }
  202. void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) {
  203. MS_EXCEPTION_IF_NULL(graph);
  204. auto &kernels = graph->execution_order();
  205. for (const auto &kernel : kernels) {
  206. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  207. MS_EXCEPTION_IF_NULL(kernel_mod);
  208. auto output_sizes = kernel_mod->GetOutputSizeList();
  209. for (size_t i = 0; i < output_sizes.size(); ++i) {
  210. if (!AnfAlgo::OutputAddrExist(kernel, i)) {
  211. continue;
  212. }
  213. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  214. if (device_address->ptr_) {
  215. mem_manager_->FreeMemFromMemPool(device_address);
  216. }
  217. device_address->set_status(DeviceAddressStatus::kInDevice);
  218. }
  219. }
  220. }
  221. bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  222. MS_EXCEPTION_IF_NULL(graph);
  223. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  224. auto graph_id = graph->graph_id();
  225. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  226. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  227. // Reset the reference count.
  228. mem_reuse_util_ptr->ResetDynamicUsedRefCount();
  229. // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
  230. AllocCommunicationOpDynamicRes(graph);
  231. auto &kernels = graph->execution_order();
  232. for (const auto &kernel : kernels) {
  233. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  234. MS_EXCEPTION_IF_NULL(kernel_mod);
  235. AddressPtrList kernel_inputs;
  236. AddressPtrList kernel_workspaces;
  237. AddressPtrList kernel_outputs;
  238. auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
  239. if (!ret) {
  240. return false;
  241. }
  242. if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) {
  243. MS_LOG(EXCEPTION) << "Launch kernel failed.";
  244. }
  245. FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  246. if (mem_swap_manager_->trigger_swap() && mem_swap_manager_->QueryKernelTriggerSwap(kernel)) {
  247. CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  248. if (!AddMemSwapTask(kernel)) {
  249. return false;
  250. }
  251. }
  252. if (mem_swap_manager_->trigger_swap()) {
  253. mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
  254. }
  255. }
  256. CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
  257. if (mem_swap_manager_->trigger_swap()) {
  258. mem_swap_manager_->ClearSwapQueue();
  259. }
  260. return true;
  261. }
  262. bool GPUKernelRuntime::AddMemSwapTask(const AnfNodePtr &kernel) {
  263. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  264. auto &mem_swap_info_list = mem_swap_manager_->QueryKernelMemSwapInfo(kernel);
  265. for (auto &mem_swap_info : mem_swap_info_list) {
  266. auto &kernel_exec_info = mem_swap_manager_->SearchKernelExecutionInfo(mem_swap_info.kernel_);
  267. const HostAddress &host_address = kernel_exec_info.host_addrs_[mem_swap_info.output_idx_];
  268. auto device_address = AnfAlgo::GetMutableOutputAddr(mem_swap_info.kernel_, mem_swap_info.output_idx_);
  269. if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) {
  270. mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address);
  271. } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) {
  272. auto status = device_address->status();
  273. if (status == DeviceAddressStatus::kInDeviceToHost) {
  274. mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_);
  275. device_address->set_status(DeviceAddressStatus::kInDevice);
  276. } else if (status == DeviceAddressStatus::kInHost) {
  277. if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_)) {
  278. return false;
  279. }
  280. if (!mem_swap_manager_->FindInSwapInBlackList(device_address->ptr_)) {
  281. mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address);
  282. }
  283. }
  284. }
  285. }
  286. return true;
  287. }
  288. bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size) {
  289. MS_EXCEPTION_IF_NULL(mem_manager_);
  290. auto ret = mem_manager_->MallocMemFromMemPool(device_address, size);
  291. if (!ret) {
  292. if (!mem_swap_manager_->trigger_swap()) {
  293. return false;
  294. }
  295. mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
  296. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  297. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  298. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  299. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  300. }
  301. }
  302. ret = mem_manager_->MallocMemFromMemPool(device_address, size);
  303. if (!ret) {
  304. return false;
  305. }
  306. }
  307. return true;
  308. }
  309. void *GPUKernelRuntime::AttemptMallocMem(size_t size) {
  310. MS_EXCEPTION_IF_NULL(mem_manager_);
  311. auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
  312. if (!device_ptr) {
  313. if (!mem_swap_manager_->trigger_swap()) {
  314. return nullptr;
  315. }
  316. mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost);
  317. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  318. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  319. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  320. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  321. }
  322. }
  323. device_ptr = mem_manager_->MallocMemFromMemPool(size);
  324. if (!device_ptr) {
  325. return nullptr;
  326. }
  327. }
  328. return device_ptr;
  329. }
  330. bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  331. const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
  332. AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) {
  333. if (!AllocKernelInputDynamicRes(kernel, kernel_inputs)) {
  334. return false;
  335. }
  336. if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs)) {
  337. return false;
  338. }
  339. if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces)) {
  340. return false;
  341. }
  342. return true;
  343. }
  344. bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs) {
  345. MS_EXCEPTION_IF_NULL(kernel);
  346. MS_EXCEPTION_IF_NULL(kernel_inputs);
  347. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  348. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  349. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  350. MS_EXCEPTION_IF_NULL(device_address);
  351. if (mem_swap_manager_->trigger_swap()) {
  352. while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) {
  353. device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
  354. }
  355. auto status = device_address->status();
  356. switch (status) {
  357. case DeviceAddressStatus::kInDevice:
  358. break;
  359. case DeviceAddressStatus::kInHost:
  360. break;
  361. case DeviceAddressStatus::kInDeviceToHost: {
  362. mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_);
  363. device_address->set_status(DeviceAddressStatus::kInDevice);
  364. break;
  365. }
  366. case DeviceAddressStatus::kInHostToDevice: {
  367. while (device_address->status() != DeviceAddressStatus::kInDevice) {
  368. while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) {
  369. device_address_swap_in->set_status(DeviceAddressStatus::kInDevice);
  370. }
  371. }
  372. break;
  373. }
  374. default:
  375. MS_LOG(ERROR) << "Invaild device address status";
  376. return false;
  377. }
  378. }
  379. MS_EXCEPTION_IF_NULL(device_address->ptr_);
  380. kernel::AddressPtr input = std::make_shared<kernel::Address>();
  381. MS_EXCEPTION_IF_NULL(input);
  382. input->addr = device_address->ptr_;
  383. input->size = device_address->size_;
  384. kernel_inputs->emplace_back(input);
  385. }
  386. return true;
  387. }
  388. bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  389. const mindspore::AnfNodePtr &kernel,
  390. AddressPtrList *kernel_outputs) {
  391. MS_EXCEPTION_IF_NULL(kernel);
  392. MS_EXCEPTION_IF_NULL(kernel_outputs);
  393. MS_EXCEPTION_IF_NULL(mem_manager_);
  394. MS_EXCEPTION_IF_NULL(mem_swap_manager_);
  395. if (mem_swap_manager_->trigger_swap()) {
  396. while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) {
  397. if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) {
  398. device_address_swap_out->set_status(DeviceAddressStatus::kInHost);
  399. mem_manager_->FreeMemFromMemPool(device_address_swap_out);
  400. }
  401. }
  402. }
  403. auto output_sizes = kernel_mod.GetOutputSizeList();
  404. for (size_t i = 0; i < output_sizes.size(); ++i) {
  405. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  406. MS_EXCEPTION_IF_NULL(device_address);
  407. if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i])) {
  408. return false;
  409. }
  410. kernel::AddressPtr output = std::make_shared<kernel::Address>();
  411. MS_EXCEPTION_IF_NULL(output);
  412. output->addr = device_address->ptr_;
  413. output->size = output_sizes[i];
  414. kernel_outputs->emplace_back(output);
  415. }
  416. return true;
  417. }
  418. bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  419. const mindspore::AnfNodePtr &kernel,
  420. AddressPtrList *kernel_workspaces) {
  421. MS_EXCEPTION_IF_NULL(kernel);
  422. MS_EXCEPTION_IF_NULL(kernel_workspaces);
  423. auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
  424. for (size_t i = 0; i < workspace_sizes.size(); ++i) {
  425. if (workspace_sizes[i] == 0) {
  426. kernel_workspaces->emplace_back(nullptr);
  427. continue;
  428. }
  429. auto device_ptr = AttemptMallocMem(workspace_sizes[i]);
  430. if (!device_ptr) {
  431. return false;
  432. }
  433. kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
  434. MS_EXCEPTION_IF_NULL(workspace);
  435. workspace->addr = device_ptr;
  436. workspace->size = workspace_sizes[i];
  437. kernel_workspaces->emplace_back(workspace);
  438. }
  439. return true;
  440. }
  441. void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
  442. MS_EXCEPTION_IF_NULL(graph);
  443. auto &kernels = graph->execution_order();
  444. for (auto &kernel : kernels) {
  445. MS_EXCEPTION_IF_NULL(kernel);
  446. if (AnfAlgo::IsCommunicationOp(kernel)) {
  447. AllocCommunicationOpInputDynamicRes(kernel);
  448. AllocCommunicationOpOutputDynamicRes(kernel);
  449. }
  450. }
  451. }
  452. void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  453. MS_EXCEPTION_IF_NULL(kernel);
  454. bool is_need_alloc_memory = false;
  455. bool is_need_free_memory = false;
  456. size_t total_size = 0;
  457. std::vector<size_t> size_list;
  458. DeviceAddressPtrList addr_list;
  459. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  460. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  461. MS_EXCEPTION_IF_NULL(device_address);
  462. if (device_address->ptr_ == nullptr) {
  463. is_need_alloc_memory = true;
  464. } else {
  465. is_need_free_memory = true;
  466. }
  467. total_size += device_address->size_;
  468. size_list.emplace_back(device_address->size_);
  469. addr_list.emplace_back(device_address);
  470. }
  471. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  472. }
  473. void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  474. MS_EXCEPTION_IF_NULL(kernel);
  475. bool is_need_alloc_memory = false;
  476. bool is_need_free_memory = false;
  477. size_t total_size = 0;
  478. std::vector<size_t> size_list;
  479. DeviceAddressPtrList addr_list;
  480. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  481. MS_EXCEPTION_IF_NULL(kernel_mod);
  482. auto output_sizes = kernel_mod->GetOutputSizeList();
  483. for (size_t i = 0; i < output_sizes.size(); ++i) {
  484. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  485. MS_EXCEPTION_IF_NULL(device_address);
  486. if (device_address->ptr_ == nullptr) {
  487. is_need_alloc_memory = true;
  488. } else {
  489. is_need_free_memory = true;
  490. }
  491. total_size += output_sizes[i];
  492. size_list.emplace_back(output_sizes[i]);
  493. addr_list.emplace_back(device_address);
  494. }
  495. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  496. }
  497. void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
  498. const DeviceAddressPtrList addr_list, size_t total_size,
  499. std::vector<size_t> size_list) {
  500. MS_EXCEPTION_IF_NULL(mem_manager_);
  501. if (!is_need_alloc_memory) {
  502. return;
  503. }
  504. if (is_need_free_memory) {
  505. for (const auto &iter : addr_list) {
  506. MS_EXCEPTION_IF_NULL(iter);
  507. // Free the inputs/outputs of communication kernel which are not released.
  508. if (iter->ptr_ != nullptr) {
  509. mem_manager_->FreeMemFromMemPool(iter);
  510. }
  511. }
  512. }
  513. auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
  514. if (!ret) {
  515. MS_LOG(EXCEPTION) << "Malloc device memory failed.";
  516. }
  517. }
  518. void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
  519. const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  520. MS_EXCEPTION_IF_NULL(kernel);
  521. MS_EXCEPTION_IF_NULL(mem_manager_);
  522. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  523. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  524. auto cnode = kernel->cast<CNodePtr>();
  525. MS_EXCEPTION_IF_NULL(cnode);
  526. if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
  527. return;
  528. }
  529. // Free the input of kernel by reference count.
  530. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  531. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
  532. if (kernel_ref_count_ptr == nullptr) {
  533. continue;
  534. }
  535. kernel_ref_count_ptr->ref_count_dynamic_use_--;
  536. if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) {
  537. MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
  538. }
  539. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  540. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  541. mem_manager_->FreeMemFromMemPool(device_address);
  542. device_address->set_status(DeviceAddressStatus::kInDevice);
  543. }
  544. }
  545. // Free the output of kernel, if output has no reference.
  546. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
  547. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
  548. if (kernel_ref_count_ptr == nullptr) {
  549. continue;
  550. }
  551. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  552. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  553. mem_manager_->FreeMemFromMemPool(device_address);
  554. device_address->set_status(DeviceAddressStatus::kInDevice);
  555. }
  556. }
  557. // Free the workspace of kernel.
  558. for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
  559. auto workspace = kernel_workspaces[i];
  560. if (workspace != nullptr) {
  561. MS_EXCEPTION_IF_NULL(workspace->addr);
  562. mem_manager_->FreeMemFromMemPool(workspace->addr);
  563. workspace->addr = nullptr;
  564. }
  565. }
  566. }
  567. } // namespace gpu
  568. } // namespace device
  569. } // namespace mindspore