You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_kernel_runtime.cc 16 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_kernel_runtime.h"
  17. #include "device/gpu/gpu_device_address.h"
  18. #include "device/gpu/cuda_driver.h"
  19. #include "device/gpu/gpu_buffer_mgr.h"
  20. #include "device/gpu/gpu_device_manager.h"
  21. #include "device/gpu/gpu_memory_allocator.h"
  22. #include "device/gpu/distribution/collective_init.h"
  23. #include "utils/convert_utils.h"
  24. #include "utils/context/ms_context.h"
  25. #include "device/kernel_runtime_manager.h"
  26. #include "device/gpu/gpu_common.h"
  27. #include "common/utils.h"
  28. #include "device/gpu/gpu_memory_manager.h"
  29. #include "kernel/common_utils.h"
  30. namespace mindspore {
  31. namespace device {
  32. namespace gpu {
  33. bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
  34. bool GPUKernelRuntime::Init() {
  35. if (device_init_ == true) {
  36. return true;
  37. }
  38. auto ret = InitDevice();
  39. if (!ret) {
  40. MS_LOG(ERROR) << "InitDevice error.";
  41. return ret;
  42. }
  43. mem_manager_ = std::make_shared<GPUMemoryManager>();
  44. MS_EXCEPTION_IF_NULL(mem_manager_);
  45. mem_manager_->MallocDeviceMemory();
  46. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  47. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  48. if (collective_inited && collective_handle_ != nullptr) {
  49. auto init_nccl_comm_funcptr =
  50. reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
  51. MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
  52. (*init_nccl_comm_funcptr)();
  53. }
  54. device_init_ = true;
  55. return ret;
  56. }
  57. DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
  58. TypeId type_id) {
  59. return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
  60. }
  61. bool GPUKernelRuntime::InitDevice() {
  62. if (GPUDeviceManager::GetInstance().device_count() <= 0) {
  63. MS_LOG(ERROR) << "No GPU device found.";
  64. return false;
  65. }
  66. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  67. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  68. if (collective_inited && collective_handle_ != nullptr) {
  69. auto get_local_rank_funcptr =
  70. reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
  71. MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
  72. device_id_ = IntToUint((*get_local_rank_funcptr)());
  73. }
  74. if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
  75. if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
  76. MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
  77. return false;
  78. }
  79. }
  80. GPUDeviceManager::GetInstance().InitDevice();
  81. stream_ = GPUDeviceManager::GetInstance().default_stream();
  82. if (stream_ == nullptr) {
  83. MS_LOG(ERROR) << "No default CUDA stream found.";
  84. return false;
  85. }
  86. return true;
  87. }
  88. void GPUKernelRuntime::ReleaseDeviceRes() {
  89. // For dataset mode.
  90. if (GpuBufferMgr::GetInstance().IsInit()) {
  91. if (!GpuBufferMgr::GetInstance().IsClosed()) {
  92. if (!GpuBufferMgr::GetInstance().CloseNotify()) {
  93. MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
  94. }
  95. }
  96. CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  97. }
  98. GPUDeviceManager::GetInstance().ReleaseDevice();
  99. if (mem_manager_ != nullptr) {
  100. mem_manager_->FreeDeviceMemory();
  101. }
  102. kernel::KernelMeta::GetInstance()->RemoveKernelCache();
  103. }
  104. void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  105. auto context_ptr = MsContext::GetInstance();
  106. MS_EXCEPTION_IF_NULL(context_ptr);
  107. MS_EXCEPTION_IF_NULL(mem_manager_);
  108. mem_manager_->ResetDynamicMemory();
  109. AssignStaticMemoryInput(graph);
  110. AssignStaticMemoryValueNode(graph);
  111. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  112. if (is_enable_dynamic_mem) {
  113. // Use the dynamic memory pool.
  114. InitKernelRefCount(graph);
  115. InitKernelOutputAddress(graph);
  116. } else {
  117. AssignDynamicMemory(graph);
  118. }
  119. }
  120. bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  121. bool ret;
  122. auto context_ptr = MsContext::GetInstance();
  123. MS_EXCEPTION_IF_NULL(context_ptr);
  124. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  125. bool is_enable_pynative_infer = context_ptr->enable_pynative_infer();
  126. struct timeval start_time, end_time;
  127. (void)gettimeofday(&start_time, nullptr);
  128. if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
  129. ret = LaunchKernelDynamic(graph);
  130. } else {
  131. ret = LaunchKernel(graph);
  132. }
  133. (void)gettimeofday(&end_time, nullptr);
  134. const uint64_t kUSecondInSecond = 1000000;
  135. uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  136. cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  137. MS_LOG(DEBUG) << "kernel runtime run graph in " << cost << " us";
  138. return ret;
  139. }
  140. void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  141. MS_EXCEPTION_IF_NULL(graph);
  142. MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  143. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  144. // Init the kernel reference count.
  145. if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
  146. MS_LOG(EXCEPTION) << "Init kernel reference count failed";
  147. }
  148. mem_reuse_util_ptr->SetKernelDefMap();
  149. mem_reuse_util_ptr->SetReuseRefCount();
  150. // Can't free the device address of graph output, so set the reference count of graph output specially.
  151. mem_reuse_util_ptr->SetGraphOutputRefCount();
  152. auto graph_id = graph->graph_id();
  153. mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
  154. }
  155. void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
  156. MS_EXCEPTION_IF_NULL(graph);
  157. auto &kernels = graph->execution_order();
  158. for (const auto &kernel : kernels) {
  159. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  160. MS_EXCEPTION_IF_NULL(kernel_mod);
  161. auto output_sizes = kernel_mod->GetOutputSizeList();
  162. for (size_t i = 0; i < output_sizes.size(); ++i) {
  163. if (AnfAlgo::OutputAddrExist(kernel, i)) {
  164. continue;
  165. }
  166. std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
  167. auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
  168. auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
  169. AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  170. }
  171. }
  172. }
  173. bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  174. MS_EXCEPTION_IF_NULL(graph);
  175. auto graph_id = graph->graph_id();
  176. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  177. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  178. // Reset the reference count.
  179. mem_reuse_util_ptr->ResetDynamicUsedRefCount();
  180. // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
  181. AllocCommunicationOpDynamicRes(graph);
  182. auto &kernels = graph->execution_order();
  183. for (const auto &kernel : kernels) {
  184. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  185. MS_EXCEPTION_IF_NULL(kernel_mod);
  186. AddressPtrList kernel_inputs;
  187. AddressPtrList kernel_workspaces;
  188. AddressPtrList kernel_outputs;
  189. AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
  190. if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) {
  191. MS_LOG(ERROR) << "Launch kernel failed.";
  192. return false;
  193. }
  194. FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  195. }
  196. if (!SyncStream()) {
  197. MS_LOG(ERROR) << "SyncStream failed.";
  198. return false;
  199. }
  200. return true;
  201. }
  202. void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  203. const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
  204. AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) {
  205. MS_EXCEPTION_IF_NULL(kernel);
  206. MS_EXCEPTION_IF_NULL(kernel_inputs);
  207. MS_EXCEPTION_IF_NULL(kernel_workspaces);
  208. MS_EXCEPTION_IF_NULL(kernel_outputs);
  209. MS_EXCEPTION_IF_NULL(mem_manager_);
  210. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  211. auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
  212. MS_EXCEPTION_IF_NULL(device_address);
  213. MS_EXCEPTION_IF_NULL(device_address->ptr_);
  214. kernel::AddressPtr input = std::make_shared<kernel::Address>();
  215. MS_EXCEPTION_IF_NULL(input);
  216. input->addr = device_address->ptr_;
  217. input->size = device_address->size_;
  218. kernel_inputs->emplace_back(input);
  219. }
  220. auto output_sizes = kernel_mod.GetOutputSizeList();
  221. for (size_t i = 0; i < output_sizes.size(); ++i) {
  222. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  223. MS_EXCEPTION_IF_NULL(device_address);
  224. if (device_address->ptr_ == nullptr) {
  225. auto ret = mem_manager_->MallocMemFromMemPool(device_address, output_sizes[i]);
  226. if (!ret) {
  227. MS_LOG(EXCEPTION) << "Malloc device memory failed.";
  228. }
  229. }
  230. kernel::AddressPtr output = std::make_shared<kernel::Address>();
  231. MS_EXCEPTION_IF_NULL(output);
  232. output->addr = device_address->ptr_;
  233. output->size = output_sizes[i];
  234. kernel_outputs->emplace_back(output);
  235. }
  236. auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
  237. for (size_t i = 0; i < workspace_sizes.size(); ++i) {
  238. if (workspace_sizes[i] == 0) {
  239. kernel_workspaces->emplace_back(nullptr);
  240. continue;
  241. }
  242. auto device_ptr = mem_manager_->MallocMemFromMemPool(workspace_sizes[i]);
  243. if (!device_ptr) {
  244. MS_LOG(EXCEPTION) << "Malloc device memory failed.";
  245. }
  246. kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
  247. MS_EXCEPTION_IF_NULL(workspace);
  248. workspace->addr = device_ptr;
  249. workspace->size = workspace_sizes[i];
  250. kernel_workspaces->emplace_back(workspace);
  251. }
  252. }
  253. void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
  254. MS_EXCEPTION_IF_NULL(graph);
  255. auto &kernels = graph->execution_order();
  256. for (auto &kernel : kernels) {
  257. MS_EXCEPTION_IF_NULL(kernel);
  258. if (AnfAlgo::IsCommunicationOp(kernel)) {
  259. AllocCommunicationOpInputDynamicRes(kernel);
  260. AllocCommunicationOpOutputDynamicRes(kernel);
  261. }
  262. }
  263. }
  264. void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  265. MS_EXCEPTION_IF_NULL(kernel);
  266. MS_EXCEPTION_IF_NULL(mem_manager_);
  267. bool is_need_alloc_memory = false;
  268. bool is_need_free_memory = false;
  269. size_t total_size = 0;
  270. std::vector<size_t> size_list;
  271. DeviceAddressPtrList addr_list;
  272. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  273. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  274. MS_EXCEPTION_IF_NULL(device_address);
  275. if (device_address->ptr_ == nullptr) {
  276. is_need_alloc_memory = true;
  277. } else {
  278. is_need_free_memory = true;
  279. }
  280. total_size += device_address->size_;
  281. size_list.emplace_back(device_address->size_);
  282. addr_list.emplace_back(device_address);
  283. }
  284. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  285. }
  286. void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  287. MS_EXCEPTION_IF_NULL(kernel);
  288. MS_EXCEPTION_IF_NULL(mem_manager_);
  289. bool is_need_alloc_memory = false;
  290. bool is_need_free_memory = false;
  291. size_t total_size = 0;
  292. std::vector<size_t> size_list;
  293. DeviceAddressPtrList addr_list;
  294. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  295. MS_EXCEPTION_IF_NULL(kernel_mod);
  296. auto output_sizes = kernel_mod->GetOutputSizeList();
  297. for (size_t i = 0; i < output_sizes.size(); ++i) {
  298. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  299. MS_EXCEPTION_IF_NULL(device_address);
  300. if (device_address->ptr_ == nullptr) {
  301. is_need_alloc_memory = true;
  302. } else {
  303. is_need_free_memory = true;
  304. }
  305. total_size += output_sizes[i];
  306. size_list.emplace_back(output_sizes[i]);
  307. addr_list.emplace_back(device_address);
  308. }
  309. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  310. }
  311. void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
  312. const DeviceAddressPtrList addr_list, size_t total_size,
  313. std::vector<size_t> size_list) {
  314. if (!is_need_alloc_memory) {
  315. return;
  316. }
  317. if (is_need_free_memory) {
  318. for (const auto &iter : addr_list) {
  319. MS_EXCEPTION_IF_NULL(iter);
  320. // Free the inputs/outputs of communication kernel which are not released.
  321. if (iter->ptr_ != nullptr) {
  322. mem_manager_->FreeMemFromMemPool(iter);
  323. }
  324. }
  325. }
  326. auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
  327. if (!ret) {
  328. MS_LOG(EXCEPTION) << "Malloc device memory failed.";
  329. }
  330. }
  331. void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
  332. const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  333. MS_EXCEPTION_IF_NULL(kernel);
  334. MS_EXCEPTION_IF_NULL(mem_manager_);
  335. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  336. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  337. auto cnode = kernel->cast<CNodePtr>();
  338. MS_EXCEPTION_IF_NULL(cnode);
  339. if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
  340. return;
  341. }
  342. // Free the input of kernel by reference count.
  343. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  344. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
  345. if (kernel_ref_count_ptr == nullptr) {
  346. continue;
  347. }
  348. kernel_ref_count_ptr->ref_count_dynamic_use_--;
  349. if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) {
  350. MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
  351. }
  352. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  353. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  354. mem_manager_->FreeMemFromMemPool(device_address);
  355. }
  356. }
  357. // Free the output of kernel, if output has no reference.
  358. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
  359. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
  360. if (kernel_ref_count_ptr == nullptr) {
  361. continue;
  362. }
  363. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  364. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  365. mem_manager_->FreeMemFromMemPool(device_address);
  366. }
  367. }
  368. // Free the workspace of kernel.
  369. for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
  370. auto workspace = kernel_workspaces[i];
  371. if (workspace != nullptr) {
  372. MS_EXCEPTION_IF_NULL(workspace->addr);
  373. mem_manager_->FreeMemFromMemPool(workspace->addr);
  374. workspace->addr = nullptr;
  375. }
  376. }
  377. }
  378. } // namespace gpu
  379. } // namespace device
  380. } // namespace mindspore