You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_kernel_runtime.cc 16 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_kernel_runtime.h"
  17. #include "device/gpu/gpu_device_address.h"
  18. #include "device/gpu/cuda_driver.h"
  19. #include "device/gpu/gpu_buffer_mgr.h"
  20. #include "device/gpu/gpu_device_manager.h"
  21. #include "device/gpu/gpu_memory_allocator.h"
  22. #include "device/gpu/distribution/collective_init.h"
  23. #include "utils/convert_utils.h"
  24. #include "utils/context/ms_context.h"
  25. #include "device/kernel_runtime_manager.h"
  26. #include "device/gpu/gpu_common.h"
  27. #include "common/utils.h"
  28. #include "device/gpu/gpu_memory_manager.h"
  29. namespace mindspore {
  30. namespace device {
  31. namespace gpu {
  32. bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
  33. bool GPUKernelRuntime::Init() {
  34. if (device_init_ == true) {
  35. return true;
  36. }
  37. auto ret = InitDevice();
  38. if (!ret) {
  39. MS_LOG(ERROR) << "InitDevice error.";
  40. return ret;
  41. }
  42. mem_manager_ = std::make_shared<GPUMemoryManager>();
  43. MS_EXCEPTION_IF_NULL(mem_manager_);
  44. mem_manager_->MallocDeviceMemory();
  45. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  46. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  47. if (collective_inited && collective_handle_ != nullptr) {
  48. auto init_nccl_comm_funcptr =
  49. reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
  50. MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
  51. (*init_nccl_comm_funcptr)();
  52. }
  53. device_init_ = true;
  54. return ret;
  55. }
  56. DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
  57. TypeId type_id) {
  58. return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
  59. }
  60. bool GPUKernelRuntime::InitDevice() {
  61. if (GPUDeviceManager::GetInstance().device_count() <= 0) {
  62. MS_LOG(ERROR) << "No GPU device found.";
  63. return false;
  64. }
  65. const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  66. bool collective_inited = CollectiveInitializer::instance().collective_inited();
  67. if (collective_inited && collective_handle_ != nullptr) {
  68. auto get_local_rank_funcptr =
  69. reinterpret_cast<GetLocalRankId>(dlsym(const_cast<void *>(collective_handle_), "local_rank_id"));
  70. MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
  71. device_id_ = IntToUint((*get_local_rank_funcptr)());
  72. }
  73. if (!GPUDeviceManager::GetInstance().is_device_id_init()) {
  74. if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) {
  75. MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_);
  76. return false;
  77. }
  78. }
  79. GPUDeviceManager::GetInstance().InitDevice();
  80. stream_ = GPUDeviceManager::GetInstance().default_stream();
  81. if (stream_ == nullptr) {
  82. MS_LOG(ERROR) << "No default CUDA stream found.";
  83. return false;
  84. }
  85. return true;
  86. }
  87. void GPUKernelRuntime::ReleaseDeviceRes() {
  88. // For dataset mode.
  89. if (GpuBufferMgr::GetInstance().IsInit()) {
  90. if (!GpuBufferMgr::GetInstance().IsClosed()) {
  91. if (!GpuBufferMgr::GetInstance().CloseNotify()) {
  92. MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
  93. }
  94. }
  95. CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  96. }
  97. GPUDeviceManager::GetInstance().ReleaseDevice();
  98. if (mem_manager_ != nullptr) {
  99. mem_manager_->FreeDeviceMemory();
  100. }
  101. }
  102. void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  103. auto context_ptr = MsContext::GetInstance();
  104. MS_EXCEPTION_IF_NULL(context_ptr);
  105. MS_EXCEPTION_IF_NULL(mem_manager_);
  106. mem_manager_->ResetDynamicMemory();
  107. AssignStaticMemoryInput(graph);
  108. AssignStaticMemoryValueNode(graph);
  109. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  110. if (is_enable_dynamic_mem) {
  111. // Use the dynamic memory pool.
  112. InitKernelRefCount(graph);
  113. InitKernelOutputAddress(graph);
  114. } else {
  115. AssignDynamicMemory(graph);
  116. }
  117. }
  118. bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  119. bool ret;
  120. auto context_ptr = MsContext::GetInstance();
  121. MS_EXCEPTION_IF_NULL(context_ptr);
  122. bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  123. bool is_enable_pynative_infer = context_ptr->enable_pynative_infer();
  124. struct timeval start_time, end_time;
  125. (void)gettimeofday(&start_time, nullptr);
  126. if (is_enable_dynamic_mem && !is_enable_pynative_infer) {
  127. ret = LaunchKernelDynamic(graph);
  128. } else {
  129. ret = LaunchKernel(graph);
  130. }
  131. (void)gettimeofday(&end_time, nullptr);
  132. const uint64_t kUSecondInSecond = 1000000;
  133. uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
  134. cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
  135. MS_LOG(DEBUG) << "kernel runtime run graph in " << cost << " us";
  136. return ret;
  137. }
  138. void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  139. MS_EXCEPTION_IF_NULL(graph);
  140. MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  141. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  142. // Init the kernel reference count.
  143. if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) {
  144. MS_LOG(EXCEPTION) << "Init kernel reference count failed";
  145. }
  146. mem_reuse_util_ptr->SetKernelDefMap();
  147. mem_reuse_util_ptr->SetReuseRefCount();
  148. // Can't free the device address of graph output, so set the reference count of graph output specially.
  149. mem_reuse_util_ptr->SetGraphOutputRefCount();
  150. auto graph_id = graph->graph_id();
  151. mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr;
  152. }
  153. void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) {
  154. MS_EXCEPTION_IF_NULL(graph);
  155. auto &kernels = graph->execution_order();
  156. for (const auto &kernel : kernels) {
  157. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  158. MS_EXCEPTION_IF_NULL(kernel_mod);
  159. auto output_sizes = kernel_mod->GetOutputSizeList();
  160. for (size_t i = 0; i < output_sizes.size(); ++i) {
  161. if (AnfAlgo::OutputAddrExist(kernel, i)) {
  162. continue;
  163. }
  164. std::string output_format = AnfAlgo::GetOutputFormat(kernel, i);
  165. auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
  166. auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
  167. AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  168. }
  169. }
  170. }
  171. bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
  172. MS_EXCEPTION_IF_NULL(graph);
  173. auto graph_id = graph->graph_id();
  174. // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
  175. AllocCommunicationOpDynamicRes(graph);
  176. auto &kernels = graph->execution_order();
  177. for (const auto &kernel : kernels) {
  178. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  179. MS_EXCEPTION_IF_NULL(kernel_mod);
  180. AddressPtrList kernel_inputs;
  181. AddressPtrList kernel_workspaces;
  182. AddressPtrList kernel_outputs;
  183. AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
  184. if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, reinterpret_cast<uintptr_t>(stream_))) {
  185. MS_LOG(ERROR) << "Launch kernel failed.";
  186. return false;
  187. }
  188. FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
  189. }
  190. if (!SyncStream()) {
  191. MS_LOG(ERROR) << "SyncStream failed.";
  192. return false;
  193. }
  194. return true;
  195. }
  196. void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
  197. const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
  198. AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) {
  199. MS_EXCEPTION_IF_NULL(kernel);
  200. MS_EXCEPTION_IF_NULL(kernel_inputs);
  201. MS_EXCEPTION_IF_NULL(kernel_workspaces);
  202. MS_EXCEPTION_IF_NULL(kernel_outputs);
  203. MS_EXCEPTION_IF_NULL(mem_manager_);
  204. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  205. auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
  206. MS_EXCEPTION_IF_NULL(device_address);
  207. MS_EXCEPTION_IF_NULL(device_address->ptr_);
  208. kernel::AddressPtr input = std::make_shared<kernel::Address>();
  209. MS_EXCEPTION_IF_NULL(input);
  210. input->addr = device_address->ptr_;
  211. input->size = device_address->size_;
  212. kernel_inputs->push_back(input);
  213. }
  214. auto output_sizes = kernel_mod.GetOutputSizeList();
  215. for (size_t i = 0; i < output_sizes.size(); ++i) {
  216. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  217. MS_EXCEPTION_IF_NULL(device_address);
  218. if (device_address->ptr_ == nullptr) {
  219. mem_manager_->MallocMemFromMemPool(device_address, output_sizes[i]);
  220. }
  221. kernel::AddressPtr output = std::make_shared<kernel::Address>();
  222. MS_EXCEPTION_IF_NULL(output);
  223. output->addr = device_address->ptr_;
  224. output->size = output_sizes[i];
  225. kernel_outputs->push_back(output);
  226. }
  227. auto workspace_sizes = kernel_mod.GetWorkspaceSizeList();
  228. for (size_t i = 0; i < workspace_sizes.size(); ++i) {
  229. if (workspace_sizes[i] == 0) {
  230. kernel_workspaces->emplace_back(nullptr);
  231. continue;
  232. }
  233. auto device_ptr = mem_manager_->MallocMemFromMemPool(workspace_sizes[i]);
  234. MS_EXCEPTION_IF_NULL(device_ptr);
  235. kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
  236. MS_EXCEPTION_IF_NULL(workspace);
  237. workspace->addr = device_ptr;
  238. workspace->size = workspace_sizes[i];
  239. kernel_workspaces->push_back(workspace);
  240. }
  241. }
  242. void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) {
  243. MS_EXCEPTION_IF_NULL(graph);
  244. auto &kernels = graph->execution_order();
  245. for (auto &kernel : kernels) {
  246. MS_EXCEPTION_IF_NULL(kernel);
  247. if (AnfAlgo::IsCommunicationOp(kernel)) {
  248. AllocCommunicationOpInputDynamicRes(kernel);
  249. AllocCommunicationOpOutputDynamicRes(kernel);
  250. }
  251. }
  252. }
  253. void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  254. MS_EXCEPTION_IF_NULL(kernel);
  255. MS_EXCEPTION_IF_NULL(mem_manager_);
  256. bool is_need_alloc_memory = false;
  257. bool is_need_free_memory = false;
  258. size_t total_size = 0;
  259. std::vector<size_t> size_list;
  260. DeviceAddressPtrList addr_list;
  261. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  262. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  263. MS_EXCEPTION_IF_NULL(device_address);
  264. if (device_address->ptr_ == nullptr) {
  265. is_need_alloc_memory = true;
  266. } else {
  267. is_need_free_memory = true;
  268. }
  269. total_size += device_address->size_;
  270. size_list.emplace_back(device_address->size_);
  271. addr_list.emplace_back(device_address);
  272. }
  273. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  274. }
  275. void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  276. MS_EXCEPTION_IF_NULL(kernel);
  277. MS_EXCEPTION_IF_NULL(mem_manager_);
  278. bool is_need_alloc_memory = false;
  279. bool is_need_free_memory = false;
  280. size_t total_size = 0;
  281. std::vector<size_t> size_list;
  282. DeviceAddressPtrList addr_list;
  283. auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  284. MS_EXCEPTION_IF_NULL(kernel_mod);
  285. auto output_sizes = kernel_mod->GetOutputSizeList();
  286. for (size_t i = 0; i < output_sizes.size(); ++i) {
  287. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  288. MS_EXCEPTION_IF_NULL(device_address);
  289. if (device_address->ptr_ == nullptr) {
  290. is_need_alloc_memory = true;
  291. } else {
  292. is_need_free_memory = true;
  293. }
  294. total_size += output_sizes[i];
  295. size_list.emplace_back(output_sizes[i]);
  296. addr_list.emplace_back(device_address);
  297. }
  298. AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
  299. }
  300. void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
  301. const DeviceAddressPtrList addr_list, size_t total_size,
  302. std::vector<size_t> size_list) {
  303. if (!is_need_alloc_memory) {
  304. return;
  305. }
  306. if (is_need_free_memory) {
  307. for (const auto &iter : addr_list) {
  308. MS_EXCEPTION_IF_NULL(iter);
  309. // Free the inputs/outputs of communication kernel which are not released.
  310. if (iter->ptr_ != nullptr) {
  311. mem_manager_->FreeMemFromMemPool(iter);
  312. }
  313. }
  314. }
  315. mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
  316. }
  317. void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
  318. const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
  319. MS_EXCEPTION_IF_NULL(kernel);
  320. MS_EXCEPTION_IF_NULL(mem_manager_);
  321. auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
  322. MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  323. auto cnode = kernel->cast<CNodePtr>();
  324. MS_EXCEPTION_IF_NULL(cnode);
  325. if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
  326. return;
  327. }
  328. // Free the input of kernel by reference count.
  329. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  330. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
  331. if (kernel_ref_count_ptr == nullptr) {
  332. continue;
  333. }
  334. // Can't free the output of graph.
  335. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == memreuse::kMaxRefCount) {
  336. continue;
  337. }
  338. kernel_ref_count_ptr->ref_count_dynamic_use_--;
  339. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  340. auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
  341. mem_manager_->FreeMemFromMemPool(device_address);
  342. // Reset the reference count.
  343. kernel_ref_count_ptr->ref_count_dynamic_use_ = kernel_ref_count_ptr->ref_count_;
  344. }
  345. }
  346. // Free the output of kernel, if output has no reference.
  347. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
  348. auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
  349. if (kernel_ref_count_ptr == nullptr) {
  350. continue;
  351. }
  352. if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
  353. auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i);
  354. mem_manager_->FreeMemFromMemPool(device_address);
  355. }
  356. }
  357. // Free the workspace of kernel.
  358. for (size_t i = 0; i < kernel_workspaces.size(); ++i) {
  359. auto workspace = kernel_workspaces[i];
  360. if (workspace != nullptr) {
  361. MS_EXCEPTION_IF_NULL(workspace->addr);
  362. mem_manager_->FreeMemFromMemPool(workspace->addr);
  363. workspace->addr = nullptr;
  364. }
  365. }
  366. }
  367. } // namespace gpu
  368. } // namespace device
  369. } // namespace mindspore