| @@ -64,7 +64,7 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { | |||||
| void DatasetIteratorKernel::InitSizeLists() { return; } | void DatasetIteratorKernel::InitSizeLists() { return; } | ||||
| bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | ||||
| const std::vector<AddressPtr> &outputs, void *) { | |||||
| const std::vector<AddressPtr> &outputs, void *stream) { | |||||
| void *addr = nullptr; | void *addr = nullptr; | ||||
| size_t len = 0; | size_t len = 0; | ||||
| @@ -96,11 +96,14 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v | |||||
| } | } | ||||
| for (size_t i = 0; i < output_size_list_.size(); i++) { | for (size_t i = 0; i < output_size_list_.size(); i++) { | ||||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpy(outputs[i]->addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice), | |||||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs[i]->addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice, | |||||
| reinterpret_cast<cudaStream_t>(stream)), | |||||
| "Cuda Memcpy Failed"); | "Cuda Memcpy Failed"); | ||||
| addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i]; | addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i]; | ||||
| } | } | ||||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)), | |||||
| "cudaStreamSynchronize failed"); | |||||
| (void)GpuBufferMgr::GetInstance().Pop(handle_); | (void)GpuBufferMgr::GetInstance().Pop(handle_); | ||||
| return true; | return true; | ||||
| } | } | ||||