| @@ -13,21 +13,24 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h" | |||
| #include <cuda_runtime_api.h> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/gpu/data/dataset_utils.h" | |||
| #include "profiler/device/gpu/gpu_profiling.h" | |||
| #include "runtime/device/gpu/gpu_buffer_mgr.h" | |||
| #include "runtime/device/gpu/gpu_common.h" | |||
| #include "backend/kernel_compiler/gpu/data/dataset_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| using mindspore::device::GpuBufferMgr; | |||
| using mindspore::device::HandleMgr; | |||
| DatasetIteratorKernel::DatasetIteratorKernel() : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0) {} | |||
| DatasetIteratorKernel::DatasetIteratorKernel() | |||
| : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0), profiling_enable_(false), profiling_op_(nullptr) {} | |||
| DatasetIteratorKernel::~DatasetIteratorKernel() { GpuBufferMgr::GetInstance().Close(handle_); } | |||
| @@ -60,6 +63,14 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { | |||
| MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed"; | |||
| } | |||
| auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(profiler_inst); | |||
| profiling_enable_ = profiler_inst->GetEnableFlag(); | |||
| if (profiling_enable_) { | |||
| std::string path = profiler_inst->ProfileDataPath(); | |||
| profiling_op_ = std::make_shared<GetNextProfiling>(path); | |||
| profiler_inst->RegisterProfilingOp(profiling_op_); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -69,11 +80,21 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v | |||
| const std::vector<AddressPtr> &outputs, void *stream) { | |||
| void *addr = nullptr; | |||
| size_t len = 0; | |||
| uint64_t start_time_stamp = 0; | |||
| uint32_t queue_size = 0; | |||
| int repeat = 0; | |||
| while (true) { | |||
| if (profiling_enable_) { | |||
| start_time_stamp = profiling_op_->GetTimeStamp(); | |||
| queue_size = GpuBufferMgr::GetInstance().Size(handle_); | |||
| } | |||
| auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len); | |||
| if (ret == device::SUCCESS) { | |||
| if (profiling_enable_) { | |||
| uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); | |||
| profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); | |||
| } | |||
| break; | |||
| } | |||
| @@ -84,10 +105,18 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v | |||
| continue; | |||
| } else { | |||
| MS_LOG(ERROR) << "Get data timeout"; | |||
| if (profiling_enable_) { | |||
| uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); | |||
| profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); | |||
| } | |||
| return false; | |||
| } | |||
| } | |||
| if (profiling_enable_) { | |||
| uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); | |||
| profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); | |||
| } | |||
| MS_LOG(ERROR) << "Get data failed, errcode " << ret; | |||
| return false; | |||
| } | |||
| @@ -17,8 +17,10 @@ | |||
| #ifndef MINDSPORE_GET_NEXT_KERNEL_H | |||
| #define MINDSPORE_GET_NEXT_KERNEL_H | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/gpu/data/dataset_profiling.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| @@ -44,6 +46,8 @@ class DatasetIteratorKernel : public GpuKernel { | |||
| std::string queue_name_; | |||
| unsigned int handle_; | |||
| size_t total_bytes_; | |||
| bool profiling_enable_; | |||
| std::shared_ptr<GetNextProfiling> profiling_op_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| @@ -0,0 +1,70 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/data/dataset_profiling.h" | |||
| #include <fstream> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <utility> | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "utils/utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {} | |||
| void GetNextProfiling::GetDeviceId() { | |||
| // If DEVICE_ID is not set,defult value is 0 | |||
| device_id_ = common::GetEnv("DEVICE_ID"); | |||
| if (device_id_.empty()) { | |||
| device_id_ = "0"; | |||
| } | |||
| } | |||
| void GetNextProfiling::Init() { | |||
| GetDeviceId(); | |||
| file_name_ = profiling_path_ + "/minddata_getnext_profiling_" + device_id_ + ".txt"; | |||
| op_name_ = kGetNextOpName; | |||
| } | |||
| void GetNextProfiling::SaveProfilingData() { | |||
| std::ofstream handle(file_name_, std::ios::trunc); | |||
| if (!handle.is_open()) { | |||
| MS_LOG(ERROR) << "Open get-next profiling file failed."; | |||
| return; | |||
| } | |||
| for (uint32_t index = 0; index < queue_size_.size(); index++) { | |||
| handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index] | |||
| << std::endl; | |||
| } | |||
| handle.close(); | |||
| } | |||
| void GetNextProfiling::RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp) { | |||
| queue_size_.emplace_back(queue_size); | |||
| std::pair<uint64_t, uint64_t> time_stamp(start_time_stamp, end_time_stamp); | |||
| time_stamp_.emplace_back(time_stamp); | |||
| } | |||
| uint64_t GetNextProfiling::GetTimeStamp() const { | |||
| auto cur_sys_clock = std::chrono::system_clock::now(); | |||
| uint64_t time_stamp = std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count(); | |||
| return time_stamp; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,50 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "profiler/device/gpu/gpu_profiling.h" | |||
| using mindspore::profiler::gpu::ProfilingOp; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class GetNextProfiling : public ProfilingOp { | |||
| public: | |||
| explicit GetNextProfiling(const std::string &path); | |||
| ~GetNextProfiling() = default; | |||
| void SaveProfilingData(); | |||
| void GetDeviceId(); | |||
| uint64_t GetTimeStamp() const; | |||
| void RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp); | |||
| void Init(); | |||
| private: | |||
| std::string profiling_path_; | |||
| std::string file_name_; | |||
| std::vector<uint32_t> queue_size_; | |||
| std::vector<std::pair<uint64_t, uint64_t>> time_stamp_; // First value of std::pair is the start time stamp, | |||
| // Second value of std::pair is the stop time stamp | |||
| std::string device_id_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ | |||
| @@ -14,18 +14,19 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/engine/datasetops/device_queue_op.h" | |||
| #include <iomanip> | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "minddata/dataset/core/config_manager.h" | |||
| #include "minddata/dataset/core/global_context.h" | |||
| #include "minddata/dataset/engine/datasetops/device_queue_op.h" | |||
| #include "minddata/dataset/engine/data_buffer.h" | |||
| #include "minddata/dataset/engine/dataset_iterator.h" | |||
| #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h" | |||
| #include "minddata/dataset/engine/opt/pass.h" | |||
| #include "minddata/dataset/engine/perf/profiling.h" | |||
| #include "minddata/dataset/engine/perf/device_queue_tracing.h" | |||
| #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h" | |||
| #include "minddata/dataset/engine/perf/profiling.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/util/task_manager.h" | |||
| @@ -197,6 +198,19 @@ Status DeviceQueueOp::SendDataToGPU() { | |||
| bool is_open = false; | |||
| uint32_t handle = INVALID_HANDLE; | |||
| auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1); | |||
| double batch_start_time, end_time; | |||
| int32_t batch_cost, push_cost; | |||
| int32_t connector_size = 0; | |||
| int32_t connector_capacity; | |||
| std::shared_ptr<DeviceQueueTracing> profiling_node; | |||
| bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable(); | |||
| if (isProfilingEnable) { | |||
| std::shared_ptr<Tracing> node; | |||
| RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node)); | |||
| profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node); | |||
| batch_start_time = ProfilingTime::GetCurMilliSecond(); | |||
| connector_capacity = ChildOpConnectorCapacity(); | |||
| } | |||
| std::unique_ptr<DataBuffer> current_buffer; | |||
| RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | |||
| @@ -220,20 +234,44 @@ Status DeviceQueueOp::SendDataToGPU() { | |||
| } | |||
| is_open = true; | |||
| } | |||
| RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle)); | |||
| RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle, isProfilingEnable, &push_cost)); | |||
| total_batch++; | |||
| if (isProfilingEnable) { | |||
| end_time = ProfilingTime::GetCurMilliSecond(); | |||
| // record push data time | |||
| profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch, push_cost); | |||
| batch_cost = (int32_t)(end_time - batch_start_time); | |||
| // record batch time | |||
| profiling_node->Record(TIME, BATCH_TIME, total_batch, batch_cost); | |||
| // record pipeline time | |||
| profiling_node->Record(TIME, PIPELINE_TIME, total_batch, batch_cost - push_cost); | |||
| batch_start_time = end_time; | |||
| // record connector depth | |||
| profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch, connector_size); | |||
| } | |||
| } | |||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) | |||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) { | |||
| if (isProfilingEnable) { | |||
| connector_size = ChildOpConnectorSize(); | |||
| connector_capacity = ChildOpConnectorCapacity(); | |||
| } | |||
| RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | |||
| else | |||
| } else { | |||
| is_break_loop = true; | |||
| } | |||
| } | |||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) | |||
| if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) { | |||
| if (isProfilingEnable) { | |||
| connector_size = ChildOpConnectorSize(); | |||
| connector_capacity = ChildOpConnectorCapacity(); | |||
| } | |||
| RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); | |||
| else | |||
| } else { | |||
| is_break_loop = true; | |||
| } | |||
| } | |||
| tree_->SetFinished(); | |||
| MS_LOG(INFO) << "Device queue total batch is " << total_batch << "."; | |||
| GpuBufferMgr::GetInstance().Close(handle); | |||
| @@ -241,9 +279,10 @@ Status DeviceQueueOp::SendDataToGPU() { | |||
| return Status::OK(); | |||
| } | |||
| Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, | |||
| uint32_t handle) { | |||
| Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle, | |||
| bool profiling, int32_t *push_time) { | |||
| std::vector<device::DataItemGpu> items; | |||
| double start_time; | |||
| for (int i = 0; i < data_size.size(); i++) { | |||
| device::DataItemGpu data_item; | |||
| data_item.data_len_ = data_size[i]; | |||
| @@ -253,7 +292,14 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con | |||
| while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) { | |||
| RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row)); | |||
| if (profiling) { | |||
| start_time = ProfilingTime::GetCurMilliSecond(); | |||
| } | |||
| BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME); | |||
| if (profiling) { | |||
| double end_time = ProfilingTime::GetCurMilliSecond(); | |||
| *push_time = (int32_t)(end_time - start_time); | |||
| } | |||
| if (ret) { | |||
| for (int i = 0; i < items.size(); i++) { | |||
| ReleaseData(items[i].data_ptr_); | |||
| @@ -168,7 +168,8 @@ class DeviceQueueOp : public PipelineOp { | |||
| #ifdef ENABLE_GPUQUE | |||
| Status SendDataToGPU(); | |||
| Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle); | |||
| Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle, | |||
| bool profiling, int32_t *push_time); | |||
| Status MallocForGPUData(std::vector<device::DataItemGpu> *items, const TensorRow &curr_row); | |||
| void ReleaseData(void *addr); | |||
| @@ -14,14 +14,16 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "profiler/device/gpu/gpu_profiling.h" | |||
| #include <cxxabi.h> | |||
| #include <cmath> | |||
| #include <chrono> | |||
| #include "profiler/device/gpu/gpu_profiling.h" | |||
| #include <cmath> | |||
| #include "profiler/device/gpu/cupti_interface.h" | |||
| #include "profiler/device/gpu/data_saver.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "pybind_api/api_register.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/utils.h" | |||
| namespace mindspore { | |||
| namespace profiler { | |||
| @@ -456,6 +458,13 @@ void GPUProfiler::Stop() { | |||
| ClearInst(); | |||
| } | |||
| void GPUProfiler::SaveExtraProfileData() { | |||
| for (auto op : profiling_op_) { | |||
| op.second->SaveProfilingData(); | |||
| } | |||
| MS_LOG(INFO) << "Save extra profiling data end."; | |||
| } | |||
| void GPUProfiler::SaveProfileData() { | |||
| if (profile_data_path_.empty()) { | |||
| MS_LOG(WARNING) << "Profile data path is empty, skip save profile data."; | |||
| @@ -464,6 +473,7 @@ void GPUProfiler::SaveProfileData() { | |||
| dataSaver.ParseOpInfo(op_info_map_); | |||
| dataSaver.ParseEvent(events_); | |||
| dataSaver.WriteFile(profile_data_path_); | |||
| SaveExtraProfileData(); | |||
| } | |||
| } | |||
| @@ -639,6 +649,13 @@ void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) { | |||
| AddEvent(std::move(profilingData)); | |||
| } | |||
| void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) { | |||
| if (profiling_op_.find(node->Name()) != profiling_op_.end()) { | |||
| return; | |||
| } | |||
| node->Init(); | |||
| profiling_op_[node->Name()] = node; | |||
| } | |||
| void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { | |||
| int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE); | |||
| @@ -18,14 +18,15 @@ | |||
| #define MINDSPORE_GPU_PROFILING_H | |||
| #include <cuda.h> | |||
| #include <cupti.h> | |||
| #include <algorithm> | |||
| #include <cstdio> | |||
| #include <unordered_map> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <mutex> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <algorithm> | |||
| #include <mutex> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <utility> | |||
| #include <vector> | |||
| namespace mindspore { | |||
| namespace profiler { | |||
| @@ -109,6 +110,18 @@ struct BaseTime { | |||
| const float kTimeUnit = 1000; | |||
| class ProfilingOp { | |||
| public: | |||
| ProfilingOp() = default; | |||
| virtual ~ProfilingOp() = default; | |||
| virtual void SaveProfilingData() = 0; | |||
| virtual void Init() = 0; | |||
| std::string Name() const { return op_name_; } | |||
| protected: | |||
| std::string op_name_; | |||
| }; | |||
| class GPUProfiler { | |||
| public: | |||
| static std::shared_ptr<GPUProfiler> GetInstance(); | |||
| @@ -130,6 +143,8 @@ class GPUProfiler { | |||
| void OpDataProducerBegin(const std::string op_name, void *stream); | |||
| void OpDataProducerEnd(); | |||
| void ProcessEvents(); | |||
| void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node); | |||
| std::string ProfileDataPath() const { return profile_data_path_; } | |||
| private: | |||
| GPUProfiler() = default; | |||
| @@ -153,6 +168,7 @@ class GPUProfiler { | |||
| std::string op_name_; | |||
| void *stream_; | |||
| void SaveProfileData(); | |||
| void SaveExtraProfileData(); | |||
| std::mutex event_mutex_; | |||
| std::vector<CUpti_ActivityKind> activities_enable_; | |||
| @@ -172,6 +188,7 @@ class GPUProfiler { | |||
| uint64_t op_host_time_stop_; | |||
| uint64_t op_cupti_time_start_; | |||
| std::string profile_data_path_; | |||
| std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_; | |||
| }; | |||
| } // namespace gpu | |||
| } // namespace profiler | |||