Merge pull request !21362 from lvchangquan/profiling_refactortags/v1.5.0-rc1
| @@ -32,6 +32,7 @@ | |||
| #include "ir/tensor.h" | |||
| #include "utils/any.h" | |||
| #include "utils/utils.h" | |||
| #include "utils/profile.h" | |||
| #include "utils/ms_context.h" | |||
| #include "utils/check_convert_utils.h" | |||
| #include "utils/context/context_extends.h" | |||
| @@ -67,6 +68,7 @@ | |||
| #include "debug/anf_ir_dump.h" | |||
| #include "runtime/hardware/device_context_manager.h" | |||
| #include "runtime/device/pynative_profiling.h" | |||
| using mindspore::tensor::TensorPy; | |||
| @@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) { | |||
| } // namespace | |||
| py::object RealRunOp(const py::args &args) { | |||
| auto real_run_op_start = GetTime(); | |||
| auto &profiler_inst = device::PynativeProfiler::GetInstance(); | |||
| profiler_inst.AddRealRunOpIndex(); | |||
| CheckPyNativeContext(); | |||
| auto executor = PynativeExecutor::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(executor); | |||
| @@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) { | |||
| MS_EXCEPTION_IF_NULL(op_exec_info); | |||
| py::object ret = py::none(); | |||
| PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info); | |||
| auto real_run_op_end = GetTime(); | |||
| profiler_inst.SetRealRunOpName(op_exec_info->op_name); | |||
| profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end)); | |||
| profiler_inst.SingleOpProfilingData(); | |||
| return ret; | |||
| } | |||
| @@ -25,6 +25,7 @@ | |||
| #include "pybind_api/api_register.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/utils.h" | |||
| #include "utils/profile.h" | |||
| #include "utils/ms_context.h" | |||
| namespace mindspore { | |||
| @@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) { | |||
| } | |||
| } | |||
| void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) { | |||
| auto launch_end_time = GetTime(); | |||
| double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit; | |||
| SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time)); | |||
| } | |||
| void GPUProfiler::OpDataProducerEnd() { | |||
| float op_time_elapsed = 0; | |||
| if (sync_enable_flag_) { | |||
| @@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() { | |||
| CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed"); | |||
| op_time_elapsed = op_time_elapsed * kTimeUnit; | |||
| op_host_time_stop_ = GetHostTimeStamp(); | |||
| SingleOpLaunchTimeProcess(op_time_elapsed); | |||
| } else { | |||
| op_host_time_stop_ = GetHostTimeStamp(); | |||
| op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit; | |||
| SingleOpLaunchTimeProcess(op_time_elapsed); | |||
| } | |||
| MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed; | |||
| Profiler::SetRunTimeData(op_name_, op_time_elapsed); | |||
| @@ -135,6 +135,7 @@ class GPUProfiler : public Profiler { | |||
| std::string ProfileDataPath() const { return profile_data_path_; } | |||
| private: | |||
| void SingleOpLaunchTimeProcess(float op_time_elapsed); | |||
| void OpsParser(); | |||
| void EventLog(const Event &event); | |||
| void ClearInst() override; | |||
| @@ -79,6 +79,10 @@ class Profiler { | |||
| bool GetEnableFlag() const { return enable_flag_; } | |||
| std::string ProfileDataPath() const { return profile_data_path_; } | |||
| void RecordOneStepStartEndInfo(std::string op_name); | |||
| std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; } | |||
| void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) { | |||
| single_op_launch_start_time_end_time_ = launch_start_end; | |||
| } | |||
| protected: | |||
| void SetRunTimeData(const std::string &op_name, const float time_elapsed); | |||
| @@ -86,6 +90,7 @@ class Profiler { | |||
| uint64_t GetHostMonoTimeStamp() const; | |||
| virtual void SaveProfileData() = 0; | |||
| virtual void ClearInst() = 0; | |||
| std::pair<double, double> single_op_launch_start_time_end_time_; | |||
| bool enable_flag_ = false; | |||
| std::string profile_data_path_; | |||
| std::unordered_map<std::string, OpInfo> op_info_map_; | |||
| @@ -1,7 +1,7 @@ | |||
| file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" | |||
| "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" | |||
| "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" | |||
| "bucket.cc" "launch_kernel.cc" "launch_mul.cc" | |||
| "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc" | |||
| ) | |||
| if("${ENABLE_HIDDEN}" STREQUAL "OFF") | |||
| @@ -29,6 +29,14 @@ AscendEvent::AscendEvent() { | |||
| } | |||
| } | |||
| AscendTimeEvent::AscendTimeEvent() { | |||
| auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret; | |||
| event_ = nullptr; | |||
| } | |||
| } | |||
| AscendEvent::~AscendEvent() { | |||
| auto ret = rtEventDestroy(event_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| @@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() { | |||
| need_wait_ = false; | |||
| } | |||
| void AscendEvent::SyncEvent() { | |||
| MS_EXCEPTION_IF_NULL(event_); | |||
| auto ret = rtEventSynchronize(event_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret; | |||
| } | |||
| } | |||
| void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) { | |||
| MS_EXCEPTION_IF_NULL(event_); | |||
| auto ascend_other = static_cast<AscendEvent *>(other); | |||
| MS_EXCEPTION_IF_NULL(ascend_other); | |||
| MS_EXCEPTION_IF_NULL(ascend_other->event_); | |||
| auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret; | |||
| } | |||
| } | |||
| bool AscendEvent::NeedWait() { return need_wait_; } | |||
| } // namespace mindspore::device::ascend | |||
| @@ -19,6 +19,7 @@ | |||
| #include "runtime/base.h" | |||
| #include "ir/device_event.h" | |||
| namespace mindspore::device::ascend { | |||
| class AscendEvent : public DeviceEvent { | |||
| public: | |||
| @@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent { | |||
| void WaitEvent() override; | |||
| void RecordEvent() override; | |||
| bool NeedWait() override; | |||
| void SyncEvent() override; | |||
| void ElapsedTime(float *cost_time, DeviceEvent *other) override; | |||
| void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; } | |||
| void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; } | |||
| private: | |||
| protected: | |||
| rtEvent_t event_{nullptr}; | |||
| rtStream_t wait_stream_{nullptr}; | |||
| rtStream_t record_stream_{nullptr}; | |||
| bool need_wait_{false}; | |||
| }; | |||
| class AscendTimeEvent : public AscendEvent { | |||
| public: | |||
| AscendTimeEvent(); | |||
| ~AscendTimeEvent() override = default; | |||
| }; | |||
| } // namespace mindspore::device::ascend | |||
| #endif // MINDSPORE_ASCEND_EVENT_H | |||
| @@ -282,6 +282,13 @@ void AscendKernelRuntime::PreInit() { | |||
| } | |||
| bool AscendKernelRuntime::Init() { | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE); | |||
| auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING); | |||
| if (execution_mode == kPynativeMode && profiling_flag) { | |||
| pynative_mode_profiling_flag_ = true; | |||
| } | |||
| if (initialized_) { | |||
| SetCurrentContext(); | |||
| return true; | |||
| @@ -946,6 +953,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() { | |||
| return ascend_event; | |||
| } | |||
| std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() { | |||
| auto ascend_time_event = std::make_shared<AscendTimeEvent>(); | |||
| MS_EXCEPTION_IF_NULL(ascend_time_event); | |||
| return ascend_time_event; | |||
| } | |||
| uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const { | |||
| auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_); | |||
| MS_EXCEPTION_IF_NULL(ascend_mem_manager); | |||
| @@ -58,6 +58,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| uint64_t GetAvailableMemMaxSize() const override; | |||
| DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; }; | |||
| std::shared_ptr<DeviceEvent> CreateDeviceEvent() override; | |||
| std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override; | |||
| void *compute_stream() const override { return stream_; } | |||
| void *communication_stream() const override { return communication_stream_; } | |||
| @@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() { | |||
| need_wait_ = true; | |||
| } | |||
| void GpuEvent::SyncEvent() { | |||
| MS_EXCEPTION_IF_NULL(event_); | |||
| CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed"); | |||
| } | |||
| void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) { | |||
| MS_EXCEPTION_IF_NULL(event_); | |||
| auto gpu_event = static_cast<GpuEvent *>(other); | |||
| MS_EXCEPTION_IF_NULL(gpu_event); | |||
| MS_EXCEPTION_IF_NULL(gpu_event->event_); | |||
| CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_), | |||
| "cudaEventElapsedTime failed"); | |||
| } | |||
| bool GpuEvent::NeedWait() { return need_wait_; } | |||
| } // namespace mindspore::device::gpu | |||
| @@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent { | |||
| void WaitEvent() override; | |||
| void RecordEvent() override; | |||
| bool NeedWait() override; | |||
| void SyncEvent() override; | |||
| void ElapsedTime(float *cost_time, DeviceEvent *other) override; | |||
| void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); } | |||
| void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); } | |||
| @@ -32,6 +32,7 @@ | |||
| #include "utils/utils.h" | |||
| #include "frontend/parallel/context.h" | |||
| #include "debug/env_config_parser.h" | |||
| #include "runtime/device/pynative_profiling.h" | |||
| #if ((defined ENABLE_CPU) && (!defined _WIN32)) | |||
| #include "ps/ps_cache/ps_cache_manager.h" | |||
| #endif | |||
| @@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio | |||
| } | |||
| } | |||
| bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name, | |||
| const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream) { | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| MS_EXCEPTION_IF_NULL(stream); | |||
| float cost_time = 0; | |||
| auto start = CreateDeviceTimeEvent(); | |||
| auto end = CreateDeviceTimeEvent(); | |||
| MS_EXCEPTION_IF_NULL(start); | |||
| MS_EXCEPTION_IF_NULL(end); | |||
| start->set_record_stream(stream); | |||
| end->set_record_stream(stream); | |||
| start->RecordEvent(); | |||
| bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream); | |||
| end->RecordEvent(); | |||
| start->SyncEvent(); | |||
| end->SyncEvent(); | |||
| start->ElapsedTime(&cost_time, end.get()); | |||
| auto launch_end_time = GetTime(); | |||
| auto &profiler_inst = PynativeProfiler::GetInstance(); | |||
| double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit; | |||
| auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time); | |||
| profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time)); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name; | |||
| } | |||
| return ret; | |||
| } | |||
| bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { | |||
| const auto &kernels = graph.execution_order(); | |||
| std::vector<DynamicKernelPtr> dynamic_kernel_list; | |||
| @@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { | |||
| GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs); | |||
| bool ret; | |||
| if (AnfAlgo::IsCommunicationOp(kernel)) { | |||
| ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_); | |||
| if (pynative_mode_profiling_flag_) { | |||
| ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs, | |||
| kernel_workspaces, kernel_outputs, communication_stream_); | |||
| } else { | |||
| ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_); | |||
| } | |||
| } else { | |||
| ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | |||
| if (pynative_mode_profiling_flag_) { | |||
| ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs, | |||
| kernel_workspaces, kernel_outputs, stream_); | |||
| } else { | |||
| ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); | |||
| } | |||
| } | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "Launch kernel failed."; | |||
| @@ -98,6 +98,7 @@ class KernelRuntime { | |||
| virtual uint64_t GetAvailableMemMaxSize() const { return 0; } | |||
| void GenKernelEvents(const session::KernelGraph *graph); | |||
| virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; } | |||
| virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; } | |||
| virtual DeviceAddressType GetTargetDeviceAddressType() const = 0; | |||
| virtual void *compute_stream() const { return nullptr; } | |||
| virtual void *communication_stream() const { return nullptr; } | |||
| @@ -134,6 +135,10 @@ class KernelRuntime { | |||
| void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph); | |||
| void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx); | |||
| DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index); | |||
| bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name, | |||
| const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream); | |||
| #if (ENABLE_CPU && !_WIN32) | |||
| void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index, | |||
| size_t *const first_cache_size); | |||
| @@ -143,6 +148,7 @@ class KernelRuntime { | |||
| protected: | |||
| uint32_t device_id_{0}; | |||
| bool pynative_mode_profiling_flag_{false}; | |||
| #if !defined(_WIN32) && !defined(_WIN64) | |||
| std::shared_ptr<Debugger> debugger_; | |||
| #endif | |||
| @@ -0,0 +1,49 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/device/pynative_profiling.h" | |||
| #include <iostream> | |||
| #include <fstream> | |||
| #include <utility> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "utils/profile.h" | |||
| #include "utils/utils.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| void PynativeProfiler::SingleOpProfilingData() { | |||
| static std::ofstream of("pynative_forward_profiling_data.csv"); | |||
| of.setf(std::ios::fixed, std::ios::floatfield); | |||
| if (real_run_op_index_ == 1) { | |||
| of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ',' | |||
| << "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl; | |||
| } | |||
| if (op_name_launch_start_time_end_time_vec_.empty()) { | |||
| of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ',' | |||
| << real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ',' | |||
| << real_run_op_start_time_end_time_.second << std::endl; | |||
| return; | |||
| } | |||
| for (const auto &i : op_name_launch_start_time_end_time_vec_) { | |||
| of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ',' | |||
| << real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ',' | |||
| << real_run_op_start_time_end_time_.second << std::endl; | |||
| } | |||
| op_name_launch_start_time_end_time_vec_.clear(); | |||
| } | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,56 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include <utility> | |||
| #include <unordered_map> | |||
| namespace mindspore { | |||
| namespace device { | |||
| class PynativeProfiler { | |||
| public: | |||
| static PynativeProfiler &GetInstance() { | |||
| static PynativeProfiler instance; | |||
| return instance; | |||
| } | |||
| void AddRealRunOpIndex() { ++real_run_op_index_; } | |||
| void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; } | |||
| void SetRealRunOpTime(const std::pair<double, double> &start_end) { real_run_op_start_time_end_time_ = start_end; } | |||
| void SetOpNameAndLaunchTime(const std::pair<std::string, std::pair<double, double>> &name_start_end) { | |||
| op_name_launch_start_time_end_time_vec_.push_back(name_start_end); | |||
| } | |||
| void SingleOpProfilingData(); | |||
| private: | |||
| PynativeProfiler() = default; | |||
| ~PynativeProfiler() = default; | |||
| PynativeProfiler(const PynativeProfiler &) = delete; | |||
| PynativeProfiler &operator=(const PynativeProfiler &) = delete; | |||
| int real_run_op_index_ = 0; | |||
| std::string real_run_op_name_; | |||
| std::pair<double, double> real_run_op_start_time_end_time_; | |||
| std::vector<std::pair<std::string, std::pair<double, double>>> op_name_launch_start_time_end_time_vec_; | |||
| }; | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ | |||
| @@ -16,6 +16,8 @@ | |||
| #include "runtime/hardware/gpu/gpu_device_context.h" | |||
| #include <dlfcn.h> | |||
| #include <utility> | |||
| #include "runtime/device/pynative_profiling.h" | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| #include "runtime/device/gpu/gpu_kernel_build.h" | |||
| #include "runtime/device/gpu/gpu_device_address.h" | |||
| @@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s | |||
| bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs); | |||
| profiler_inst->OpDataProducerEnd(); | |||
| auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime(); | |||
| auto &pynative_profiler = PynativeProfiler::GetInstance(); | |||
| std::string op_name = kernel->fullname_with_scope(); | |||
| pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time)); | |||
| if (profiler_inst->GetSyncEnableFlag()) { | |||
| CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed."); | |||
| } | |||
| @@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2; | |||
| // index define of Load | |||
| constexpr auto kLoadRealInput = 1; | |||
| constexpr auto kLoadStateInput = 2; | |||
| // time transfer unit | |||
| constexpr int kBasicTimeTransferUnit = 1000; | |||
| // index of input or output | |||
| enum Index : size_t { | |||
| kIndex0 = 0, | |||
| @@ -24,6 +24,8 @@ class DeviceEvent { | |||
| virtual void WaitEvent() = 0; | |||
| virtual void RecordEvent() = 0; | |||
| virtual bool NeedWait() = 0; | |||
| virtual void SyncEvent() = 0; | |||
| virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0; | |||
| virtual void set_wait_stream(void *stream) = 0; | |||
| virtual void set_record_stream(void *stream) = 0; | |||
| }; | |||
| @@ -25,6 +25,10 @@ | |||
| rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; } | |||
| rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; } | |||
| rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; } | |||
| rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; } | |||
| rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) { | |||