!21362 add pynative profiling codes based on ascend and gpu

Merge pull request !21362 from lvchangquan/profiling_refactor
4 years ago · 6afcd815d2
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -32,6 +32,7 @@
 #include "ir/tensor.h"
 #include "utils/any.h"
 #include "utils/utils.h"
 #include "utils/profile.h"
 #include "utils/ms_context.h"
 #include "utils/check_convert_utils.h"
 #include "utils/context/context_extends.h"
@@ -67,6 +68,7 @@

 #include "debug/anf_ir_dump.h"
 #include "runtime/hardware/device_context_manager.h"
 #include "runtime/device/pynative_profiling.h"

 using mindspore::tensor::TensorPy;

@@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) {
 }  // namespace

 py::object RealRunOp(const py::args &args) {
  auto real_run_op_start = GetTime();
  auto &profiler_inst = device::PynativeProfiler::GetInstance();
  profiler_inst.AddRealRunOpIndex();
  CheckPyNativeContext();
  auto executor = PynativeExecutor::GetInstance();
  MS_EXCEPTION_IF_NULL(executor);
@@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  py::object ret = py::none();
  PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
  auto real_run_op_end = GetTime();
  profiler_inst.SetRealRunOpName(op_exec_info->op_name);
  profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end));
  profiler_inst.SingleOpProfilingData();
  return ret;
 }

--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
@@ -25,6 +25,7 @@
 #include "pybind_api/api_register.h"
 #include "utils/log_adapter.h"
 #include "utils/utils.h"
 #include "utils/profile.h"
 #include "utils/ms_context.h"

 namespace mindspore {
@@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
  }
 }

 void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
  auto launch_end_time = GetTime();
  double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
  SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
 }

 void GPUProfiler::OpDataProducerEnd() {
  float op_time_elapsed = 0;
  if (sync_enable_flag_) {
@@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() {
    CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy  op event stop failed");
    op_time_elapsed = op_time_elapsed * kTimeUnit;
    op_host_time_stop_ = GetHostTimeStamp();
    SingleOpLaunchTimeProcess(op_time_elapsed);
  } else {
    op_host_time_stop_ = GetHostTimeStamp();
    op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
    SingleOpLaunchTimeProcess(op_time_elapsed);
  }
  MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
  Profiler::SetRunTimeData(op_name_, op_time_elapsed);
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
@@ -135,6 +135,7 @@ class GPUProfiler : public Profiler {
  std::string ProfileDataPath() const { return profile_data_path_; }

 private:
  void SingleOpLaunchTimeProcess(float op_time_elapsed);
  void OpsParser();
  void EventLog(const Event &event);
  void ClearInst() override;
--- a/mindspore/ccsrc/profiler/device/profiling.h
+++ b/mindspore/ccsrc/profiler/device/profiling.h
@@ -79,6 +79,10 @@ class Profiler {
  bool GetEnableFlag() const { return enable_flag_; }
  std::string ProfileDataPath() const { return profile_data_path_; }
  void RecordOneStepStartEndInfo(std::string op_name);
  std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; }
  void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) {
    single_op_launch_start_time_end_time_ = launch_start_end;
  }

 protected:
  void SetRunTimeData(const std::string &op_name, const float time_elapsed);
@@ -86,6 +90,7 @@ class Profiler {
  uint64_t GetHostMonoTimeStamp() const;
  virtual void SaveProfileData() = 0;
  virtual void ClearInst() = 0;
  std::pair<double, double> single_op_launch_start_time_end_time_;
  bool enable_flag_ = false;
  std::string profile_data_path_;
  std::unordered_map<std::string, OpInfo> op_info_map_;
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
    "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
    "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
    "bucket.cc" "launch_kernel.cc" "launch_mul.cc"
    "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc"
 )

 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
@@ -29,6 +29,14 @@ AscendEvent::AscendEvent() {
  }
 }

 AscendTimeEvent::AscendTimeEvent() {
  auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
    event_ = nullptr;
  }
 }

 AscendEvent::~AscendEvent() {
  auto ret = rtEventDestroy(event_);
  if (ret != RT_ERROR_NONE) {
@@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() {
  need_wait_ = false;
 }

 void AscendEvent::SyncEvent() {
  MS_EXCEPTION_IF_NULL(event_);
  auto ret = rtEventSynchronize(event_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret;
  }
 }

 void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
  MS_EXCEPTION_IF_NULL(event_);
  auto ascend_other = static_cast<AscendEvent *>(other);
  MS_EXCEPTION_IF_NULL(ascend_other);
  MS_EXCEPTION_IF_NULL(ascend_other->event_);
  auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret;
  }
 }

 bool AscendEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
@@ -19,6 +19,7 @@

 #include "runtime/base.h"
 #include "ir/device_event.h"

 namespace mindspore::device::ascend {
 class AscendEvent : public DeviceEvent {
 public:
@@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent {
  void WaitEvent() override;
  void RecordEvent() override;
  bool NeedWait() override;
  void SyncEvent() override;
  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
  void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
  void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }

 private:
 protected:
  rtEvent_t event_{nullptr};
  rtStream_t wait_stream_{nullptr};
  rtStream_t record_stream_{nullptr};
  bool need_wait_{false};
 };

 class AscendTimeEvent : public AscendEvent {
 public:
  AscendTimeEvent();
  ~AscendTimeEvent() override = default;
 };
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_ASCEND_EVENT_H
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -282,6 +282,13 @@ void AscendKernelRuntime::PreInit() {
 }

 bool AscendKernelRuntime::Init() {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
  auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING);
  if (execution_mode == kPynativeMode && profiling_flag) {
    pynative_mode_profiling_flag_ = true;
  }
  if (initialized_) {
    SetCurrentContext();
    return true;
@@ -946,6 +953,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() {
  return ascend_event;
 }

 std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() {
  auto ascend_time_event = std::make_shared<AscendTimeEvent>();
  MS_EXCEPTION_IF_NULL(ascend_time_event);
  return ascend_time_event;
 }

 uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
  auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
  MS_EXCEPTION_IF_NULL(ascend_mem_manager);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -58,6 +58,7 @@ class AscendKernelRuntime : public KernelRuntime {
  uint64_t GetAvailableMemMaxSize() const override;
  DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
  std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
  std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override;
  void *compute_stream() const override { return stream_; }
  void *communication_stream() const override { return communication_stream_; }

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
@@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() {
  need_wait_ = true;
 }

 void GpuEvent::SyncEvent() {
  MS_EXCEPTION_IF_NULL(event_);
  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed");
 }

 void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
  MS_EXCEPTION_IF_NULL(event_);
  auto gpu_event = static_cast<GpuEvent *>(other);
  MS_EXCEPTION_IF_NULL(gpu_event);
  MS_EXCEPTION_IF_NULL(gpu_event->event_);
  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_),
                                     "cudaEventElapsedTime failed");
 }

 bool GpuEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::gpu
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
@@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent {
  void WaitEvent() override;
  void RecordEvent() override;
  bool NeedWait() override;
  void SyncEvent() override;
  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
  void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
  void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }

--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -32,6 +32,7 @@
 #include "utils/utils.h"
 #include "frontend/parallel/context.h"
 #include "debug/env_config_parser.h"
 #include "runtime/device/pynative_profiling.h"
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
@@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio
  }
 }

 bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
                                                      const std::vector<AddressPtr> &inputs,
                                                      const std::vector<AddressPtr> &workspace,
                                                      const std::vector<AddressPtr> &outputs, void *stream) {
  MS_EXCEPTION_IF_NULL(kernel_mod);
  MS_EXCEPTION_IF_NULL(stream);
  float cost_time = 0;
  auto start = CreateDeviceTimeEvent();
  auto end = CreateDeviceTimeEvent();
  MS_EXCEPTION_IF_NULL(start);
  MS_EXCEPTION_IF_NULL(end);
  start->set_record_stream(stream);
  end->set_record_stream(stream);
  start->RecordEvent();
  bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream);
  end->RecordEvent();
  start->SyncEvent();
  end->SyncEvent();
  start->ElapsedTime(&cost_time, end.get());
  auto launch_end_time = GetTime();
  auto &profiler_inst = PynativeProfiler::GetInstance();
  double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit;
  auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time);
  profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time));
  if (!ret) {
    MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name;
  }
  return ret;
 }

 bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
  const auto &kernels = graph.execution_order();
  std::vector<DynamicKernelPtr> dynamic_kernel_list;
@@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
      GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
      bool ret;
      if (AnfAlgo::IsCommunicationOp(kernel)) {
        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
        if (pynative_mode_profiling_flag_) {
          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
                                                  kernel_workspaces, kernel_outputs, communication_stream_);
        } else {
          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
        }
      } else {
        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
        if (pynative_mode_profiling_flag_) {
          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
                                                  kernel_workspaces, kernel_outputs, stream_);
        } else {
          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
        }
      }
      if (!ret) {
        MS_LOG(ERROR) << "Launch kernel failed.";
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -98,6 +98,7 @@ class KernelRuntime {
  virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
  void GenKernelEvents(const session::KernelGraph *graph);
  virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
  virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; }
  virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
  virtual void *compute_stream() const { return nullptr; }
  virtual void *communication_stream() const { return nullptr; }
@@ -134,6 +135,10 @@ class KernelRuntime {
  void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
  void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
  DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
  bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
                                         const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &workspace,
                                         const std::vector<AddressPtr> &outputs, void *stream);
 #if (ENABLE_CPU && !_WIN32)
  void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
                                size_t *const first_cache_size);
@@ -143,6 +148,7 @@ class KernelRuntime {

 protected:
  uint32_t device_id_{0};
  bool pynative_mode_profiling_flag_{false};
 #if !defined(_WIN32) && !defined(_WIN64)
  std::shared_ptr<Debugger> debugger_;
 #endif
--- a/mindspore/ccsrc/runtime/device/pynative_profiling.cc
+++ b/mindspore/ccsrc/runtime/device/pynative_profiling.cc
@@ -0,0 +1,49 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "runtime/device/pynative_profiling.h"
 #include <iostream>
 #include <fstream>
 #include <utility>
 #include <memory>
 #include <string>
 #include "utils/profile.h"
 #include "utils/utils.h"

 namespace mindspore {
 namespace device {
 void PynativeProfiler::SingleOpProfilingData() {
  static std::ofstream of("pynative_forward_profiling_data.csv");
  of.setf(std::ios::fixed, std::ios::floatfield);
  if (real_run_op_index_ == 1) {
    of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ','
       << "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl;
  }
  if (op_name_launch_start_time_end_time_vec_.empty()) {
    of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ','
       << real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ','
       << real_run_op_start_time_end_time_.second << std::endl;
    return;
  }
  for (const auto &i : op_name_launch_start_time_end_time_vec_) {
    of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ','
       << real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ','
       << real_run_op_start_time_end_time_.second << std::endl;
  }
  op_name_launch_start_time_end_time_vec_.clear();
 }
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/pynative_profiling.h
+++ b/mindspore/ccsrc/runtime/device/pynative_profiling.h
@@ -0,0 +1,56 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
 #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_

 #include <memory>
 #include <string>
 #include <vector>
 #include <utility>
 #include <unordered_map>

 namespace mindspore {
 namespace device {
 class PynativeProfiler {
 public:
  static PynativeProfiler &GetInstance() {
    static PynativeProfiler instance;
    return instance;
  }

  void AddRealRunOpIndex() { ++real_run_op_index_; }
  void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; }
  void SetRealRunOpTime(const std::pair<double, double> &start_end) { real_run_op_start_time_end_time_ = start_end; }
  void SetOpNameAndLaunchTime(const std::pair<std::string, std::pair<double, double>> &name_start_end) {
    op_name_launch_start_time_end_time_vec_.push_back(name_start_end);
  }
  void SingleOpProfilingData();

 private:
  PynativeProfiler() = default;
  ~PynativeProfiler() = default;
  PynativeProfiler(const PynativeProfiler &) = delete;
  PynativeProfiler &operator=(const PynativeProfiler &) = delete;
  int real_run_op_index_ = 0;
  std::string real_run_op_name_;
  std::pair<double, double> real_run_op_start_time_end_time_;
  std::vector<std::pair<std::string, std::pair<double, double>>> op_name_launch_start_time_end_time_vec_;
 };
 }  // namespace device
 }  // namespace mindspore

 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@@ -16,6 +16,8 @@

 #include "runtime/hardware/gpu/gpu_device_context.h"
 #include <dlfcn.h>
 #include <utility>
 #include "runtime/device/pynative_profiling.h"
 #include "runtime/device/gpu/kernel_info_setter.h"
 #include "runtime/device/gpu/gpu_kernel_build.h"
 #include "runtime/device/gpu/gpu_device_address.h"
@@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
  bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
  profiler_inst->OpDataProducerEnd();

  auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime();
  auto &pynative_profiler = PynativeProfiler::GetInstance();
  std::string op_name = kernel->fullname_with_scope();
  pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time));

  if (profiler_inst->GetSyncEnableFlag()) {
    CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
  }
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2;
 // index define of Load
 constexpr auto kLoadRealInput = 1;
 constexpr auto kLoadStateInput = 2;
 // time transfer unit
 constexpr int kBasicTimeTransferUnit = 1000;
 // index of input or output
 enum Index : size_t {
  kIndex0 = 0,
--- a/mindspore/core/ir/device_event.h
+++ b/mindspore/core/ir/device_event.h
@@ -24,6 +24,8 @@ class DeviceEvent {
  virtual void WaitEvent() = 0;
  virtual void RecordEvent() = 0;
  virtual bool NeedWait() = 0;
  virtual void SyncEvent() = 0;
  virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0;
  virtual void set_wait_stream(void *stream) = 0;
  virtual void set_record_stream(void *stream) = 0;
 };
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@@ -25,6 +25,10 @@

 rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }

 rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; }

 rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; }

 rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }

 rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {