| @@ -214,12 +214,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin | |||
| uint32_t block_dim = task.block_dim; | |||
| uint32_t task_id = task.task_id; | |||
| uint32_t stream_id = task.stream_id; | |||
| std::string shape_type = task.shape_type; | |||
| uint64_t cur_iter_num = task.cur_iter_num; | |||
| data = model_name.append(" ") | |||
| .append(op_name).append(" ") | |||
| .append(std::to_string(block_dim).append(" ") | |||
| .append(std::to_string(block_dim)).append(" ") | |||
| .append(std::to_string(task_id)).append(" ") | |||
| .append(std::to_string(stream_id)).append(" ") | |||
| .append(std::to_string(model_id)).append("\n")); | |||
| .append(std::to_string(model_id)).append(" ") | |||
| .append(shape_type).append(" ") | |||
| .append(std::to_string(cur_iter_num)).append("\n"); | |||
| ReporterData reporter_data{}; | |||
| reporter_data.deviceId = device_id; | |||
| @@ -3161,8 +3161,7 @@ Status DavinciModel::DistributeTask() { | |||
| auto task_type = static_cast<rtModelTaskType_t>(task_def.type()); | |||
| bool no_need_profiling = (task_type != RT_MODEL_TASK_KERNEL) | |||
| && (task_type != RT_MODEL_TASK_KERNEL_EX) | |||
| && (task_type != RT_MODEL_TASK_HCCL); | |||
| && (task_type != RT_MODEL_TASK_KERNEL_EX); | |||
| GE_IF_BOOL_EXEC(no_need_profiling, continue); | |||
| SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId()); | |||
| @@ -3177,6 +3176,8 @@ Status DavinciModel::DistributeTask() { | |||
| task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
| task_desc_info.task_id = task->GetTaskID(); | |||
| task_desc_info.stream_id = task->GetStreamId(); | |||
| task_desc_info.shape_type = "static"; | |||
| task_desc_info.cur_iter_num = 0; | |||
| task_desc_info_.emplace_back(task_desc_info); | |||
| if (flag) { | |||
| if (task->GetSktTaskID() != 0xFFFFFFFF) { | |||
| @@ -74,6 +74,7 @@ class NodeDoneCallback { | |||
| std::vector<ComputeGraphDescInfo> &compute_graph_info); | |||
| Status GetTaskDescInfo(const NodePtr node, const HybridModel *model, | |||
| std::vector<TaskDescInfo> &task_desc_info); | |||
| Status GetNodeCurIterNum(uint64_t &cur_iter_num); | |||
| GraphExecutionContext *graph_context_; | |||
| std::shared_ptr<TaskContext> context_; | |||
| DumpOp dump_op_; | |||
| @@ -151,29 +152,42 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel * | |||
| GE_CHECK_NOTNULL(node); | |||
| GE_CHECK_NOTNULL(model); | |||
| // only report aicpu and aicore node | |||
| auto task_defs = model->GetTaskDefs(node); | |||
| if (task_defs == nullptr || (*task_defs).size() == 0) { | |||
| GELOGD("Node[%s] does not need to report data.", node->GetName().c_str()); | |||
| return SUCCESS; | |||
| } | |||
| const auto &task_def = (*task_defs)[0]; | |||
| auto task_type = static_cast<rtModelTaskType_t>(task_def.type()); | |||
| bool is_profiling_report = (task_type == RT_MODEL_TASK_KERNEL) || (task_type == RT_MODEL_TASK_KERNEL_EX); | |||
| if (!is_profiling_report) { | |||
| GELOGD("Task type[%d] of Node[%s] is not aicore or aicpu, and no need to report data.", | |||
| static_cast<int>(task_type), node->GetName().c_str()); | |||
| return SUCCESS; | |||
| } | |||
| GELOGD("GetTaskDescInfo of node [%s] start.", node->GetName().c_str()); | |||
| auto op_desc = node->GetOpDesc(); | |||
| std::string op_name = op_desc->GetName(); | |||
| std::string dynamic_model_name = model->GetModelName(); | |||
| uint32_t task_id = 0; | |||
| uint32_t stream_id = 0; | |||
| if (rtGetTaskIdAndStreamID(&task_id, &stream_id) != RT_ERROR_NONE) { | |||
| GELOGE(PARAM_INVALID, "Get task_id and stream_id failed."); | |||
| uint32_t task_id = context_->GetTaskId(); | |||
| uint32_t stream_id = context_->GetStreamId(); | |||
| uint64_t cur_iter_num = 0; | |||
| if (GetNodeCurIterNum(cur_iter_num) != SUCCESS) { | |||
| GELOGE(PARAM_INVALID, "Get cur iter num failed."); | |||
| return PARAM_INVALID; | |||
| } | |||
| TaskDescInfo tmp_task_desc_info; | |||
| tmp_task_desc_info.model_name = dynamic_model_name; | |||
| tmp_task_desc_info.op_name = op_name; | |||
| tmp_task_desc_info.block_dim = 0; | |||
| auto task_defs = model->GetTaskDefs(node); | |||
| if (task_defs != nullptr && (*task_defs).size() > 0) { | |||
| const auto &task_def = (*task_defs)[0]; | |||
| tmp_task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
| } | |||
| tmp_task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
| tmp_task_desc_info.task_id = task_id; | |||
| tmp_task_desc_info.stream_id = stream_id; | |||
| tmp_task_desc_info.shape_type = "dynamic"; | |||
| tmp_task_desc_info.cur_iter_num = cur_iter_num; | |||
| GELOGD("GetTaskDescInfo of node [%s] end, task_id[%u], stream_id[%u]", | |||
| node->GetName().c_str(), task_id, stream_id); | |||
| task_desc_info.emplace_back(tmp_task_desc_info); | |||
| @@ -224,6 +238,60 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel | |||
| return SUCCESS; | |||
| } | |||
| Status NodeDoneCallback::GetNodeCurIterNum(uint64_t &cur_iter_num) { | |||
| GE_CHECK_NOTNULL(context_); | |||
| uint64_t global_step = 0; | |||
| TensorValue *varible_global_step = context_->GetVariable(NODE_NAME_GLOBAL_STEP); | |||
| if (varible_global_step != nullptr) { | |||
| size_t global_step_size = varible_global_step->GetSize(); | |||
| if (global_step_size > 0) { | |||
| std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[global_step_size]); | |||
| GE_CHECK_NOTNULL(data_buf); | |||
| GE_CHK_RT_RET(rtMemcpy(data_buf.get(), global_step_size, varible_global_step->GetData(), global_step_size, | |||
| RT_MEMCPY_DEVICE_TO_HOST)); | |||
| global_step = *reinterpret_cast<uint64_t *>(data_buf.get()); | |||
| } | |||
| } | |||
| uint64_t loop_per_iter = 0; | |||
| TensorValue *varible_loop_per_iter = context_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_PER_ITER); | |||
| if (varible_loop_per_iter != nullptr) { | |||
| size_t varible_loop_per_iter_size = varible_loop_per_iter->GetSize(); | |||
| if (varible_loop_per_iter_size > 0) { | |||
| std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[varible_loop_per_iter_size]); | |||
| GE_CHECK_NOTNULL(data_buf); | |||
| GE_CHK_RT_RET(rtMemcpy(data_buf.get(), varible_loop_per_iter_size, varible_loop_per_iter->GetData(), | |||
| varible_loop_per_iter_size, RT_MEMCPY_DEVICE_TO_HOST)); | |||
| loop_per_iter = *reinterpret_cast<uint64_t *>(data_buf.get()); | |||
| } | |||
| } | |||
| uint64_t loop_cond = 0; | |||
| TensorValue *varible_loop_cond = context_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_COND); | |||
| if (varible_loop_cond != nullptr) { | |||
| size_t varible_loop_cond_size = varible_loop_cond->GetSize(); | |||
| if (varible_loop_cond_size > 0) { | |||
| std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[varible_loop_cond_size]); | |||
| GE_CHECK_NOTNULL(data_buf); | |||
| GE_CHK_RT_RET(rtMemcpy(data_buf.get(), varible_loop_cond_size, varible_loop_cond->GetData(), | |||
| varible_loop_cond_size, RT_MEMCPY_DEVICE_TO_HOST)); | |||
| loop_cond = *reinterpret_cast<uint64_t *>(data_buf.get()); | |||
| } | |||
| } | |||
| auto node = context_->GetNodeItem().node; | |||
| if (node == nullptr) { | |||
| GELOGE(PARAM_INVALID, "Node is nullptr."); | |||
| return PARAM_INVALID; | |||
| } | |||
| GELOGD("Node[%s] has global_step: %lu, loop_per_iter:%lu, loop_cond: %lu", | |||
| node->GetName().c_str(), global_step, loop_per_iter, loop_cond); | |||
| cur_iter_num = global_step * (loop_per_iter + 1) + loop_cond + 1; | |||
| return SUCCESS; | |||
| } | |||
| Status NodeDoneCallback::ProfilingReport() { | |||
| auto node = context_->GetNodeItem().node; | |||
| if (node == nullptr) { | |||
| @@ -165,6 +165,16 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> | |||
| } | |||
| RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start"); | |||
| GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream())); | |||
| uint32_t task_id = 0; | |||
| uint32_t stream_id = 0; | |||
| rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); | |||
| if (rt_ret != RT_ERROR_NONE) { | |||
| GELOGE(rt_ret, "Get task_id and stream_id failed."); | |||
| return rt_ret; | |||
| } | |||
| context.SetTaskId(task_id); | |||
| context.SetStreamId(stream_id); | |||
| GELOGD("AiCore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); | |||
| RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); | |||
| RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); | |||
| } | |||
| @@ -189,6 +189,17 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void( | |||
| GE_CHK_STATUS_RET(LaunchTask(context)); | |||
| uint32_t task_id = 0; | |||
| uint32_t stream_id = 0; | |||
| rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); | |||
| if (rt_ret != RT_ERROR_NONE) { | |||
| GELOGE(rt_ret, "Get task_id and stream_id failed."); | |||
| return rt_ret; | |||
| } | |||
| context.SetTaskId(task_id); | |||
| context.SetStreamId(stream_id); | |||
| GELOGD("AiCpu node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); | |||
| auto callback = [=, &context]() { | |||
| GELOGD("Node[%s] callback start.", node_name_.c_str()); | |||
| RECORD_CALLBACK_EVENT(context.GetExecutionContext(), node_name_.c_str(), "[TaskCallback] Start"); | |||
| @@ -319,6 +319,22 @@ void TaskContext::SetStatus(Status status) { | |||
| } | |||
| } | |||
| uint32_t TaskContext::GetTaskId() const { | |||
| return task_id_; | |||
| } | |||
| void TaskContext::SetTaskId(uint32_t task_id) { | |||
| task_id_ = task_id; | |||
| } | |||
| uint32_t TaskContext::GetStreamId() const { | |||
| return stream_id_; | |||
| } | |||
| void TaskContext::SetStreamId(uint32_t stream_id) { | |||
| stream_id_ = stream_id; | |||
| } | |||
| Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr) { | |||
| GE_CHECK_NOTNULL(buffer); | |||
| if (ori_addr == nullptr) { | |||
| @@ -96,6 +96,12 @@ class TaskContext { | |||
| void SetStatus(Status status); | |||
| uint32_t GetTaskId() const; | |||
| void SetTaskId(uint32_t task_id); | |||
| uint32_t GetStreamId() const; | |||
| void SetStreamId(uint32_t stream_id); | |||
| bool IsForceInferShape() const; | |||
| void SetForceInferShape(bool force_infer_shape); | |||
| void *handle_ = nullptr; | |||
| @@ -117,6 +123,8 @@ class TaskContext { | |||
| Status status_ = SUCCESS; | |||
| std::vector<void *> workspaces_; | |||
| uint64_t iteration_ = 0; | |||
| uint32_t task_id_= 0; | |||
| uint32_t stream_id_ = 0; | |||
| }; | |||
| } // namespace hybrid | |||
| } // namespace ge | |||
| @@ -32,13 +32,15 @@ namespace ge { | |||
| namespace { | |||
| const size_t kDataMemAlignSize = 32; | |||
| const size_t kDataMemAlignUnit = 2; | |||
| const string kShapeTypeDynamic = "dynamic"; | |||
| const string kShapeTypeStatic = "static"; | |||
| size_t GetAlignedSize(size_t size) { | |||
| size_t aligned_size = (size + kDataMemAlignUnit * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize; | |||
| return aligned_size; | |||
| } | |||
| Status ProfilingTaskInfo(OpTask *op_task) { | |||
| Status ProfilingTaskInfo(OpTask *op_task, string shape_type) { | |||
| if (!ProfilingManager::Instance().ProfilingModelExecuteOn()) { | |||
| return SUCCESS; | |||
| } | |||
| @@ -66,6 +68,8 @@ Status ProfilingTaskInfo(OpTask *op_task) { | |||
| tmp_task_desc_info.block_dim = block_dim; | |||
| tmp_task_desc_info.task_id = task_id; | |||
| tmp_task_desc_info.stream_id = stream_id; | |||
| tmp_task_desc_info.shape_type = shape_type; | |||
| tmp_task_desc_info.cur_iter_num = 0; | |||
| GELOGD("GetTaskDescInfo of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id); | |||
| task_desc_info.emplace_back(tmp_task_desc_info); | |||
| @@ -193,7 +197,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c | |||
| if (ret != SUCCESS) { | |||
| return ret; | |||
| } | |||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(task)); | |||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(task, kShapeTypeStatic)); | |||
| } | |||
| return ret; | |||
| @@ -255,7 +259,7 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | |||
| std::lock_guard<std::mutex> lk(*stream_mutex_); | |||
| GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); | |||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get(), kShapeTypeDynamic)); | |||
| return SUCCESS; | |||
| } | |||
| } // namespace ge | |||
| @@ -245,6 +245,8 @@ struct TaskDescInfo { | |||
| uint32_t block_dim; | |||
| uint32_t task_id; | |||
| uint32_t stream_id; | |||
| std::string shape_type; | |||
| uint64_t cur_iter_num; | |||
| }; | |||
| // Profiling info of graph | |||