| @@ -702,7 +702,7 @@ void AscendStreamAssign::PrintGraphExeOrders(const shared_ptr<mindspore::session | |||||
| << AnfAlgo::GetStreamId(cur_cnode_ptr) << "], event_id[" | << AnfAlgo::GetStreamId(cur_cnode_ptr) << "], event_id[" | ||||
| << GetValue<uint32_t>(primitive->GetAttr(kAttrEventId)) << "]"; | << GetValue<uint32_t>(primitive->GetAttr(kAttrEventId)) << "]"; | ||||
| } else { | } else { | ||||
| MS_LOG(INFO) << "node name[" << AnfAlgo::GetCNodeName(cur_cnode_ptr) << "], logic id[" | |||||
| MS_LOG(INFO) << "node name[" << cur_cnode_ptr->fullname_with_scope() << "], logic id[" | |||||
| << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id[" | << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id[" | ||||
| << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]"; | << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]"; | ||||
| } | } | ||||
| @@ -29,10 +29,6 @@ namespace ascend { | |||||
| // PROFILING_CUSTOM_LOGID_START 3 | // PROFILING_CUSTOM_LOGID_START 3 | ||||
| const uint64_t kProfilingFpStartLogId = 1; | const uint64_t kProfilingFpStartLogId = 1; | ||||
| const uint64_t kProfilingBpEndLogId = 2; | const uint64_t kProfilingBpEndLogId = 2; | ||||
| const uint64_t kProfilingAllReduce1Start = 3; | |||||
| const uint64_t kProfilingAllReduce1End = 4; | |||||
| const uint64_t kProfilingAllReduce2Start = 5; | |||||
| const uint64_t kProfilingAllReduce2End = 6; | |||||
| const uint64_t kProfilingIterEndLogId = 255; | const uint64_t kProfilingIterEndLogId = 255; | ||||
| class ProfilingEngineImpl; | class ProfilingEngineImpl; | ||||
| @@ -14,10 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "device/ascend/profiling/profiling_utils.h" | |||||
| #include <map> | #include <map> | ||||
| #include "device/ascend/profiling/profiling_utils.h" | |||||
| #include "kernel/kernel.h" | #include "kernel/kernel.h" | ||||
| #include "device/ascend/profiling/profiling_manager.h" | #include "device/ascend/profiling/profiling_manager.h" | ||||
| #include "session/anf_runtime_algorithm.h" | #include "session/anf_runtime_algorithm.h" | ||||
| @@ -27,82 +25,61 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | namespace ascend { | ||||
| const char ProfilingUtils::kProfiling[] = "Profiling"; | |||||
| const char ProfilingUtils::kNotify[] = "notify"; | |||||
| const char ProfilingUtils::kProfilerTraceId[] = "profiler_trace_id"; | |||||
| const char ProfilingUtils::kFlags[] = "flags"; | |||||
| constexpr uint32_t kMaxProfilingNodeNum = 100; | |||||
| constexpr char kCustomNode[] = "PROFILING_CUSTOM_"; | |||||
| constexpr char kFpStartNode[] = "PROFILING_FP_START"; | |||||
| constexpr char kBpEndNode[] = "PROFILING_BP_END"; | |||||
| constexpr char kIterEndNode[] = "PROFILING_ITER_END"; | |||||
| std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_; | std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_; | ||||
| bool ProfilingUtils::GetProfilingTraceInfo(const std::shared_ptr<session::KernelGraph> &graph_ptr, | |||||
| ProfilingTraceInfo *profiling_trace_info) { | |||||
| MS_EXCEPTION_IF_NULL(profiling_trace_info); | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr); | |||||
| bool find_begin = false; | |||||
| bool first_allreduce = true; | |||||
| for (const auto &anf_node : graph_ptr->execution_order()) { | |||||
| if (anf_node->isa<CNode>()) { | |||||
| const std::string kernel_name = AnfAlgo::GetCNodeName(anf_node); | |||||
| if ((kernel_name == "Cast" || kernel_name == "Four2Five") && !find_begin) { | |||||
| profiling_trace_info->profiling_trace_begin = anf_node->fullname_with_scope(); | |||||
| find_begin = true; | |||||
| } | |||||
| if (kernel_name == "Conv2DBackpropFilter") { | |||||
| profiling_trace_info->profiling_trace_bp_end = anf_node->fullname_with_scope(); | |||||
| } | |||||
| if (kernel_name == kFusedMulApplyMomentumOpName || kernel_name == kApplyMomentumOpName) { | |||||
| profiling_trace_info->profiling_trace_netoutput = anf_node->fullname_with_scope(); | |||||
| } | |||||
| if (kernel_name == kAllReduceOpName) { | |||||
| if (first_allreduce) { | |||||
| profiling_trace_info->profiling_allreduce1_start = anf_node->fullname_with_scope(); | |||||
| profiling_trace_info->profiling_allreduce1_end = anf_node->fullname_with_scope(); | |||||
| first_allreduce = false; | |||||
| } else { | |||||
| profiling_trace_info->profiling_allreduce2_start = anf_node->fullname_with_scope(); | |||||
| profiling_trace_info->profiling_allreduce2_end = anf_node->fullname_with_scope(); | |||||
| } | |||||
| } | |||||
| uint32_t ProfilingUtils::custom_node_index_ = 1; | |||||
| ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr) { | |||||
| MS_LOG(INFO) << "get env start"; | |||||
| custom_node_index_ = 1; | |||||
| auto &cnode_exec_order = graph_ptr->execution_order(); | |||||
| ProfilingTraceInfo profiling_trace; | |||||
| profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order); | |||||
| profiling_trace.trace_bp_end = GetTraceBpEnd(); | |||||
| profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order); | |||||
| MS_LOG(INFO) << "[profiling] trace_begin:" << profiling_trace.trace_begin | |||||
| << " trace_bp_end:" << profiling_trace.trace_bp_end | |||||
| << " trace_netoutput:" << profiling_trace.trace_netoutput; | |||||
| for (uint32_t i = 1; i <= kMaxProfilingNodeNum; ++i) { | |||||
| std::string env_str = std::string(kCustomNode) + std::to_string(i); | |||||
| const char *node_full_name = std::getenv(env_str.c_str()); | |||||
| if (node_full_name == nullptr) { | |||||
| break; | |||||
| } | } | ||||
| MS_LOG(INFO) << "Get profiling node:" << node_full_name; | |||||
| profiling_trace.trace_custom_node.insert(node_full_name); | |||||
| } | } | ||||
| MS_LOG(INFO) << "[profiling]begin:" << profiling_trace_info->profiling_trace_begin | |||||
| << ", net_output:" << profiling_trace_info->profiling_trace_netoutput | |||||
| << ", end:" << profiling_trace_info->profiling_trace_bp_end | |||||
| << ", allreduce1:" << profiling_trace_info->profiling_allreduce1_start | |||||
| << ", allreduce2:" << profiling_trace_info->profiling_allreduce2_start; | |||||
| return profiling_trace_info->IsValid(); | |||||
| MS_LOG(INFO) << "get env end"; | |||||
| return profiling_trace; | |||||
| } | } | ||||
| bool ProfilingUtils::GetNetOutput(AnfNodePtr anf_node, std::string *profiling_trace_net_output) { | |||||
| MS_EXCEPTION_IF_NULL(anf_node); | |||||
| MS_EXCEPTION_IF_NULL(profiling_trace_net_output); | |||||
| MS_LOG(INFO) << "[profiling]Anf node's full name with scope:" << anf_node->fullname_with_scope(); | |||||
| if (!profiling_trace_net_output->empty()) { | |||||
| MS_LOG(INFO) << "[profiling]Has got the net_output:" << profiling_trace_net_output->c_str(); | |||||
| return true; | |||||
| } | |||||
| if (AnfAlgo::IsRealKernel(anf_node)) { | |||||
| *profiling_trace_net_output = anf_node->fullname_with_scope(); | |||||
| return true; | |||||
| } | |||||
| std::string ProfilingUtils::GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order) { | |||||
| const char *trace_begin = std::getenv(kFpStartNode); | |||||
| auto &first_cnode = cnode_exec_order.front(); | |||||
| MS_EXCEPTION_IF_NULL(first_cnode); | |||||
| return trace_begin == nullptr ? first_cnode->fullname_with_scope() : std::string(trace_begin); | |||||
| } | |||||
| auto cnode = anf_node->cast<CNodePtr>(); | |||||
| if (cnode == nullptr) { | |||||
| MS_LOG(ERROR) << "[profiling]Anf node should be a CNode"; | |||||
| return false; | |||||
| } | |||||
| std::string ProfilingUtils::GetTraceBpEnd() { | |||||
| const char *trace_bp_end = std::getenv(kBpEndNode); | |||||
| return trace_bp_end == nullptr ? "" : std::string(trace_bp_end); | |||||
| } | |||||
| auto inputs = cnode->inputs(); | |||||
| auto input_size = inputs.size(); | |||||
| if (input_size < 2) { | |||||
| MS_LOG(ERROR) << "[profiling]Anf node' input size(" << input_size << ") < 2, don't support get apply kernel node."; | |||||
| return false; | |||||
| } | |||||
| return GetNetOutput(inputs[1], profiling_trace_net_output); | |||||
| std::string ProfilingUtils::GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order) { | |||||
| const char *trace_netoutput = std::getenv(kIterEndNode); | |||||
| auto &last_cnode = cnode_exec_order.back(); | |||||
| MS_EXCEPTION_IF_NULL(last_cnode); | |||||
| return trace_netoutput == nullptr ? last_cnode->fullname_with_scope() : std::string(trace_netoutput); | |||||
| } | } | ||||
| CNodePtr ProfilingUtils::CreateProfilingCNode(const std::shared_ptr<session::KernelGraph> &graph_ptr, bool notify, | |||||
| uint64_t profiler_trace_id, uint32_t flags) { | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr); | |||||
| NotNull<CNodePtr> ProfilingUtils::CreateProfilingCNode(const ProfilingContent &profiling_content, | |||||
| NotNull<session::KernelGraph *> graph_ptr) { | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; | kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; | ||||
| selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT, kOpFormat_DEFAULT}); | selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT, kOpFormat_DEFAULT}); | ||||
| selected_kernel_builder.SetInputsDeviceType({TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); | selected_kernel_builder.SetInputsDeviceType({TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); | ||||
| @@ -118,75 +95,79 @@ CNodePtr ProfilingUtils::CreateProfilingCNode(const std::shared_ptr<session::Ker | |||||
| AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), cnode_ptr.get()); | AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), cnode_ptr.get()); | ||||
| cnode_ptr->set_abstract(type_none_abstract); | cnode_ptr->set_abstract(type_none_abstract); | ||||
| // set attr | // set attr | ||||
| ValuePtr notify_value = MakeValue(notify); | |||||
| ValuePtr trace_id_value = MakeValue(profiler_trace_id); | |||||
| ValuePtr flags_value = MakeValue(flags); | |||||
| ValuePtr notify_value = MakeValue(profiling_content.notify); | |||||
| ValuePtr trace_id_value = MakeValue(profiling_content.profiler_trace_id); | |||||
| ValuePtr flags_value = MakeValue(profiling_content.flags); | |||||
| AnfAlgo::SetNodeAttr(ProfilingUtils::kNotify, notify_value, cnode_ptr); | AnfAlgo::SetNodeAttr(ProfilingUtils::kNotify, notify_value, cnode_ptr); | ||||
| AnfAlgo::SetNodeAttr(ProfilingUtils::kProfilerTraceId, trace_id_value, cnode_ptr); | AnfAlgo::SetNodeAttr(ProfilingUtils::kProfilerTraceId, trace_id_value, cnode_ptr); | ||||
| AnfAlgo::SetNodeAttr(ProfilingUtils::kFlags, flags_value, cnode_ptr); | AnfAlgo::SetNodeAttr(ProfilingUtils::kFlags, flags_value, cnode_ptr); | ||||
| return cnode_ptr; | |||||
| return NOT_NULL(cnode_ptr); | |||||
| } | } | ||||
| void ProfilingUtils::ProfilingTraceFpStart(const std::shared_ptr<mindspore::session::KernelGraph> &graph_ptr, | |||||
| const mindspore::AnfNodePtr &anf_node, | |||||
| const mindspore::device::ascend::ProfilingTraceInfo &profiling_trace_info, | |||||
| std::vector<mindspore::CNodePtr> *kernel_list) { | |||||
| if (profiling_trace_info.IsValid() && profiling_trace_info.profiling_trace_begin == anf_node->fullname_with_scope()) { | |||||
| if (graph_ptr == nullptr || kernel_list == nullptr || anf_node == nullptr) { | |||||
| MS_LOG(ERROR) << "[profiling]input param invalid"; | |||||
| return; | |||||
| } | |||||
| void ProfilingUtils::ProfilingTraceFpStart(const mindspore::AnfNodePtr &anf_node, | |||||
| const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<mindspore::CNodePtr> *> kernel_list) { | |||||
| if (profiling_trace_info.trace_begin == anf_node->fullname_with_scope()) { | |||||
| auto job_id = ProfilingManager::GetInstance().GetJobId(); | auto job_id = ProfilingManager::GetInstance().GetJobId(); | ||||
| // job task info | |||||
| CNodePtr job_kernel_ptr = CreateProfilingCNode(graph_ptr, false, job_id, 0); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), job_kernel_ptr.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), job_kernel_ptr.get()); | |||||
| // fp task info | |||||
| CNodePtr start_kernel_ptr = CreateProfilingCNode(graph_ptr, false, kProfilingFpStartLogId, 0); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), start_kernel_ptr.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), start_kernel_ptr.get()); | |||||
| kernel_list->emplace_back(job_kernel_ptr); | |||||
| kernel_list->emplace_back(start_kernel_ptr); | |||||
| ProfilingContent job_profiling_context = {false, job_id, 0}; | |||||
| auto job_profiling_node = CreateProfilingCNodeWithStream(anf_node, job_profiling_context, graph_ptr); | |||||
| kernel_list->emplace_back(job_profiling_node); | |||||
| ProfilingContent fp_profiling_content = {false, kProfilingFpStartLogId, 0}; | |||||
| auto fp_profiling_node = CreateProfilingCNodeWithStream(anf_node, fp_profiling_content, graph_ptr); | |||||
| kernel_list->emplace_back(fp_profiling_node); | |||||
| } | } | ||||
| } | } | ||||
| void ProfilingUtils::ProfilingAllReduce(const std::shared_ptr<session::KernelGraph> &graph_ptr, | |||||
| const AnfNodePtr &anf_node, int job_id, const std::string &profiling_node_name, | |||||
| std::vector<CNodePtr> *kernel_list) { | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr); | |||||
| CNodePtr ProfilingUtils::CreateProfilingCNodeWithStream(const mindspore::AnfNodePtr &anf_node, | |||||
| const ProfilingContent &profiling_content, | |||||
| NotNull<session::KernelGraph *> graph_ptr) { | |||||
| CNodePtr profiling_node = CreateProfilingCNode(profiling_content, graph_ptr); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), profiling_node.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), profiling_node.get()); | |||||
| return profiling_node; | |||||
| } | |||||
| void ProfilingUtils::ProfilingCustomOp(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<CNodePtr> *> kernel_list) { | |||||
| MS_EXCEPTION_IF_NULL(anf_node); | MS_EXCEPTION_IF_NULL(anf_node); | ||||
| MS_EXCEPTION_IF_NULL(kernel_list); | |||||
| auto full_scope_name = anf_node->fullname_with_scope(); | |||||
| if (profiling_node_name == full_scope_name) { | |||||
| CNodePtr allreduce_kernel_ptr = CreateProfilingCNode(graph_ptr, false, job_id, 0); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), allreduce_kernel_ptr.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), allreduce_kernel_ptr.get()); | |||||
| kernel_list->emplace_back(allreduce_kernel_ptr); | |||||
| auto iter = profiling_trace_info.trace_custom_node.find(anf_node->fullname_with_scope()); | |||||
| if (iter == profiling_trace_info.trace_custom_node.end()) { | |||||
| return; | |||||
| } | } | ||||
| // custom op profiling job start from 3. | |||||
| ProfilingContent front_profiling_content = {false, 2 * custom_node_index_ + 1, 0}; | |||||
| CNodePtr front_node = CreateProfilingCNodeWithStream(anf_node, front_profiling_content, graph_ptr); | |||||
| kernel_list->insert(kernel_list->end() - 1, front_node); | |||||
| ProfilingContent back_profiling_content = {false, 2 * custom_node_index_ + 2, 0}; | |||||
| CNodePtr back_node = CreateProfilingCNodeWithStream(anf_node, back_profiling_content, graph_ptr); | |||||
| kernel_list->insert(kernel_list->end(), back_node); | |||||
| ++custom_node_index_; | |||||
| } | } | ||||
| void ProfilingUtils::ProfilingTraceEnd(const std::shared_ptr<mindspore::session::KernelGraph> &graph_ptr, | |||||
| const mindspore::AnfNodePtr &anf_node, | |||||
| const mindspore::device::ascend::ProfilingTraceInfo &profiling_trace_info, | |||||
| std::vector<mindspore::CNodePtr> *kernel_list) { | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr); | |||||
| void ProfilingUtils::ProfilingTraceBpEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<CNodePtr> *> kernel_list) { | |||||
| MS_EXCEPTION_IF_NULL(anf_node); | MS_EXCEPTION_IF_NULL(anf_node); | ||||
| MS_EXCEPTION_IF_NULL(kernel_list); | |||||
| if (profiling_trace_info.IsValid()) { | |||||
| auto full_scope_name = anf_node->fullname_with_scope(); | |||||
| if (profiling_trace_info.profiling_trace_netoutput == full_scope_name) { | |||||
| CNodePtr bp_kernel_ptr = CreateProfilingCNode(graph_ptr, true, kProfilingIterEndLogId, 0); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), bp_kernel_ptr.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), bp_kernel_ptr.get()); | |||||
| kernel_list->emplace_back(bp_kernel_ptr); | |||||
| } | |||||
| if (profiling_trace_info.trace_bp_end == anf_node->fullname_with_scope()) { | |||||
| ProfilingContent bp_end_profiling_content = {false, kProfilingBpEndLogId, 0}; | |||||
| CNodePtr bp_end_node = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr); | |||||
| kernel_list->emplace_back(bp_end_node); | |||||
| } | |||||
| } | |||||
| if (profiling_trace_info.profiling_trace_bp_end == full_scope_name) { | |||||
| CNodePtr end_task_info = CreateProfilingCNode(graph_ptr, false, kProfilingBpEndLogId, 0); | |||||
| AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), end_task_info.get()); | |||||
| AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), end_task_info.get()); | |||||
| kernel_list->emplace_back(end_task_info); | |||||
| } | |||||
| void ProfilingUtils::ProfilingTraceEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<mindspore::CNodePtr> *> kernel_list) { | |||||
| MS_EXCEPTION_IF_NULL(anf_node); | |||||
| auto full_scope_name = anf_node->fullname_with_scope(); | |||||
| if (profiling_trace_info.trace_netoutput == full_scope_name) { | |||||
| ProfilingContent bp_end_profiling_content = {true, kProfilingIterEndLogId, 0}; | |||||
| CNodePtr bp_kernel_ptr = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr); | |||||
| kernel_list->emplace_back(bp_kernel_ptr); | |||||
| } | } | ||||
| } | } | ||||
| @@ -19,63 +19,102 @@ | |||||
| #include <memory> | #include <memory> | ||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include <set> | |||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include "session/kernel_graph.h" | #include "session/kernel_graph.h" | ||||
| #include "utils/contract.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | namespace ascend { | ||||
| struct ProfilingTraceInfo { | struct ProfilingTraceInfo { | ||||
| // execute order's first execute op(like: Cast or Four2Five ...), except tdt op(GetNext ...) | // execute order's first execute op(like: Cast or Four2Five ...), except tdt op(GetNext ...) | ||||
| std::string profiling_trace_begin; | |||||
| std::string trace_begin; | |||||
| // get first net_output(apply kernel) from graph outputs: fp ->net_output<- bp | // get first net_output(apply kernel) from graph outputs: fp ->net_output<- bp | ||||
| std::string profiling_trace_bp_end; | |||||
| std::string trace_bp_end; | |||||
| // execute order's end execute (like: Conv2DBackpropFilter) | // execute order's end execute (like: Conv2DBackpropFilter) | ||||
| std::string profiling_trace_netoutput; | |||||
| std::string trace_netoutput; | |||||
| std::string profiling_allreduce1_start; | |||||
| std::string profiling_allreduce1_end; | |||||
| std::string profiling_allreduce2_start; | |||||
| std::string profiling_allreduce2_end; | |||||
| // profiling specific op, such as AllReduce; | |||||
| std::set<std::string> trace_custom_node; | |||||
| // 1. insert profiling_trace_begin if profiling_trace_bp_end is not empty. | // 1. insert profiling_trace_begin if profiling_trace_bp_end is not empty. | ||||
| // 2. op lanuch get task info with callback func. | // 2. op lanuch get task info with callback func. | ||||
| // 3. insert profiling_trace_bp_end. | // 3. insert profiling_trace_bp_end. | ||||
| // 4. insert profiling_trace_net_output if profiling_trace_bp_end is not empty. | // 4. insert profiling_trace_net_output if profiling_trace_bp_end is not empty. | ||||
| bool IsValid() const { return !(profiling_trace_begin.empty() || profiling_trace_bp_end.empty()); } | |||||
| bool IsValid() const { return !(trace_begin.empty() || trace_bp_end.empty() || trace_netoutput.empty()); } | |||||
| }; | |||||
| struct ProfilingContent { | |||||
| // true -send data from device to host and finish profiling | |||||
| bool notify; | |||||
| uint64_t profiler_trace_id; | |||||
| uint32_t flags; | |||||
| }; | }; | ||||
| class ProfilingUtils { | class ProfilingUtils { | ||||
| public: | public: | ||||
| ProfilingUtils() = default; | ProfilingUtils() = default; | ||||
| ~ProfilingUtils() = default; | ~ProfilingUtils() = default; | ||||
| static bool GetProfilingTraceInfo(const std::shared_ptr<session::KernelGraph> &graph_ptr, | |||||
| ProfilingTraceInfo *profiling_trace_info); | |||||
| static void ProfilingTraceFpStart(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node, | |||||
| const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list); | |||||
| static void ProfilingAllReduce(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node, | |||||
| int job_id, const std::string &profiling_node_name, | |||||
| std::vector<CNodePtr> *kernel_list); | |||||
| static void ProfilingTraceEnd(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node, | |||||
| const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list); | |||||
| // Insert job_id profiling node and fp_start profiling node. | |||||
| // Job_id is got from envs, which shound be a number greater than 255 | |||||
| // Fp_start node should been inserted in the start of a network, and the log_id is hard code to 1. | |||||
| static void ProfilingTraceFpStart(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<CNodePtr> *> kernel_list); | |||||
| // Insert net output profiling node, which tells the device to stop profiling. | |||||
| // The notify in struct ProfilingContent should be 'true', which tells the device to send data to host. | |||||
| static void ProfilingTraceEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<CNodePtr> *> kernel_list); | |||||
| // Insert bp_end profiling node, which should been inserted after the last backpropagation CNode in the network. | |||||
| static void ProfilingTraceBpEnd(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<mindspore::CNodePtr> *> kernel_list); | |||||
| // Mapping graph id and the kernels' name in the graph | |||||
| static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names); | static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names); | ||||
| // Mapping task_id and kernel name for device to generate the time cost of specific kernel. | |||||
| // Device calculate the time cost of the task which is marked by task id. | |||||
| // But we need data of (kernel name , time cost) | |||||
| static void ReportProfilingData(uint32_t graph_id, const std::vector<uint32_t> &task_ids); | static void ReportProfilingData(uint32_t graph_id, const std::vector<uint32_t> &task_ids); | ||||
| static const char kProfiling[]; | |||||
| static const char kNotify[]; | |||||
| static const char kProfilerTraceId[]; | |||||
| static const char kFlags[]; | |||||
| // Get profiling trace point from envs. | |||||
| // export PROFILING_FP_START='full name of the first cnode to execute' | |||||
| // export PROFILING_BP_END='full name of the last backpropagation cnode to execute' | |||||
| // export PROFILING_ITER_END='full name of last cnode in graph to execute' | |||||
| // And other cnode, like AllReduce, export PROFILING_CUSTOM_1='full name of AllReduce cnode' | |||||
| // GetNext, export PROFIFLING_CUSTOM_2='full name fo GetNext cnode' | |||||
| // The variable i in PROFILING_CUSTOM_i should start from 1 without interruption. | |||||
| static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr); | |||||
| // Insert two profiling trace points, one in front and one behind | |||||
| static void ProfilingCustomOp(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> graph_ptr, | |||||
| NotNull<std::vector<mindspore::CNodePtr> *> kernel_list); | |||||
| inline static constexpr char kProfiling[] = "Profiling"; | |||||
| inline static constexpr char kNotify[] = "notify"; | |||||
| inline static constexpr char kProfilerTraceId[] = "profiler_trace_id"; | |||||
| inline static constexpr char kFlags[] = "flags"; | |||||
| private: | private: | ||||
| static bool GetNetOutput(AnfNodePtr anf_node, std::string *profiling_trace_net_output); | |||||
| static CNodePtr CreateProfilingCNode(const std::shared_ptr<session::KernelGraph> &graph_ptr, bool notify, | |||||
| uint64_t profiler_trace_id, uint32_t flags); | |||||
| static NotNull<CNodePtr> CreateProfilingCNode(const ProfilingContent &profiling_content, | |||||
| NotNull<session::KernelGraph *> graph_ptr); | |||||
| static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content, | |||||
| NotNull<session::KernelGraph *> graph_ptr); | |||||
| static std::string GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order); | |||||
| static std::string GetTraceBpEnd(); | |||||
| static std::string GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order); | |||||
| // graph id --> (kernel name list) | // graph id --> (kernel name list) | ||||
| static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name_; | static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name_; | ||||
| static uint32_t custom_node_index_; | |||||
| }; | }; | ||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| @@ -438,23 +438,22 @@ void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) { | |||||
| MS_LOG(INFO) << "---------------- LoadSwitchInputs End--"; | MS_LOG(INFO) << "---------------- LoadSwitchInputs End--"; | ||||
| } | } | ||||
| void KernelAdjust::Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { | |||||
| void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) { | |||||
| if (!ascend::ProfilingManager::GetInstance().IsProfiling()) { | if (!ascend::ProfilingManager::GetInstance().IsProfiling()) { | ||||
| MS_LOG(INFO) << "No need to profiling"; | MS_LOG(INFO) << "No need to profiling"; | ||||
| return; | return; | ||||
| } | } | ||||
| ProfilingTraceInfo profiling_trace_info; | |||||
| if (ProfilingUtils::GetProfilingTraceInfo(kernel_graph_ptr, &profiling_trace_info)) { | |||||
| InsertProfilingKernel(kernel_graph_ptr, profiling_trace_info); | |||||
| } else { | |||||
| MS_LOG(WARNING) << "[profiling] GetProfilingTraceInfo failed"; | |||||
| ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GetProfilingTraceFromEnv(kernel_graph_ptr); | |||||
| if (!profiling_trace_info.IsValid()) { | |||||
| MS_LOG(WARNING) << "[profiling] no profiling node found!"; | |||||
| return; | |||||
| } | } | ||||
| InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr); | |||||
| } | } | ||||
| void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr, | |||||
| const ProfilingTraceInfo &profiling_trace_info) { | |||||
| void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> kernel_graph_ptr) { | |||||
| MS_LOG(INFO) << "[profiling] Insert profiling kernel start"; | MS_LOG(INFO) << "[profiling] Insert profiling kernel start"; | ||||
| MS_EXCEPTION_IF_NULL(kernel_graph_ptr); | |||||
| if (!profiling_trace_info.IsValid()) { | if (!profiling_trace_info.IsValid()) { | ||||
| MS_LOG(WARNING) << "Profiling trace point not found"; | MS_LOG(WARNING) << "Profiling trace point not found"; | ||||
| return; | return; | ||||
| @@ -462,18 +461,12 @@ void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGr | |||||
| std::vector<CNodePtr> new_cnode_list; | std::vector<CNodePtr> new_cnode_list; | ||||
| std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order(); | std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order(); | ||||
| for (const auto &cnode_ptr : cnode_ptr_list) { | for (const auto &cnode_ptr : cnode_ptr_list) { | ||||
| ProfilingUtils::ProfilingTraceFpStart(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1Start, | |||||
| profiling_trace_info.profiling_allreduce1_start, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2Start, | |||||
| profiling_trace_info.profiling_allreduce2_start, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingTraceFpStart(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); | |||||
| new_cnode_list.emplace_back(cnode_ptr); | new_cnode_list.emplace_back(cnode_ptr); | ||||
| ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1End, | |||||
| profiling_trace_info.profiling_allreduce1_end, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2End, | |||||
| profiling_trace_info.profiling_allreduce2_end, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingTraceEnd(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list); | |||||
| ProfilingUtils::ProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); | |||||
| ProfilingUtils::ProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); | |||||
| ProfilingUtils::ProfilingTraceEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); | |||||
| } | } | ||||
| kernel_graph_ptr->set_execution_order(new_cnode_list); | kernel_graph_ptr->set_execution_order(new_cnode_list); | ||||
| } | } | ||||
| @@ -48,7 +48,7 @@ class KernelAdjust { | |||||
| void SetStreamSwitchOps(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | void SetStreamSwitchOps(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | ||||
| bool StepLoadCtrlInputs(const std::shared_ptr<session::Context> &context, | bool StepLoadCtrlInputs(const std::shared_ptr<session::Context> &context, | ||||
| const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | ||||
| void Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | |||||
| void Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr); | |||||
| static bool NeedInsertSwitch(); | static bool NeedInsertSwitch(); | ||||
| CNodePtr CreateSteamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | CNodePtr CreateSteamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr); | ||||
| @@ -66,8 +66,8 @@ class KernelAdjust { | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder CreateMngKernelBuilder(const std::vector<std::string> &formats, | kernel::KernelBuildInfo::KernelBuildInfoBuilder CreateMngKernelBuilder(const std::vector<std::string> &formats, | ||||
| const std::vector<TypeId> &type_ids); | const std::vector<TypeId> &type_ids); | ||||
| void LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs); | void LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs); | ||||
| void InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr, | |||||
| const ProfilingTraceInfo &profiling_trace_info); | |||||
| void InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info, | |||||
| NotNull<session::KernelGraph *> kernel_graph_ptr); | |||||
| }; | }; | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -246,7 +246,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern | |||||
| kernel_graph->SetExecOrderByDefault(); | kernel_graph->SetExecOrderByDefault(); | ||||
| if (save_graphs) { | if (save_graphs) { | ||||
| std::string file_path = save_graphs_path + "/" + "hwopt_d_end.ir"; | std::string file_path = save_graphs_path + "/" + "hwopt_d_end.ir"; | ||||
| DumpIR(file_path, kernel_graph); | |||||
| DumpIR(file_path, kernel_graph, true); | |||||
| DumpIRProto(kernel_graph, "after_hwopt"); | DumpIRProto(kernel_graph, "after_hwopt"); | ||||
| } | } | ||||
| } | } | ||||
| @@ -136,7 +136,7 @@ void AscendSession::BuildGraph(GraphId graph_id) { | |||||
| // Assign streams for control sink and hccl and so on | // Assign streams for control sink and hccl and so on | ||||
| AssignStream(graph); | AssignStream(graph); | ||||
| device::KernelAdjust::GetInstance().Profiling(graph); | |||||
| device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get())); | |||||
| // build kernel if node is cnode | // build kernel if node is cnode | ||||
| BuildKernel(graph); | BuildKernel(graph); | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| @@ -42,6 +42,6 @@ bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::Context> &c | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool KernelAdjust::NeedInsertSwitch() { return true; } | bool KernelAdjust::NeedInsertSwitch() { return true; } | ||||
| void KernelAdjust::Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) { return; } | |||||
| void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) { return; } | |||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||