| @@ -453,25 +453,26 @@ bool AscendKernelRuntime::HcclInit() { | |||||
| } | } | ||||
| MS_LOG(INFO) << "do hcom init"; | MS_LOG(INFO) << "do hcom init"; | ||||
| std::string path; | |||||
| const char *config_path_str = std::getenv("MINDSPORE_HCCL_CONFIG_PATH"); | const char *config_path_str = std::getenv("MINDSPORE_HCCL_CONFIG_PATH"); | ||||
| if (config_path_str == nullptr) { | if (config_path_str == nullptr) { | ||||
| MS_LOG(ERROR) << "get hccl json config failed, please set env MINDSPORE_HCCL_CONFIG_PATH"; | MS_LOG(ERROR) << "get hccl json config failed, please set env MINDSPORE_HCCL_CONFIG_PATH"; | ||||
| return false; | return false; | ||||
| } | } | ||||
| path = config_path_str; | |||||
| char fullPath[PATH_MAX] = {0}; | |||||
| if (path.size() > PATH_MAX || realpath(path.c_str(), fullPath) == nullptr) { | |||||
| MS_LOG(ERROR) << "file " << path << " is not exist"; | |||||
| auto full_path = realpath(config_path_str, nullptr); | |||||
| if (full_path == nullptr) { | |||||
| MS_LOG(ERROR) << "file path " << config_path_str << " does not exist"; | |||||
| return false; | return false; | ||||
| } | } | ||||
| const char *identify = std::getenv("RANK_ID"); | const char *identify = std::getenv("RANK_ID"); | ||||
| if (identify == nullptr) { | if (identify == nullptr) { | ||||
| MS_LOG(ERROR) << "get hccl rankid failed, please set env RANK_ID"; | MS_LOG(ERROR) << "get hccl rankid failed, please set env RANK_ID"; | ||||
| free(full_path); | |||||
| return false; | return false; | ||||
| } | } | ||||
| MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << fullPath << ", RANK_ID: " << identify; | |||||
| hcclResult_t res = hcom_init(fullPath, identify); | |||||
| MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << identify; | |||||
| hcclResult_t res = hcom_init(full_path, identify); | |||||
| free(full_path); | |||||
| if (res != HCCL_SUCCESS) { | if (res != HCCL_SUCCESS) { | ||||
| MS_LOG(ERROR) << "hcom init failed, res is " << static_cast<int>(res); | MS_LOG(ERROR) << "hcom init failed, res is " << static_cast<int>(res); | ||||
| return false; | return false; | ||||
| @@ -33,7 +33,7 @@ constexpr char kIterEndNode[] = "PROFILING_ITER_END"; | |||||
| std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_; | std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_; | ||||
| uint32_t ProfilingUtils::custom_node_index_ = 1; | uint32_t ProfilingUtils::custom_node_index_ = 1; | ||||
| ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr) { | |||||
| ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(const NotNull<session::KernelGraph *> graph_ptr) { | |||||
| MS_LOG(INFO) << "get env start"; | MS_LOG(INFO) << "get env start"; | ||||
| custom_node_index_ = 1; | custom_node_index_ = 1; | ||||
| auto &cnode_exec_order = graph_ptr->execution_order(); | auto &cnode_exec_order = graph_ptr->execution_order(); | ||||
| @@ -94,7 +94,7 @@ class ProfilingUtils { | |||||
| // And other cnode, like AllReduce, export PROFILING_CUSTOM_1='full name of AllReduce cnode' | // And other cnode, like AllReduce, export PROFILING_CUSTOM_1='full name of AllReduce cnode' | ||||
| // GetNext, export PROFIFLING_CUSTOM_2='full name fo GetNext cnode' | // GetNext, export PROFIFLING_CUSTOM_2='full name fo GetNext cnode' | ||||
| // The variable i in PROFILING_CUSTOM_i should start from 1 without interruption. | // The variable i in PROFILING_CUSTOM_i should start from 1 without interruption. | ||||
| static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr); | |||||
| static ProfilingTraceInfo GetProfilingTraceFromEnv(const NotNull<session::KernelGraph *> graph_ptr); | |||||
| // Insert two profiling trace points, one in front and one behind | // Insert two profiling trace points, one in front and one behind | ||||
| static void ProfilingCustomOp(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | static void ProfilingCustomOp(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info, | ||||
| @@ -121,8 +121,10 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i | |||||
| LaunchAddrCleanKernel(anf_node_ptr, &kernel_inputs); | LaunchAddrCleanKernel(anf_node_ptr, &kernel_inputs); | ||||
| } | } | ||||
| std::vector<TaskInfoPtr> task_info_ptrs = dynamic_cast<kernel::AscendKernelMod *>(kernel_mod) | |||||
| ->GenTask(kernel_inputs, kernel_workspaces, kernel_outputs, stream_id); | |||||
| auto ascend_kernel_mod = dynamic_cast<kernel::AscendKernelMod *>(kernel_mod); | |||||
| MS_EXCEPTION_IF_NULL(ascend_kernel_mod); | |||||
| std::vector<TaskInfoPtr> task_info_ptrs = | |||||
| ascend_kernel_mod->GenTask(kernel_inputs, kernel_workspaces, kernel_outputs, stream_id); | |||||
| task_info_list->insert(task_info_list->end(), task_info_ptrs.begin(), task_info_ptrs.end()); | task_info_list->insert(task_info_list->end(), task_info_ptrs.begin(), task_info_ptrs.end()); | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -136,7 +136,7 @@ bool HcomUtil::GetHcomCount(const AnfNodePtr &anf_node, const vector<hcclDataTyp | |||||
| } | } | ||||
| } | } | ||||
| if (total_size % type_size != 0) { | |||||
| if (type_size == 0 || total_size % type_size != 0) { | |||||
| MS_LOG(ERROR) << "Total_size[" << total_size << "],Type_size[" << type_size << "] != 0, fail!"; | MS_LOG(ERROR) << "Total_size[" << total_size << "],Type_size[" << type_size << "] != 0, fail!"; | ||||
| return false; | return false; | ||||
| } | } | ||||