From: @zhoufeng54 Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -6,10 +6,15 @@ include(${GE_SOURCE_DIR}/cmake/ge_utils.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/json.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/eigen.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/gtest.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/onnx.cmake) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/securec.cmake) | |||
| if (ENABLE_D) | |||
| set(AS_MS_COMP TRUE) | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) | |||
| unset(AS_MS_COMP) | |||
| else () | |||
| include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) | |||
| endif () | |||
| # for UT, find slog and error_manager from local prebuild | |||
| if (NOT ENABLE_D AND NOT ENABLE_ACL) | |||
| set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR}) | |||
| @@ -79,8 +84,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__FILE__='\"$(subst $(realpath ${CMAKE | |||
| add_subdirectory(${GE_SOURCE_DIR}/src/common/graph) | |||
| if (ENABLE_ACL OR ENABLE_D) | |||
| add_subdirectory(${GE_SOURCE_DIR}/src/ge/common) | |||
| target_compile_definitions(graph PRIVATE google=ascend_private) | |||
| set_target_properties(graph PROPERTIES SKIP_BUILD_RPATH TRUE) | |||
| if (ENABLE_D) | |||
| add_subdirectory(${GE_SOURCE_DIR}/src/ge/ge_runtime) | |||
| target_compile_definitions(ge_runtime PRIVATE google=ascend_private) | |||
| set_target_properties(ge_runtime PROPERTIES SKIP_BUILD_RPATH TRUE) | |||
| endif () | |||
| endif () | |||
| @@ -216,7 +216,7 @@ if (NOT ENABLE_GE) | |||
| if (ENABLE_D) | |||
| install( | |||
| TARGETS ms_profile | |||
| TARGETS ms_profile hccl_adapter | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| @@ -1 +1 @@ | |||
| Subproject commit 412ebe82c96620b5f7c942a7ab87a45bf14c5621 | |||
| Subproject commit 383f7f751d6612e9dbde9e22a2960098fdbf3792 | |||
| @@ -174,7 +174,7 @@ foreach (_comp ${SUB_COMP}) | |||
| string(REPLACE "/" "_" sub ${_comp}) | |||
| if (TARGET _mindspore_${sub}_obj) | |||
| list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_${sub}_obj>) | |||
| add_dependencies(_mindspore_${sub}_obj proto_input ) | |||
| add_dependencies(_mindspore_${sub}_obj proto_input) | |||
| endif () | |||
| endforeach () | |||
| @@ -229,28 +229,26 @@ if (ENABLE_D) | |||
| endif() | |||
| MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}") | |||
| find_library(HCCL hccl ${ASCEND_RUNTIME_PATH}) | |||
| find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH}) | |||
| find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH}) | |||
| find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH}) | |||
| # for atlas env | |||
| find_library(HCCL hccl ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(CCE_LIB cce ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(RUNTIME_LIB runtime ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(TSDCLIENT tsdclient HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(DATATRANSFER datatransfer HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(PROFILING msprofiler ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(HCCL hccl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(PROFILING_SHARED msprof ${ASCEND_DRIVER_PATH}) | |||
| find_library(REGISTER register ${ASCEND_RUNTIME_PATH}) | |||
| find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(OPTILING optiling ${ASCEND_OPP_PATH}) | |||
| add_library(ms_profile SHARED ${PROFILING}) | |||
| set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) | |||
| target_link_libraries(ms_profile -Wl,--start-group ${PROFILING_SHARED} ${PROFILING} mindspore::protobuf -Wl,--end-group) | |||
| target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed) | |||
| target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} | |||
| ${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed) | |||
| target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) | |||
| # hccl_adpter | |||
| find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) | |||
| add_subdirectory(runtime/hccl_adapter) | |||
| target_link_libraries(hccl_adapter PRIVATE mindspore ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${HCCL_BUILDER}) | |||
| elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece -Wl,--end-group) | |||
| else () | |||
| @@ -274,11 +272,14 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| else () | |||
| MESSAGE(FATAL_ERROR "other platform: ${CMAKE_SYSTEM_NAME}") | |||
| endif () | |||
| set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib) | |||
| if (ENABLE_D) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling) | |||
| @@ -286,9 +287,16 @@ if (ENABLE_D) | |||
| elseif (ENABLE_GPU) | |||
| set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/cuda/lib64) | |||
| endif () | |||
| set(HCCL_ADPT_RPATH ${ORIGIN_PATH}:${MINDSPORE_RPATH}) | |||
| set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH}) | |||
| set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH}) | |||
| if (ENABLE_D) | |||
| set_target_properties(hccl_adapter PROPERTIES INSTALL_RPATH ${HCCL_ADPT_RPATH}) | |||
| target_link_libraries(_c_expression PRIVATE hccl_adapter) | |||
| endif () | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| target_link_libraries(mindspore mindspore::pybind11_module) | |||
| target_link_libraries(mindspore mindspore_gvar) | |||
| @@ -352,6 +360,7 @@ if (ENABLE_D) | |||
| find_library(adump_server libadump_server.a ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| target_link_libraries(_c_expression PRIVATE ${adump_server}) | |||
| target_link_libraries(inference PRIVATE ${adump_server}) | |||
| target_link_libraries(inference PRIVATE mindspore_core hccl_adapter) | |||
| endif() | |||
| if (ENABLE_CPU) | |||
| @@ -17,16 +17,15 @@ | |||
| #include "backend/kernel_compiler/hccl/hccl_kernel.h" | |||
| #include <map> | |||
| #include "runtime/device/ascend/tasksink/runtime_utils.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/utils.h" | |||
| #include "utils/ms_context.h" | |||
| #include "runtime/device/kernel_runtime.h" | |||
| #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" | |||
| #include "runtime/hccl_adapter/hccl_adapter.h" | |||
| using HcclTaskInfoPtr = std::shared_ptr<ge::model_runner::HcclTaskInfo>; | |||
| using ge::model_runner::HcclTaskInfo; | |||
| using mindspore::device::ascend::tasksink::RuntimeUtils; | |||
| namespace { | |||
| static std::map<std::string, std::string> kMsOpNameToHcomHcclType = { | |||
| @@ -145,35 +144,45 @@ const std::vector<size_t> &HcclKernel::GetOutputSizeList() const { | |||
| const std::vector<size_t> &HcclKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } | |||
| std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, uint32_t stream_id) { | |||
| if (inputs.empty() || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Inputs or outputs is empty"; | |||
| } | |||
| stream_id_ = stream_id; | |||
| std::string hccl_type = AnfAlgo::GetCNodeName(anf_node_); | |||
| MS_EXCEPTION_IF_NULL(inputs.at(0)); | |||
| auto input_data_addr = inputs.at(0)->addr; | |||
| MS_EXCEPTION_IF_NULL(outputs.at(0)); | |||
| auto output_data_addr = outputs.at(0)->addr; | |||
| void *workspace_address = nullptr; | |||
| const int64_t workspace_num = 0; | |||
| std::vector<uint8_t> private_def; | |||
| HcclDataType data_type = hccl_data_type_list_[0]; | |||
| MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", ws_num=" << workspace_num << ", count=" << hccl_count_ | |||
| << ", root_id=" << root_id_ << ", op_type=" << static_cast<int>(op_type_) | |||
| << ", data_type=" << static_cast<int>(data_type); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| HcclTaskInfoPtr task_info_ptr = std::make_shared<HcclTaskInfo>( | |||
| kernel_name_, stream_id, hccl_type, input_data_addr, output_data_addr, workspace_address, workspace_num, 0, | |||
| private_def, nullptr, hccl_count_, root_id_, op_type_, data_type, group_, RuntimeUtils::HcomBindModel, | |||
| RuntimeUtils::HcomUnbindModel, RuntimeUtils::HcomDistribute, NeedDump()); | |||
| MS_EXCEPTION_IF_NULL(task_info_ptr); | |||
| return {task_info_ptr}; | |||
| std::vector<hccl::HcclTaskInfo> task_info; | |||
| bool ret = hccl::GenTask(anf_node_, data_type, &task_info); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Gen Task for " << anf_node_->DebugString() << " failed."; | |||
| } | |||
| std::vector<TaskInfoPtr> results; | |||
| for (auto &task : task_info) { | |||
| MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", count=" << hccl_count_ << ", root_id=" << root_id_ | |||
| << ", op_type=" << static_cast<int>(op_type_) << ", data_type=" << static_cast<int>(data_type) | |||
| << ", workspace_size=" << task.workspace_size << ", stream_num=" << task.stream_num | |||
| << ", private_def_size=" << task.private_def.size(); | |||
| private_def.resize(task.private_def.size()); | |||
| auto sec_ret = memcpy_s(private_def.data(), private_def.size(), task.private_def.data(), task.private_def.size()); | |||
| if (sec_ret != 0) { | |||
| MS_LOG(EXCEPTION) << "Set data memcpy_s failed, ret = " << sec_ret; | |||
| } | |||
| results.emplace_back(std::make_shared<HcclTaskInfo>( | |||
| kernel_name_, stream_id, hccl::GetHcclType(anf_node_), input_data_addr, output_data_addr, task.workspace_size, | |||
| task.stream_num, private_def, hccl::GetHcclOpsKernelInfoStore(), hccl_count_, root_id_, op_type_, data_type, | |||
| group_, NeedDump())); | |||
| } | |||
| return results; | |||
| } | |||
| device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) { | |||
| @@ -20,26 +20,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool HcomAllBroadCastKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| bool HcomAllBroadCastKernel::Launch(const std::vector<AddressPtr> & /*inputs*/, | |||
| const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> & /*outputs*/, void *stream_ptr) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| return true; | |||
| } | |||
| if (inputs.empty() || hccl_data_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "BroadCast param is empty"; | |||
| return false; | |||
| } | |||
| const char *tag = "Hccl-BroadCast"; | |||
| MS_EXCEPTION_IF_NULL(inputs[0]); | |||
| HcclResult ret = | |||
| hcom_broadcast(tag, inputs[0]->addr, hccl_count_, hccl_data_type_list_[0], root_id_, nullptr, stream_ptr); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "HcomBroadcastOp : hcom_broadcast fail, return: " << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) { | |||
| MS_LOG(INFO) << "HcomAllBroadCast launch"; | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -20,24 +20,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool HcomAllGatherKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| return true; | |||
| } | |||
| if (inputs.empty() || hccl_data_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "AllGather param is empty"; | |||
| return false; | |||
| } | |||
| const char *tag = "Hccl-AllGather"; | |||
| HcclResult ret = | |||
| hcom_all_gather(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], nullptr, stream_ptr); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "HcomAllGatherKernelOp : hcom_all_gather fail, return: " << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| bool HcomAllGatherKernel::Launch(const std::vector<AddressPtr> & /*inputs*/, | |||
| const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) { | |||
| MS_LOG(INFO) << "HcomAllGather launch"; | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -20,24 +20,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool HcomAllReduceKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| return true; | |||
| } | |||
| if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "AllReduce param is empty"; | |||
| return false; | |||
| } | |||
| const char *tag = "Hccl-AllReduce"; | |||
| HcclResult ret = hcom_all_reduce(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], | |||
| op_type_, nullptr, stream_ptr); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "HcomAllReduceKernelOp : hcom_all_reduce fail, return: " << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| bool HcomAllReduceKernel::Launch(const std::vector<AddressPtr> & /*inputs*/, | |||
| const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) { | |||
| MS_LOG(INFO) << "HcomAllReduce launch"; | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -20,25 +20,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool HcomAllReduceScatterKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| bool HcomAllReduceScatterKernel::Launch(const std::vector<AddressPtr> & /*inputs*/, | |||
| const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| return true; | |||
| } | |||
| if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) { | |||
| MS_LOG(ERROR) << "ReduceScatter param is empty"; | |||
| return false; | |||
| } | |||
| const char *tag = "Hccl-ReduceScatter"; | |||
| HcclResult ret = hcom_reduce_scatter(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], | |||
| op_type_, nullptr, stream_ptr); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "HcomReduceScatterOp : hcom_reduce_scatter fail, return: " << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) { | |||
| MS_LOG(INFO) << "HcomAllReduceScatter launch"; | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -55,6 +55,7 @@ | |||
| #include "profiler/device/ascend/rt_callback_manager.h" | |||
| #include "utils/config_manager.h" | |||
| #include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h" | |||
| #include "runtime/hccl_adapter/hccl_adapter.h" | |||
| using ge::model_runner::ModelRunner; | |||
| using mindspore::device::ascend::ProfilingManager; | |||
| @@ -796,10 +797,10 @@ bool AscendKernelRuntime::HcclInit() { | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str; | |||
| HcclResult res = hcom_init(full_path, rank_id_str.c_str()); | |||
| bool ret = hccl::InitHccl(context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID), rank_id_str, full_path); | |||
| free(full_path); | |||
| if (res != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "Hcom init failed, res is " << static_cast<int>(res); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "Hcom init failed."; | |||
| return false; | |||
| } | |||
| return true; | |||
| @@ -816,12 +817,14 @@ bool AscendKernelRuntime::DestroyHccl() { | |||
| if (!HcclExecutorManager::GetInstance().Finalize()) { | |||
| MS_LOG(ERROR) << "Dynamic Shape Hccl Finalize Failed"; | |||
| } | |||
| HcclResult res = hcom_destroy(); | |||
| if (res != HCCL_SUCCESS) { | |||
| bool res = hccl::FinalizeHccl(); | |||
| if (!res) { | |||
| MS_LOG(ERROR) << "Hccl destroy failed"; | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "Hccl destroy successful, status = " << res << "."; | |||
| MS_LOG(INFO) << "Hccl destroy successful."; | |||
| context_ptr->set_param<bool>(MS_CTX_ENABLE_HCCL, false); | |||
| return true; | |||
| } | |||
| @@ -855,7 +858,7 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name) | |||
| auto try_emplace_ret = stream_id_task_id_op_name_map_.try_emplace(stream_task_pair, kernel_name); | |||
| if (!try_emplace_ret.second) { | |||
| MS_LOG(WARNING) << "Profiling duplicate key, task_id:" << stream_task_pair.second | |||
| << " stream_id:" << stream_task_pair.first << " name:" << kernel_name; | |||
| << " stream_id:" << stream_task_pair.first << " name:" << kernel_name; | |||
| } | |||
| if (stream_id_task_id_op_name_map_.size() > kProfilingMaxTaskIdInStream) { | |||
| MS_LOG(EXCEPTION) << "Too many profiling data"; | |||
| @@ -1,106 +0,0 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/device/ascend/tasksink/runtime_utils.h" | |||
| #include <string> | |||
| #include "hccl/hcom.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "hccl/hccl_types.h" | |||
| #include "utils/utils.h" | |||
| constexpr auto kHcomBroadcast = "hcom_broadcast_"; | |||
| constexpr auto kHcomAllGather = "hcom_all_gather_"; | |||
| constexpr auto kHcomAllReduce = "hcom_all_reduce_"; | |||
| constexpr auto kHcomReduceScatter = "hcom_reduce_scatter_"; | |||
| constexpr auto kUnderline = "_"; | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| namespace tasksink { | |||
| bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) { | |||
| HcclResult ret = hcom_bind_model(model, stream); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "Call hcom_bind_model failed, ret: 0x" << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool RuntimeUtils::HcomUnbindModel(rtModel_t model) { | |||
| HcclResult ret = hcom_unbind_model(model); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "Call hcom_unbind_model failed, ret: 0x" << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream) { | |||
| MS_LOG(INFO) << "hccl distribute start"; | |||
| MS_EXCEPTION_IF_NULL(task_info); | |||
| HcclResult ret; | |||
| static uint32_t task_counter = 0; | |||
| auto hccl_group = task_info->group(); | |||
| if (task_info->hccl_type() == kBroadcastOpName) { | |||
| // call hcom broadcast interface to run op | |||
| const string tag_broadcast = kHcomBroadcast + std::to_string(task_counter++) + kUnderline + std::to_string(0); | |||
| ret = hcom_broadcast(tag_broadcast.c_str(), task_info->input_data_addr(), static_cast<u64>(task_info->count()), | |||
| static_cast<HcclDataType>(task_info->data_type()), static_cast<u32>(task_info->root_id()), | |||
| hccl_group.c_str(), stream); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "hcom_broadcast fail, return ret: " << static_cast<int>(ret); | |||
| return false; | |||
| } | |||
| } else if (task_info->hccl_type() == kAllGatherOpName) { | |||
| // call hcom allgather interface to run op | |||
| const string tag_all_gather = kHcomAllGather + std::to_string(task_counter++) + kUnderline + std::to_string(0); | |||
| ret = hcom_all_gather(tag_all_gather.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), | |||
| static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()), | |||
| hccl_group.c_str(), stream); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "hcom_all_gather fail, return ret: " << ret; | |||
| return false; | |||
| } | |||
| } else if (task_info->hccl_type() == kAllReduceOpName) { | |||
| // call hcom allreduce interface to run op | |||
| const string tag_all_reduce = kHcomAllReduce + std::to_string(task_counter++) + kUnderline + std::to_string(0); | |||
| ret = hcom_all_reduce(tag_all_reduce.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), | |||
| static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()), | |||
| static_cast<HcclReduceOp>(task_info->op_type()), hccl_group.c_str(), stream); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "hcom_all_reduce fail, return ret: " << ret; | |||
| return false; | |||
| } | |||
| } else if (task_info->hccl_type() == kReduceScatterOpName) { | |||
| // call hcom reducescatter interface to run op | |||
| const string tag_reduce_scatter = | |||
| kHcomReduceScatter + std::to_string(task_counter++) + kUnderline + std::to_string(0); | |||
| ret = hcom_reduce_scatter(tag_reduce_scatter.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), | |||
| static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()), | |||
| static_cast<HcclReduceOp>(task_info->op_type()), hccl_group.c_str(), stream); | |||
| if (ret != HCCL_SUCCESS) { | |||
| MS_LOG(ERROR) << "hcom_reduce_scatter fail, return ret: " << ret; | |||
| return false; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace tasksink | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -1,39 +0,0 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ | |||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ | |||
| #include <memory> | |||
| #include "runtime/rt.h" | |||
| #include "framework/ge_runtime/task_info.h" | |||
| using ge::model_runner::HcclTaskInfo; | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| namespace tasksink { | |||
| class RuntimeUtils { | |||
| public: | |||
| static bool HcomBindModel(rtModel_t model, rtStream_t stream); | |||
| static bool HcomUnbindModel(rtModel_t model); | |||
| static bool HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream); | |||
| }; | |||
| } // namespace tasksink | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ | |||
| @@ -0,0 +1,8 @@ | |||
| file(GLOB_RECURSE HCCL_ADAPTER_SRC_LIST ./*.cc) | |||
| set_property(SOURCE ${HCCL_ADAPTER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_HCCL_ADPT) | |||
| add_library(hccl_adapter SHARED ${HCCL_ADAPTER_SRC_LIST}) | |||
| target_include_directories(hccl_adapter PRIVATE ${CMAKE_BINARY_DIR}/proto/ge) | |||
| add_dependencies(hccl_adapter graph) | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Linux") | |||
| target_link_options(hccl_adapter PRIVATE -Wl,-init,mindspore_log_init) | |||
| endif () | |||
| @@ -0,0 +1,129 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/hccl_adapter/converter.h" | |||
| #include <map> | |||
| #include <algorithm> | |||
| #include <tuple> | |||
| #define google ascend_private | |||
| #include "register/ops_kernel_builder_registry.h" | |||
| #include "graph/compute_graph.h" | |||
| #include "graph/debug/ge_attr_define.h" | |||
| #undef google | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "mindspore/core/base/core_ops.h" | |||
| #include "transform/graph_ir/util.h" | |||
| static constexpr char kGeOpNameHcclAllRudece[] = "HcomAllReduce"; | |||
| static constexpr char kGeOpNameHcclAllGather[] = "HcomAllGather"; | |||
| static constexpr char kGeOpNameHcclBroadcast[] = "HcomBroadcast"; | |||
| static constexpr char kGeOpNameHcclReduceScatter[] = "HcomReduceScatter"; | |||
| static constexpr char kGeNodeAttrUsedStreamNum[] = "used_stream_num"; | |||
| static constexpr char kStubDataStructureName[] = "any_name_can_work"; | |||
| static ge::DataType ConvertHcclDTypeToGeDType(HcclDataType datatype) { | |||
| static map<HcclDataType, ge::DataType> kHcomDataTypeMap = { | |||
| {HCCL_DATA_TYPE_FP32, ge::DT_FLOAT}, | |||
| {HCCL_DATA_TYPE_FP16, ge::DT_FLOAT16}, | |||
| {HCCL_DATA_TYPE_INT8, ge::DT_INT8}, | |||
| {HCCL_DATA_TYPE_INT32, ge::DT_INT32}, | |||
| }; | |||
| auto iter = kHcomDataTypeMap.find(datatype); | |||
| if (iter == kHcomDataTypeMap.end()) { | |||
| MS_LOG(EXCEPTION) << "Unknown hccl data type " << datatype; | |||
| } | |||
| return iter->second; | |||
| } | |||
| namespace mindspore::hccl { | |||
| std::string GetGeNodeName(const CNodePtr &cnode) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| if (IsPrimitiveCNode(cnode, prim::kPrimAllReduce)) { | |||
| return kGeOpNameHcclAllRudece; | |||
| } else if (IsPrimitiveCNode(cnode, prim::kPrimAllGather)) { | |||
| return kGeOpNameHcclAllGather; | |||
| } else if (IsPrimitiveCNode(cnode, prim::kPrimBroadcast)) { | |||
| return kGeOpNameHcclBroadcast; | |||
| } else if (IsPrimitiveCNode(cnode, prim::kPrimReduceScatter)) { | |||
| return kGeOpNameHcclReduceScatter; | |||
| } | |||
| MS_LOG(EXCEPTION) << "Unknown hccl node type " << cnode->DebugString(); | |||
| } | |||
| std::tuple<ge::NodePtr, ge::ComputeGraphPtr> GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| auto cnode = anf_node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| std::string ge_node_name = GetGeNodeName(cnode); | |||
| ge::OpDescPtr op_desc = std::make_shared<ge::OpDesc>(kStubDataStructureName, ge_node_name); | |||
| MS_EXCEPTION_IF_NULL(op_desc); | |||
| for (size_t i = 1; i < cnode->size(); ++i) { | |||
| auto &input = cnode->input(i); | |||
| std::vector<int64_t> ge_shape; | |||
| auto ms_shape = AnfAlgo::GetOutputInferShape(input, 0); | |||
| std::transform(ms_shape.begin(), ms_shape.end(), std::back_inserter(ge_shape), | |||
| [](size_t in) { return static_cast<int64_t>(in); }); | |||
| op_desc->AddInputDesc( | |||
| ge::GeTensorDesc(ge::GeShape(ge_shape), ge::Format::FORMAT_NCHW, | |||
| transform::TransformUtil::ConvertDataType(AnfAlgo::GetOutputInferDataType(input, 0)))); | |||
| } | |||
| // set node data type | |||
| bool ret = ge::AttrUtils::SetDataType(*op_desc, ge::HCOM_ATTR_DATA_TYPE, ConvertHcclDTypeToGeDType(datatype)); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_DATA_TYPE << " for ge node of " << cnode->DebugString() | |||
| << " failed."; | |||
| } | |||
| // set rank size | |||
| if (AnfAlgo::HasNodeAttr(kAttrRankSize, cnode)) { | |||
| auto rank_size = AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrRankSize); | |||
| ret = ge::AttrUtils::SetInt(*op_desc, ge::HCOM_ATTR_RANK_SIZE, rank_size); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_RANK_SIZE << " for ge node of " << cnode->DebugString() | |||
| << " failed."; | |||
| } | |||
| } | |||
| ge::ComputeGraphPtr ge_graph = std::make_shared<ge::ComputeGraph>(kStubDataStructureName); | |||
| MS_EXCEPTION_IF_NULL(ge_graph); | |||
| auto ge_node = ge_graph->AddNode(op_desc); | |||
| return {ge_node, ge_graph}; | |||
| } | |||
| HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def) { | |||
| MS_EXCEPTION_IF_NULL(op); | |||
| // workspace size | |||
| auto workspace_sizes = op->GetWorkspaceBytes(); | |||
| if (workspace_sizes.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Unexpected workspace size " << workspace_sizes.size(); | |||
| } | |||
| int64_t workspace_size = workspace_sizes[0]; | |||
| // stream num | |||
| int64_t stream_num; | |||
| bool ret = ge::AttrUtils::GetInt(*op, kGeNodeAttrUsedStreamNum, stream_num); | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Get attr " << kGeNodeAttrUsedStreamNum << " for ge node " << op->GetType() << " failed."; | |||
| } | |||
| return {task_def.private_def(), workspace_size, stream_num}; | |||
| } | |||
| } // namespace mindspore::hccl | |||
| @@ -0,0 +1,38 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H | |||
| #define MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <tuple> | |||
| #define google ascend_private | |||
| #include "graph/node.h" | |||
| #include "common/opskernel/ops_kernel_info_types.h" | |||
| #include "proto/task.pb.h" | |||
| #undef google | |||
| #include "runtime/hccl_adapter/hccl_adapter.h" | |||
| #include "mindspore/core/ir/anf.h" | |||
| namespace mindspore::hccl { | |||
| // return graph ptr to keep reference count | |||
| std::tuple<ge::NodePtr, ge::ComputeGraphPtr> GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype); | |||
| HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def); | |||
| std::string GetGeNodeName(const CNodePtr &cnode); | |||
| } // namespace mindspore::hccl | |||
| #endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H | |||
| @@ -0,0 +1,165 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/hccl_adapter/hccl_adapter.h" | |||
| #include <map> | |||
| #include <algorithm> | |||
| #define google ascend_private | |||
| #include "register/ops_kernel_builder_registry.h" | |||
| #include "common/opskernel/ops_kernel_info_store.h" | |||
| #include "external/ge/ge_api_types.h" | |||
| #undef google | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "runtime/hccl_adapter/converter.h" | |||
| #include "runtime/hccl_adapter/hcom_graph_adaptor.h" | |||
| static constexpr const char *kHcclOpsKernelInfoStore = "ops_kernel_info_hccl"; | |||
| static constexpr const char *kHcclDeployModeEnv = "DEPLOY_MODE"; | |||
| // following global var, thread safety is not guaranteed | |||
| static std::shared_ptr<ge::OpsKernelInfoStore> ops_kernel_info_store = nullptr; | |||
| static ge::OpsKernelBuilderPtr ops_kernel_builder = nullptr; | |||
| namespace mindspore::hccl { | |||
| static std::map<std::string, std::string> GenHcclOptions(uint32_t device_id, std::string_view rank_id, | |||
| std::string_view rank_file) { | |||
| auto env_deploy_mode = common::GetEnv(kHcclDeployModeEnv); | |||
| if (env_deploy_mode.empty()) { | |||
| MS_LOG(WARNING) << kHcclDeployModeEnv << " is not set in ENV. Now set to default value 0"; | |||
| env_deploy_mode = "0"; | |||
| } | |||
| return std::map<std::string, std::string>({{ge::OPTION_EXEC_IS_USEHCOM, "1"}, | |||
| {ge::OPTION_EXEC_IS_USEHVD, "0"}, | |||
| {ge::OPTION_EXEC_HCCL_FLAG, "1"}, | |||
| {ge::OPTION_EXEC_DEVICE_ID, std::to_string(device_id)}, | |||
| {ge::OPTION_EXEC_RANK_ID, rank_id.data()}, | |||
| {ge::OPTION_EXEC_POD_NAME, rank_id.data()}, | |||
| {ge::OPTION_EXEC_RANK_TABLE_FILE, rank_file.data()}, | |||
| {ge::OPTION_GRAPH_RUN_MODE, "1"}, | |||
| {ge::OPTION_EXEC_HCCL_FLAG, "1"}, | |||
| {ge::OPTION_EXEC_DEPLOY_MODE, env_deploy_mode}}); | |||
| } | |||
| bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file) { | |||
| MS_LOG(INFO) << "Start init hccl adapter."; | |||
| // get ops_kernel_builder | |||
| std::map<std::string, ge::OpsKernelBuilderPtr> all_builders = ge::OpsKernelBuilderRegistry::GetInstance().GetAll(); | |||
| if (all_builders.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Builders size should be 1 (hccl builder), but is " << all_builders.size(); | |||
| } | |||
| MS_LOG(INFO) << "Get builder " << all_builders.begin()->first; | |||
| ops_kernel_builder = all_builders.begin()->second; | |||
| MS_EXCEPTION_IF_NULL(ops_kernel_builder); | |||
| // init ops_kernel_builder | |||
| auto options = GenHcclOptions(device_id, rank_id, rank_file); | |||
| auto ret = ops_kernel_builder->Initialize(options); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Init builder failed, ret = " << ret; | |||
| } | |||
| // get ops_kernel_info_store | |||
| ret = ::Initialize(options); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Init plugin so failed, ret = " << ret; | |||
| } | |||
| std::map<std::string, std::shared_ptr<ge::OpsKernelInfoStore>> all_ops_kernel_info_stores; | |||
| ::GetOpsKernelInfoStores(all_ops_kernel_info_stores); | |||
| for (auto &[name, ptr] : all_ops_kernel_info_stores) { | |||
| if (name == kHcclOpsKernelInfoStore) { | |||
| ops_kernel_info_store = ptr; | |||
| break; | |||
| } | |||
| } | |||
| MS_EXCEPTION_IF_NULL(ops_kernel_info_store); | |||
| ret = ops_kernel_info_store->Initialize(options); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Init info store failed, ret = " << ret; | |||
| } | |||
| MS_LOG(INFO) << "Init hccl adapter success."; | |||
| return true; | |||
| } | |||
| bool FinalizeHccl() { | |||
| MS_LOG(INFO) << "Start destroy hccl adapter."; | |||
| if (ops_kernel_info_store != nullptr) { | |||
| auto ret = ops_kernel_info_store->Finalize(); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(ERROR) << "Destory info store failed, ret = " << ret; | |||
| return false; | |||
| } | |||
| } | |||
| if (ops_kernel_builder != nullptr) { | |||
| auto ret = ops_kernel_builder->Finalize(); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(ERROR) << "Destory builder failed, ret = " << ret; | |||
| return false; | |||
| } | |||
| } | |||
| ::Finalize(); | |||
| ge::OpsKernelBuilderRegistry::GetInstance().UnregisterAll(); | |||
| ops_kernel_info_store.reset(); | |||
| ops_kernel_builder.reset(); | |||
| MS_LOG(INFO) << "Destroy hccl adapter success."; | |||
| return true; | |||
| } | |||
| bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector<HcclTaskInfo> *task_info_lists) { | |||
| MS_EXCEPTION_IF_NULL(ops_kernel_builder); | |||
| MS_EXCEPTION_IF_NULL(task_info_lists); | |||
| MS_LOG(INFO) << "Start generate task for hccl node " << node->DebugString(); | |||
| auto [ge_node, ge_graph] = GenerateStubGeNode(node, datatype); | |||
| MS_EXCEPTION_IF_NULL(ge_node); | |||
| auto op = ge_node->GetOpDesc(); | |||
| MS_EXCEPTION_IF_NULL(op); | |||
| MS_LOG(INFO) << "Start to call CalcOpRunningParam"; | |||
| ge::Status ret = ops_kernel_builder->CalcOpRunningParam(*ge_node); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(ERROR) << "OpsKernelBuilder CalcOpRunningParam failed, ret = " << ret; | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "Start to call GenerateTask"; | |||
| ge::RunContext unused_ctx; | |||
| std::vector<domi::TaskDef> domi_tasks; | |||
| ret = ops_kernel_builder->GenerateTask(*ge_node, unused_ctx, domi_tasks); | |||
| if (ret != ge::SUCCESS) { | |||
| MS_LOG(ERROR) << "OpsKernelBuilder GenerateTask failed, ret = " << ret; | |||
| return false; | |||
| } | |||
| task_info_lists->clear(); | |||
| std::transform(domi_tasks.begin(), domi_tasks.end(), std::back_inserter(*task_info_lists), | |||
| [&op](const domi::TaskDef &task_def) -> HcclTaskInfo { return ParseDomiTask(op, task_def); }); | |||
| MS_LOG(INFO) << "Generate task for node " << node->DebugString() << " success."; | |||
| ge_graph.reset(); | |||
| return true; | |||
| } | |||
| bool CalcOpRunningParam(const AnfNodePtr &node) { return true; } | |||
| void *GetHcclOpsKernelInfoStore() { return ops_kernel_info_store.get(); } | |||
| std::string GetHcclType(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| return GetGeNodeName(cnode); | |||
| } | |||
| } // namespace mindspore::hccl | |||
| @@ -0,0 +1,43 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H | |||
| #define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "mindspore/core/ir/anf.h" | |||
| #include "external/hccl/hccl_types.h" | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| namespace mindspore::hccl { | |||
| struct MS_API HcclTaskInfo { | |||
| std::string private_def; | |||
| int64_t workspace_size; | |||
| int64_t stream_num; | |||
| }; | |||
| MS_API bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file); | |||
| MS_API bool FinalizeHccl(); | |||
| MS_API bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector<HcclTaskInfo> *task_info_lists); | |||
| MS_API bool CalcOpRunningParam(const AnfNodePtr &node); | |||
| MS_API void *GetHcclOpsKernelInfoStore(); | |||
| MS_API std::string GetHcclType(const AnfNodePtr &node); | |||
| } // namespace mindspore::hccl | |||
| #undef MS_API | |||
| #endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H | |||
| @@ -0,0 +1,32 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H | |||
| #define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H | |||
| #include <string> | |||
| #include <map> | |||
| #include <memory> | |||
| #include "mindspore/core/ir/anf.h" | |||
| #include "common/opskernel/ops_kernel_info_store.h" | |||
| extern "C" { | |||
| ge::Status Initialize(const std::map<std::string, std::string> &); | |||
| ge::Status Finalize(); | |||
| void GetOpsKernelInfoStores(std::map<std::string, std::shared_ptr<ge::OpsKernelInfoStore>> &); | |||
| } | |||
| #endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H | |||
| @@ -181,7 +181,8 @@ static const char *GetSubModuleName(SubModuleId module_id) { | |||
| "VM", // SM_VM | |||
| "PROFILER", // SM_PROFILER | |||
| "PS", // SM_PS | |||
| "LITE" // SM_LITE | |||
| "LITE", // SM_LITE | |||
| "HCCL_ADPT" // SM_HCCL_ADPT | |||
| }; | |||
| return sub_module_names[module_id % NUM_SUBMODUES]; | |||
| @@ -125,6 +125,7 @@ enum SubModuleId : int { | |||
| SM_PROFILER, // profiler | |||
| SM_PS, // Parameter Server | |||
| SM_LITE, // LITE | |||
| SM_HCCL_ADPT, // Hccl Adapter | |||
| NUM_SUBMODUES // number of submodules | |||
| }; | |||
| @@ -15,7 +15,7 @@ | |||
| */ | |||
| #include <vector> | |||
| #include "framework/ge_runtime/model_runner.h" | |||
| #include "runtime/device/ascend/tasksink/runtime_utils.h" | |||
| #include "runtime/hccl_adapter/hccl_adapter.h" | |||
| namespace ge { | |||
| namespace model_runner { | |||
| @@ -60,15 +60,12 @@ const std::map<std::string, std::shared_ptr<RuntimeInfo>> &ModelRunner::GetRunti | |||
| } // namespace ge | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| namespace tasksink { | |||
| bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) { return true; } | |||
| bool RuntimeUtils::HcomUnbindModel(rtModel_t model) { return true; } | |||
| bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream) { return true; } | |||
| } // namespace tasksink | |||
| } // namespace ascend | |||
| } // namespace device | |||
| namespace hccl { | |||
| bool InitHccl(uint32_t, std::string_view, std::string_view) { return true; } | |||
| bool FinalizeHccl() { return true; } | |||
| bool GenTask(const AnfNodePtr &, HcclDataType, std::vector<HcclTaskInfo> *) { return true; } | |||
| bool CalcOpRunningParam(const AnfNodePtr &) { return true; } | |||
| void *GetHcclOpsKernelInfoStore() { return nullptr; } | |||
| std::string GetHcclType(const AnfNodePtr &) { return ""; } | |||
| } // namespace hccl | |||
| } // namespace mindspore | |||