From 8d147deb07fcf58cccd0118a0e76b1e44f11aba8 Mon Sep 17 00:00:00 2001 From: yanghaitao1 Date: Tue, 15 Dec 2020 17:21:24 +0800 Subject: [PATCH] profiler memory --- mindspore/ccsrc/CMakeLists.txt | 208 ++++++++++-------- .../ccsrc/backend/optimizer/somas/somas.cc | 48 ++++ .../ccsrc/backend/optimizer/somas/somas.h | 2 + .../ccsrc/backend/session/ascend_session.cc | 15 ++ mindspore/ccsrc/profiler/CMakeLists.txt | 17 +- .../device/common/memory_profiling.cc | 97 ++++++++ .../profiler/device/common/memory_profiling.h | 124 +++++++++++ .../device/common/memory_profiling.proto | 50 +++++ .../ccsrc/pybind_api/utils/ms_context_py.cc | 4 +- .../device/ascend/ascend_kernel_runtime.cc | 5 + .../device/ascend/ascend_kernel_runtime.h | 1 + .../device/ascend/ascend_memory_manager.cc | 30 ++- .../device/ascend/ascend_memory_manager.h | 4 +- .../runtime/device/cpu/cpu_memory_manager.cc | 2 +- .../runtime/device/cpu/cpu_memory_manager.h | 2 +- .../runtime/device/gpu/gpu_memory_manager.cc | 2 +- .../runtime/device/gpu/gpu_memory_manager.h | 2 +- .../ccsrc/runtime/device/kernel_runtime.cc | 12 +- .../ccsrc/runtime/device/kernel_runtime.h | 1 + .../ccsrc/runtime/device/memory_manager.cc | 8 +- .../ccsrc/runtime/device/memory_manager.h | 7 +- mindspore/core/utils/ms_context.cc | 1 + mindspore/core/utils/ms_context.h | 1 + mindspore/profiler/profiling.py | 3 +- tests/ut/cpp/CMakeLists.txt | 61 ++--- 25 files changed, 560 insertions(+), 147 deletions(-) create mode 100644 mindspore/ccsrc/profiler/device/common/memory_profiling.cc create mode 100644 mindspore/ccsrc/profiler/device/common/memory_profiling.h create mode 100644 mindspore/ccsrc/profiler/device/common/memory_profiling.proto diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 3abc9ea49d..6144104821 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -3,33 +3,34 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/core) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}) -if (ENABLE_ACL) +if(ENABLE_ACL) set(ASCEND_PATH /usr/local/Ascend) include_directories(${ASCEND_PATH}/acllib/include) link_directories(${ASCEND_PATH}/acllib/lib64/) find_library(ascendcl acl_dvpp ${ASCEND_PATH}/acllib/lib64) -endif () +endif() -if (NOT(CMAKE_SYSTEM_NAME MATCHES "Darwin")) +if(NOT(CMAKE_SYSTEM_NAME MATCHES "Darwin")) link_directories(${CMAKE_SOURCE_DIR}/build/mindspore/graphengine) else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-delete-non-abstract-non-virtual-dtor") -endif () +endif() -if (CMAKE_SYSTEM_NAME MATCHES "Windows") +if(CMAKE_SYSTEM_NAME MATCHES "Windows") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes -DHAVE_SNPRINTF") add_compile_definitions(BUILDING_DLL) endif() -if (ENABLE_MPI) +if(ENABLE_MPI) add_compile_definitions(ENABLE_MPI) -endif () +endif() if(ENABLE_GPU) find_package(CUDA REQUIRED) find_package(Threads) if(${CUDA_VERSION} VERSION_LESS ${MS_REQUIRE_CUDA_VERSION}) - message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, but only CUDA ${CUDA_VERSION} found.") + message(FATAL_ERROR "The minimum CUDA version ${MS_REQUIRE_CUDA_VERSION} is required, \ + but only CUDA ${CUDA_VERSION} found.") endif() enable_language(CUDA) if(NOT CUDA_PATH OR CUDA_PATH STREQUAL "") @@ -40,31 +41,36 @@ if(ENABLE_GPU) endif() endif() - if (DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "") + if(DEFINED ENV{CUDNN_HOME} AND NOT $ENV{CUDNN_HOME} STREQUAL "") set(CUDNN_INCLUDE_DIR $ENV{CUDNN_HOME}/include) set(CUDNN_LIBRARY_DIR $ENV{CUDNN_HOME}/lib64) find_path(CUDNN_INCLUDE_PATH cudnn.h HINTS ${CUDNN_INCLUDE_DIR} NO_DEFAULT_PATH) find_library(CUDNN_LIBRARY_PATH "cudnn" HINTS ${CUDNN_LIBRARY_DIR} NO_DEFAULT_PATH) - if (CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND) - message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to cudnn installation position.") + if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND) + message(FATAL_ERROR "Failed to find cudnn header file, please set environment variable CUDNN_HOME to \ + cudnn installation position.") endif() - if (CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND) - message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to cudnn installation position.") + if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND) + message(FATAL_ERROR "Failed to find cudnn library file, please set environment variable CUDNN_HOME to \ + cudnn installation position.") endif() else() list(APPEND CMAKE_PREFIX_PATH ${CUDA_TOOLKIT_ROOT_DIR}) find_path(CUDNN_INCLUDE_PATH cudnn.h PATH_SUFFIXES cuda/inclulde include cuda) find_library(CUDNN_LIBRARY_PATH "cudnn" PATH_SUFFIXES cuda/lib64 lib64 lib cuda/lib lib/x86_64-linux-gnu) - if (CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND) - message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put cudnn header file in cuda include path \ - or user include path(eg. /usr/local/cuda/include; /usr/local/include; /usr/include), if cudnn library is installed in other position,\ - please set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h in {CUDNN_HOME}/include.") + if(CUDNN_INCLUDE_PATH STREQUAL CUDNN_INCLUDE_PATH-NOTFOUND) + message(FATAL_ERROR "Failed to find cudnn header file, if cudnn library is not installed, please put \ + cudnn header file in cuda include path or user include path(eg. /usr/local/cuda/include; \ + /usr/local/include; /usr/include), if cudnn library is installed in other position, please \ + set environment variable CUDNN_HOME to cudnn installation position, there should be cudnn.h \ + in {CUDNN_HOME}/include.") endif() - if (CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND) - message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put cudnn library file in \ - cuda library path or user library path(eg. /usr/local/cuda/lib64; /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib),\ - if cudnn library is installed in other position, please set environment variable CUDNN_HOME to cudnn installation position, \ - there should be cudnn library file in {CUDNN_HOME}/lib64.") + if(CUDNN_LIBRARY_PATH STREQUAL CUDNN_LIBRARY_PATH-NOTFOUND) + message(FATAL_ERROR "Failed to find cudnn library file, if cudnn library is not installed, please put \ + cudnn library file in cuda library path or user library path(eg. /usr/local/cuda/lib64; \ + /usr/local/lib64; /usr/lib64; /usr/local/lib; /usr/lib), if cudnn library is installed in other \ + position, please set environment variable CUDNN_HOME to cudnn installation position, there should \ + be cudnn library file in {CUDNN_HOME}/lib64.") endif() endif() @@ -102,7 +108,7 @@ if(ENABLE_GPU) cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST}) set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS}) add_compile_definitions(ENABLE_GPU) -endif () +endif() ## make protobuf files @@ -117,7 +123,13 @@ file(GLOB_RECURSE COMM_PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ps/core/pr ms_protobuf_generate(COMM_PROTO_SRCS COMM_PROTO_HDRS ${COMM_PROTO_IN}) list(APPEND MINDSPORE_PROTO_LIST ${COMM_PROTO_SRCS}) -if (ENABLE_DEBUGGER) +include_directories("${CMAKE_BINARY_DIR}/profiler/device/common") +file(GLOB_RECURSE PROFILER_PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "profiler/device/common/memory_profiling.proto") +ms_protobuf_generate(PROFILER_MEM_PROTO_SRCS PROFILER_MEM_PROTO_HDRS ${PROFILER_PROTO_LIST}) +list(APPEND MINDSPORE_PROTO_LIST ${PROFILER_MEM_PROTO_SRCS}) + +if(ENABLE_DEBUGGER) # debugger: compile proto files include_directories("${CMAKE_BINARY_DIR}/debug/debugger") file(GLOB_RECURSE DEBUGGER_PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_graph.proto") @@ -126,9 +138,9 @@ if (ENABLE_DEBUGGER) ms_grpc_generate(DEBUGGER_GRPC_SRCS DEBUGGER_GRPC_HDRS ${DEBUGGER_GRPC_LIST}) list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_PROTO_SRCS}) list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_GRPC_SRCS}) -endif () +endif() -if (ENABLE_DUMP_PROTO) +if(ENABLE_DUMP_PROTO) include_directories(${CMAKE_BINARY_DIR}) file(GLOB_RECURSE PROTO_PY RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} @@ -144,9 +156,9 @@ if (ENABLE_DUMP_PROTO) list(APPEND MINDSPORE_PROTO_LIST ${PROTO_SRCS}) list(APPEND MINDSPORE_PROTO_LIST ${PY_SRCS}) -endif () +endif() -if (ENABLE_D) +if(ENABLE_D) include_directories("${CMAKE_BINARY_DIR}/backend/kernel_compiler/aicpu") file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "backend/kernel_compiler/aicpu/proto/*.proto") ms_protobuf_generate(PROTOSRCS PROTOHDRS ${PROTO_IN}) @@ -159,9 +171,9 @@ if (ENABLE_D) list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS}) add_compile_definitions(ENABLE_D) -endif () +endif() -if (MINDSPORE_PROTO_LIST) +if(MINDSPORE_PROTO_LIST) add_library(proto_input STATIC ${MINDSPORE_PROTO_LIST}) set_target_properties(proto_input PROPERTIES COMPILE_FLAGS "-Wno-unused-variable") endif() @@ -183,58 +195,58 @@ set(SUB_COMP common debug pybind_api utils vm profiler ps ) -foreach (_comp ${SUB_COMP}) +foreach(_comp ${SUB_COMP}) add_subdirectory(${_comp}) string(REPLACE "/" "_" sub ${_comp}) - if (TARGET _mindspore_${sub}_obj) + if(TARGET _mindspore_${sub}_obj) list(APPEND SUB_OBJECTS_SRC $) add_dependencies(_mindspore_${sub}_obj proto_input) - endif () -endforeach () + endif() +endforeach() set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME) add_library(mindspore STATIC ${SUB_OBJECTS_SRC}) target_link_libraries(mindspore mindspore_core) -if (ENABLE_DEBUGGER) +if(ENABLE_DEBUGGER) # debugger: link grpc target_link_libraries(proto_input mindspore::grpc++) endif() target_link_libraries(mindspore securec mindspore::flatbuffers) -if (NOT WIN32) +if(NOT WIN32) target_link_libraries(mindspore dl) endif() -if (ENABLE_GE) +if(ENABLE_GE) if(ENABLE_TRAIN) target_link_libraries(mindspore ge_runner hccl) - else () + else() target_link_libraries(mindspore ge_client) - endif () + endif() target_link_libraries(mindspore graph tsdclient datatransfer) endif() -if (ENABLE_D) - if (DEFINED ENV{D_LINK_PATH}) - if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") +if(ENABLE_D) + if(DEFINED ENV{D_LINK_PATH}) + if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") MESSAGE("system processor matches aarch64") set(D_LIB_PATH $ENV{D_LINK_PATH}/aarch64) - elseif (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") MESSAGE("system processor matches x86_64") set(D_LIB_PATH $ENV{D_LINK_PATH}/x86_64) - else () + else() MESSAGE("system ${CMAKE_HOST_SYSTEM_PROCESSOR} not support") endif() - else () + else() MESSAGE("use system default lib") - if (DEFINED ENV{ASCEND_CUSTOM_PATH}) + if(DEFINED ENV{ASCEND_CUSTOM_PATH}) set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH}) - else () + else() set(ASCEND_PATH /usr/local/Ascend) - endif () + endif() set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common) set(ASCEND_DRIVER_BACK_PATH ${ASCEND_PATH}/driver/lib64/driver) set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64) @@ -246,8 +258,10 @@ if (ENABLE_D) find_library(HCCL hccl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) + find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} + ${ASCEND_DRIVER_BACK_PATH}) + find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} + ${ASCEND_DRIVER_BACK_PATH}) find_library(PROFILING msprofiler_fwk ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(PLATFORM platform ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) @@ -255,42 +269,48 @@ if (ENABLE_D) # hccl_adpter find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(HCCL_RA ra ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) + find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel + ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) - add_library(ms_profile SHARED ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc) + add_library(ms_profile SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc) set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init) - target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive mindspore::protobuf -Wl,--end-group) + target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive + mindspore::protobuf -Wl,--end-group) target_link_libraries(mindspore ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} - ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER} ${HCCL_RA} ${PLATFORM}) + ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER} + ${HCCL_RA} ${PLATFORM}) target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) -elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") - target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece -Wl,--end-group) -elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") +elseif(CMAKE_SYSTEM_NAME MATCHES "Windows") + target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece + -Wl,--end-group) +elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") target_link_libraries(mindspore -Wl proto_input mindspore::protobuf mindspore::sentencepiece -Wl) -else () +else() target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) -endif () +endif() # set c_expression building set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) -set_property(SOURCE "pipeline/jit/init.cc" PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE) +set_property(SOURCE "pipeline/jit/init.cc" PROPERTY + COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE) pybind11_add_module(_c_expression "pipeline/jit/init.cc") MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}") -if (CMAKE_SYSTEM_NAME MATCHES "Linux") +if(CMAKE_SYSTEM_NAME MATCHES "Linux") target_link_options(_c_expression PRIVATE -Wl,-init,mindspore_log_init) set(ORIGIN_PATH $ORIGIN) -elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") +elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") set_target_properties(_c_expression PROPERTIES MACOSX_RPATH ON) set(ORIGIN_PATH @loader_path) -elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") +elseif(CMAKE_SYSTEM_NAME MATCHES "Windows") set(ORIGIN_PATH $ORIGIN) -else () +else() MESSAGE(FATAL_ERROR "other platform: ${CMAKE_SYSTEM_NAME}") -endif () +endif() -if (ENABLE_D) +if(ENABLE_D) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64) @@ -300,45 +320,47 @@ if (ENABLE_D) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling) - set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling) -elseif (ENABLE_GPU) + set(MINDSPORE_RPATH + ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling) +elseif(ENABLE_GPU) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/cuda/lib64) -endif () +endif() set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH}) set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH}) -if (CMAKE_SYSTEM_NAME MATCHES "Windows") +if(CMAKE_SYSTEM_NAME MATCHES "Windows") target_link_libraries(mindspore mindspore::pybind11_module) target_link_libraries(mindspore mindspore_gvar) target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive) -elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") +elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") target_link_libraries(mindspore mindspore::pybind11_module) target_link_libraries(mindspore mindspore_gvar) target_link_libraries(_c_expression PRIVATE -Wl,-force_load mindspore -Wl,-noall_load) -else () - if (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU)) - target_link_libraries(mindspore mindspore::pslite proto_input mindspore::protobuf mindspore::event mindspore::event_pthreads ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a) +else() + if(ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU)) + target_link_libraries(mindspore mindspore::pslite proto_input mindspore::protobuf + mindspore::event mindspore::event_pthreads ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a) target_link_libraries(mindspore -Wl,--no-as-needed mindspore::event_core ps_cache) - if (${ENABLE_IBVERBS} STREQUAL "ON") + if(${ENABLE_IBVERBS} STREQUAL "ON") target_link_libraries(mindspore ibverbs rdmacm) endif() endif() target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore proto_input -Wl,--no-whole-archive) target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module) target_link_libraries(_c_expression PRIVATE mindspore_gvar) - if (ENABLE_D) + if(ENABLE_D) target_link_libraries(_c_expression PRIVATE -Wl,--no-as-needed ms_profile) - endif () - if (ENABLE_ACL) + endif() + if(ENABLE_ACL) target_link_libraries(_c_expression PRIVATE -Wl,--no-as-needed graph) - endif () -endif () + endif() +endif() -if (USE_GLOG) +if(USE_GLOG) target_link_libraries(_c_expression PRIVATE mindspore::glog) -endif () +endif() -if (ENABLE_GPU) +if(ENABLE_GPU) message("add gpu lib to c_expression") target_link_libraries(_c_expression PRIVATE gpu_cuda_lib gpu_queue cublas ${CUDA_PATH}/lib64/libcurand.so @@ -346,27 +368,27 @@ if (ENABLE_GPU) ${CUDA_PATH}/lib64/libcudart.so ${CUDA_PATH}/lib64/stubs/libcuda.so ${CUDA_PATH}/lib64/libcusolver.so) - if (ENABLE_MPI) + if(ENABLE_MPI) set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH}) - endif () -endif () + endif() +endif() -if (CMAKE_SYSTEM_NAME MATCHES "Darwin") +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(CMAKE_MACOSX_RPATH 1) set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path") set_target_properties(_c_expression PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}") -endif () +endif() -if (ENABLE_CPU) +if(ENABLE_CPU) target_link_libraries(_c_expression PRIVATE mindspore::dnnl mindspore::mkldnn) -endif () +endif() -if (ENABLE_MINDDATA) +if(ENABLE_MINDDATA) add_subdirectory(minddata/mindrecord) add_subdirectory(minddata/dataset) -endif () +endif() -if (ENABLE_D) +if(ENABLE_D) find_library(adump_server libadump_server.a ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) target_link_libraries(_c_expression PRIVATE ${adump_server}) endif() diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.cc b/mindspore/ccsrc/backend/optimizer/somas/somas.cc index e6483e05ed..b21711cdc6 100644 --- a/mindspore/ccsrc/backend/optimizer/somas/somas.cc +++ b/mindspore/ccsrc/backend/optimizer/somas/somas.cc @@ -35,6 +35,11 @@ #include "utils/ms_context.h" #include "debug/common.h" #include "common/thread_pool.h" +#include "profiler/device/common/memory_profiling.h" + +using mindspore::profiler::MemoryProfiling; +using mindspore::profiler::NodeMemory; +using mindspore::profiler::TensorMemory; namespace mindspore { namespace somas { @@ -49,6 +54,11 @@ std::map tensor_type_name_map = {{kCommon, "Common"}, {kRefNodeOutput, "RefNodeOutput"}, {kUnknown, "Unknown"}}; +std::map life_long_name_map = {{kLifeLongNone, "LifeLongNone"}, + {kLifeLongGraphAll, "LifeLongGraphAll"}, + {kLifeLongGraphStart, "LifeLongGraphStart"}, + {kLifeLongGraphEnd, "LifeLongGraphEnd"}}; + bool Somas::Allocate(const session::KernelGraph *graph) { auto ret = InitSomasTensors(graph); if (!ret) { @@ -1413,5 +1423,43 @@ uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const } return ptr; } + +void Somas::ConvertToProfilingNode(uint32_t graph_id) { +#ifdef ENABLE_D + auto graph_node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id); + if (graph_node == nullptr) { + graph_node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id); + MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id; + } + + for (const auto &tensor : tensors_list_) { + TensorMemory tensor_memory; + tensor_memory.SetTensorId(tensor->GetId()); + tensor_memory.SetAlignedSize(tensor->GetAlignedSize()); + tensor_memory.SetType(tensor_type_name_map[tensor->type_]); + tensor_memory.SetLifeStart(tensor->lifetime_.start_); + tensor_memory.SetLifeEnd(tensor->lifetime_.end_); + tensor_memory.SetLifeLong(life_long_name_map[tensor->lifelong_value_]); + graph_node->AddTensorMemory(tensor_memory); + } + + for (const auto &node : nodes_list_) { + NodeMemory node_memory; + std::string name = GetSplitName(node->scope_full_name_); + node_memory.SetNodeName(name); + node_memory.SetNodeId(node->GetId()); + for (const auto &tensor : node->input_tensors_) { + node_memory.AddInputTensorId(tensor->GetId()); + } + for (const auto &tensor : node->output_tensors_) { + node_memory.AddOutputTensorId(tensor->GetId()); + } + for (const auto &tensor : node->workspace_tensors_) { + node_memory.AddWorkSpaceTensorId(tensor->GetId()); + } + graph_node->AddNodeMemory(node_memory); + } +#endif +} } // namespace somas } // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.h b/mindspore/ccsrc/backend/optimizer/somas/somas.h index b0c7536c33..4cf54c41c8 100644 --- a/mindspore/ccsrc/backend/optimizer/somas/somas.h +++ b/mindspore/ccsrc/backend/optimizer/somas/somas.h @@ -54,6 +54,8 @@ class Somas { static bool NodeSort(SomasNodePtr, SomasNodePtr); std::vector reuse_matrix_; + std::vector tensor_relation; + void ConvertToProfilingNode(uint32_t graph_id); private: // Maps diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index c20a266379..477be37ed5 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -30,6 +30,7 @@ #include "runtime/device/ascend/kernel_select_ascend.h" #include "runtime/device/ascend/kernel_build_ascend.h" #include "runtime/device/ascend/ascend_kernel_runtime.h" +#include "runtime/device/ascend/profiling/profiling_manager.h" #include "backend/optimizer/ascend/ascend_backend_optimization.h" #include "backend/optimizer/common/common_backend_optimization.h" #include "backend/optimizer/ascend/mindir/dropout_unify_mindir.h" @@ -65,6 +66,11 @@ #include "ps/util.h" #include "ps/ps_cache/ps_cache_manager.h" #endif +#include "profiler/device/common/memory_profiling.h" + +using mindspore::device::ascend::ProfilingManager; +using mindspore::profiler::MemoryProfiling; + static constexpr uint32_t kLabelSwitchLabelId = 2; namespace mindspore { namespace session { @@ -649,6 +655,15 @@ GraphId AscendSession::CompileGraphImpl(NotNull func_graph) { root_graph->SetInputNodes(); root_graph->SetOptimizerFlag(); DumpAllGraphs(all_graphs); + // Save memory profiling data to proto file + if (ProfilingManager::GetInstance().IsProfiling()) { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + uint64_t mem_size = runtime_instance->GetAvailableMemMaxSize(); + auto instance = MemoryProfiling::GetInstance(); + instance.SetDeviceMemSize(mem_size); + instance.SaveMemoryProfiling(); + } // return the root_graph id to backend auto graph_id = root_graph->graph_id(); return graph_id; diff --git a/mindspore/ccsrc/profiler/CMakeLists.txt b/mindspore/ccsrc/profiler/CMakeLists.txt index 12b4bc1f6f..852fc6efe5 100644 --- a/mindspore/ccsrc/profiler/CMakeLists.txt +++ b/mindspore/ccsrc/profiler/CMakeLists.txt @@ -1,11 +1,14 @@ -if (ENABLE_GPU) +if(ENABLE_GPU) file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/gpu/*.cc") - set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER) + set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS + SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER) add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST}) -endif () +endif() -if (ENABLE_D) - file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/ascend/*.cc") - set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER) +if(ENABLE_D) + file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/ascend/*.cc" "device/common/*.cc") + set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS + SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER) add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST}) -endif () \ No newline at end of file + add_dependencies(_mindspore_profiler_obj mindspore::protobuf) +endif() diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.cc b/mindspore/ccsrc/profiler/device/common/memory_profiling.cc new file mode 100644 index 0000000000..c467915fcc --- /dev/null +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.cc @@ -0,0 +1,97 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "profiler/device/common/memory_profiling.h" +#include +#include +#include "utils/log_adapter.h" +#include "utils/ms_context.h" + +namespace mindspore { +namespace profiler { + +std::shared_ptr MemoryProfiling::AddGraphMemoryNode(uint32_t graph_id) { + std::shared_ptr node = std::make_shared(graph_id); + graph_memory_[graph_id] = node; + return node; +} + +std::shared_ptr MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) { + auto node = graph_memory_.find(graph_id); + if (node != graph_memory_.end()) { + return node->second; + } + + return nullptr; +} + +void MemoryProfiling::MemoryToPB() { + memory_proto_.set_total_mem(device_mem_size_); + for (const auto &graph : graph_memory_) { + GraphMemProto *graph_proto = memory_proto_.add_graph_mem(); + graph_proto->set_graph_id(graph.second->GetGraphId()); + graph_proto->set_static_mem(graph.second->GetStaticMemSize()); + // node memory to PB + for (const auto &node : graph.second->GetNodeMemory()) { + NodeMemProto *node_mem = graph_proto->add_node_mems(); + node_mem->set_node_name(node.GetNodeName()); + node_mem->set_node_id(node.GetNodeId()); + for (const auto &id : node.GetInputTensorId()) { + node_mem->add_input_tensor_id(id); + } + for (const auto &id : node.GetOutputTensorId()) { + node_mem->add_output_tensor_id(id); + } + for (const auto &id : node.GetOutputTensorId()) { + node_mem->add_workspace_tensor_id(id); + } + } + // tensor memory to PB + for (const auto &node : graph.second->GetTensorMemory()) { + TensorMemProto *tensor_mem = graph_proto->add_tensor_mems(); + tensor_mem->set_tensor_id(node.GetTensorId()); + tensor_mem->set_size(node.GetAlignedSize()); + std::string type = node.GetType(); + tensor_mem->set_type(type); + tensor_mem->set_life_start(node.GetLifeStart()); + tensor_mem->set_life_end(node.GetLifeEnd()); + std::string life_long = node.GetLifeLong(); + tensor_mem->set_life_long(life_long); + } + } + MS_LOG(INFO) << "Memory profiling data to PB end"; + return; +} + +void MemoryProfiling::SaveMemoryProfiling() { + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); + std::string dir_path = context->get_param(MS_CTX_PROFILING_DIR_PATH); + auto device_id = context->get_param(MS_CTX_DEVICE_ID); + std::string file = dir_path + std::string("/memory_usage_") + std::to_string(device_id) + std::string(".pb"); + + MemoryToPB(); + + std::fstream handle(file, std::ios::out | std::ios::trunc | std::ios::binary); + if (!memory_proto_.SerializeToOstream(&handle)) { + MS_LOG(ERROR) << "Save memory profiling data to file failed"; + } + handle.close(); + MS_LOG(INFO) << "Start save memory profiling data to " << file << " end"; + return; +} +} // namespace profiler +} // namespace mindspore diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.h b/mindspore/ccsrc/profiler/device/common/memory_profiling.h new file mode 100644 index 0000000000..0e5470fd3e --- /dev/null +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.h @@ -0,0 +1,124 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_PROFILER_DEVICE_COMMON_PROFILING_MEMORY_H +#define MINDSPORE_PROFILER_DEVICE_COMMON_PROFILING_MEMORY_H + +#include "proto/memory_profiling.pb.h" +#include +#include +#include +#include +#include "utils/ms_context.h" + +namespace mindspore { +namespace profiler { + +class NodeMemory { + public: + NodeMemory() : node_name_(""), node_id_(0) {} + ~NodeMemory() = default; + + void SetNodeName(const std::string &name) { node_name_ = name; } + void SetNodeId(uint64_t node_id) { node_id_ = node_id; } + void AddInputTensorId(uint64_t node_id) { input_tensor_id_.emplace_back(node_id); } + void AddOutputTensorId(uint64_t node_id) { output_tensor_id_.emplace_back(node_id); } + void AddWorkSpaceTensorId(uint64_t node_id) { workspace_tensor_id_.emplace_back(node_id); } + std::string GetNodeName() const { return node_name_; } + uint64_t GetNodeId() const { return node_id_; } + std::vector GetInputTensorId() const { return input_tensor_id_; } + std::vector GetOutputTensorId() const { return output_tensor_id_; } + std::vector GetWorkspaceTensorId() const { return workspace_tensor_id_; } + + private: + std::string node_name_; + uint64_t node_id_; + std::vector input_tensor_id_; + std::vector output_tensor_id_; + std::vector workspace_tensor_id_; +}; + +class TensorMemory { + public: + TensorMemory() : tensor_id_(0), size_(0), type_(""), life_start_(0), life_end_(0), life_long_("") {} + ~TensorMemory() = default; + + void SetTensorId(uint64_t tensor_id) { tensor_id_ = tensor_id; } + void SetAlignedSize(uint64_t size) { size_ = size; } + void SetType(const std::string &type) { type_ = type; } + void SetLifeStart(uint64_t start) { life_start_ = start; } + void SetLifeEnd(uint64_t end) { life_end_ = end; } + void SetLifeLong(const std::string &life_long) { life_long_ = life_long; } + uint64_t GetTensorId() const { return tensor_id_; } + uint64_t GetAlignedSize() const { return size_; } + std::string GetType() const { return type_; } + uint64_t GetLifeStart() const { return life_start_; } + uint64_t GetLifeEnd() const { return life_end_; } + std::string GetLifeLong() const { return life_long_; } + + private: + uint64_t tensor_id_; + uint64_t size_; // aligned tensor size + std::string type_; // see TensorType in somas_tensor.h + uint64_t life_start_; // the exe node id at which tensor memory allocated + uint64_t life_end_; // the exe node id at which tensor memory deallocated + std::string life_long_; // see LifeLongType in somas_tensor.h +}; + +class GraphMemory { + public: + explicit GraphMemory(uint32_t graph_id) : graph_id_(graph_id), static_mem_size_(0) {} + ~GraphMemory() = default; + void AddStaticMemorySize(uint32_t size) { static_mem_size_ += size; } + void AddNodeMemory(const NodeMemory &node) { node_memory_.emplace_back(node); } + void AddTensorMemory(const TensorMemory &node) { tensor_memory_.emplace_back(node); } + uint32_t GetGraphId() const { return graph_id_; } + uint32_t GetStaticMemSize() const { return static_mem_size_; } + std::vector GetNodeMemory() const { return node_memory_; } + std::vector GetTensorMemory() const { return tensor_memory_; } + + private: + uint32_t graph_id_; + uint32_t static_mem_size_; + std::vector node_memory_; + std::vector tensor_memory_; +}; + +class MemoryProfiling { + public: + MemoryProfiling() = default; + ~MemoryProfiling() = default; + + static MemoryProfiling &GetInstance() { + static MemoryProfiling instance; + return instance; + } + + MemoryProto &GetMemProto() { return memory_proto_; } + std::shared_ptr AddGraphMemoryNode(uint32_t graph_id); + std::shared_ptr GetGraphMemoryNode(uint32_t graph_id); + void SetDeviceMemSize(uint64_t size) { device_mem_size_ = size; } + void MemoryToPB(); + void SaveMemoryProfiling(); + + private: + MemoryProto memory_proto_; + std::map> graph_memory_; + uint64_t device_mem_size_; +}; +} // namespace profiler +} // namespace mindspore +#endif diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.proto b/mindspore/ccsrc/profiler/device/common/memory_profiling.proto new file mode 100644 index 0000000000..eb596e62c8 --- /dev/null +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.proto @@ -0,0 +1,50 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package mindspore.profiler; + +message MemoryProto { + repeated GraphMemProto graph_mem = 1; // memory usage of multiple graphs + int64 total_mem = 2; // total allocated device memory +} + +message GraphMemProto { + int64 graph_id = 1; // graph id + int64 static_mem = 2; // size of allocated static memory for current graph + repeated NodeMemProto node_mems = 3; // execution nodes + repeated TensorMemProto tensor_mems = 4; // all tensors + string fp_start = 5; // node name of fp start + string bp_end = 6; // node name of bp end +} + +message NodeMemProto { + string node_name = 1; // node name + int64 node_id = 2; // node id with respect to the execution order + repeated int64 input_tensor_id = 3; // input tensor id + repeated int64 output_tensor_id = 4; // output tensor id + repeated int64 workspace_tensor_id = 5; // workspace tensor id +} + +message TensorMemProto { + int64 tensor_id = 1; // tensor id + int64 size = 2; // aligned tensor size + string type = 3; // tensor type, e.g. Common, OutputOnly + int64 life_start = 4; // the exe node id at which tensor memory allocated + int64 life_end = 5; // the exe node id at which tensor memory deallocated + string life_long = 6; // see LifeLongType enum +} diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc index 006f4ab578..bd1df8b8c6 100644 --- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc +++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc @@ -94,8 +94,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) { .value("save_graphs_path", MsCtxParam::MS_CTX_SAVE_GRAPHS_PATH) .value("variable_memory_max_size", MsCtxParam::MS_CTX_VARIABLE_MEMORY_MAX_SIZE) .value("device_id", MsCtxParam::MS_CTX_DEVICE_ID) - .value("max_call_depth", MsCtxParam::MS_CTX_MAX_CALL_DEPTH); - + .value("max_call_depth", MsCtxParam::MS_CTX_MAX_CALL_DEPTH) + .value("profiling_dir_path", MsCtxParam::MS_CTX_PROFILING_DIR_PATH); (void)py::class_>(*m, "MSContext") .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.") .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified paramter.") diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 890fad9b74..fd116d1ec7 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -895,4 +895,9 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name) MS_LOG(EXCEPTION) << "Too many profiling data"; } } + +uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const { + auto ascend_mem_manager = dynamic_pointer_cast(mem_manager_); + return ascend_mem_manager->GetDeviceMemSize(); +} } // namespace mindspore::device::ascend diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index bb4c2c9206..4d7955c865 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -55,6 +55,7 @@ class AscendKernelRuntime : public KernelRuntime { void CreateContext() override; void *context() const override { return rt_context_; } void PreInit() override; + uint64_t GetAvailableMemMaxSize() const; protected: DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc index 58b6e4fbde..46a64dda83 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc @@ -18,6 +18,12 @@ #include "runtime/device/ascend/ascend_memory_pool.h" #include "utils/ms_context.h" #include "runtime/mem.h" +#include "runtime/device/ascend/profiling/profiling_manager.h" +#include "profiler/device/common/memory_profiling.h" + +using mindspore::device::ascend::ProfilingManager; +using mindspore::profiler::MemoryProfiling; + namespace mindspore { namespace device { namespace ascend { @@ -44,6 +50,11 @@ void AscendMemoryManager::MallocDeviceMemory() { AscendMemoryPool::GetInstance().Init(device_mem_base_, device_mem_size_, dynamic_mem_offset_); } +uint64_t AscendMemoryManager::GetDeviceMemSize() { + auto mem_size = GetDeviceMemSizeFromContext(); + return mem_size == 0 ? kAscendDeviceMemSize : mem_size; +} + uint64_t AscendMemoryManager::GetDeviceMemSizeFromContext() { auto context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context); @@ -88,7 +99,7 @@ void *AscendMemoryManager::MallocMemFromMemPool(size_t size) { return AscendMemoryPool::GetInstance().AllocTensorMem(align_size); } -uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem) { +uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) { size_t align_size = 0; if (communication_mem) { align_size = GetCommunicationAlignSize(size); @@ -96,6 +107,16 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me align_size = GetCommonAlignSize(size); } + if (ProfilingManager::GetInstance().IsProfiling() && graph_id != kInvalidGraphId) { + auto node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id); + if (node == nullptr) { + node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id); + MS_LOG(INFO) << "Add graph memory node for static memory profiling, graph id is " << graph_id; + } + + node->AddStaticMemorySize(align_size); + } + auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset(); MS_LOG(INFO) << "Malloc Memory: Static, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_ << "] memory pool[" << device_mem_size_ - device_mem_pool_offset << "])" @@ -139,6 +160,13 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m return device_mem_base_ + offset; } } + +void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) { + MemoryManager::MallocSomasDynamicMem(graph); + if (ProfilingManager::GetInstance().IsProfiling()) { + somas_reuse_util_ptr_->ConvertToProfilingNode(graph->graph_id()); + } +} } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h index 93bb2951e2..14f5f29f93 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h @@ -31,9 +31,11 @@ class AscendMemoryManager : public MemoryManager { void ResetDynamicMemory() override; void ClearGlobalIdleMem() override; void *MallocMemFromMemPool(size_t size) override; + uint64_t GetDeviceMemSize(); + void MallocSomasDynamicMem(const session::KernelGraph *graph); protected: - uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; + uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override; uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; private: diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc index c1c5008717..242880b161 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc @@ -22,7 +22,7 @@ namespace mindspore { namespace device { namespace cpu { -uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool) { +uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) { void *ptr = malloc(size); if (ptr != nullptr) { memset_s(ptr, size, 0, size); diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h index 08f0052b7a..f384244d2b 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h @@ -44,7 +44,7 @@ class CPUMemoryManager : public MemoryManager { void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); protected: - uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; + uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override; uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; private: diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc index d11ec324cf..1a8da802d5 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc @@ -101,7 +101,7 @@ void GPUMemoryManager::FreeDeviceMemory() { GPUMemoryAllocator::GetInstance().ReleaseDeviceRes(); } -uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) { +uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); if (context_ptr->get_param(MS_CTX_ENABLE_DYNAMIC_MEM_POOL)) { diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h index e0cc2988ed..11dba08981 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h @@ -36,7 +36,7 @@ class GPUMemoryManager : public MemoryManager { std::vector size_list) override; protected: - uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; + uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override; }; } // namespace gpu } // namespace device diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index c32c0b5ae1..609a8f316f 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -360,7 +360,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { auto tensor_size = CountNodeDeviceMemorySize(item, index); device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); MS_LOG(DEBUG) << "Malloc static memory for " << item->fullname_with_scope(); - if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address) == nullptr) { + if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address, graph->graph_id()) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size; } MS_LOG(INFO) << "Malloc Input for graph " << graph->graph_id() << ", node: " << item->fullname_with_scope() @@ -629,6 +629,10 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const MS_EXCEPTION_IF_NULL(ms_context); std::vector tensors; TensorValueToTensor(node_value, &tensors); + // Graph id should be passed to record static memory if profiling is enabled. + auto kernel_info = static_cast(value_node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + uint32_t graph_id = kernel_info->graph_id(); for (const auto &tensor : tensors) { if (tensor == nullptr) { MS_LOG(WARNING) << "Tensor is null"; @@ -651,7 +655,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const if (ms_context->get_param(MS_CTX_ENABLE_PYNATIVE_INFER) && !mem_manager_->MallocMemFromMemPool(address, node_size)) { MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << node_size; - } else if (mem_manager_->MallocMem(kStaticMem, node_size, address) == nullptr) { + } else if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size; } AnfAlgo::SetOutputAddr(address, output_idx, value_node.get()); @@ -662,6 +666,8 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const << "node dtype is " << AnfAlgo::GetOutputInferDataType(value_node, output_idx); } } + + return; } void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { @@ -690,7 +696,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { if (ms_context->get_param(MS_CTX_ENABLE_PYNATIVE_INFER) && !mem_manager_->MallocMemFromMemPool(address, tensor_size)) { MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << tensor_size; - } else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address) == nullptr) { + } else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size; } AnfAlgo::SetOutputAddr(address, 0, value_node.get()); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 1b046cc9b2..f759db9e88 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -100,6 +100,7 @@ class KernelRuntime { } virtual void PreInit() {} + virtual uint64_t GetAvailableMemMaxSize() const { return 0; } protected: virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc index b497b5f353..b7422fca4e 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/memory_manager.cc @@ -18,8 +18,10 @@ #include #include "backend/session/anf_runtime_algorithm.h" #include "utils/ms_context.h" + using mindspore::memreuse::BestFitMemReuse; using mindspore::memreuse::MemReuseUtilPtr; + namespace mindspore { namespace device { size_t MemoryManager::GetCommonAlignSize(size_t input_size) const { @@ -139,11 +141,11 @@ uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, return MallocDynamicMem(size, false); } -uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddressPtr &address) { +uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddressPtr &address, uint32_t graph_id) { MS_EXCEPTION_IF_NULL(address); uint8_t *ptr = nullptr; if (type == kStaticMem) { - ptr = MallocStaticMem(size, false); + ptr = MallocStaticMem(size, false, graph_id); address->from_mem_pool_ = true; } else if (type == kDynamicMem) { ptr = MallocDynamicMem(size, false); @@ -152,7 +154,7 @@ uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddress return ptr; } -uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) { +uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) { size_t align_size = 0; if (communication_mem) { align_size = GetCommunicationAlignSize(size); diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h index 6fce89c881..7f0564c581 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.h +++ b/mindspore/ccsrc/runtime/device/memory_manager.h @@ -44,11 +44,12 @@ class MemoryManager { virtual void ClearGlobalIdleMem() {} void MallocReusedDynamicMem(const session::KernelGraph *graph); - void MallocSomasDynamicMem(const session::KernelGraph *graph); + virtual void MallocSomasDynamicMem(const session::KernelGraph *graph); uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size, const DeviceAddressPtr &address, bool comm_mem); uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size); - virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address); + virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address, + uint32_t graph_id = kInvalidGraphId); virtual bool MallocMemFromMemPool(const DeviceAddressPtr address, size_t size); virtual void *MallocMemFromMemPool(size_t size); @@ -62,7 +63,7 @@ class MemoryManager { size_t GetCommunicationAlignSize(size_t input_size) const; protected: - virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem); + virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId); virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); uint8_t *device_mem_base_{nullptr}; uint64_t device_mem_size_{0}; diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index ed12e9111c..9bd0766936 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -73,6 +73,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { set_param(MS_CTX_ENABLE_GRAPH_KERNEL, false); set_param(MS_CTX_ENABLE_SPARSE, false); set_param(MS_CTX_ENABLE_PARALLEL_SPLIT, false); + set_param(MS_CTX_PROFILING_DIR_PATH, ""); backend_policy_ = policy_map_[policy]; } diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h index 98948d10e8..3ece84d965 100644 --- a/mindspore/core/utils/ms_context.h +++ b/mindspore/core/utils/ms_context.h @@ -104,6 +104,7 @@ enum MsCtxParam : unsigned { MS_CTX_SAVE_GRAPHS_PATH, MS_CTX_VARIABLE_MEMORY_MAX_SIZE, MS_CTX_PYTHON_EXE_PATH, + MS_CTX_PROFILING_DIR_PATH, MS_CTX_TYPE_STRING_END, // parameter numbers of each type diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 4048c3d2d3..6dd2e76207 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -140,7 +140,8 @@ class Profiler: logger.error(msg) raise ValueError(msg) # use context interface to open profiling, for the new mindspore version(after 2020.5.21) - context.set_context(enable_profiling=True, profiling_options=profiling_options) + context.set_context(enable_profiling=True, profiling_options=profiling_options, + profiling_dir_path=self._output_path) base_profiling_container_path = os.path.join(self._output_path, "container") container_path = os.path.join(base_profiling_container_path, self._dev_id) data_path = os.path.join(container_path, "data") diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 485a281765..3e233fb202 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -4,12 +4,12 @@ message("build ut testcases...") project(ut) set(PROJECT_DIR "${PROJECT_SOURCE_DIR}/../../..") -if (ENABLE_DUMP_IR) +if(ENABLE_DUMP_IR) add_compile_definitions(ENABLE_DUMP_IR) -endif (ENABLE_DUMP_IR) -if (ENABLE_D) +endif() +if(ENABLE_D) add_compile_definitions(ENABLE_D) -endif () +endif() #add python lib and include for all ut executables; message("PYTHON_INCLUDE_DIRS = ${PYTHON_INCLUDE_DIRS}") @@ -25,13 +25,13 @@ MESSAGE("check ut_test ${CMAKE_BINARY_DIR}") link_directories(${MS_CCSRC_BUILD_PATH}) -if (ENABLE_MINDDATA) +if(ENABLE_MINDDATA) add_definitions(-D ENABLE_MINDDATA) link_directories(${MS_CCSRC_BUILD_PATH}/minddata/dataset) link_directories(${MS_CCSRC_BUILD_PATH}/minddata/mindrecord) -endif () +endif() # fetch ut test files -if (ENABLE_MINDDATA) +if(ENABLE_MINDDATA) include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image) file(GLOB_RECURSE UT_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ./stub/*.cc @@ -61,7 +61,7 @@ if (ENABLE_MINDDATA) ./cxx_api/*.cc ) - if (NOT ENABLE_PYTHON) + if(NOT ENABLE_PYTHON) set(PYTHON_RELATED_SRCS dataset/filter_op_test.cc dataset/voc_op_test.cc @@ -69,15 +69,15 @@ if (ENABLE_MINDDATA) dataset/sentence_piece_vocab_op_test.cc ) list(REMOVE_ITEM UT_SRCS ${PYTHON_RELATED_SRCS}) - endif () -else () + endif() +else() file(GLOB_RECURSE TEMP_UT_SRCS ./*.cc) - foreach (OBJ ${TEMP_UT_SRCS}) - if (NOT ${OBJ} MATCHES "./dataset/" AND NOT ${OBJ} MATCHES "./mindrecord/") + foreach(OBJ ${TEMP_UT_SRCS}) + if(NOT ${OBJ} MATCHES "./dataset/" AND NOT ${OBJ} MATCHES "./mindrecord/") list(APPEND UT_SRCS ${OBJ}) - endif () - endforeach () -endif () + endif() + endforeach() +endif() file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/pybind_api/*.cc" @@ -133,9 +133,11 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/transform/graph_ir/*.cc" "../../../mindspore/ccsrc/transform/graph_ir/op_declare/*.cc" "../../../mindspore/ccsrc/ps/*.cc" + "../../../mindspore/ccsrc/profiler/device/common/*.cc" ) -list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc") +list(REMOVE_ITEM MINDSPORE_SRC_LIST + "../../../mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/util.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/scheduler.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/optimizer_info.cc") @@ -154,31 +156,32 @@ add_dependencies(_ut_ut_obj engine-cache-server) add_executable(ut_tests $ $) -if (ENABLE_GE) - if (ENABLE_TRAIN) +if(ENABLE_GE) + if(ENABLE_TRAIN) target_link_libraries(ut_tests PRIVATE graph ge_runner) - else () + else() target_link_libraries(ut_tests PRIVATE graph ge_client) - endif () + endif() target_link_libraries(mindspore PRIVATE tsdclient) -endif () +endif() -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore::event mindspore::event_pthreads mindspore_gvar ${PYTHON_LIBRARIES} pthread util dl) - if (ENABLE_MINDDATA) +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore::event mindspore::event_pthreads + mindspore_gvar ${PYTHON_LIBRARIES} pthread util dl) + if(ENABLE_MINDDATA) # AUX_SOURCE_DIRECTORY(LITE_CV_FILES) # message(STATUS "xxxxxxxxxxxxxxxxx"${LITE_CV_FILES} ) # add_library(_live_cv OBJECT ${LITE_CV_FILES}) target_link_libraries(ut_tests PRIVATE _c_dataengine _c_mindrecord) - endif () -else () + endif() +else() target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore_gvar ${PYTHON_LIBRARIES}) -endif () -if (USE_GLOG) +endif() +if(USE_GLOG) target_link_libraries(ut_tests PRIVATE mindspore::glog) -endif () +endif() target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph)