| @@ -24,9 +24,6 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") | |||
| -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare \ | |||
| -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move \ | |||
| -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2") | |||
| elseif(ENABLE_SYM_FILE) | |||
| set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -g -ggdb -Wl,--allow-shlib-undefined \ | |||
| -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2") | |||
| else() | |||
| set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined \ | |||
| -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2") | |||
| @@ -18,7 +18,7 @@ | |||
| SET BASE_PATH=%CD% | |||
| SET BUILD_PATH=%BASE_PATH%/build | |||
| SET threads=8 | |||
| SET threads=6 | |||
| SET ENABLE_GITEE=OFF | |||
| set VERSION_MAJOR='' | |||
| @@ -27,7 +27,7 @@ usage() | |||
| echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\" | |||
| echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\" | |||
| echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\" | |||
| echo " [-L Tensor-RT path] [-y on|off] \\" | |||
| echo " [-L Tensor-RT path] \\" | |||
| echo "" | |||
| echo "Options:" | |||
| echo " -d Debug mode" | |||
| @@ -61,10 +61,9 @@ usage() | |||
| echo " -l Compile with python dependency, default on" | |||
| echo " -S Enable enable download cmake compile dependency from gitee , default off" | |||
| echo " -k Enable make clean, clean up compilation generated cache " | |||
| echo " -W Enable SIMD instruction set, use [sse|neon|avx|avx512|off], default avx for cloud CPU backend" | |||
| echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU" | |||
| echo " -H Enable hidden" | |||
| echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking" | |||
| echo " -y Compile the symbol table switch and save the symbol table to the directory output" | |||
| } | |||
| # check value of input is 'on' or 'off' | |||
| @@ -123,9 +122,8 @@ checkopts() | |||
| TENSORRT_HOME="" | |||
| USER_ENABLE_DUMP_IR=false | |||
| USER_ENABLE_DEBUGGER=false | |||
| ENABLE_SYM_FILE="off" | |||
| # Process the options | |||
| while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:y' opt | |||
| while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:' opt | |||
| do | |||
| CASE_SENSIVE_ARG=${OPTARG} | |||
| OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]') | |||
| @@ -142,9 +140,6 @@ checkopts() | |||
| exit 1 | |||
| fi | |||
| ;; | |||
| y) | |||
| ENABLE_SYM_FILE="on" | |||
| ;; | |||
| r) | |||
| DEBUG_MODE="off" | |||
| ;; | |||
| @@ -447,9 +442,6 @@ build_mindspore() | |||
| if [[ -n "$TRAIN_MODE" ]]; then | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_${TRAIN_MODE}=ON" | |||
| fi | |||
| if [[ "X$ENABLE_SYM_FILE" = "Xon" ]]; then | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SYM_FILE=ON" | |||
| fi | |||
| if [[ "X$ENABLE_ASAN" = "Xon" ]]; then | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ASAN=ON" | |||
| fi | |||
| @@ -0,0 +1,44 @@ | |||
| set(FFMPEG_FLAGS | |||
| --disable-programs | |||
| --disable-doc | |||
| --disable-debug | |||
| --disable-avdevice | |||
| --disable-postproc | |||
| --disable-avfilter | |||
| --disable-network | |||
| --disable-encoders | |||
| --disable-hwaccels | |||
| --disable-muxers | |||
| --disable-bsfs | |||
| --disable-protocols | |||
| --enable-protocol=file | |||
| --enable-protocol=pipe | |||
| --disable-indevs | |||
| --disable-outdevs | |||
| --disable-devices | |||
| --disable-filters | |||
| --disable-bzlib | |||
| --disable-iconv | |||
| --disable-libxcb | |||
| --disable-lzma | |||
| --disable-sdl2 | |||
| --disable-xlib | |||
| --disable-zlib) | |||
| set(REQ_URL "https://github.com/FFmpeg/FFmpeg/archive/n4.3.1.tar.gz") | |||
| set(MD5 "426ca412ca61634a248c787e29507206") | |||
| mindspore_add_pkg(ffmpeg | |||
| VER 4.3.1 | |||
| LIBS avcodec avformat avutil swresample swscale | |||
| URL ${REQ_URL} | |||
| MD5 ${MD5} | |||
| CONFIGURE_COMMAND ./configure --disable-static --enable-shared --disable-x86asm ${FFMPEG_FLAGS} | |||
| ) | |||
| include_directories(${ffmpeg_INC}) | |||
| add_library(mindspore::avcodec ALIAS ffmpeg::avcodec) | |||
| add_library(mindspore::avformat ALIAS ffmpeg::avformat) | |||
| add_library(mindspore::avutil ALIAS ffmpeg::avutil) | |||
| add_library(mindspore::swresample ALIAS ffmpeg::swresample) | |||
| add_library(mindspore::swscale ALIAS ffmpeg::swscale) | |||
| @@ -1,10 +1,10 @@ | |||
| if(MSVC) | |||
| set(flatbuffers_CXXFLAGS "${CMAKE_CXX_FLAGS}") | |||
| set(flatbuffers_CFLAGS "${CMAKE_C_FLAGS}") | |||
| set(flatbuffers_CFLAGS "${CMAKE_CXX_FLAGS}") | |||
| set(flatbuffers_LDFLAGS "${CMAKE_SHARED_LINKER_FLAGS}") | |||
| else() | |||
| set(flatbuffers_CXXFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong") | |||
| set(flatbuffers_CFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong") | |||
| set(flatbuffers_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2") | |||
| set(flatbuffers_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") | |||
| endif() | |||
| if(WIN32) | |||
| @@ -1,15 +1,13 @@ | |||
| set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private") | |||
| set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") | |||
| if(NOT ENABLE_GLIBCXX) | |||
| set(glog_CXXFLAGS "${glog_CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") | |||
| endif() | |||
| if(BUILD_LITE) | |||
| set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private") | |||
| set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_C_FLAGS}") | |||
| set(glog_LDFLAGS "${SECURE_SHARED_LINKER_FLAGS}") | |||
| set(glog_patch "") | |||
| set(glog_lib glog) | |||
| else() | |||
| set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private") | |||
| set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") | |||
| if(NOT ENABLE_GLIBCXX) | |||
| set(glog_CXXFLAGS "${glog_CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") | |||
| endif() | |||
| set(glog_patch ${CMAKE_SOURCE_DIR}/third_party/patch/glog/glog.patch001) | |||
| set(glog_lib mindspore_glog) | |||
| endif() | |||
| @@ -9,7 +9,7 @@ endif() | |||
| if(ENABLE_GITEE) | |||
| set(REQ_URL "https://gitee.com/mirrors/JSON-for-Modern-CPP/repository/archive/v3.6.1.zip") | |||
| set(MD5 "36ea0d9a709c6667b2798a62f6b197ae") | |||
| set(MD5 "5bda78ce308e6cfcf614dcf1d5ff27a7") | |||
| set(INCLUDE "./include") | |||
| else() | |||
| set(REQ_URL "https://github.com/nlohmann/json/releases/download/v3.6.1/include.zip") | |||
| @@ -23,4 +23,4 @@ mindspore_add_pkg(nlohmann_json | |||
| URL ${REQ_URL} | |||
| MD5 ${MD5}) | |||
| include_directories(${nlohmann_json_INC}) | |||
| add_library(mindspore::json ALIAS nlohmann_json) | |||
| add_library(mindspore::json ALIAS nlohmann_json) | |||
| @@ -89,6 +89,7 @@ if(ENABLE_MINDDATA) | |||
| include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tinyxml2.cmake) | |||
| include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cppjieba.cmake) | |||
| include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sentencepiece.cmake) | |||
| include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ffmpeg.cmake) | |||
| endif() | |||
| if(ENABLE_MINDDATA) | |||
| @@ -25,7 +25,6 @@ option(ENABLE_ACL "enable acl" OFF) | |||
| option(ENABLE_GLIBCXX "enable_glibcxx" OFF) | |||
| option(MODE_ASCEND_ALL "supports all ascend platform" OFF) | |||
| option(MODE_ASCEND_ACL "supports ascend acl mode only" OFF) | |||
| option(ENABLE_SYM_FILE "enable sym file" OFF) | |||
| if(NOT ENABLE_D AND NOT ENABLE_TESTCASES AND NOT ENABLE_ACL AND NOT ENABLE_GE) | |||
| set(ENABLE_GLIBCXX ON) | |||
| @@ -12,8 +12,6 @@ set(CPACK_TEMPORARY_PACKAGE_FILE_NAME ${BUILD_PATH}/package/mindspore) | |||
| set(CPACK_TEMPORARY_INSTALL_DIRECTORY ${BUILD_PATH}/package/mindspore) | |||
| set(CPACK_PACK_ROOT_DIR ${BUILD_PATH}/package/) | |||
| set(CPACK_CMAKE_SOURCE_DIR ${CMAKE_SOURCE_DIR}) | |||
| set(CPACK_ENABLE_SYM_FILE ${ENABLE_SYM_FILE}) | |||
| set(CPACK_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) | |||
| if(ENABLE_GE) | |||
| set(CPACK_MS_BACKEND "ge") | |||
| set(CPACK_MS_TARGET "ascend or cpu") | |||
| @@ -127,6 +125,17 @@ if(ENABLE_MINDDATA) | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libicudata.so.67 COMPONENT mindspore) | |||
| install(FILES ${icu4c_LIBPATH}/libicui18n.so.67.1 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libicui18n.so.67 COMPONENT mindspore) | |||
| install(FILES ${ffmpeg_LIBPATH}/libavcodec.so.58.91.100 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libavcodec.so.58 COMPONENT mindspore) | |||
| install(FILES ${ffmpeg_LIBPATH}/libavformat.so.58.45.100 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libavformat.so.58 COMPONENT mindspore) | |||
| install(FILES ${ffmpeg_LIBPATH}/libavutil.so.56.51.100 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libavutil.so.56 COMPONENT mindspore) | |||
| install(FILES ${ffmpeg_LIBPATH}/libswresample.so.3.7.100 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libswresample.so.3 COMPONENT mindspore) | |||
| install(FILES ${ffmpeg_LIBPATH}/libswscale.so.5.7.100 | |||
| DESTINATION ${INSTALL_LIB_DIR} RENAME libswscale.so.5 COMPONENT mindspore) | |||
| endif() | |||
| if(ENABLE_CPU) | |||
| @@ -198,6 +207,12 @@ if(NOT ENABLE_GE) | |||
| set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common) | |||
| if(ENABLE_D) | |||
| install( | |||
| TARGETS ms_profile | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| install( | |||
| TARGETS hccl_plugin | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| @@ -330,6 +330,8 @@ elseif(WIN32) | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/dump_graph.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/ops/ops_def.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(DIRECTORY ${TOP_DIR}/build/mindspore/schema/ DESTINATION ${CONVERTER_ROOT_DIR}/include/schema | |||
| @@ -460,6 +462,8 @@ else() | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/dump_graph.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/ops/ops_def.h | |||
| DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME}) | |||
| install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/schema/ DESTINATION ${CONVERTER_ROOT_DIR}/include/schema | |||
| @@ -77,48 +77,6 @@ set(ENV{BACKEND_TARGET} ${CPACK_MS_TARGET}) | |||
| set(ENV{MS_PACKAGE_NAME} ${CPACK_MS_PACKAGE_NAME}) | |||
| set(ENV{COMMIT_ID} ${GIT_COMMIT_ID}) | |||
| file(GLOB DEBUG_SYM | |||
| ${MS_PACK_ROOT_DIR}/mindspore/*.so | |||
| ${MS_PACK_ROOT_DIR}/mindspore/lib/*.so | |||
| ) | |||
| file(GLOB DEBUG_STRIP_SYM | |||
| ${MS_PACK_ROOT_DIR}/mindspore/*.so | |||
| ${MS_PACK_ROOT_DIR}/mindspore/lib/*.so* | |||
| ) | |||
| set(CMAKE_OBJCOPY $ENV{CROSS_COMPILE}objcopy) | |||
| set(CMAKE_STRIP $ENV{CROSS_COMPILE}strip) | |||
| if(CPACK_ENABLE_SYM_FILE) | |||
| foreach(schema ${DEBUG_SYM}) | |||
| execute_process( | |||
| COMMAND ${CMAKE_OBJCOPY} "--only-keep-debug" ${schema} ${schema}.sym | |||
| WORKING_DIRECTORY ${MS_PACK_ROOT_DIR} | |||
| ) | |||
| endforeach() | |||
| endif() | |||
| if("${CPACK_CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| foreach(schema ${DEBUG_STRIP_SYM}) | |||
| execute_process( | |||
| COMMAND ${CMAKE_STRIP} ${schema} | |||
| WORKING_DIRECTORY ${MS_PACK_ROOT_DIR} | |||
| ) | |||
| endforeach() | |||
| endif() | |||
| file(GLOB DEBUG_SYM_FILE | |||
| ${MS_PACK_ROOT_DIR}/mindspore/*.sym | |||
| ${MS_PACK_ROOT_DIR}/mindspore/lib/*.sym | |||
| ) | |||
| if(CPACK_ENABLE_SYM_FILE) | |||
| file(MAKE_DIRECTORY ${MS_ROOT_DIR}/debug_info) | |||
| file(COPY ${DEBUG_SYM_FILE} DESTINATION ${MS_ROOT_DIR}/debug_info/) | |||
| file(REMOVE_RECURSE ${DEBUG_SYM_FILE}) | |||
| endif() | |||
| execute_process( | |||
| COMMAND ${PYTHON} ${MS_ROOT_DIR}/setup.py "bdist_wheel" | |||
| WORKING_DIRECTORY ${MS_PACK_ROOT_DIR} | |||
| @@ -146,16 +104,3 @@ file(COPY ${MS_PACK_ROOT_DIR}/${NEW_FILE_NAME} DESTINATION ${MS_ROOT_DIR}/output | |||
| file(SHA256 ${MS_ROOT_DIR}/output/${NEW_FILE_NAME} SHA256_VAR) | |||
| file(WRITE ${MS_ROOT_DIR}/output/${NEW_FILE_NAME}.sha256 ${SHA256_VAR} " " ${NEW_FILE_NAME}) | |||
| set(CMAKE_TAR $ENV{CROSS_COMPILE}tar) | |||
| if(CPACK_ENABLE_SYM_FILE) | |||
| file(MAKE_DIRECTORY ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}) | |||
| file(COPY ${MS_ROOT_DIR}/debug_info/ DESTINATION | |||
| ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/) | |||
| execute_process(COMMAND | |||
| ${CMAKE_COMMAND} -E ${CMAKE_TAR} cfv | |||
| ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}.zip | |||
| ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/ --format=zip | |||
| WORKING_DIRECTORY ${MS_ROOT_DIR}) | |||
| file(REMOVE_RECURSE ${MS_ROOT_DIR}/debug_info) | |||
| file(REMOVE_RECURSE ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}) | |||
| endif() | |||
| @@ -91,6 +91,18 @@ if(ENABLE_MINDDATA) | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| file(GLOB_RECURSE FFMPEG_LIB_LIST | |||
| ${ffmpeg_LIBPATH}/libavcodec* | |||
| ${ffmpeg_LIBPATH}/libavformat* | |||
| ${ffmpeg_LIBPATH}/libavutil* | |||
| ${ffmpeg_LIBPATH}/libswresample* | |||
| ${ffmpeg_LIBPATH}/libswscale* | |||
| ) | |||
| install( | |||
| FILES ${FFMPEG_LIB_LIST} | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| endif() | |||
| # CPU mode | |||
| @@ -42,6 +42,7 @@ set(opencv_LIBPATH ${opencv_LIBPATH}/../bin/) | |||
| set(jpeg_turbo_LIBPATH ${jpeg_turbo_LIBPATH}/../bin/) | |||
| set(sqlite_LIBPATH ${sqlite_LIBPATH}/../bin/) | |||
| set(tinyxml2_LIBPATH ${tinyxml2_LIBPATH}/../bin/) | |||
| set(ffmpeg_LIBPATH ${ffmpeg_LIBPATH}/../bin/) | |||
| message("offline debugger does not support windows system temporarily") | |||
| @@ -97,6 +98,18 @@ if(ENABLE_MINDDATA) | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| file(GLOB_RECURSE FFMPEG_LIB_LIST | |||
| ${ffmpeg_LIBPATH}/libavcodec* | |||
| ${ffmpeg_LIBPATH}/libavformat* | |||
| ${ffmpeg_LIBPATH}/libavutil* | |||
| ${ffmpeg_LIBPATH}/libswresample* | |||
| ${ffmpeg_LIBPATH}/libswscale* | |||
| ) | |||
| install( | |||
| FILES ${FFMPEG_LIB_LIST} | |||
| DESTINATION ${INSTALL_LIB_DIR} | |||
| COMPONENT mindspore | |||
| ) | |||
| endif() | |||
| if(ENABLE_CPU) | |||
| @@ -1,4 +1,2 @@ | |||
| approvers: | |||
| - zhoufeng54 | |||
| reviewers: | |||
| - HW_KK | |||
| - HW_KK | |||
| @@ -58,11 +58,8 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||
| && make install -j4 \ | |||
| && rm -f /usr/local/bin/python \ | |||
| && rm -f /usr/local/bin/pip \ | |||
| && rm -f /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ldconfig \ | |||
| && rm -rf /tmp/cpython-3.7.5 \ | |||
| && rm -f /tmp/v3.7.5.tar.gz | |||
| @@ -51,16 +51,13 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||
| && tar -xvf v3.7.5.tar.gz \ | |||
| && cd /tmp/cpython-3.7.5 \ | |||
| && mkdir -p ${PYTHON_ROOT_PATH} \ | |||
| && ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \ | |||
| && ./configure --prefix=${PYTHON_ROOT_PATH} \ | |||
| && make -j4 \ | |||
| && make install -j4 \ | |||
| && rm -f /usr/local/bin/python \ | |||
| && rm -f /usr/local/bin/pip \ | |||
| && rm -f /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ldconfig \ | |||
| && rm -rf /tmp/cpython-3.7.5 \ | |||
| && rm -f /tmp/v3.7.5.tar.gz | |||
| @@ -1,4 +1,4 @@ | |||
| FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04 | |||
| FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 | |||
| MAINTAINER leonwanghui <leon.wanghui@huawei.com> | |||
| @@ -43,7 +43,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y \ | |||
| libnuma-dev | |||
| # Configure cuDNN (v7.6.5) | |||
| RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.5 /usr/local/cuda/lib64/libcudnn.so | |||
| RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.5 /usr/local/cuda/lib64/libcudnn.so | |||
| # Set bash | |||
| RUN echo "dash dash/sh boolean false" | debconf-set-selections | |||
| @@ -62,11 +62,8 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||
| && make install -j4 \ | |||
| && rm -f /usr/local/bin/python \ | |||
| && rm -f /usr/local/bin/pip \ | |||
| && rm -f /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ldconfig \ | |||
| && rm -rf /tmp/cpython-3.7.5 \ | |||
| && rm -f /tmp/v3.7.5.tar.gz | |||
| @@ -1,4 +1,4 @@ | |||
| FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04 | |||
| FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 | |||
| MAINTAINER leonwanghui <leon.wanghui@huawei.com> | |||
| @@ -53,16 +53,13 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \ | |||
| && tar -xvf v3.7.5.tar.gz \ | |||
| && cd /tmp/cpython-3.7.5 \ | |||
| && mkdir -p ${PYTHON_ROOT_PATH} \ | |||
| && ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \ | |||
| && ./configure --prefix=${PYTHON_ROOT_PATH} \ | |||
| && make -j4 \ | |||
| && make install -j4 \ | |||
| && rm -f /usr/local/bin/python \ | |||
| && rm -f /usr/local/bin/pip \ | |||
| && rm -f /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \ | |||
| && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \ | |||
| && ldconfig \ | |||
| && rm -rf /tmp/cpython-3.7.5 \ | |||
| && rm -f /tmp/v3.7.5.tar.gz | |||
| @@ -23,6 +23,12 @@ | |||
| #include "include/api/data_type.h" | |||
| #include "include/api/dual_abi_helper.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| namespace mindspore { | |||
| class Model; | |||
| class ModelImpl; | |||
| @@ -22,6 +22,12 @@ | |||
| #include <memory> | |||
| #include "include/api/callback/callback.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| namespace mindspore { | |||
| class CkptSaver: public TrainCallBack { | |||
| @@ -21,6 +21,12 @@ | |||
| #include <utility> | |||
| #include "include/api/callback/callback.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| using GraphPoint = std::pair<int, float>; | |||
| namespace mindspore { | |||
| @@ -22,6 +22,12 @@ | |||
| #include <memory> | |||
| #include "include/api/callback/callback.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| namespace mindspore { | |||
| constexpr int DONT_UPDATE_LR = 0; | |||
| @@ -22,6 +22,12 @@ | |||
| #include <memory> | |||
| #include "include/api/callback/callback.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| namespace mindspore { | |||
| class TimeMonitor: public TrainCallBack { | |||
| @@ -24,6 +24,12 @@ | |||
| #include "include/api/callback/callback.h" | |||
| #include "include/api/metrics/accuracy.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| using GraphPoint = std::pair<int, float>; | |||
| namespace mindspore { | |||
| @@ -23,6 +23,12 @@ | |||
| #include "include/api/data_type.h" | |||
| #include "include/api/dual_abi_helper.h" | |||
| #ifdef _WIN32 | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| namespace mindspore { | |||
| class MixPrecisionCfg { | |||
| @@ -38,19 +38,12 @@ class Allocator; | |||
| class Delegate; | |||
| class DeviceInfoContext; | |||
| /// \brief Context is used to store environment variables during execution. | |||
| class MS_API Context { | |||
| public: | |||
| Context(); | |||
| ~Context() = default; | |||
| /// \brief Set the number of threads at runtime. This option is only valid for MindSpore Lite. | |||
| /// | |||
| /// \param[in] thread_num the number of threads at runtime. | |||
| void SetThreadNum(int32_t thread_num); | |||
| /// \brief Get the current thread number setting. | |||
| /// | |||
| /// \return The current thread number setting. | |||
| int32_t GetThreadNum() const; | |||
| /// \brief Set the thread affinity to CPU cores. | |||
| @@ -67,10 +60,6 @@ class MS_API Context { | |||
| void SetDelegate(const std::shared_ptr<Delegate> &delegate); | |||
| std::shared_ptr<Delegate> GetDelegate() const; | |||
| /// \brief Get a mutable reference of DeviceInfoContext vector in this context. Only MindSpore Lite supports | |||
| /// heterogeneous scenarios with multiple members in the vector. | |||
| /// | |||
| /// \return Mutable reference of DeviceInfoContext vector in this context. | |||
| std::vector<std::shared_ptr<DeviceInfoContext>> &MutableDeviceInfo(); | |||
| private: | |||
| @@ -78,24 +67,14 @@ class MS_API Context { | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| /// \brief DeviceInfoContext defines different device contexts. | |||
| class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoContext> { | |||
| public: | |||
| struct Data; | |||
| DeviceInfoContext(); | |||
| virtual ~DeviceInfoContext() = default; | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| virtual enum DeviceType GetDeviceType() const = 0; | |||
| /// \brief A similar function to RTTI is provided when the -fno-rtti compilation option is turned on, which converts | |||
| /// DeviceInfoContext to a shared pointer of type T, and returns nullptr if the conversion fails. | |||
| /// | |||
| /// \param T Type | |||
| /// \return A pointer of type T after conversion. If the conversion fails, it will be nullptr. | |||
| template <class T> | |||
| std::shared_ptr<T> Cast() { | |||
| static_assert(std::is_base_of<DeviceInfoContext, T>::value, "Wrong cast type."); | |||
| @@ -105,89 +84,41 @@ class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoC | |||
| return std::static_pointer_cast<T>(shared_from_this()); | |||
| } | |||
| /// \brief obtain provider's name | |||
| /// | |||
| /// \return provider's name. | |||
| std::string GetProvider() const; | |||
| /// \brief set provider's name. | |||
| /// | |||
| /// \param[in] provider define the provider's name. | |||
| void SetProvider(const std::string &provider); | |||
| /// \brief obtain provider's device type. | |||
| /// | |||
| /// \return provider's device type. | |||
| std::string GetProviderDevice() const; | |||
| /// \brief set provider's device type. | |||
| /// | |||
| /// \param[in] device define the provider's device type.EG: CPU. | |||
| void SetProviderDevice(const std::string &device); | |||
| /// \brief set memory allocator. | |||
| /// | |||
| /// \param[in] allocator define the memory allocator which can be defined by user. | |||
| void SetAllocator(const std::shared_ptr<Allocator> &allocator); | |||
| /// \brief obtain memory allocator. | |||
| /// | |||
| /// \return memory allocator. | |||
| std::shared_ptr<Allocator> GetAllocator() const; | |||
| protected: | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| /// \brief Derived from DeviceInfoContext, The configuration of the model running on the CPU. This option is only valid | |||
| /// for MindSpore Lite. | |||
| class MS_API CPUDeviceInfo : public DeviceInfoContext { | |||
| public: | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; }; | |||
| /// \brief Set enables to perform the float16 inference | |||
| /// | |||
| /// \param[in] is_fp16 Enable float16 inference or not. | |||
| void SetEnableFP16(bool is_fp16); | |||
| /// \brief Get enables to perform the float16 inference | |||
| /// | |||
| /// \return Whether enable float16 inference. | |||
| bool GetEnableFP16() const; | |||
| }; | |||
| /// \brief Derived from DeviceInfoContext, The configuration of the model running on the NPU. This option is only valid | |||
| /// for MindSpore Lite. | |||
| class MS_API KirinNPUDeviceInfo : public DeviceInfoContext { | |||
| public: | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| enum DeviceType GetDeviceType() const override { return DeviceType::kKirinNPU; }; | |||
| /// \brief Set the NPU frequency. | |||
| /// | |||
| /// \param[in] frequency Can be set to 1 (low power consumption), 2 (balanced), 3 (high performance), 4 (extreme | |||
| /// performance), default as 3. | |||
| void SetFrequency(int frequency); | |||
| /// \brief Get the NPU frequency. | |||
| /// | |||
| /// \return NPU frequency | |||
| int GetFrequency() const; | |||
| }; | |||
| /// \brief Derived from DeviceInfoContext, The configuration of the model running on the GPU. | |||
| class MS_API GPUDeviceInfo : public DeviceInfoContext { | |||
| public: | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| enum DeviceType GetDeviceType() const override { return DeviceType::kGPU; }; | |||
| /// \brief Set device id. | |||
| /// | |||
| /// \param[in] device_id The device id. | |||
| void SetDeviceID(uint32_t device_id); | |||
| /// \brief Get the device id. | |||
| /// | |||
| /// \return The device id. | |||
| uint32_t GetDeviceID() const; | |||
| void SetGpuTrtInferMode(bool gpu_trt_infer_mode); | |||
| @@ -196,15 +127,8 @@ class MS_API GPUDeviceInfo : public DeviceInfoContext { | |||
| inline void SetPrecisionMode(const std::string &precison_mode); | |||
| inline std::string GetPrecisionMode() const; | |||
| /// \brief Set enables to perform the float16 inference | |||
| /// | |||
| /// \param[in] is_fp16 Enable float16 inference or not. | |||
| void SetEnableFP16(bool is_fp16); | |||
| /// \brief Get enables to perform the float16 inference | |||
| /// | |||
| /// \return Whether enable float16 inference. | |||
| bool GetEnableFP16() const; | |||
| private: | |||
| void SetPrecisionMode(const std::vector<char> &precision_mode); | |||
| std::vector<char> GetPrecisionModeChar() const; | |||
| @@ -215,113 +139,52 @@ void GPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) { | |||
| } | |||
| std::string GPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); } | |||
| /// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend910. This option is | |||
| /// invalid for MindSpore Lite. | |||
| class MS_API Ascend910DeviceInfo : public DeviceInfoContext { | |||
| public: | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| enum DeviceType GetDeviceType() const override { return DeviceType::kAscend910; }; | |||
| /// \brief Set device id. | |||
| /// | |||
| /// \param[in] device_id The device id. | |||
| void SetDeviceID(uint32_t device_id); | |||
| /// \brief Get the device id. | |||
| /// | |||
| /// \return The device id. | |||
| uint32_t GetDeviceID() const; | |||
| }; | |||
| /// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend310. This option is | |||
| /// invalid for MindSpore Lite. | |||
| class MS_API Ascend310DeviceInfo : public DeviceInfoContext { | |||
| public: | |||
| /// \brief Get the type of this DeviceInfoContext. | |||
| /// | |||
| /// \return Type of this DeviceInfoContext. | |||
| enum DeviceType GetDeviceType() const override { return DeviceType::kAscend310; }; | |||
| /// \brief Set device id. | |||
| /// | |||
| /// \param[in] device_id The device id. | |||
| void SetDeviceID(uint32_t device_id); | |||
| /// \brief Get the device id. | |||
| /// | |||
| /// \return The device id. | |||
| uint32_t GetDeviceID() const; | |||
| inline void SetDumpConfigPath(const std::string &cfg_path); | |||
| inline std::string GetDumpConfigPath() const; | |||
| /// \brief Set AIPP configuration file path. | |||
| /// | |||
| /// \param[in] cfg_path AIPP configuration file path. | |||
| // aipp config file | |||
| inline void SetInsertOpConfigPath(const std::string &cfg_path); | |||
| /// \brief Get AIPP configuration file path. | |||
| /// | |||
| /// \return AIPP configuration file path. | |||
| inline std::string GetInsertOpConfigPath() const; | |||
| /// \brief Set format of model inputs. | |||
| /// | |||
| /// \param[in] format Optional "NCHW", "NHWC", etc. | |||
| // nchw or nhwc | |||
| inline void SetInputFormat(const std::string &format); | |||
| /// \brief Get format of model inputs. | |||
| /// | |||
| /// \return The format of model inputs. | |||
| inline std::string GetInputFormat() const; | |||
| /// \brief Set shape of model inputs. | |||
| /// | |||
| /// \param[in] shape e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1". | |||
| // Mandatory while dynamic batch: e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1" | |||
| inline void SetInputShape(const std::string &shape); | |||
| /// \brief Get shape of model inputs. | |||
| /// | |||
| /// \return The shape of model inputs. | |||
| inline std::string GetInputShape() const; | |||
| /// \brief Set shape of model inputs. | |||
| /// | |||
| /// \param[in] shape e.g. {{1, {1,2,3,4}}, {2, {4,3,2,1}}} means the first input shape 1,2,3,4 and the second input | |||
| /// shape 4,3,2,1. | |||
| void SetInputShapeMap(const std::map<int, std::vector<int>> &shape); | |||
| /// \brief Get shape of model inputs. | |||
| /// | |||
| /// \return The shape of model inputs. | |||
| std::map<int, std::vector<int>> GetInputShapeMap() const; | |||
| void SetDynamicBatchSize(const std::vector<size_t> &dynamic_batch_size); | |||
| inline std::string GetDynamicBatchSize() const; | |||
| /// \brief Set type of model outputs. | |||
| /// | |||
| /// \param[in] output_type FP32, UINT8 or FP16, default as FP32. | |||
| // FP32, UINT8 or FP16, default as FP32 | |||
| void SetOutputType(enum DataType output_type); | |||
| /// \brief Get type of model outputs. | |||
| /// | |||
| /// \return The set type of model outputs. | |||
| enum DataType GetOutputType() const; | |||
| /// \brief Set precision mode of model. | |||
| /// | |||
| /// \param[in] precision_mode Optional "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" and | |||
| /// "allow_mix_precision", "force_fp16" is set as default | |||
| // "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" or "allow_mix_precision", default as "force_fp16" | |||
| inline void SetPrecisionMode(const std::string &precision_mode); | |||
| /// \brief Get precision mode of model. | |||
| /// | |||
| /// \return The set type of model outputs | |||
| inline std::string GetPrecisionMode() const; | |||
| /// \brief Set op select implementation mode. | |||
| /// | |||
| /// \param[in] op_select_impl_mode Optional "high_performance" and "high_precision", "high_performance" is set as | |||
| /// default. | |||
| // Optional "high_performance" and "high_precision", "high_performance" is set as default | |||
| inline void SetOpSelectImplMode(const std::string &op_select_impl_mode); | |||
| /// \brief Get op select implementation mode. | |||
| /// | |||
| /// \return The set op select implementation mode. | |||
| inline std::string GetOpSelectImplMode() const; | |||
| inline void SetFusionSwitchConfigPath(const std::string &cfg_path); | |||
| @@ -24,16 +24,9 @@ | |||
| #include "include/api/context.h" | |||
| namespace mindspore::kernel { | |||
| /// \brief The Kernel class is used to define a MindSpore Kernel. | |||
| class Kernel { | |||
| public: | |||
| Kernel() = default; | |||
| /// \brief Constructor. | |||
| /// | |||
| /// \param[in] inputs define the input tensors for kernel. | |||
| /// \param[in] outputs define the output tensors for kernel. | |||
| /// \param[in] primitive define the primitive of kernel generated by flatbuffers. | |||
| /// \param[in] ctx define the context for kernel. | |||
| Kernel(const std::vector<mindspore::MSTensor> &inputs, const std::vector<mindspore::MSTensor> &outputs, | |||
| const schema::Primitive *primitive, const mindspore::Context *ctx) | |||
| : context_(ctx), inputs_(std::move(inputs)), outputs_(std::move(outputs)), primitive_(primitive) { | |||
| @@ -41,65 +34,32 @@ class Kernel { | |||
| type_ = primitive->value_type(); | |||
| } | |||
| } | |||
| /// \brief Destructor. | |||
| virtual ~Kernel() = default; | |||
| /// \brief prepare for executing kernel. | |||
| /// | |||
| /// \return result code. | |||
| virtual int Prepare() = 0; | |||
| /// \brief execute the kernel. | |||
| /// | |||
| /// \return result code. | |||
| virtual int Execute() = 0; | |||
| /// \brief resize the kernel input shape, memory need to refresh. | |||
| /// | |||
| /// \return result code. | |||
| virtual int ReSize() = 0; | |||
| /// \brief set kernel's input tensors. | |||
| /// | |||
| /// \param[in] in_tensors define the input tensors. | |||
| virtual void set_inputs(const std::vector<mindspore::MSTensor> &in_tensors) { this->inputs_ = in_tensors; } | |||
| /// \brief set kernel's input tensor. | |||
| /// | |||
| /// \param[in] in_tensor define the input tensor. | |||
| /// \param[in] index define the index of the input tensor. | |||
| virtual void set_input(mindspore::MSTensor in_tensor, int index) { this->inputs_[index] = in_tensor; } | |||
| /// \brief set kernel's output tensors. | |||
| /// | |||
| /// \param[in] out_tensors define the output tensors. | |||
| virtual void set_outputs(const std::vector<mindspore::MSTensor> &out_tensors) { this->outputs_ = out_tensors; } | |||
| /// \brief set kernel's output tensor. | |||
| /// | |||
| /// \param[in] out_tensor define the output tensor. | |||
| /// \param[in] index define the index of the output tensor. | |||
| virtual void set_output(mindspore::MSTensor out_tensor, int index) { this->outputs_[index] = out_tensor; } | |||
| /// \brief obtain kernel's input tensors. | |||
| /// | |||
| /// \return input tensors. | |||
| virtual const std::vector<mindspore::MSTensor> &inputs() { return this->inputs_; } | |||
| /// \brief obtain kernel's output tensors. | |||
| /// | |||
| /// \return output tensors. | |||
| virtual const std::vector<mindspore::MSTensor> &outputs() { return this->outputs_; } | |||
| /// \brief obtain kernel's name. | |||
| /// | |||
| /// \return kernel's name. | |||
| std::string name() const { return this->name_; } | |||
| /// \brief set kernel's name. | |||
| /// | |||
| /// \param[in] name define the kernel's name. | |||
| void set_name(const std::string &name) { this->name_ = name; } | |||
| /// \brief obtain kernel's context. | |||
| /// | |||
| /// \return kernel's context. | |||
| const mindspore::Context *context() const { return this->context_; } | |||
| /// \brief obtain kernel's type. | |||
| /// | |||
| /// \return kernel's type. | |||
| virtual schema::PrimitiveType type() const { return type_; } | |||
| /// \brief obtain the primitive of kernel generated by flatbuffers. | |||
| /// | |||
| /// \return the primitive of kernel generated by flatbuffers. | |||
| const schema::Primitive *primitive() const { return this->primitive_; } | |||
| protected: | |||
| @@ -37,75 +37,32 @@ class Metrics; | |||
| namespace dataset { | |||
| class Dataset; | |||
| } // namespace dataset | |||
| /// \brief The Model class is used to define a MindSpore model, facilitating computational graph management. | |||
| class MS_API Model { | |||
| public: | |||
| Model(); | |||
| ~Model(); | |||
| Model(const Model &) = delete; | |||
| void operator=(const Model &) = delete; | |||
| /// \brief Builds a model so that it can run on a device. | |||
| /// | |||
| /// \param[in] graph GraphCell is a derivative of Cell. Cell is not available currently. GraphCell can be constructed | |||
| /// from Graph, for example, model.Build(GraphCell(graph), context). | |||
| /// \param[in] model_context A context used to store options during execution. | |||
| /// \param[in] train_cfg A config used by training. | |||
| /// | |||
| /// \return Status. | |||
| Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr, | |||
| const std::shared_ptr<TrainCfg> &train_cfg = nullptr); | |||
| /// \brief Resizes the shapes of inputs. | |||
| /// | |||
| /// \param[in] inputs A vector that includes all input tensors in order. | |||
| /// \param[in] dims Defines the new shapes of inputs, should be consistent with inputs. | |||
| /// | |||
| /// \return Status. | |||
| Status Resize(const std::vector<MSTensor> &inputs, const std::vector<std::vector<int64_t>> &dims); | |||
| /// \brief Inference model. | |||
| /// | |||
| /// \param[in] inputs A vector where model inputs are arranged in sequence. | |||
| /// \param[out] outputs Which is a pointer to a vector. The model outputs are filled in the container in sequence. | |||
| /// \param[in] before CallBack before predict. | |||
| /// \param[in] after CallBack after predict. | |||
| /// | |||
| /// \return Status. | |||
| Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs, | |||
| const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr); | |||
| /// \brief Obtains all input tensors of the model. | |||
| /// | |||
| /// \return The vector that includes all input tensors. | |||
| std::vector<MSTensor> GetInputs(); | |||
| /// \brief Obtains the input tensor of the model by name. | |||
| /// | |||
| /// \return The input tensor with the given name, if the name is not found, an invalid tensor is returned. | |||
| inline MSTensor GetInputByTensorName(const std::string &tensor_name); | |||
| Status InitMetrics(std::vector<Metrics *> metrics); | |||
| std::vector<Metrics *> GetMetrics(); | |||
| /// \brief Obtains all output tensors of the model. | |||
| /// | |||
| /// \return The vector that includes all output tensors. | |||
| std::vector<MSTensor> GetOutputs(); | |||
| /// \brief Obtains names of all output tensors of the model. | |||
| /// | |||
| /// \return A vector that includes names of all output tensors. | |||
| inline std::vector<std::string> GetOutputTensorNames(); | |||
| /// \brief Obtains the output tensor of the model by name. | |||
| /// | |||
| /// \return The output tensor with the given name, if the name is not found, an invalid tensor is returned. | |||
| inline MSTensor GetOutputByTensorName(const std::string &tensor_name); | |||
| inline std::vector<MSTensor> GetOutputsByNodeName(const std::string &tensor_name); | |||
| /// \brief Inference model. | |||
| /// | |||
| /// \param[in] device_type Device type,options are kGPU, kAscend910, etc. | |||
| /// \param[in] model_type The type of model file, options are ModelType::kMindIR, ModelType::kOM. | |||
| /// | |||
| /// \return Is supported or not. | |||
| static bool CheckModelSupport(enum DeviceType device_type, ModelType model_type); | |||
| Status SetTrainMode(bool train); | |||
| @@ -27,43 +27,13 @@ | |||
| #include "include/api/dual_abi_helper.h" | |||
| namespace mindspore { | |||
| /// \brief The Serialization class is used to summarize methods for reading and writing model files. | |||
| class MS_API Serialization { | |||
| public: | |||
| /// \brief Loads a model file from memory buffer. | |||
| /// | |||
| /// \param[in] model_data A buffer filled by model file. | |||
| /// \param[in] data_size The size of the buffer. | |||
| /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM. | |||
| /// \param[out] graph The output parameter, an object saves graph data. | |||
| /// \param[in] dec_key The decryption key, key length is 16, 24, or 32. | |||
| /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC. | |||
| /// | |||
| /// \return Status. | |||
| inline static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph, | |||
| const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm); | |||
| /// \brief Loads a model file from path, is not supported on MindSpore Lite. | |||
| /// | |||
| /// \param[in] file The path of model file. | |||
| /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM. | |||
| /// \param[out] graph The output parameter, an object saves graph data. | |||
| /// \param[in] dec_key The decryption key, key length is 16, 24, or 32. | |||
| /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC. | |||
| /// | |||
| /// \return Status. | |||
| inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key = {}, | |||
| const std::string &dec_mode = kDecModeAesGcm); | |||
| /// \brief Load multiple models from multiple files, MindSpore Lite does not provide this feature. | |||
| /// | |||
| /// \param[in] files The path of model files. | |||
| /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM. | |||
| /// \param[out] graph The output parameter, an object saves graph data. | |||
| /// \param[in] dec_key The decryption key, key length is 16, 24, or 32. | |||
| /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC. | |||
| /// | |||
| /// \return Status. | |||
| inline static Status Load(const std::vector<std::string> &files, ModelType model_type, std::vector<Graph> *graphs, | |||
| const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm); | |||
| static Status SetParameters(const std::map<std::string, Buffer> ¶meters, Model *model); | |||
| @@ -25,21 +25,11 @@ | |||
| #include "include/api/dual_abi_helper.h" | |||
| #include "include/api/format.h" | |||
| #ifndef MS_API | |||
| #ifdef _WIN32 | |||
| #ifdef _MSC_VER | |||
| #ifdef BUILDING_DLL | |||
| #define MS_API __declspec(dllexport) | |||
| #else | |||
| #define MS_API __declspec(dllimport) | |||
| #endif | |||
| #else | |||
| #define MS_API __declspec(dllexport) | |||
| #endif | |||
| #else | |||
| #define MS_API __attribute__((visibility("default"))) | |||
| #endif | |||
| #endif | |||
| namespace mindspore { | |||
| enum ModelType : uint32_t { | |||
| @@ -74,64 +64,18 @@ struct QuantParam { | |||
| }; | |||
| class Allocator; | |||
| /// \brief The MSTensor class defines a tensor in MindSpore. | |||
| class MS_API MSTensor { | |||
| public: | |||
| class Impl; | |||
| /// \brief Creates a MSTensor object, whose data need to be copied before accessed by Model, must be used in pairs | |||
| /// with DestroyTensorPtr. | |||
| /// | |||
| /// \param[in] name The name of the MSTensor. | |||
| /// \param[in] type The data type of the MSTensor. | |||
| /// \param[in] shape The shape of the MSTensor. | |||
| /// \param[in] data The data pointer that points to allocated memory. | |||
| /// \param[in] data_len The length of the memory, in bytes. | |||
| /// | |||
| /// \return A pointer of MSTensor. | |||
| static inline MSTensor *CreateTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape, | |||
| const void *data, size_t data_len) noexcept; | |||
| /// \brief Creates a MSTensor object, whose data can be directly accessed by Model, must be used in pairs with | |||
| /// DestroyTensorPtr. | |||
| /// | |||
| /// \param[in] name The name of the MSTensor. | |||
| /// \param[in] type The data type of the MSTensor. | |||
| /// \param[in] shape The shape of the MSTensor. | |||
| /// \param[in] data The data pointer that points to allocated memory. | |||
| /// \param[in] data_len The length of the memory, in bytes. | |||
| /// | |||
| /// \return A pointer of MSTensor. | |||
| static inline MSTensor *CreateRefTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape, | |||
| const void *data, size_t data_len) noexcept; | |||
| /// \brief Creates a MSTensor object, whose device data can be directly accessed by Model, must be used in pairs with | |||
| /// DestroyTensorPtr. | |||
| /// | |||
| /// \param[in] name The name of the MSTensor. | |||
| /// \param[in] type The data type of the MSTensor. | |||
| /// \param[in] shape The shape of the MSTensor. | |||
| /// \param[in] data The data pointer that points to device memory. | |||
| /// \param[in] data_len The length of the memory, in bytes. | |||
| /// | |||
| /// \return A pointer of MSTensor. | |||
| static inline MSTensor *CreateDevTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape, | |||
| const void *data, size_t data_len) noexcept; | |||
| /// \brief Create a string type MSTensor object whose data can be accessed by Model only after being copied, must be | |||
| /// used in pair with DestroyTensorPtr. | |||
| /// | |||
| /// \param[in] name The name of the MSTensor. | |||
| /// \param[in] str A vector container containing several strings. | |||
| /// | |||
| /// \return A pointer of MSTensor. | |||
| static inline MSTensor *StringsToTensor(const std::string &name, const std::vector<std::string> &str); | |||
| /// \brief Parse the string type MSTensor object into strings. | |||
| /// | |||
| /// \param[in] tensor A MSTensor object. | |||
| /// | |||
| /// \return A vector container containing several strings. | |||
| static inline std::vector<std::string> TensorToStrings(const MSTensor &tensor); | |||
| /// \brief Destroy an object created by Clone, StringsToTensor, CreateRefTensor, CreateDevTensor or CreateTensor. Do | |||
| /// not use it to destroy MSTensor from other sources. | |||
| /// | |||
| /// \param[in] tensor A MSTensor object. | |||
| static void DestroyTensorPtr(MSTensor *tensor) noexcept; | |||
| MSTensor(); | |||
| @@ -141,51 +85,19 @@ class MS_API MSTensor { | |||
| explicit MSTensor(std::nullptr_t); | |||
| ~MSTensor(); | |||
| /// \brief Obtains the name of the MSTensor. | |||
| /// | |||
| /// \return The name of the MSTensor. | |||
| inline std::string Name() const; | |||
| /// \brief Obtains the data type of the MSTensor. | |||
| /// | |||
| /// \return The data type of the MSTensor. | |||
| enum DataType DataType() const; | |||
| /// \brief Obtains the shape of the MSTensor. | |||
| /// | |||
| /// \return The shape of the MSTensor. | |||
| const std::vector<int64_t> &Shape() const; | |||
| /// \brief Obtains the number of elements of the MSTensor. | |||
| /// | |||
| /// \return The number of elements of the MSTensor. | |||
| int64_t ElementNum() const; | |||
| /// \brief Obtains a shared pointer to the copy of data of the MSTensor. The data can be read on host. | |||
| /// | |||
| /// \return A shared pointer to the copy of data of the MSTensor. | |||
| std::shared_ptr<const void> Data() const; | |||
| /// \brief Obtains the pointer to the data of the MSTensor. If the MSTensor is a device tensor, the data cannot be | |||
| /// accessed directly on host. | |||
| /// | |||
| /// \return A pointer to the data of the MSTensor. | |||
| void *MutableData(); | |||
| /// \brief Obtains the length of the data of the MSTensor, in bytes. | |||
| /// | |||
| /// \return The length of the data of the MSTensor, in bytes. | |||
| size_t DataSize() const; | |||
| /// \brief Gets the boolean value that indicates whether the memory of MSTensor is on device. | |||
| /// | |||
| /// \return The boolean value that indicates whether the memory of MSTensor is on device. | |||
| bool IsDevice() const; | |||
| /// \brief Gets a deep copy of the MSTensor, must be used in pair with DestroyTensorPtr. | |||
| /// | |||
| /// \return A pointer points to a deep copy of the MSTensor. | |||
| MSTensor *Clone() const; | |||
| /// \brief Gets the boolean value that indicates whether the MSTensor is valid. | |||
| /// | |||
| /// \return The boolean value that indicates whether the MSTensor is valid. | |||
| bool operator==(std::nullptr_t) const; | |||
| /// \brief Gets the boolean value that indicates whether the MSTensor is valid. | |||
| /// | |||
| /// \return The boolean value that indicates whether the MSTensor is valid. | |||
| bool operator!=(std::nullptr_t) const; | |||
| bool operator==(const MSTensor &tensor) const; | |||
| @@ -23,7 +23,6 @@ from itertools import repeat, zip_longest | |||
| from collections import deque | |||
| from collections.abc import Iterable | |||
| import numpy as np | |||
| from mindspore import context | |||
| from mindspore import log as logger | |||
| from mindspore.common import dtype as mstype | |||
| from mindspore._c_expression import Tensor as Tensor_ | |||
| @@ -148,7 +147,7 @@ def check_number(arg_value, value, rel, arg_type=int, arg_name=None, prim_name=N | |||
| Check argument integer. | |||
| Example: | |||
| - number = check_number(number, 0, Rel.GE, "number", None) # number >= 0 | |||
| - number = check_int(number, 0, Rel.GE, "number", None) # number >= 0 | |||
| """ | |||
| rel_fn = Rel.get_fns(rel) | |||
| prim_name = f'in `{prim_name}`' if prim_name else '' | |||
| @@ -847,10 +846,6 @@ class Validator: | |||
| """Returns an empty Tensor.""" | |||
| return Tensor_(dtype, shape) | |||
| @staticmethod | |||
| def check_type_support(dtype, device, supported_dtypes): | |||
| return dtype in supported_dtypes or not context.get_context('device_target') == device | |||
| def check_input_format(input_param): | |||
| """Judge input format.""" | |||
| @@ -18,6 +18,7 @@ from .addn import AddN | |||
| from .assign_add import AssignAdd | |||
| from .batchnorm import BatchNorm | |||
| from .batchnorm_grad import BatchNormGrad | |||
| from .bias_add import BiasAdd | |||
| from .bias_add_grad import BiasAddGrad | |||
| from .clip_by_norm_no_div_sum import ClipByNormNoDivSum | |||
| from .conv2d import Conv2D | |||
| @@ -25,6 +26,7 @@ from .complex import CAbs, CAdd, CDiv, CMul, CSub | |||
| from .dropout_grad import DropoutGrad | |||
| from .equal_count import EqualCount | |||
| from .erfc import Erfc | |||
| from .expand_dims import ExpandDims | |||
| from .fused_adam import FusedAdam | |||
| from .fused_adam_weight_decay import FusedAdamWeightDecay | |||
| from .fused_mul_add import FusedMulAdd | |||
| @@ -49,7 +51,6 @@ from .sigmoid import Sigmoid | |||
| from .sigmoid_cross_entropy_with_logits import SigmoidCrossEntropyWithLogits | |||
| from .sigmoid_cross_entropy_with_logits_grad import SigmoidCrossEntropyWithLogitsGrad | |||
| from .sigmoid_grad import SigmoidGrad | |||
| from .slice import Slice | |||
| from .softmax import Softmax | |||
| from .softmax_cross_entropy_with_logits import SoftmaxCrossEntropyWithLogits | |||
| from .softmax_grad_ext import SoftmaxGradExt | |||
| @@ -80,9 +80,6 @@ class Expander: | |||
| class ExpanderInfoValidator: | |||
| """ExpanderInfoValidator is the utility class which defines the validator decorator for expanders""" | |||
| def __init__(self): | |||
| """Init""" | |||
| @staticmethod | |||
| def _add_check_function(kls, func): | |||
| """ | |||
| @@ -201,8 +198,8 @@ def to_frac_z_axis(ori_shape, ori_axis): | |||
| return frac_z_axis | |||
| def infer_shape_from_fractalnz(fractal): | |||
| "get original shape from fractalnz shape" | |||
| def infer_shape_from_fractalNz(fractal): | |||
| "get original shape from fractalNz shape" | |||
| shape = [] | |||
| dims = len(fractal) | |||
| batch = dims - 4 | |||
| @@ -24,7 +24,6 @@ from .expand_dims import ExpandDims | |||
| @VLD.check_attrs('is_training', 'momentum', 'epsilon') | |||
| class BatchNorm(Expander): | |||
| """BatchNorm expander""" | |||
| def _expand(self, graph_builder): | |||
| # get op info | |||
| input_x = self.inputs[0] | |||
| @@ -43,8 +42,81 @@ class BatchNorm(Expander): | |||
| input_x = graph_builder.emit('Cast', [input_x], attrs={'dst_type': input_x_new_type}) | |||
| if self.attrs['is_training']: | |||
| self.inputs[0] = input_x | |||
| res_y, mean_res, variance_res, mean_muls, y_sqrt_rec = self._bn_train(graph_builder) | |||
| reduce_axis = () | |||
| shape_x = input_x.shape | |||
| if input_x.data_format == DF.NHWC: | |||
| reduce_axis = (0, 1, 2) | |||
| num = shape_x[0] * shape_x[1] * shape_x[2] | |||
| else: | |||
| reduce_axis = (0, 2, 3) | |||
| num = shape_x[0] * shape_x[2] * shape_x[3] | |||
| num_rec = 1.0 / num | |||
| num_rec_v = graph_builder.value(input_scale.dtype, num_rec) | |||
| # compute mean value of input_x | |||
| mean_sum = graph_builder.emit( | |||
| 'ReduceSum', [input_x], attrs={'reduce_axis': reduce_axis, 'keep_dims': False}) | |||
| mean_muls = graph_builder.emit('Mul', [mean_sum, num_rec_v]) | |||
| # compute variance of input_x | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| mean_muls_expand = graph_builder.emit( | |||
| 'Reshape', [mean_muls], attrs={'shape': ExpandDims.infer_shape(mean_muls.shape, [-1, -1])}) | |||
| else: | |||
| mean_muls_expand = mean_muls | |||
| var_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand]) | |||
| var_mul = graph_builder.emit('Mul', [var_sub, var_sub]) | |||
| var_sum = graph_builder.emit('ReduceSum', [var_mul], attrs={'reduce_axis': reduce_axis, 'keep_dims': False}) | |||
| var_mul = graph_builder.emit('Mul', [var_sum, num_rec_v]) | |||
| # y_sqrt_rec means 1 / sqrt(variance + epsilon), which is calculated in backward pass | |||
| scalar_one = 1.0 | |||
| scalar_one_v = graph_builder.value(input_scale.dtype, scalar_one) | |||
| y_add = graph_builder.emit('Add', [var_mul, epsilon_v]) | |||
| y_sqrt = graph_builder.emit('Sqrt', [y_add]) | |||
| y_sqrt_rec = graph_builder.emit('RealDiv', [scalar_one_v, y_sqrt]) | |||
| # compute res_y | |||
| tmp_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| y_sqrt_rec_expand = graph_builder.emit( | |||
| 'Reshape', [y_sqrt_rec], attrs={'shape': ExpandDims.infer_shape(y_sqrt_rec.shape, [-1, -1])}) | |||
| else: | |||
| y_sqrt_rec_expand = y_sqrt_rec | |||
| y_norm = graph_builder.emit('Mul', [tmp_sub, y_sqrt_rec_expand]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| input_scale_expand = graph_builder.emit( | |||
| 'Reshape', [input_scale], attrs={'shape': ExpandDims.infer_shape(input_scale.shape, [-1, -1])}) | |||
| else: | |||
| input_scale_expand = input_scale | |||
| res_y_mul = graph_builder.emit('Mul', [input_scale_expand, y_norm]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| input_offset_expand = graph_builder.emit( | |||
| 'Reshape', [input_offset], attrs={'shape': ExpandDims.infer_shape(input_offset.shape, [-1, -1])}) | |||
| else: | |||
| input_offset_expand = input_offset | |||
| res_y = graph_builder.emit('Add', [res_y_mul, input_offset_expand]) | |||
| # compute mean_res | |||
| momentum_sub = scalar_one - self.attrs['momentum'] | |||
| momentum_v_sub = graph_builder.value(input_scale.dtype, momentum_sub) | |||
| new_running_mean_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_mean]) | |||
| momentum_v = graph_builder.value(input_scale.dtype, self.attrs['momentum']) | |||
| current_mean_tmp = graph_builder.emit('Mul', [momentum_v, mean_muls]) | |||
| updated_moving_mean = graph_builder.emit('Add', [new_running_mean_tmp, current_mean_tmp]) | |||
| mean_res = graph_builder.emit( | |||
| 'InplaceAssign', [input_mean, updated_moving_mean, updated_moving_mean], attrs={'fake_output': True}) | |||
| # variance_res is calculated by sample variance, and need to multiply by num / (num - 1) | |||
| var_num = float(num) / (num - 1) | |||
| var_num_v = graph_builder.value(input_scale.dtype, var_num) | |||
| var_mul_update = graph_builder.emit('Mul', [var_num_v, var_mul]) | |||
| new_running_var_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_variance]) | |||
| current_var_tmp = graph_builder.emit('Mul', [momentum_v, var_mul_update]) | |||
| updated_moving_variance = graph_builder.emit('Add', [new_running_var_tmp, current_var_tmp]) | |||
| variance_res = graph_builder.emit( | |||
| 'InplaceAssign', [input_variance, updated_moving_variance, updated_moving_variance], | |||
| attrs={'fake_output': True}) | |||
| if input_x_new_type != input_x_ori_type: | |||
| res_y = graph_builder.emit('Cast', [res_y], attrs={'dst_type': input_x_ori_type}) | |||
| return res_y, mean_res, variance_res, mean_muls, y_sqrt_rec | |||
| @@ -68,88 +140,3 @@ class BatchNorm(Expander): | |||
| if input_x_new_type != input_x_ori_type: | |||
| res_y = graph_builder.emit('Cast', [res_y], attrs={'dst_type': input_x_ori_type}) | |||
| return res_y, var_add, var_add, var_add, var_add | |||
| def _bn_train(self, graph_builder): | |||
| """expand BatchNorm for training mode""" | |||
| input_x = self.inputs[0] | |||
| input_scale = self.inputs[1] | |||
| input_offset = self.inputs[2] | |||
| input_mean = self.inputs[3] | |||
| input_variance = self.inputs[4] | |||
| epsilon_v = graph_builder.value(input_scale.dtype, self.attrs['epsilon']) | |||
| reduce_axis = () | |||
| shape_x = input_x.shape | |||
| if input_x.data_format == DF.NHWC: | |||
| reduce_axis = (0, 1, 2) | |||
| num = shape_x[0] * shape_x[1] * shape_x[2] | |||
| else: | |||
| reduce_axis = (0, 2, 3) | |||
| num = shape_x[0] * shape_x[2] * shape_x[3] | |||
| num_rec = 1.0 / num | |||
| num_rec_v = graph_builder.value(input_scale.dtype, num_rec) | |||
| # compute mean value of input_x | |||
| mean_sum = graph_builder.emit( | |||
| 'ReduceSum', [input_x], attrs={'reduce_axis': reduce_axis, 'keep_dims': False}) | |||
| mean_muls = graph_builder.emit('Mul', [mean_sum, num_rec_v]) | |||
| # compute variance of input_x | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| mean_muls_expand = graph_builder.emit( | |||
| 'Reshape', [mean_muls], attrs={'shape': ExpandDims.infer_shape(mean_muls.shape, [-1, -1])}) | |||
| else: | |||
| mean_muls_expand = mean_muls | |||
| var_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand]) | |||
| var_mul = graph_builder.emit('Mul', [var_sub, var_sub]) | |||
| var_sum = graph_builder.emit('ReduceSum', [var_mul], attrs={'reduce_axis': reduce_axis, 'keep_dims': False}) | |||
| var_mul = graph_builder.emit('Mul', [var_sum, num_rec_v]) | |||
| # y_sqrt_rec means 1 / sqrt(variance + epsilon), which is calculated in backward pass | |||
| scalar_one = 1.0 | |||
| scalar_one_v = graph_builder.value(input_scale.dtype, scalar_one) | |||
| y_add = graph_builder.emit('Add', [var_mul, epsilon_v]) | |||
| y_sqrt = graph_builder.emit('Sqrt', [y_add]) | |||
| y_sqrt_rec = graph_builder.emit('RealDiv', [scalar_one_v, y_sqrt]) | |||
| # compute res_y | |||
| tmp_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| y_sqrt_rec_expand = graph_builder.emit( | |||
| 'Reshape', [y_sqrt_rec], attrs={'shape': ExpandDims.infer_shape(y_sqrt_rec.shape, [-1, -1])}) | |||
| else: | |||
| y_sqrt_rec_expand = y_sqrt_rec | |||
| y_norm = graph_builder.emit('Mul', [tmp_sub, y_sqrt_rec_expand]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| input_scale_expand = graph_builder.emit( | |||
| 'Reshape', [input_scale], attrs={'shape': ExpandDims.infer_shape(input_scale.shape, [-1, -1])}) | |||
| else: | |||
| input_scale_expand = input_scale | |||
| res_y_mul = graph_builder.emit('Mul', [input_scale_expand, y_norm]) | |||
| if input_x.data_format in (DF.DEFAULT, DF.NCHW): | |||
| input_offset_expand = graph_builder.emit( | |||
| 'Reshape', [input_offset], attrs={'shape': ExpandDims.infer_shape(input_offset.shape, [-1, -1])}) | |||
| else: | |||
| input_offset_expand = input_offset | |||
| res_y = graph_builder.emit('Add', [res_y_mul, input_offset_expand]) | |||
| # compute mean_res | |||
| momentum_sub = scalar_one - self.attrs['momentum'] | |||
| momentum_v_sub = graph_builder.value(input_scale.dtype, momentum_sub) | |||
| new_running_mean_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_mean]) | |||
| momentum_v = graph_builder.value(input_scale.dtype, self.attrs['momentum']) | |||
| current_mean_tmp = graph_builder.emit('Mul', [momentum_v, mean_muls]) | |||
| updated_moving_mean = graph_builder.emit('Add', [new_running_mean_tmp, current_mean_tmp]) | |||
| mean_res = graph_builder.emit( | |||
| 'InplaceAssign', [input_mean, updated_moving_mean, updated_moving_mean], attrs={'fake_output': True}) | |||
| # variance_res is calculated by sample variance, and need to multiply by num / (num - 1) | |||
| var_num = float(num) / (num - 1) | |||
| var_num_v = graph_builder.value(input_scale.dtype, var_num) | |||
| var_mul_update = graph_builder.emit('Mul', [var_num_v, var_mul]) | |||
| new_running_var_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_variance]) | |||
| current_var_tmp = graph_builder.emit('Mul', [momentum_v, var_mul_update]) | |||
| updated_moving_variance = graph_builder.emit('Add', [new_running_var_tmp, current_var_tmp]) | |||
| variance_res = graph_builder.emit( | |||
| 'InplaceAssign', [input_variance, updated_moving_variance, updated_moving_variance], | |||
| attrs={'fake_output': True}) | |||
| return res_y, mean_res, variance_res, mean_muls, y_sqrt_rec | |||
| @@ -17,14 +17,12 @@ from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from .expand_dims import ExpandDims | |||
| @VLD.add_format(DF.NHWC, DF.NHWC, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.add_format(DF.NCHW, DF.NCHW, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.check_attrs('is_training', 'epsilon') | |||
| class BatchNormGrad(Expander): | |||
| """BatchNormGrad expander""" | |||
| def _expand(self, graph_builder): | |||
| # get op info | |||
| input_dy = self.inputs[0] | |||
| @@ -0,0 +1,48 @@ | |||
| # Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # =========================================================================== | |||
| """generate json desc for bias_add""" | |||
| from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from .expand_dims import ExpandDims | |||
| @VLD.add_format(DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.add_format(DF.NCHW, DF.DEFAULT) | |||
| @VLD.add_format(DF.NHWC, DF.DEFAULT) | |||
| class BiasAdd(Expander): | |||
| """BiasAdd expander""" | |||
| def _expand(self, graph_builder): | |||
| input_x, input_y = self.inputs | |||
| if input_x.data_format == DF.NCHW: | |||
| input_y_expand = graph_builder.emit( | |||
| 'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, [1, 2])}) | |||
| result = graph_builder.emit('Add', [input_x, input_y_expand]) | |||
| elif input_x.data_format == DF.DEFAULT: | |||
| if len(input_x.shape) == 2: | |||
| result = graph_builder.emit('Add', [input_x, input_y]) | |||
| elif len(input_x.shape) == 3: | |||
| input_y_expand = graph_builder.emit( | |||
| 'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, 1)}) | |||
| result = graph_builder.emit('Add', [input_x, input_y_expand]) | |||
| else: # len == 4 | |||
| input_y_expand = graph_builder.emit( | |||
| 'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, [1, 2])}) | |||
| result = graph_builder.emit('Add', [input_x, input_y_expand]) | |||
| else: # NHWC | |||
| result = graph_builder.emit('Add', [input_x, input_y]) | |||
| return result | |||
| @@ -15,7 +15,6 @@ | |||
| """generate json desc for FusedMulAdd""" | |||
| from ._utils import Expander | |||
| class FusedMulAdd(Expander): | |||
| """FusedMulAdd expander""" | |||
| @@ -15,15 +15,13 @@ | |||
| """generate json desc for LayerNorm""" | |||
| from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis | |||
| from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis | |||
| @VLD.add_format(DF.FRAC_NZ, DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT) | |||
| @VLD.check_attrs('begin_norm_axis', 'begin_params_axis', 'epsilon') | |||
| class LayerNorm(Expander): | |||
| """LayerNorm expander""" | |||
| def _expand(self, graph_builder): | |||
| input_x, input_gamma, input_beta = self.inputs | |||
| processor = self.processor | |||
| @@ -38,7 +36,7 @@ class LayerNorm(Expander): | |||
| ori_shape_x = input_x.shape | |||
| if input_x.data_format == DF.FRAC_NZ: | |||
| ori_shape_x = infer_shape_from_fractalnz(ori_shape_x) | |||
| ori_shape_x = infer_shape_from_fractalNz(ori_shape_x) | |||
| # Calculate the scaling ratio of the average | |||
| if begin_norm_axis < 0: | |||
| @@ -17,7 +17,6 @@ from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from mindspore._extends.graph_kernel.model.model import GraphKernelUnsupportedException as GKException | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| @VLD.check_attrs('transpose_a', 'transpose_b', 'left_format', 'right_format') | |||
| class MatMul(Expander): | |||
| """ | |||
| @@ -25,7 +24,7 @@ class MatMul(Expander): | |||
| """ | |||
| def __init__(self, expand_info): | |||
| super(MatMul, self).__init__(expand_info) | |||
| super().__init__(expand_info) | |||
| self.transpose_a = self.attrs['transpose_a'] | |||
| self.transpose_b = self.attrs['transpose_b'] | |||
| self.left_format = self.attrs['left_format'] | |||
| @@ -48,28 +47,28 @@ class MatMul(Expander): | |||
| if input_num < 2: | |||
| raise GKException("matul inputs number should bigger than 1, but got {}.".format(input_num)) | |||
| def _trans_shape(self, shape): | |||
| trans_shape = list(shape) | |||
| trans_shape[-2] = shape[-1] | |||
| trans_shape[-1] = shape[-2] | |||
| return trans_shape | |||
| def _expand(self, graph_builder): | |||
| def transpose(shape): | |||
| trans_shape = list(shape) | |||
| trans_shape[-2] = shape[-1] | |||
| trans_shape[-1] = shape[-2] | |||
| return trans_shape | |||
| if not self._optimize_to_mul(): | |||
| raise GKException("MatMul/BatchMatMul do not need to be replaced by Mul") | |||
| # Matmul is replaced by Mul([b m k], [b k n]) when k==1 | |||
| #Matmul is replaced by Mul([b m k], [b k n]) when k==1 | |||
| input_a = self.inputs[0] | |||
| input_b = self.inputs[1] | |||
| if self.transpose_a: | |||
| shape_a_trans = transpose(self.shape_a) | |||
| shape_a_trans = self._trans_shape(self.shape_a) | |||
| input_a = graph_builder.emit('Reshape', [input_a], attrs={'shape': shape_a_trans}) | |||
| if self.transpose_b: | |||
| shape_b_trans = transpose(self.shape_b) | |||
| shape_b_trans = self._trans_shape(self.shape_b) | |||
| input_b = graph_builder.emit('Reshape', [input_b], attrs={'shape': shape_b_trans}) | |||
| result = graph_builder.emit('Mul', [input_a, input_b]) | |||
| if 'dst_type' in self.attrs and self.inputs[0].dtype != self.attrs['dst_type']: | |||
| result = graph_builder.emit('Cast', [result], attrs={'dst_type': self.attrs['dst_type']}) | |||
| return result | |||
| class BatchMatMul(MatMul): | |||
| """BatchMatMul expander""" | |||
| @@ -24,7 +24,7 @@ class MinimumGrad(Expander): | |||
| def _check(self): | |||
| if not self.attrs.get('grad_x', True) and not self.attrs.get('grad_y', True): | |||
| raise GKException("both grad_x and grad_y are False.") | |||
| return super(MinimumGrad, self)._check() | |||
| return super()._check() | |||
| def _expand(self, graph_builder): | |||
| input_x, input_y, input_dout = self.inputs | |||
| @@ -34,8 +34,7 @@ class MinimumGrad(Expander): | |||
| dx = graph_builder.emit('Mul', [le_result, input_dout]) | |||
| dy = graph_builder.emit('Sub', [input_dout, dx]) | |||
| # for minimumgrad op, output_shape should be equal to input_shape, | |||
| # but some elementwise operating may broadcast input_shape | |||
| # for minimumgrad op, output_shape should be equal to input_shape, but some elementwise operating may broadcast input_shape | |||
| # then output_shape not equal to original input_shape, so need to reduce output to let them equal | |||
| reduce_axis_x = self.get_reduce_axis(input_x.shape, dx.shape) | |||
| reduce_axis_y = self.get_reduce_axis(input_y.shape, dy.shape) | |||
| @@ -15,8 +15,7 @@ | |||
| """generate json desc for softmax""" | |||
| from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis | |||
| from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis | |||
| @VLD.add_format(DF.FRAC_NZ) | |||
| @VLD.add_format(DF.DEFAULT) | |||
| @@ -31,7 +30,7 @@ class Softmax(Expander): | |||
| ori_shape = input_x.shape | |||
| if input_x.data_format == DF.FRAC_NZ: | |||
| ori_shape = infer_shape_from_fractalnz(input_x.shape) | |||
| ori_shape = infer_shape_from_fractalNz(input_x.shape) | |||
| for i, _ in enumerate(list(axis)): | |||
| if axis[i] < 0: | |||
| @@ -15,8 +15,7 @@ | |||
| """generate json desc for SoftmaxGradExt""" | |||
| from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis | |||
| from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis | |||
| @VLD.add_format(DF.FRAC_NZ, DF.FRAC_NZ, DF.DEFAULT) | |||
| @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT) | |||
| @@ -30,7 +29,7 @@ class SoftmaxGradExt(Expander): | |||
| ori_shape = x.shape | |||
| if x.data_format == DF.FRAC_NZ: | |||
| ori_shape = infer_shape_from_fractalnz(ori_shape) | |||
| ori_shape = infer_shape_from_fractalNz(ori_shape) | |||
| if not axis: | |||
| axis = [] | |||
| for i, _ in enumerate(ori_shape): | |||
| @@ -15,7 +15,7 @@ | |||
| """generate json desc for SquareSumV1""" | |||
| from mindspore._extends.graph_kernel.model.model import DataFormat as DF | |||
| from ._utils import Expander, ExpanderInfoValidator as VLD | |||
| from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis | |||
| from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis | |||
| @VLD.add_format(DF.FRAC_NZ) | |||
| @@ -30,7 +30,7 @@ class SquareSumV1(Expander): | |||
| ori_shape = x.shape | |||
| if x.data_format == DF.FRAC_NZ: | |||
| ori_shape = infer_shape_from_fractalnz(ori_shape) | |||
| ori_shape = infer_shape_from_fractalNz(ori_shape) | |||
| if not axis: | |||
| axis = [] | |||
| for i, _ in enumerate(ori_shape): | |||
| @@ -17,8 +17,6 @@ from .model import PrimLib | |||
| class ParalGain: | |||
| """Paral Gain""" | |||
| def __init__(self, fusion_type, bottleneck, gain, block_assign, type_info): | |||
| self.fusion_type = fusion_type | |||
| self.bottleneck = bottleneck | |||
| @@ -43,9 +41,7 @@ class ScheduleAnalyzer: | |||
| self.ops = graph.ops | |||
| self.dom_op = [out.op for out in outputs] | |||
| @staticmethod | |||
| def prod(shape): | |||
| """Compute shape product""" | |||
| def prod(self, shape): | |||
| res = shape[0] | |||
| for i in range(1, len(shape)): | |||
| res = res * shape[i] | |||
| @@ -258,7 +254,7 @@ class ScheduleAnalyzer: | |||
| fusion_type = "block_fusion" | |||
| type_info = None | |||
| activate_pipeline_optimization = False # Disable pipeline optimization for now. | |||
| activate_pipeline_optimization = False # Disable pipeline optimization for now. | |||
| if activate_pipeline_optimization: | |||
| pipeline_info = ScheduleAnalyzer.pipeline_fusion_analyze( | |||
| blocks, op_sizes, exclude_gid) | |||
| @@ -291,5 +287,4 @@ def block_parallel_estimate(graphs): | |||
| def parallel_estimate(graphs): | |||
| """Estimate parallel gain""" | |||
| return block_parallel_estimate(graphs) | |||
| @@ -13,6 +13,7 @@ | |||
| # limitations under the License. | |||
| # =========================================================================== | |||
| """Cost model splitter""" | |||
| import os | |||
| from functools import reduce as prod_reduce | |||
| from mindspore import log as logger | |||
| from .model import PrimLib, Graph, Tensor, Operator | |||
| @@ -38,24 +39,20 @@ class GraphSplitByPattern: | |||
| def sync(self, x, y): | |||
| """sync from y to x""" | |||
| for i in self.alive: | |||
| self._link(self.map[y][i], x, i) | |||
| def _link(self, cond, f, t): | |||
| """link from `f` to `t`""" | |||
| if cond: | |||
| self.map[f][t] = True | |||
| if self.map[y][i] and not self.map[x][i]: | |||
| self.map[x][i] = True | |||
| def fuse(self, x, y): | |||
| """fuse y to x""" | |||
| for i in self.alive: | |||
| # i is the succeeding node of y, links the x's previous nodes to i | |||
| if self.map[y][i] and not self.map[x][i]: | |||
| for pre in self.alive: | |||
| self._link(self.map[pre][x], pre, i) | |||
| # i is the previous node of y, link i to x's succeeding nodes | |||
| if self.map[pre][x] and not self.map[pre][i]: | |||
| self.map[pre][i] = True | |||
| if self.map[i][y] and not self.map[i][x]: | |||
| for suc in self.alive: | |||
| self._link(self.map[x][suc], i, suc) | |||
| if self.map[x][suc] and not self.map[i][suc]: | |||
| self.map[i][suc] = True | |||
| self.alive.remove(y) | |||
| class Area: | |||
| @@ -70,10 +67,6 @@ class GraphSplitByPattern: | |||
| self.stitch_ops = set() | |||
| self.stitch_atomic_ops = set() | |||
| def has_stitch_op(self): | |||
| """check stitch_op exists""" | |||
| return self.stitch_ops or self.stitch_atomic_ops | |||
| def __init__(self, init_op, is_output, unique_id, reach_tab, recompute_ops=None): | |||
| self.pattern = PrimLib.iter_type(init_op) if init_op is not None else PrimLib.UNKNOWN | |||
| self.ops = [] if init_op is None else [init_op] | |||
| @@ -293,35 +286,31 @@ class GraphSplitByPattern: | |||
| def fuse(self, selector): | |||
| """Fuse areas""" | |||
| def _fuse_area(): | |||
| changed = False | |||
| while True: | |||
| for dominant in self.areas: | |||
| result = selector(dominant) | |||
| if result is None or not result[0]: | |||
| continue | |||
| fuse_areas, is_forward = result | |||
| fuse_areas = self.limit_area_size(dominant, fuse_areas) | |||
| if not fuse_areas: | |||
| continue | |||
| if is_forward: | |||
| for area in fuse_areas: | |||
| dominant.fuse(area) | |||
| self.set_area_map(area.ops, dominant) | |||
| self.areas.remove(area) | |||
| else: | |||
| forward_area = dominant | |||
| for area in fuse_areas: | |||
| area.fuse(forward_area) | |||
| self.set_area_map(forward_area.ops, area) | |||
| self.areas.remove(forward_area) | |||
| forward_area = area | |||
| return True | |||
| return False | |||
| changed, do_again = False, True | |||
| while do_again: | |||
| do_again = _fuse_area() | |||
| changed = changed or do_again | |||
| return changed | |||
| if result is not None and result[0]: | |||
| fuse_areas, is_forward = result | |||
| fuse_areas = self.limit_area_size(dominant, fuse_areas) | |||
| if not fuse_areas: | |||
| continue | |||
| if is_forward: | |||
| for area in fuse_areas: | |||
| dominant.fuse(area) | |||
| self.set_area_map(area.ops, dominant) | |||
| self.areas.remove(area) | |||
| else: | |||
| forward_area = dominant | |||
| for area in fuse_areas: | |||
| area.fuse(forward_area) | |||
| self.set_area_map(forward_area.ops, area) | |||
| self.areas.remove(forward_area) | |||
| forward_area = area | |||
| changed = True | |||
| break | |||
| else: | |||
| return changed | |||
| def fuse_recom(self, selector): | |||
| """Fuse recompute area to its user""" | |||
| @@ -359,6 +348,21 @@ class GraphSplitByPattern: | |||
| graphmodes.append("basic" if area.mode == self.Area.MODE_BASIC else "composite") | |||
| return subgraphs, graphmodes | |||
| def dump_subgraphs(self, subgraphs): | |||
| """Dump subgraphs""" | |||
| if os.environ.get("ENABLE_SUBGRAPHS", "off") == "on": | |||
| subgraphs_str = "subgraphs:\nlen: " + str(len(subgraphs)) + "\n" | |||
| for i, sub in enumerate(subgraphs): | |||
| subgraphs_str += str("============") + str(i) + "\n" | |||
| subgraphs_str += str(sub) | |||
| dirname = 'subgraphs' | |||
| if not os.path.exists(dirname): | |||
| os.makedirs(dirname) | |||
| graphname = self.graph.name | |||
| filename = dirname + '/' + graphname + '.log' | |||
| with os.fdopen(os.open(filename, os.O_RDWR | os.O_CREAT), 'w+') as f: | |||
| f.write(subgraphs_str) | |||
| def pattern_fuse(self, fuse_func=None): | |||
| """fuse Areas by pattern repeatedly""" | |||
| del fuse_func | |||
| @@ -372,38 +376,34 @@ class GraphSplitByPattern: | |||
| # Note: after this function, the input output relation is not maintained. | |||
| self.split_output_reshapes() | |||
| subgraphs, graphmodes = self.to_subgraphs() | |||
| self.dump_subgraphs(subgraphs) | |||
| return subgraphs, graphmodes | |||
| def split_output_reshapes(self): | |||
| """Force split the output Reshapes into other new area""" | |||
| def _remove_output_reshape(reshape_ops, other_ops): | |||
| def _run(): | |||
| for op in reshape_ops: | |||
| if any([to_op in other_ops for to_op in op.output.to_ops]): | |||
| reshape_ops.remove(op) | |||
| other_ops.append(op) | |||
| return True | |||
| return False | |||
| while _run(): | |||
| pass | |||
| """Force split the output reshapes into other new """ | |||
| new_areas = [] | |||
| for area in self.areas: | |||
| reshape_ops = [op for op in area.ops if PrimLib.iter_type(op) == PrimLib.RESHAPE] | |||
| other_ops = [op for op in area.ops if op not in reshape_ops] | |||
| if not other_ops or not reshape_ops: | |||
| out_reshape_ops = [op for op in area.ops if PrimLib.iter_type(op) == PrimLib.RESHAPE] | |||
| remain_ops = [op for op in area.ops if op not in out_reshape_ops] | |||
| if not remain_ops or not out_reshape_ops: | |||
| continue | |||
| # remove the output reshape from "reshape_ops" and add it into "other_ops" | |||
| _remove_output_reshape(reshape_ops, other_ops) | |||
| if not reshape_ops: | |||
| continue | |||
| for op in reshape_ops: | |||
| a = self.Area(op, False, 0, self.reach_tab) | |||
| self.set_default_mode(a) | |||
| new_areas.append(a) | |||
| area.ops = other_ops | |||
| if len(other_ops) == 1: | |||
| self.set_default_mode(area) | |||
| changed = True | |||
| while changed: | |||
| changed = False | |||
| for op in out_reshape_ops: | |||
| if any([to_op in remain_ops for to_op in op.output.to_ops]): | |||
| out_reshape_ops.remove(op) | |||
| remain_ops.append(op) | |||
| changed = True | |||
| break | |||
| if out_reshape_ops: | |||
| for op in out_reshape_ops: | |||
| a = self.Area(op, False, 0, self.reach_tab) | |||
| self.set_default_mode(a) | |||
| new_areas.append(a) | |||
| area.ops = remain_ops | |||
| if len(remain_ops) == 1: | |||
| self.set_default_mode(area) | |||
| if new_areas: | |||
| self.areas += new_areas | |||
| @@ -472,8 +472,8 @@ class GraphSplitByPattern: | |||
| region_ops.append(op) | |||
| return False, None, weight, True | |||
| # region fails to grow | |||
| max_weight = 20 | |||
| if weight > max_weight or len(op.inputs) > 1 or PrimLib.iter_type(op) > PrimLib.BROADCAST: | |||
| MAX_WEIGHT = 20 | |||
| if weight > MAX_WEIGHT or len(op.inputs) > 1 or PrimLib.iter_type(op) > PrimLib.BROADCAST: | |||
| return False, None, weight, False | |||
| # region grows successfully | |||
| weight = weight + 1 | |||
| @@ -486,7 +486,7 @@ class GraphSplitByPattern: | |||
| cheap_regions = [] | |||
| for output in outputs: | |||
| # tensor should have user other than user_area to be fused | |||
| if len(output.to_ops) < 2: | |||
| if output.para_type != Tensor.PARA_OUTPUT and len(output.to_ops) < 2: | |||
| continue | |||
| region_ops = [] | |||
| grow = True | |||
| @@ -533,7 +533,14 @@ class GraphSplitByPattern: | |||
| """find recompute regions and copy them out to new Areas""" | |||
| def do_recompute_fuse(): | |||
| """split the unfusing pattern by add recompute area""" | |||
| def recompute_cheap_region(dom): | |||
| recompute_suc = False | |||
| orig_areas = [] | |||
| orig_areas.extend(self.areas) | |||
| for dom in orig_areas: | |||
| if dom not in self.areas or not dom.out_relations: | |||
| continue | |||
| cheap_regions = self.find_cheap_regions(dom) | |||
| dom_changed = False | |||
| for cheap_region in cheap_regions: | |||
| user_areas = self.select_user_area(cheap_region[-1].output) | |||
| if not user_areas: | |||
| @@ -543,17 +550,12 @@ class GraphSplitByPattern: | |||
| self.pattern_fuse(self.fuse_recom) | |||
| self.clear_recompute() | |||
| if self.recom_res: | |||
| return True | |||
| return False | |||
| recompute_suc = False | |||
| orig_areas = [] | |||
| orig_areas.extend(self.areas) | |||
| for dom in orig_areas: | |||
| if dom not in self.areas or not dom.out_relations: | |||
| continue | |||
| cheap_regions = self.find_cheap_regions(dom) | |||
| if recompute_cheap_region(dom): | |||
| recompute_suc = True | |||
| recompute_suc = True | |||
| # Copy region at most once for this dom | |||
| dom_changed = True | |||
| break | |||
| if dom_changed: | |||
| break | |||
| return recompute_suc | |||
| if self.enable_recompute: | |||
| @@ -561,6 +563,9 @@ class GraphSplitByPattern: | |||
| self.pattern_fuse() | |||
| use_poly_reduce = True | |||
| class GraphSplitGpu(GraphSplitByPattern): | |||
| """Graph splitter""" | |||
| BORADCAST_FUSE_DEPTH = 20 | |||
| @@ -611,7 +616,7 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| return fused, True | |||
| def _broadcast_pat_exclude(dom, a, r): | |||
| if a.pattern == PrimLib.REDUCE: | |||
| if use_poly_reduce and a.pattern == PrimLib.REDUCE: | |||
| return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE | |||
| return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST | |||
| @@ -636,14 +641,34 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| fused.append(a) | |||
| return fused, False | |||
| def _check_reduce_exclude(dom): | |||
| if use_poly_reduce: | |||
| return False | |||
| # exclude large all-reduce | |||
| if len(dom.ops[0].inputs[0].shape) == len(dom.ops[0].attrs["reduce_axis"]) and \ | |||
| dom.ops[0].inputs[0].get_size() > 10000: | |||
| return True | |||
| # exclude multi output | |||
| for a in dom.in_relations.keys(): | |||
| if len(a.out_relations) > 1: | |||
| return True | |||
| if any([op.output.para_type == Tensor.PARA_OUTPUT for op in a.ops]): | |||
| return True | |||
| return False | |||
| def _reduce_pat_exclude(_, a, r): | |||
| if len(a.ops) > self.REDUCE_FUSE_DEPTH: | |||
| return True | |||
| return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST | |||
| if use_poly_reduce: | |||
| return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST | |||
| return a.pattern > PrimLib.BROADCAST or r > PrimLib.REDUCE | |||
| def _reduce_depth(dom): | |||
| if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1: | |||
| return None | |||
| if _check_reduce_exclude(dom): | |||
| return None | |||
| a, r = list(dom.in_relations.items())[0] | |||
| if dom.ops[0].inputs[0].dtype == "float16" and a.is_output and len(a.ops) >= 10 and \ | |||
| _is_atomic_add_available(dom): | |||
| @@ -656,6 +681,8 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| def _reduce_width(dom): | |||
| if dom.pattern != PrimLib.REDUCE: | |||
| return None | |||
| if _check_reduce_exclude(dom): | |||
| return None | |||
| fused = [] | |||
| for a, r in dom.in_relations.items(): | |||
| if dom.ops[0].inputs[0].dtype == "float16" and a.is_output and len(a.ops) >= 10 and \ | |||
| @@ -736,16 +763,16 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| def _may_stitch(dom, a, r): | |||
| if a.pattern <= PrimLib.REDUCE and r <= PrimLib.BROADCAST and dom.check_acyclic(a): | |||
| if _reduce_nums(a.ops) >= 2: | |||
| return False | |||
| dom_outs = [op.output for op in dom.ops] | |||
| a_ins = [op_input for op in a.ops for op_input in op.inputs] | |||
| a_outs = [op.output for op in a.ops] | |||
| a_final_outs = [tensor for tensor in a_outs if tensor not in a_ins] | |||
| stitch_tensors = [tensor for tensor in dom_outs if tensor in a_ins] | |||
| if not _same_stitch_axis(stitch_tensors, a_final_outs): | |||
| return False | |||
| return any([_tensor_size(tensor) >= 1024 * 1024 for tensor in stitch_tensors]) | |||
| if _reduce_nums(a.ops) < 2: | |||
| dom_outs = [op.output for op in dom.ops] | |||
| a_ins = [op_input for op in a.ops for op_input in op.inputs] | |||
| a_outs = [op.output for op in a.ops] | |||
| a_final_outs = [tensor for tensor in a_outs if tensor not in a_ins] | |||
| stitch_tensors = [tensor for tensor in dom_outs if tensor in a_ins] | |||
| if _same_stitch_axis(stitch_tensors, a_final_outs): | |||
| for tensor in stitch_tensors: | |||
| if _tensor_size(tensor) >= 1024 * 1024: | |||
| return True | |||
| return False | |||
| def _reduce_stitch(dom): | |||
| @@ -758,15 +785,14 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| fused = [] | |||
| for a, r in dom.out_relations.items(): | |||
| if not _may_stitch(dom, a, r): | |||
| continue | |||
| if a.pattern == PrimLib.REDUCE: | |||
| if a.ops[0].attrs['reduce_axis'] == dom.ops[0].attrs['reduce_axis']: | |||
| if _may_stitch(dom, a, r): | |||
| if a.pattern == PrimLib.REDUCE: | |||
| if a.ops[0].attrs['reduce_axis'] == dom.ops[0].attrs['reduce_axis']: | |||
| dom.stitch_info.stitch_ops.add(dom.ops[0].output.name) | |||
| fused.append(a) | |||
| elif a.pattern == PrimLib.BROADCAST: | |||
| dom.stitch_info.stitch_ops.add(dom.ops[0].output.name) | |||
| fused.append(a) | |||
| elif a.pattern == PrimLib.BROADCAST: | |||
| dom.stitch_info.stitch_ops.add(dom.ops[0].output.name) | |||
| fused.append(a) | |||
| return fused, False | |||
| def _transpose(dom): | |||
| @@ -778,16 +804,6 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| fused.append(a) | |||
| return fused, True | |||
| def _strided_slice(dom): | |||
| if dom.dom_op().prim != "StridedSlice": | |||
| return None | |||
| fused = [] | |||
| for a, _ in dom.in_relations.items(): | |||
| if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \ | |||
| len(a.out_relations) == 1 and not a.is_output: | |||
| fused.append(a) | |||
| return fused, True | |||
| def _fuse_loop(): | |||
| changed = True | |||
| while changed: | |||
| @@ -798,10 +814,10 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| changed = self.fuse(_reduce_width) or changed | |||
| changed = self.fuse(_broadcast_depth) or changed | |||
| changed = self.fuse(_broadcast_width) or changed | |||
| changed = self.fuse(_strided_slice) or changed | |||
| changed = self.fuse(_reduce_output) or changed | |||
| if enable_stitch_fusion: | |||
| changed = self.fuse(_reduce_stitch) or changed | |||
| if use_poly_reduce: | |||
| changed = self.fuse(_reduce_output) or changed | |||
| if enable_stitch_fusion: | |||
| changed = self.fuse(_reduce_stitch) or changed | |||
| self.fuse(_transpose) | |||
| def _fuse_once(fuse_func): | |||
| @@ -809,8 +825,9 @@ class GraphSplitGpu(GraphSplitByPattern): | |||
| fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \ | |||
| fuse_func(_broadcast_width): | |||
| return | |||
| if fuse_func(_reduce_output) or (enable_stitch_fusion and fuse_func(_reduce_stitch)): | |||
| return | |||
| if use_poly_reduce: | |||
| if fuse_func(_reduce_output) or (enable_stitch_fusion and fuse_func(_reduce_stitch)): | |||
| return | |||
| fuse_func(_transpose) | |||
| return | |||
| @@ -216,7 +216,6 @@ class PrimLib: | |||
| 'Transpose': Prim(OPAQUE), | |||
| 'Tile': Prim(BROADCAST), | |||
| 'BroadcastTo': Prim(BROADCAST), | |||
| 'StridedSlice': Prim(OPAQUE), | |||
| 'MatMul': Prim(OPAQUE), | |||
| 'TransData': Prim(OPAQUE), | |||
| 'BatchMatMul': Prim(OPAQUE), | |||
| @@ -422,13 +421,14 @@ class Graph: | |||
| for t in op.inputs: | |||
| if t not in inputs and t.op not in self.ops: | |||
| inputs.append(t) | |||
| if op.output in outputs: | |||
| continue | |||
| if op.output.para_type == Tensor.PARA_OUTPUT or not op.output.to_ops: | |||
| outputs.append(op.output) | |||
| continue | |||
| if any([succ not in self.ops for succ in op.output.to_ops]): | |||
| outputs.append(op.output) | |||
| if op.output not in outputs: | |||
| if op.output.para_type == Tensor.PARA_OUTPUT or not op.output.to_ops: | |||
| outputs.append(op.output) | |||
| else: | |||
| for d in op.output.to_ops: | |||
| if d not in self.ops: | |||
| outputs.append(op.output) | |||
| break | |||
| if self.inputs: | |||
| inputs = self.inputs | |||
| @@ -28,13 +28,11 @@ class GraphBuilder: | |||
| self.graph = Graph(name, []) | |||
| def set_input(self, *para): | |||
| """set input to graph inputs""" | |||
| for t in para: | |||
| t.para_type = Tensor.PARA_INPUT | |||
| self.graph.inputs.append(t) | |||
| def set_output(self, *para): | |||
| """set output to graph inputs""" | |||
| for t in para: | |||
| t.para_type = Tensor.PARA_OUTPUT | |||
| self.graph.outputs.append(t) | |||
| @@ -52,8 +50,6 @@ class GraphBuilder: | |||
| def graph_scope(self, name): | |||
| """The graph scope to be processed""" | |||
| class GraphScope: | |||
| """Graph Scope""" | |||
| def __init__(self, gb): | |||
| self.gb = gb | |||
| @@ -81,6 +77,7 @@ class GraphBuilder: | |||
| """Create a new Value""" | |||
| if name in (None, ''): | |||
| name = self._alloc_tensor_name() | |||
| v = Value(name, dtype, value) | |||
| return v | |||
| @@ -108,7 +105,6 @@ class GraphBuilder: | |||
| return output | |||
| def get(self): | |||
| """Get graphs""" | |||
| return self.graphs | |||
| @@ -127,14 +123,34 @@ class CompositeGraph: | |||
| def load(self, desc): | |||
| """Load Graph from json""" | |||
| def _attr_of(op): | |||
| if not op['attr']: | |||
| return dict() | |||
| attr = {} | |||
| for a in op['attr']: | |||
| if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'): | |||
| attr['reduce_axis'] = a['value'] | |||
| def _attr_of(op, inputs, output): | |||
| def _get_axis_while_none(input_shape, output_shape): | |||
| red_axis = [] | |||
| if len(output_shape) == len(input_shape): | |||
| for i, s in enumerate(output_shape): | |||
| if s == 1 and input_shape[i] > 1: | |||
| red_axis.append(i) | |||
| else: | |||
| red_axis = list(range(len(output_shape))) | |||
| return red_axis | |||
| attr = {} | |||
| if op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'): | |||
| for a in op['attr']: | |||
| if a['name'] == 'axis': | |||
| red_axis, dim_size = [], len(inputs[0].shape) | |||
| if not a['value']: | |||
| red_axis = _get_axis_while_none(inputs[0].shape, output.shape) | |||
| else: | |||
| if isinstance(a['value'], int): | |||
| a['value'] = [a['value']] | |||
| for i in a['value']: | |||
| red_axis.append(i if i >= 0 else dim_size + i) | |||
| attr['reduce_axis'] = red_axis | |||
| if a['name'] == "reduce_output_fuse": | |||
| attr['reduce_output_fuse'] = a['value'] | |||
| elif op['attr']: | |||
| for a in op['attr']: | |||
| attr[a['name']] = a['value'] | |||
| return attr | |||
| @@ -150,6 +166,7 @@ class CompositeGraph: | |||
| 'shape'], out_desc['data_type'], out_desc['format'] | |||
| self.tensors[name] = builder.tensor( | |||
| shape, dtype, data_format, name=name, para_type=Tensor.PARA_OUTPUT) | |||
| cur_fusion = None | |||
| for op in desc['op_desc']: | |||
| inputs = [self.tensors[d['tensor_name']] for x in op['input_desc'] for d in x if 'value' not in d] | |||
| out_desc = op['output_desc'] | |||
| @@ -160,17 +177,25 @@ class CompositeGraph: | |||
| inputs[1].para_type = Tensor.PARA_OUTPUT | |||
| output = inputs[2] | |||
| self.tensors[name] = output | |||
| continue | |||
| output = self.tensors.get(name, None) | |||
| if not output: | |||
| output = builder.tensor(shape, dtype, data_format, name=name) | |||
| self.tensors[name] = output | |||
| builder.op(op['name'], output, inputs, attrs=_attr_of(op)) | |||
| else: | |||
| output = self.tensors.get(name, None) | |||
| if not output: | |||
| output = builder.tensor( | |||
| shape, dtype, data_format, name=name) | |||
| self.tensors[name] = output | |||
| builder.op(op['name'], output, inputs, | |||
| attrs=_attr_of(op, inputs, output)) | |||
| if 'fusion' in op: | |||
| if cur_fusion is None: | |||
| cur_fusion = output | |||
| else: | |||
| cur_fusion.add_buddy(output) | |||
| if op['fusion'].endswith('_end'): | |||
| cur_fusion = None | |||
| self.graph = builder.get()[0] | |||
| self.desc = desc | |||
| def add_stitch_info(self, subgraph, desc): | |||
| """add stitch info to desc""" | |||
| if subgraph.stitch_info and subgraph.stitch_info.stitch_ops: | |||
| buffer_stitch = {'stitch_op': list(subgraph.stitch_info.stitch_ops)} | |||
| if subgraph.stitch_info.stitch_atomic_ops: | |||
| @@ -179,7 +204,6 @@ class CompositeGraph: | |||
| return desc | |||
| def add_recompute_ops(self, subgraph, desc): | |||
| """add recompute ops to desc""" | |||
| if subgraph.recompute_ops: | |||
| desc['recompute_ops'] = [op.output.name for op in subgraph.recompute_ops] | |||
| return desc | |||
| @@ -203,40 +227,43 @@ class CompositeGraph: | |||
| inputs, outputs = subgraph.deduce_parameters() | |||
| graph_ops = set(subgraph.ops) | |||
| inplace_assign, inplace_assign_z = self._pre_dump(outputs) | |||
| def dump_output(t): | |||
| if t.name in inplace_assign: | |||
| z = inplace_assign_z if inplace_assign_z is not None else self.tensors[t.name] | |||
| return {'data_type': z.dtype, 'shape': z.shape, 'tensor_name': inplace_assign[t.name]} | |||
| return {'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name} | |||
| def dump_op_desc(d): | |||
| if d['name'] == 'InplaceAssign': | |||
| y = d['input_desc'][1][0]['tensor_name'] | |||
| if self.tensors[y].op in graph_ops: | |||
| z, fake = (inplace_assign_z, False) if inplace_assign_z is not None else (self.tensors[y], True) | |||
| inplace_desc = copy.deepcopy(d) | |||
| inplace_desc['attr'] = {'name': 'fake_output', 'value': fake} | |||
| z_desc, out_desc = inplace_desc['input_desc'][2][0], inplace_desc['output_desc'][0] | |||
| z_desc['shape'] = z.shape | |||
| z_desc['data_type'] = z.dtype | |||
| z_desc['tensor_name'] = z.name | |||
| out_desc['shape'] = z.shape | |||
| out_desc['data_type'] = z.dtype | |||
| return inplace_desc | |||
| op = self.tensors[d['output_desc'][0]['tensor_name']].op | |||
| if op in graph_ops or op in subgraph.recompute_ops: | |||
| return d | |||
| return None | |||
| for key in self.desc.keys(): | |||
| for key in self.desc: | |||
| if key == 'input_desc': | |||
| desc[key] = [[{'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}] for t in inputs] | |||
| desc[key] = [ | |||
| [{'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}] for t in inputs] | |||
| elif key == 'output_desc': | |||
| desc[key] = list(map(dump_output, outputs)) | |||
| out_desc = [] | |||
| for t in outputs: | |||
| if t.name in inplace_assign: | |||
| z = inplace_assign_z if inplace_assign_z is not None else self.tensors[t.name] | |||
| out_desc.append( | |||
| {'data_type': z.dtype, 'shape': z.shape, 'tensor_name': inplace_assign[t.name]}) | |||
| else: | |||
| out_desc.append( | |||
| {'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}) | |||
| desc[key] = out_desc | |||
| elif key == 'op_desc': | |||
| op_desc = map(dump_op_desc, self.desc[key]) | |||
| desc[key] = [d for d in op_desc if d is not None] | |||
| op_desc = [] | |||
| for d in self.desc[key]: | |||
| if d['name'] == 'InplaceAssign': | |||
| y = d['input_desc'][1][0]['tensor_name'] | |||
| if self.tensors[y].op in graph_ops: | |||
| z, fake = (inplace_assign_z, False) if inplace_assign_z is not None else ( | |||
| self.tensors[y], True) | |||
| inplace_desc = copy.deepcopy(d) | |||
| inplace_desc['attr'] = {'name': 'fake_output', 'value': fake} | |||
| z_desc, out_desc = inplace_desc['input_desc'][2][0], inplace_desc['output_desc'][0] | |||
| z_desc['shape'] = z.shape | |||
| z_desc['data_type'] = z.dtype | |||
| z_desc['tensor_name'] = z.name | |||
| out_desc['shape'] = z.shape | |||
| out_desc['data_type'] = z.dtype | |||
| op_desc.append(inplace_desc) | |||
| else: | |||
| op = self.tensors[d['output_desc'][0]['tensor_name']].op | |||
| if op in graph_ops or op in subgraph.recompute_ops: | |||
| op_desc.append(d) | |||
| desc[key] = op_desc | |||
| elif key == 'op': | |||
| desc[key] = subgraph.name | |||
| else: | |||
| @@ -16,7 +16,7 @@ | |||
| import copy | |||
| import sys | |||
| from functools import reduce as prod_reduce | |||
| from functools import reduce | |||
| from .model import GraphKernelUnsupportedException as GKException | |||
| from .model import PrimLib, DataFormat as DF | |||
| @@ -101,24 +101,22 @@ class OpInfer: | |||
| class _Elemwise(OpInfer): | |||
| """Common infer for elementwise operators""" | |||
| @staticmethod | |||
| def broadcast_shape(shapes): | |||
| def _broadcast_shape(self, shapes): | |||
| """deduce broadcast shape using same rules as numpy""" | |||
| dim_size = max([len(shape) for shape in shapes]) | |||
| align_shapes = [[1] * (dim_size - len(shape)) + shape for shape in shapes] | |||
| out_shape = [1] * dim_size | |||
| for i in range(dim_size): | |||
| for align_shape in align_shapes: | |||
| if align_shape[i] == 1: | |||
| continue | |||
| if out_shape[i] == 1: | |||
| out_shape[i] = align_shape[i] | |||
| elif out_shape[i] != align_shape[i]: | |||
| raise GKException("shape broadcast failed!") | |||
| if align_shape[i] > 1: | |||
| if out_shape[i] == 1: | |||
| out_shape[i] = align_shape[i] | |||
| if out_shape[i] != align_shape[i]: | |||
| raise GKException("shape broadcast failed!") | |||
| return out_shape | |||
| @staticmethod | |||
| def defaultformat_to_nz(default_shape): | |||
| def _to_nz(self, default_shape): | |||
| """default format shape to fractal_Nz format shape""" | |||
| if len(default_shape) not in (1, 2): | |||
| raise GKException("shape is too long!") | |||
| @@ -144,17 +142,17 @@ class _Elemwise(OpInfer): | |||
| """returns the output shape with broadcast""" | |||
| # in case all inputs are default format/NHWC/NCHW | |||
| is_default = [op_input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW) for op_input in self.inputs] | |||
| is_default = [input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW) for input in self.inputs] | |||
| if all(is_default): | |||
| return self.broadcast_shape([op_input.shape for op_input in self.inputs]) | |||
| return self._broadcast_shape([input.shape for input in self.inputs]) | |||
| # in case formats are fractal_nz, default_fromat/NHWC/HCHW(optional) | |||
| is_default_frac_nz = [op_input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW, DF.FRAC_NZ) | |||
| for op_input in self.inputs] | |||
| is_default_frac_nz = [input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW, DF.FRAC_NZ) | |||
| for input in self.inputs] | |||
| if all(is_default_frac_nz): | |||
| nz_shapes = [self.defaultformat_to_nz(op_input.shape) if op_input.data_format != DF.FRAC_NZ | |||
| else op_input.shape for op_input in self.inputs] | |||
| return self.broadcast_shape(nz_shapes) | |||
| nz_shapes = [self._to_nz(input.shape) if input.data_format != DF.FRAC_NZ else input.shape | |||
| for input in self.inputs] | |||
| return self._broadcast_shape(nz_shapes) | |||
| raise GKException("Only support default and fractal_nz") | |||
| @@ -216,11 +214,9 @@ class _Reshape(OpInfer): | |||
| class Reshape(_Reshape): | |||
| """Reshape op infer""" | |||
| def _check_shape(self): | |||
| size_before_reshape = prod_reduce(lambda x, y: x * y, self.inputs[0].shape) | |||
| size_after_reshape = prod_reduce(lambda x, y: x * y, self.attrs["shape"]) | |||
| size_before_reshape = reduce(lambda x, y: x * y, self.inputs[0].shape) | |||
| size_after_reshape = reduce(lambda x, y: x * y, self.attrs["shape"]) | |||
| if size_before_reshape != size_after_reshape: | |||
| raise GKException("The shape product before and after reshaping should be equal") | |||
| @@ -229,15 +225,11 @@ class Reshape(_Reshape): | |||
| class Cast(_Elemwise): | |||
| """Cast op infer""" | |||
| def _infer_type(self): | |||
| return self.attrs["dst_type"] | |||
| class InplaceAssign(_Elemwise): | |||
| """InplaceAssign op infer""" | |||
| def _infer_shape(self): | |||
| return self.inputs[2].shape | |||
| @@ -249,8 +241,6 @@ class InplaceAssign(_Elemwise): | |||
| class BroadcastTo(OpInfer): | |||
| """BroadcastTo op infer""" | |||
| def _infer_shape(self): | |||
| return self.attrs["shape"] | |||
| @@ -266,8 +256,6 @@ class _CompareOp(_Elemwise): | |||
| class CImag(OpInfer): | |||
| """CImag op infer""" | |||
| def _check_type(self): | |||
| if self.inputs[0].dtype != "complex64": | |||
| raise GKException( | |||
| @@ -278,8 +266,6 @@ class CImag(OpInfer): | |||
| class CReal(OpInfer): | |||
| """CReal op infer""" | |||
| def _check_type(self): | |||
| if self.inputs[0].dtype != "complex64": | |||
| raise GKException( | |||
| @@ -290,8 +276,6 @@ class CReal(OpInfer): | |||
| class Complex(OpInfer): | |||
| """Complex op infer""" | |||
| def _check_type(self): | |||
| if self.inputs[0].dtype != "float32": | |||
| raise GKException( | |||
| @@ -304,28 +288,26 @@ class Complex(OpInfer): | |||
| class Less(_CompareOp): | |||
| """Less op infer""" | |||
| pass | |||
| class LessEqual(_CompareOp): | |||
| """LessEqual op infer""" | |||
| pass | |||
| class Equal(_CompareOp): | |||
| """Equal op infer""" | |||
| pass | |||
| class Greater(_CompareOp): | |||
| """Greater op infer""" | |||
| pass | |||
| class GreaterEqual(_CompareOp): | |||
| """GreaterEqual op infer""" | |||
| pass | |||
| class Select(_Elemwise): | |||
| """Select op infer""" | |||
| def _check_type(self): | |||
| if self.inputs[0].dtype != "bool": | |||
| raise GKException("Select's input[0] should be a bool condition but got {}".format(self.inputs[0].dtype)) | |||
| @@ -337,7 +319,6 @@ class Select(_Elemwise): | |||
| def check_format_any(formats, checked_format): | |||
| """Check whether input format in formats list""" | |||
| if not isinstance(formats, (list, tuple)): | |||
| raise GKException("formats {} should be list or tuple, but got {}.".format(formats, type(formats))) | |||
| if checked_format not in formats: | |||
| @@ -345,13 +326,11 @@ def check_format_any(formats, checked_format): | |||
| def check_nd(data, nd): | |||
| """Check whether data are nd format""" | |||
| if not isinstance(data, (list, tuple)) or len(data) != nd: | |||
| raise GKException("input should be {}D list or tuple, but got {}.".format(nd, data)) | |||
| def conv_had_pad(pad_list, pad_mode): | |||
| """Check whether conv need to add pad""" | |||
| if not isinstance(pad_list, (list, tuple)) or len(pad_list) != 4: | |||
| raise GKException("pad_list should be 4D list or tuple, but got {}".format(pad_list)) | |||
| if pad_list[0] != pad_list[1] or pad_list[2] != pad_list[3]: | |||
| @@ -21,7 +21,7 @@ from . import model | |||
| def estimate_ops(json_str: str): | |||
| """Call cost model to estimate ops.""" | |||
| """Call costmodel to estimate ops.""" | |||
| try: | |||
| json_obj = json.loads(json_str) | |||
| graph_descs = json_obj["graph_desc"] | |||
| @@ -38,7 +38,7 @@ def estimate_ops(json_str: str): | |||
| def estimate_calulation_amount(json_str: str): | |||
| """Call cost model to estimate calculation amount of op.""" | |||
| """Call costmodel to estimate calculation amount of op.""" | |||
| try: | |||
| graph_desc = json.loads(json_str) | |||
| comp = model.load_composite(graph_desc) | |||
| @@ -24,7 +24,7 @@ from . import utils | |||
| def split_with_json(json_str, flags_str): | |||
| """Call cost model to split GraphKernel""" | |||
| """Call costmodel to split GraphKernel""" | |||
| try: | |||
| graph_desc = json.loads(json_str) | |||
| flags = json.loads(flags_str) | |||
| @@ -57,11 +57,11 @@ def _dump_split_info(flags, graph_json, graph_desc, subgraphs, graph_mode): | |||
| return | |||
| utils.create_dir(utils.GRAPH_KERNEL_DUMP_PATH) | |||
| filename = os.path.join(utils.GRAPH_KERNEL_DUMP_PATH, "graph_kernel_split_mode.txt") | |||
| with os.fdopen(os.open(filename, os.O_WRONLY | os.O_CREAT), "a+") as f: | |||
| with open(filename, "a+") as f: | |||
| f.write("********** main graph: {} **********\n".format(graph_desc.name)) | |||
| f.write("input json:\n{}\n".format(graph_json)) | |||
| f.write("graph desc:\n{}\n".format(str(graph_desc))) | |||
| if len(subgraphs) > 1 or subgraphs[0].stitch_info.has_stitch_op(): | |||
| if len(subgraphs) > 1: | |||
| for i, g in enumerate(subgraphs): | |||
| f.write("-------- subgraph {}, mode: {} --------\n".format(i, graph_mode[i])) | |||
| f.write("{}\n".format(str(g))) | |||
| @@ -26,5 +26,3 @@ def create_dir(pathname): | |||
| os.mkdir(pathname) | |||
| except OSError: | |||
| pass | |||
| finally: | |||
| pass | |||
| @@ -50,6 +50,11 @@ def _compile_akg_task_gpu(json_strs, attrs): | |||
| if not res: | |||
| raise ValueError("Compile error, args: {}! build attrs: {}".format(json_str, attrs)) | |||
| pid_path = os.path.realpath("./cuda_meta_" + str(os.getpid())) | |||
| if os.path.exists(pid_path): | |||
| copy_json(pid_path, os.path.realpath("./cuda_meta_" + str(os.getppid()))) | |||
| shutil.rmtree(pid_path) | |||
| def _compile_akg_task_ascend(json_strs, attrs): | |||
| """ | |||
| @@ -32,7 +32,7 @@ from te_fusion.parallel_compilation import init_multi_process_env, start_ga_mult | |||
| get_finished_compilation_task | |||
| from .tbe_helper import get_soc_info, assemble_op_args, get_compute_op_list, get_options_info, get_fuzz_build_info, \ | |||
| BuildType, adjust_custom_op_info, pack_op_args, get_module_name | |||
| BuildType, adjust_custom_op_info, pack_op_args | |||
| from .tbe_job import TbeJob, JobStatus | |||
| PLATFORM_FLAG = ["Ascend310", "Ascend910", "Hi3796CV300ES", "Ascend710", "Ascend610", "Hi3796CV300CS", "SD3403"] | |||
| @@ -242,7 +242,7 @@ def check_support(job: TbeJob): | |||
| op_func_name = compute_op_info["func_name"] | |||
| if op_func_name in ("resize_nearest_neighbor_v2_grad_d", "resize_bilinear_v2_grad"): | |||
| attrs.pop(-2) | |||
| op_module_name = get_module_name(compute_op_info) | |||
| op_module_name = compute_op_info["module_name"] | |||
| py_module_path = compute_op_info["py_module_path"] | |||
| _normalize_module_name(op_module_name, py_module_path) | |||
| func_name = "check_supported" | |||
| @@ -281,7 +281,7 @@ def select_op_format(job: TbeJob): | |||
| compute_op_info = compute_op_info_list[0] | |||
| adjust_custom_op_info(compute_op_info) | |||
| inputs, outputs, attrs = assemble_op_args(compute_op_info) | |||
| op_module_name = get_module_name(compute_op_info) | |||
| op_module_name = compute_op_info["module_name"] | |||
| py_module_path = compute_op_info["py_module_path"] | |||
| _normalize_module_name(op_module_name, py_module_path) | |||
| op_func_name = "op_select_format" | |||
| @@ -317,7 +317,7 @@ def _pre_build_compute_op_info(compute_op, job): | |||
| if l1_size != -1: | |||
| set_L1_info("op_L1_space", -1) | |||
| inputs, outputs, attrs = assemble_op_args(compute_op) | |||
| op_module_name = get_module_name(compute_op) | |||
| op_module_name = compute_op["module_name"] | |||
| py_module_path = compute_op["py_module_path"] | |||
| op_func_name = compute_op["func_name"] | |||
| op_type = compute_op["type"] | |||
| @@ -340,8 +340,8 @@ def _pre_build_compute_op_info(compute_op, job): | |||
| job.info("OpType {} support op_impl_mode, current op_impl_mode:{}".format(op_type, op_impl_mode)) | |||
| options = get_options_info(job.content) | |||
| dispatch_prebuild_task(job.source_id, job.id, l1_size, op_module_name, op_type, op_func_name, unknown_shape, | |||
| (inputs, outputs, attrs, options), int64_mode, dynamic_compile_static, unknown_shape, | |||
| job.rl_tune_switch, job.rl_tune_list, job.pass_list, job.op_tune_switch, job.op_tune_list) | |||
| (inputs, outputs, attrs, options), int64_mode, dynamic_compile_static, job.rl_tune_switch, | |||
| job.rl_tune_list, job.pass_list, job.op_tune_switch, job.op_tune_list) | |||
| def get_prebuild_output(op_name): | |||
| @@ -391,7 +391,7 @@ def build_single_pre_op(job: TbeJob): | |||
| inputs, outputs, attrs = assemble_op_args(compute_op_info) | |||
| op_type = compute_op_info["type"] | |||
| l1_size = job.content["l1_size"] | |||
| op_module_name = get_module_name(compute_op_info) | |||
| op_module_name = compute_op_info["module_name"] | |||
| op_kernel_name = compute_op_info["op_name"] | |||
| py_module_path = compute_op_info["py_module_path"] | |||
| op_func_name = compute_op_info["func_name"] | |||
| @@ -404,9 +404,9 @@ def build_single_pre_op(job: TbeJob): | |||
| fuzz_build_info = get_fuzz_build_info(job.content) | |||
| dispatch_single_op_compile_task(job.source_id, job.id, l1_size, op_module_name, op_type, op_func_name, | |||
| op_kernel_name, unknown_shape, (inputs, outputs, attrs, options), int64_mode, | |||
| None, None, dynamic_compile_static, unknown_shape, op_pattern, | |||
| json.dumps(fuzz_build_info), job.rl_tune_switch, job.rl_tune_list, job.pass_list, | |||
| job.op_tune_switch, job.op_tune_list) | |||
| None, None, dynamic_compile_static, op_pattern, json.dumps(fuzz_build_info), | |||
| job.rl_tune_switch, job.rl_tune_list, job.pass_list, job.op_tune_switch, | |||
| job.op_tune_list) | |||
| return True | |||
| @@ -487,7 +487,7 @@ def rl_tune_single_op(job: TbeJob): | |||
| inputs, outputs, attrs = assemble_op_args(compute_op_info) | |||
| op_type = compute_op_info["type"] | |||
| l1_size = job.content["l1_size"] | |||
| op_module_name = get_module_name(compute_op_info) | |||
| op_module_name = compute_op_info["module_name"] | |||
| op_kernel_name = compute_op_info["op_name"] | |||
| full_name = compute_op_info["name"] | |||
| py_module_path = compute_op_info["py_module_path"] | |||
| @@ -503,7 +503,7 @@ def rl_tune_single_op(job: TbeJob): | |||
| device_id = job.content["SocInfo"]["deviceId"] | |||
| try: | |||
| build_single_op_from_c(op_module_name, op_func_name, op_type, "build", unknown_shape, | |||
| (inputs, outputs, attrs), int64_mode, dynamic_compile_static, unknown_shape, op_pattern, | |||
| (inputs, outputs, attrs), int64_mode, dynamic_compile_static, op_pattern, | |||
| auto_tiling_mode, device_id, json.dumps(fuzz_build_info)) | |||
| # pylint: disable=broad-except | |||
| except Exception: | |||
| @@ -547,7 +547,7 @@ def rl_tune_fusion_op(job: TbeJob): | |||
| compute_op_list = get_compute_op_list(job.content) | |||
| op_module_names_str = "" | |||
| for op in compute_op_list: | |||
| op_module_names_str = op_module_names_str + "," + get_module_name(op) | |||
| op_module_names_str = op_module_names_str + "," + op["module_name"] | |||
| op_module_names_str = op_module_names_str[1:] | |||
| from schedule_search.rl_online_tune import dispatch_fusion_tune_task | |||
| res = dispatch_fusion_tune_task(job.source_id, job.id, l1_size, base_kernel, op_kernel_name, op_module_names_str, | |||
| @@ -179,6 +179,8 @@ def get_options_info(job_content): | |||
| options["op_debug_level"] = job_content["SocInfo"]["op_debug_level"] | |||
| options["op_impl_mode"] = job_content["SocInfo"]["op_impl_mode"] | |||
| options["op_debug_dir"] = job_content["SocInfo"]["op_debug_dir"] | |||
| options["op_compiler_cache_dir"] = job_content["SocInfo"]["op_compiler_cache_dir"] | |||
| options["op_compiler_cache_mode"] = job_content["SocInfo"]["op_compiler_cache_mode"] | |||
| options["mdl_bank_path"] = job_content["SocInfo"]["op_debug_level"] | |||
| options["op_bank_path"] = job_content["SocInfo"]["op_bank_path"] | |||
| options["deviceId"] = job_content["SocInfo"]["deviceId"] | |||
| @@ -218,19 +220,6 @@ def get_func_names(job_content): | |||
| return func_names | |||
| def get_module_name(compute_op_info): | |||
| """ | |||
| get compute_op_info | |||
| :param compute_op_info: | |||
| :return: | |||
| """ | |||
| unknown_shape = compute_op_info["unknown_shape"] | |||
| op_module_name = compute_op_info["module_name"] | |||
| if unknown_shape: | |||
| op_module_name = op_module_name.split(".")[0] + ".dynamic." + op_module_name.split(".")[-1] | |||
| return op_module_name | |||
| def adjust_custom_op_info(compute_op_info): | |||
| """ | |||
| adjust custom op info | |||
| @@ -71,13 +71,12 @@ def _get_message(msg, args): | |||
| class TbeJob: | |||
| """ Tbe compilation job """ | |||
| def __init__(self, source_id, job_id, job_type, content, fusion_op_name, json_str, sys_info): | |||
| def __init__(self, source_id, job_id, job_type, content, json_str, sys_info): | |||
| self.source_id = source_id | |||
| self.id = job_id | |||
| self.type = JobType(job_type) | |||
| self.status = JobStatus.JOB_INITIAL | |||
| self.content = content | |||
| self.fusion_op_name = fusion_op_name | |||
| self.result = "" | |||
| self.process_info = [] | |||
| self.json_string = json_str | |||
| @@ -150,8 +149,8 @@ class TbeJob: | |||
| result["source_id"] = self.source_id | |||
| result["job_id"] = self.id | |||
| result["job_type"] = self.type.value | |||
| result["fusion_op_name"] = self.fusion_op_name | |||
| result["result"] = self.result | |||
| self.debug("Resp result:{}".format(json.dumps(result))) | |||
| process_info = [] | |||
| for info in self.process_info: | |||
| msg = {"index": info.index, "level": info.level.value, "message": info.info} | |||
| @@ -102,9 +102,8 @@ class TbeJobManager: | |||
| source_id = job_json["source_id"] | |||
| job_type = job_json["job_type"] | |||
| sys_info = self._get_job_sys_info() | |||
| fusion_op_name = "NA" if "fusion_op_name" not in job_json["job_content"] else job_json["job_content"][ | |||
| "fusion_op_name"] | |||
| job = TbeJob(source_id, job_id, job_type, job_json["job_content"], fusion_op_name, job_str, sys_info) | |||
| job = TbeJob(source_id, job_id, job_type, job_json["job_content"], job_str, sys_info) | |||
| job.debug("Req job string: {}".format(job_str)) | |||
| post_job(self._all_jobs, job) | |||
| if not self.tbe_initialize and job.type != JobType.INITIALIZE_JOB: | |||
| job.error( | |||
| @@ -116,7 +115,6 @@ class TbeJobManager: | |||
| return res | |||
| # pylint: disable=broad-except | |||
| except Exception: | |||
| # pylint: disable=no-value-for-parameter | |||
| sys_info = self._get_job_sys_info() | |||
| job = TbeJob(-1, -1, "", None, job_str, sys_info) if job is None else job | |||
| job.status = JobStatus.JOB_FAILED | |||
| @@ -263,6 +261,9 @@ class TbeJobManager: | |||
| return self.add_to_finished_jobs(query_job, JobStatus.JOB_SUCCESS) | |||
| target_job = get_job(self._running_jobs, target_source_id, target_job_id) | |||
| if target_job: | |||
| query_job.debug("Found job in Running jobs, source_id:{}, job_id:{}".format(target_source_id, | |||
| target_job_id)) | |||
| target_job.debug("Be Queried") | |||
| query_job.result = target_job.get_result() | |||
| return self.add_to_finished_jobs(query_job, JobStatus.JOB_SUCCESS) | |||
| target_job = get_job(self._all_jobs, target_source_id, target_job_id) | |||
| @@ -159,17 +159,12 @@ def resolve_symbol(namespace, symbol): | |||
| if getattr(resolve_, "__hash__") is None: | |||
| return resolve_ | |||
| # Raise NotImplementedError when parsing the numpy methods, but not the numpy constant. | |||
| if namespace.name == "numpy" and isinstance(resolve_, (types.FunctionType, types.MethodType, types.ModuleType)): | |||
| raise NotImplementedError( | |||
| f"MindSpore does not support to use the numpy methods in the function construct with the graph mode.") | |||
| # If need trope the obj | |||
| if resolve_ in convert_object_map: | |||
| resolve_ = convert_object_map.get(resolve_) | |||
| logger.debug("convert resolve = %r", resolve_) | |||
| if resolve_ == NO_IMPLEMENT: | |||
| raise NotImplementedError(f"Not support for `{symbol}`.") | |||
| raise NotImplementedError(f"Not support for `{symbol}`") | |||
| except Exception as e: | |||
| if isinstance(e, NotImplementedError): | |||
| raise e | |||
| @@ -1312,8 +1312,7 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab | |||
| >>> print(input_x.sum(axis=1)) | |||
| [10. 35.] | |||
| """ | |||
| input_x = x.astype(mstype.int32) if x.dtype == mstype.bool_ else x | |||
| dtype = input_x.dtype if dtype is None else dtype | |||
| dtype = x.dtype if dtype is None else dtype | |||
| if not isinstance(keepdims, int): | |||
| const_utils.raise_type_error("integer argument expected") | |||
| if initial is not None and not isinstance(initial, (int, float, bool)): | |||
| @@ -1323,14 +1322,14 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab | |||
| else: | |||
| axis = check_and_canonicalize_axes(axis, x.ndim) | |||
| if not check_type_support(input_x.dtype, 'GPU', (mstype.float64, mstype.float32, mstype.float16)): | |||
| input_x = input_x.astype(mstype.float32) | |||
| if x.dtype == mstype.bool_: | |||
| x = x.astype("int32") | |||
| if 0 in x.shape: | |||
| x = const_utils.make_tensor([0], x.dtype) | |||
| if keepdims: | |||
| res = _reduce_sum_keepdims(input_x, axis) | |||
| res = _reduce_sum_keepdims(x, axis) | |||
| else: | |||
| res = _reduce_sum_default(input_x, axis) | |||
| res = _reduce_sum_default(x, axis) | |||
| if initial is not None: | |||
| res += initial | |||
| return res.astype(dtype) | |||
| @@ -1649,7 +1648,6 @@ get_log2_size = constexpr(validator.get_log2_size) | |||
| check_axis_type = constexpr(validator.check_axis_type) | |||
| check_and_canonicalize_axes = constexpr(validator.check_and_canonicalize_axes) | |||
| empty_compile = constexpr(validator.empty_compile) | |||
| check_type_support = constexpr(validator.check_type_support) | |||
| def tensor_bool(x): | |||
| @@ -16,6 +16,7 @@ | |||
| import os | |||
| from mindspore import log as logger | |||
| from mindspore._extends.parallel_compile.akg_compiler.akg_process import create_akg_parallel_process | |||
| from mindspore._extends.parallel_compile.akg_compiler.compiler import run_compiler as akg_compile_single | |||
| class Messager: | |||
| @@ -145,7 +146,9 @@ class AkgBuilder(): | |||
| def handle(self, messager, arg): | |||
| """Handle message about akg""" | |||
| if arg == 'AKG/START': | |||
| if arg == 'AKG/PID': | |||
| messager.send_res(os.getpid()) | |||
| elif arg == 'AKG/START': | |||
| messager.send_ack() | |||
| process_num_str = messager.get_message() | |||
| messager.send_ack() | |||
| @@ -170,8 +173,17 @@ class AkgBuilder(): | |||
| else: | |||
| messager.send_ack(False) | |||
| break | |||
| else: | |||
| raise RuntimeError("Unknown message type: %s" % arg) | |||
| elif arg == 'AKG/COMPILE': | |||
| messager.send_ack() | |||
| json = messager.get_message() | |||
| try: | |||
| akg_compile_single(json, self.attrs) | |||
| except ValueError: | |||
| messager.send_ack(False) | |||
| messager.exit() | |||
| finally: | |||
| pass | |||
| messager.send_ack() | |||
| def get_logger(): | |||
| @@ -297,14 +297,20 @@ if(MODE_ASCEND_ALL) | |||
| ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} | |||
| ${ASCEND_DRIVER_BACK_PATH}) | |||
| find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(PROFILING msprofiler_fwkacl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(ACL ascendcl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(PLATFORM platform ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| find_library(OPTILING optiling ${ASCEND_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH}) | |||
| find_library(OPT_FEATURE opt_feature ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||
| add_library(ms_profile SHARED | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc) | |||
| set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) | |||
| target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init) | |||
| target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive | |||
| mindspore::protobuf -Wl,--end-group) | |||
| target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER} -Wl,--no-as-needed | |||
| ${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE} ${PROFILING}) | |||
| ${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE}) | |||
| target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) | |||
| elseif(CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece | |||
| @@ -319,7 +325,7 @@ endif() | |||
| set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) | |||
| set_property(SOURCE "pipeline/jit/init.cc" PROPERTY | |||
| COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE) | |||
| pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc" NO_EXTRAS) | |||
| pybind11_add_module(_c_expression "pipeline/jit/init.cc") | |||
| MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}") | |||
| if(CMAKE_SYSTEM_NAME MATCHES "Linux") | |||
| @@ -369,6 +375,9 @@ else() | |||
| proto_input -Wl,--no-whole-archive) | |||
| target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module) | |||
| target_link_libraries(_c_expression PRIVATE mindspore_gvar) | |||
| if(MODE_ASCEND_ALL) | |||
| target_link_libraries(_c_expression PRIVATE -Wl,--no-as-needed ms_profile) | |||
| endif() | |||
| endif() | |||
| if(USE_GLOG) | |||
| @@ -35,8 +35,6 @@ if(ENABLE_CPU) | |||
| "cpu/fl/*.cc" | |||
| "cpu/ps/*.cc" | |||
| "cpu/quantum/*.cc" | |||
| "cpu/pyfunc/*.cc" | |||
| "cpu/rl/*.cc" | |||
| ) | |||
| if(NOT ENABLE_MPI) | |||
| @@ -85,7 +83,6 @@ if(NOT ENABLE_CPU OR WIN32) | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/get_model_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/start_fl_job_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/update_model_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/push_metrics_kernel.cc") | |||
| endif() | |||
| if(ENABLE_GPU) | |||
| @@ -16,11 +16,6 @@ | |||
| #include "backend/kernel_compiler/akg/akg_kernel_build.h" | |||
| #include <stdio.h> | |||
| #include <errno.h> | |||
| #include <fcntl.h> | |||
| #include <unistd.h> | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include <memory> | |||
| @@ -28,7 +23,6 @@ | |||
| #include <unordered_set> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include <iostream> | |||
| #include "nlohmann/json.hpp" | |||
| #include "ir/dtype.h" | |||
| #include "ir/func_graph.h" | |||
| @@ -40,346 +34,17 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| #define INIT_SET_FROM_2D_ARRAY(set_var, list_idx) \ | |||
| std::set<size_t> set_var(kernel_lists_[list_idx], kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]); | |||
| #define LIST_BEGIN(list_idx) kernel_lists_[list_idx] | |||
| #define LIST_END(list_idx) (kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]) | |||
| #define RESET_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] = val | |||
| #define INCREASE_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] += val | |||
| constexpr int32_t PROCESS_NUM = 16; | |||
| constexpr int32_t TIME_OUT = 300; | |||
| bool AkgKernelPool::LockMng::TryLock() { | |||
| // Try to lock 100 times. Return errno if lock unsuccessfully | |||
| uint32_t trial = 100; | |||
| int32_t ret = -1; | |||
| while (trial > 0) { | |||
| ret = lockf(fd_, F_TLOCK, 0); | |||
| if (ret == 0 || (errno != EACCES && errno != EAGAIN)) { | |||
| break; | |||
| } | |||
| trial--; | |||
| usleep(5000); | |||
| } | |||
| if (ret == -1) { | |||
| MS_LOG(ERROR) << "Failed to acquire the lock, errno:" << strerror(errno) << "."; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| void AkgKernelPool::LockMng::Unlock() { | |||
| auto ret = lockf(fd_, F_ULOCK, 0); | |||
| if (ret == -1) { | |||
| MS_LOG(ERROR) << "Failed to release the lock, errno:" << strerror(errno); | |||
| } | |||
| } | |||
| std::string AkgKernelPool::GetCurrentPath() { | |||
| char cwd[PATH_MAX]; | |||
| char *ret = getcwd(cwd, sizeof(cwd)); | |||
| if (ret == nullptr) { | |||
| MS_LOG(ERROR) << "Get current work directory failed, errno:" << strerror(errno); | |||
| return ""; | |||
| } | |||
| char abspath[PATH_MAX]; | |||
| char *res = realpath(cwd, abspath); | |||
| if (res == nullptr) { | |||
| MS_LOG(ERROR) << "Change to realpath failed, errno:" << strerror(errno); | |||
| return ""; | |||
| } | |||
| return std::string(abspath); | |||
| } | |||
| void *AkgKernelPool::CreateSharedMem(const std::string &path) { | |||
| is_creator_ = false; | |||
| auto hash_id = std::hash<std::string>()(path); | |||
| auto key_id = static_cast<key_t>(hash_id); | |||
| auto mem_size = sizeof(size_t) * kListNum_ * (kMaxKernelNum_ + 1) + 512; | |||
| { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return nullptr; | |||
| } | |||
| // check if the shared memory exists or not. | |||
| // remove shared memory if exists and the nattach is 0 | |||
| struct shmid_ds buf; | |||
| auto id = shmget(key_id, mem_size, 0); | |||
| if (id != -1) { | |||
| auto ret = shmctl(id, IPC_STAT, &buf); | |||
| if (ret == -1) { | |||
| MS_LOG(ERROR) << "Failed to get the info of shared memory, errno:" << strerror(errno); | |||
| return nullptr; | |||
| } | |||
| if (buf.shm_nattch == 0) { | |||
| ret = shmctl(id, IPC_RMID, nullptr); | |||
| if (ret < 0) { | |||
| MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return nullptr; | |||
| } | |||
| shm_id_ = shmget(key_id, mem_size, IPC_CREAT | IPC_EXCL | 0600); | |||
| if (shm_id_ == -1) { | |||
| if (errno == EEXIST) { | |||
| shm_id_ = shmget(key_id, mem_size, 0); | |||
| } | |||
| if (shm_id_ == -1) { | |||
| MS_LOG(ERROR) << "Create shared_mem failed, error no:" << strerror(errno); | |||
| return nullptr; | |||
| } | |||
| } else { | |||
| is_creator_ = true; | |||
| } | |||
| auto local_addr = shmat(shm_id_, nullptr, 0); | |||
| if (local_addr == reinterpret_cast<void *>(-1)) { | |||
| MS_LOG(ERROR) << "Attach to shared_mem failed, error no:" << strerror(errno); | |||
| return nullptr; | |||
| } | |||
| if (is_creator_) { | |||
| (void)memset(local_addr, 0, mem_size); | |||
| } | |||
| return local_addr; | |||
| } | |||
| int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) { | |||
| auto cp = GetCurrentPath(); | |||
| if (cp.empty()) { | |||
| return -1; | |||
| } | |||
| fd_ = open(kKeyName_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); | |||
| if (fd_ == -1) { | |||
| MS_LOG(ERROR) << "open file <" << kKeyName_ << "> failed, errno:" << strerror(errno); | |||
| return -1; | |||
| } | |||
| auto addr = CreateSharedMem(cp); | |||
| if (addr == nullptr) { | |||
| return -1; | |||
| } | |||
| InitKernelLists(addr); | |||
| auto ret = AddKernels(build_args); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "AkgKernelPool AddKernels failed."; | |||
| return false; | |||
| } | |||
| return 0; | |||
| } | |||
| AkgKernelPool::~AkgKernelPool() { | |||
| { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(EXCEPTION) << "Failed to acquire lock."; | |||
| } | |||
| struct shmid_ds buf; | |||
| auto ret = shmctl(shm_id_, IPC_STAT, &buf); | |||
| if (ret == -1) { | |||
| MS_LOG(EXCEPTION) << "Failed to get the info of shared memory, errno:" << strerror(errno); | |||
| } | |||
| bool need_delete_by_last = false; | |||
| // if the creator exits unexpectedly and fails to delete the shm, the last process will try to delete the shm | |||
| if (((buf.shm_perm.mode & SHM_DEST) == 0) && (buf.shm_nattch == 1)) { | |||
| need_delete_by_last = true; | |||
| } | |||
| // Detach shared memory | |||
| ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0])); | |||
| if (ret < 0) { | |||
| MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno); | |||
| } | |||
| // Realse shared_memroy | |||
| if (is_creator_ || need_delete_by_last) { | |||
| ret = shmctl(shm_id_, IPC_RMID, nullptr); | |||
| if (ret < 0) { | |||
| MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno); | |||
| } | |||
| } | |||
| } | |||
| // Close key file | |||
| if (fd_ != -1) { | |||
| (void)close(fd_); | |||
| } | |||
| } | |||
| int32_t AkgKernelPool::AddKernels(const std::vector<JsonNodePair> &build_args) { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return -1; | |||
| } | |||
| INIT_SET_FROM_2D_ARRAY(todo_list, kToDoIdx_); | |||
| INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_); | |||
| INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_); | |||
| for (const auto &[json_generator, anf_node] : build_args) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| auto kernel_name = json_generator.kernel_name(); | |||
| auto hash_id = std::hash<std::string>()(kernel_name); | |||
| if (self_kernel_ids_.count(hash_id) != 0) { | |||
| MS_LOG(ERROR) << "Duplicated hash_id in list."; | |||
| return -1; | |||
| } | |||
| self_kernel_ids_.emplace(hash_id); | |||
| } | |||
| std::set<size_t> diff_from_todo; | |||
| std::set<size_t> diff_from_doing; | |||
| std::set<size_t> diff_from_done; | |||
| // add the unique kernel only once, so need to check if it exists in todo_list, doing_list, or done_list | |||
| std::set_difference(self_kernel_ids_.begin(), self_kernel_ids_.end(), todo_list.begin(), todo_list.end(), | |||
| std::inserter(diff_from_todo, diff_from_todo.begin())); | |||
| std::set_difference(diff_from_todo.begin(), diff_from_todo.end(), doing_list.begin(), doing_list.end(), | |||
| std::inserter(diff_from_doing, diff_from_doing.begin())); | |||
| std::set_difference(diff_from_doing.begin(), diff_from_doing.end(), done_list.begin(), done_list.end(), | |||
| std::inserter(diff_from_done, diff_from_done.begin())); | |||
| auto new_kernel_size = diff_from_done.size(); | |||
| if (new_kernel_size + todo_list.size() > static_cast<size_t>(kMaxKernelNum_)) { | |||
| MS_LOG(ERROR) << "The size of kernels is " << new_kernel_size << ", while the left space of the pool is " | |||
| << kMaxKernelNum_ - todo_list.size(); | |||
| return -1; | |||
| } | |||
| std::copy(diff_from_done.begin(), diff_from_done.end(), LIST_END(kToDoIdx_)); | |||
| INCREASE_LIST_SIZE(kToDoIdx_, new_kernel_size); | |||
| return 0; | |||
| } | |||
| int32_t AkgKernelPool::FetchKernels(std::set<size_t> *out) { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return -1; | |||
| } | |||
| std::set<size_t> left_in_todo_list; | |||
| // filter out kernels which belongs to other processes | |||
| auto FilterBySelfList = [&left_in_todo_list, &out, this](size_t id) { | |||
| if (this->self_kernel_ids_.count(id) != 0) { | |||
| out->emplace(id); | |||
| } else { | |||
| left_in_todo_list.emplace(id); | |||
| } | |||
| }; | |||
| std::for_each(LIST_BEGIN(kToDoIdx_), LIST_END(kToDoIdx_), FilterBySelfList); | |||
| std::copy(out->begin(), out->end(), LIST_END(kDoingIdx_)); | |||
| INCREASE_LIST_SIZE(kDoingIdx_, out->size()); | |||
| std::copy(left_in_todo_list.begin(), left_in_todo_list.end(), LIST_BEGIN(kToDoIdx_)); | |||
| RESET_LIST_SIZE(kToDoIdx_, left_in_todo_list.size()); | |||
| return 0; | |||
| } | |||
| int32_t AkgKernelPool::UpdateAndWait(const std::set<size_t> &ids) { | |||
| if (!ids.empty()) { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return -1; | |||
| } | |||
| // update the state of finished kernels to `done` | |||
| std::copy(ids.begin(), ids.end(), LIST_END(kDoneIdx_)); | |||
| INCREASE_LIST_SIZE(kDoneIdx_, ids.size()); | |||
| // delete the finished kernels from doing_list | |||
| std::vector<size_t> left_in_doing_list; | |||
| INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_); | |||
| std::set_difference(doing_list.begin(), doing_list.end(), ids.begin(), ids.end(), | |||
| std::inserter(left_in_doing_list, left_in_doing_list.begin())); | |||
| std::copy(left_in_doing_list.begin(), left_in_doing_list.end(), LIST_BEGIN(kDoingIdx_)); | |||
| RESET_LIST_SIZE(kDoingIdx_, left_in_doing_list.size()); | |||
| } | |||
| auto ret = Wait(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "AkgKernelPool Wait failed."; | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| int32_t AkgKernelPool::Wait() { | |||
| // wait until all the kernels which belong to this process finish compiling | |||
| uint32_t trials = 1000; | |||
| while (trials > 0) { | |||
| { | |||
| LockMng lock(fd_); | |||
| if (!lock.locked_) { | |||
| MS_LOG(ERROR) << "Failed to acquire lock."; | |||
| return -1; | |||
| } | |||
| INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_); | |||
| if (std::all_of(self_kernel_ids_.begin(), self_kernel_ids_.end(), | |||
| [&done_list](size_t id) { return done_list.count(id) != 0; })) { | |||
| return 0; | |||
| } | |||
| } | |||
| usleep(1000000); | |||
| trials--; | |||
| } | |||
| MS_LOG(ERROR) << "Time out while wait kernel compiling"; | |||
| return -1; | |||
| } | |||
| std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) { | |||
| std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) { | |||
| // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess. | |||
| std::vector<std::string> jsons; | |||
| std::unordered_set<std::string> kernel_name_set; | |||
| std::vector<JsonNodePair> new_build_args; | |||
| for (const auto &[json_generator, anf_node] : build_args) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| auto kernel_name = json_generator.kernel_name(); | |||
| MS_LOG(DEBUG) << "Akg start compile op: " << kernel_name; | |||
| auto cached_kernel_pack = AkgSearchCache(kernel_name); | |||
| if (cached_kernel_pack != nullptr) { | |||
| @@ -394,9 +59,11 @@ std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vecto | |||
| continue; | |||
| } | |||
| kernel_name_set.insert(kernel_name); | |||
| new_build_args.push_back({json_generator, anf_node}); | |||
| auto kernel_json = json_generator.kernel_json_str(); | |||
| AkgSaveJsonInfo(kernel_name, kernel_json); | |||
| jsons.push_back(kernel_json); | |||
| } | |||
| return new_build_args; | |||
| return jsons; | |||
| } | |||
| bool AkgKernelBuilder::InsertToCache(const std::vector<JsonNodePair> &build_args) { | |||
| @@ -423,84 +90,39 @@ bool AkgKernelBuilder::HandleRepeatNodes() { | |||
| << anf_node->fullname_with_scope() << "]."; | |||
| return false; | |||
| } | |||
| MS_LOG(DEBUG) << "Use just compiled kernel, kernel_name[" << kernel_name << "], fullname_with_scope[" | |||
| << anf_node->fullname_with_scope() << "]."; | |||
| MS_LOG(INFO) << "Use just compiled kernel, kernel_name[" << kernel_name << "], fullname_with_scope[" | |||
| << anf_node->fullname_with_scope() << "]."; | |||
| AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node); | |||
| } | |||
| return true; | |||
| } | |||
| std::vector<std::string> AkgKernelBuilder::GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args, | |||
| std::set<size_t> fetched_ids) { | |||
| std::vector<std::string> jsons; | |||
| for (const auto &[json_generator, anf_node] : build_args) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| auto kernel_name = json_generator.kernel_name(); | |||
| auto hash_id = std::hash<std::string>()(kernel_name); | |||
| if (fetched_ids.count(hash_id) == 0) { | |||
| continue; | |||
| } | |||
| auto kernel_json = json_generator.kernel_json_str(); | |||
| AkgSaveJsonInfo(kernel_name, kernel_json); | |||
| jsons.push_back(kernel_json); | |||
| } | |||
| return jsons; | |||
| } | |||
| bool AkgKernelBuilder::AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args) { | |||
| repeat_nodes_.clear(); | |||
| auto new_build_args = GetNotCachedKernels(build_args); | |||
| if (new_build_args.empty()) { | |||
| auto jsons = GetNotCachedKernelJsons(build_args); | |||
| if (jsons.empty()) { | |||
| return true; | |||
| } | |||
| AkgKernelPool kp; | |||
| auto ret = kp.Init(new_build_args); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "AkgKernelPool init failed."; | |||
| auto client = GetClient(); | |||
| MS_EXCEPTION_IF_NULL(client); | |||
| if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) { | |||
| MS_LOG(ERROR) << "Akg start failed."; | |||
| return false; | |||
| } | |||
| std::set<size_t> fetched_ids; | |||
| ret = kp.FetchKernels(&fetched_ids); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "AkgKernelPool FetchKernels failed."; | |||
| auto attrs = CollectBuildAttrs(); | |||
| if (!attrs.empty() && !client->AkgSendAttr(attrs)) { | |||
| MS_LOG(ERROR) << "Akg send attr failed."; | |||
| return false; | |||
| } | |||
| if (!fetched_ids.empty()) { | |||
| auto jsons = GetKernelJsonsByHashId(new_build_args, fetched_ids); | |||
| auto client = GetClient(); | |||
| MS_EXCEPTION_IF_NULL(client); | |||
| if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) { | |||
| MS_LOG(ERROR) << "Akg start failed."; | |||
| return false; | |||
| } | |||
| auto attrs = CollectBuildAttrs(); | |||
| if (!attrs.empty() && !client->AkgSendAttr(attrs)) { | |||
| MS_LOG(ERROR) << "Akg send attr failed."; | |||
| return false; | |||
| } | |||
| if (!client->AkgSendData(jsons)) { | |||
| MS_LOG(ERROR) << "Akg send data failed."; | |||
| return false; | |||
| } | |||
| if (!client->AkgWait()) { | |||
| MS_LOG(ERROR) << "Akg compile failed."; | |||
| return false; | |||
| } | |||
| if (!client->AkgSendData(jsons)) { | |||
| MS_LOG(ERROR) << "Akg send data failed."; | |||
| return false; | |||
| } | |||
| ret = kp.UpdateAndWait(fetched_ids); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "AkgKernelPool UpdateAndWait failed."; | |||
| if (!client->AkgWait()) { | |||
| MS_LOG(ERROR) << "Akg compile failed."; | |||
| return false; | |||
| } | |||
| // All unique done here, cache them and set kernel. | |||
| if (!InsertToCache(build_args)) { | |||
| MS_LOG(ERROR) << "Insert cache failed."; | |||
| @@ -546,7 +168,7 @@ bool AkgKernelBuilder::AkgKernelParallelBuild(const std::vector<AnfNodePtr> &anf | |||
| } | |||
| if (json_and_node.empty()) { | |||
| MS_LOG(INFO) << "There is no akg kernel to be compiled."; | |||
| MS_LOG(DEBUG) << "There is no kernel needed to be compiled."; | |||
| return true; | |||
| } | |||
| @@ -17,13 +17,10 @@ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_ | |||
| #include <sys/shm.h> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include <map> | |||
| #include <set> | |||
| #include "ir/anf.h" | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/session/kernel_build_client.h" | |||
| @@ -47,83 +44,13 @@ class AkgKernelBuilder { | |||
| bool AkgKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes); | |||
| private: | |||
| std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args); | |||
| std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args, | |||
| std::set<size_t> fetched_ids); | |||
| std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args); | |||
| bool InsertToCache(const std::vector<JsonNodePair> &build_args); | |||
| bool HandleRepeatNodes(); | |||
| bool AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args); | |||
| std::vector<JsonNodePair> repeat_nodes_; | |||
| std::string CollectBuildAttrs(); | |||
| }; | |||
| class AkgKernelPool { | |||
| public: | |||
| class LockMng { | |||
| public: | |||
| explicit LockMng(int32_t fd) { | |||
| fd_ = fd; | |||
| locked_ = TryLock(); | |||
| } | |||
| virtual ~LockMng() { | |||
| if (locked_) { | |||
| Unlock(); | |||
| } | |||
| } | |||
| bool locked_{false}; | |||
| private: | |||
| bool TryLock(); | |||
| void Unlock(); | |||
| int32_t fd_{-1}; | |||
| }; | |||
| public: | |||
| AkgKernelPool() = default; | |||
| virtual ~AkgKernelPool(); | |||
| int32_t Init(const std::vector<JsonNodePair> &build_args); | |||
| int32_t FetchKernels(std::set<size_t> *out); | |||
| int32_t UpdateAndWait(const std::set<size_t> &ids); | |||
| constexpr inline static size_t kMaxKernelNum_{1000}; | |||
| // allocate memory for todo_list, doing_list, done_list | |||
| constexpr inline static size_t kListNum_{3}; | |||
| constexpr inline static auto kKeyName_ = "./akg_build_tmp.key"; | |||
| constexpr inline static int32_t kToDoIdx_ = 0; | |||
| constexpr inline static int32_t kDoingIdx_ = 1; | |||
| constexpr inline static int32_t kDoneIdx_ = 2; | |||
| private: | |||
| void *CreateSharedMem(const std::string &path); | |||
| std::string GetCurrentPath(); | |||
| inline void InitKernelLists(void *addr) { | |||
| kernel_lists_[kToDoIdx_] = reinterpret_cast<size_t *>(addr); | |||
| kernel_lists_[kDoingIdx_] = kernel_lists_[kToDoIdx_] + kMaxKernelNum_ + 1; | |||
| kernel_lists_[kDoneIdx_] = kernel_lists_[kDoingIdx_] + kMaxKernelNum_ + 1; | |||
| } | |||
| int32_t AddKernels(const std::vector<JsonNodePair> &kernel_jsons); | |||
| int32_t Wait(); | |||
| int32_t shm_id_{-1}; | |||
| bool is_creator_{false}; | |||
| int32_t fd_{-1}; | |||
| // includes 3 lists: todo_list, doing_list, done_list. | |||
| // each list has kMaxKernelNum_ + 1 elements and, the count of elements in each list | |||
| // is stored in kernel_lists_[xx][kMaxKernelNum_] | |||
| size_t *kernel_lists_[kListNum_]{nullptr, nullptr, nullptr}; | |||
| std::set<size_t> self_kernel_ids_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -15,6 +15,12 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/akg/akg_kernel_json_decoder.h" | |||
| #include <algorithm> | |||
| #include <memory> | |||
| #include <sstream> | |||
| #include <string> | |||
| #include <map> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| @@ -16,6 +16,12 @@ | |||
| #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h" | |||
| #include <algorithm> | |||
| #include <functional> | |||
| #include <map> | |||
| #include <set> | |||
| #include <sstream> | |||
| #include <tuple> | |||
| #if ENABLE_GPU | |||
| #include <cuda.h> | |||
| #endif | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/akg/akg_kernel_metadata.h" | |||
| #include <memory> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/oplib/oplib.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| @@ -16,6 +16,13 @@ | |||
| #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h" | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <unordered_set> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "ir/dtype.h" | |||
| #include "ir/func_graph.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| @@ -27,20 +34,18 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| KernelPackPtr AkgAscendKernelBuilder::AkgSearchCache(const std::string &kernel_name) { | |||
| return tbe::TbeUtils::SearchCache(kernel_name, true); | |||
| return tbe::TbeUtils::SearchCache(kernel_name, kProcessorAiCore); | |||
| } | |||
| KernelPackPtr AkgAscendKernelBuilder::AkgInsertCache(const std::string &kernel_name) { | |||
| return tbe::TbeUtils::InsertCache(kernel_name, kProcessorAiCore, true); | |||
| return tbe::TbeUtils::InsertCache(kernel_name, kProcessorAiCore); | |||
| } | |||
| void AkgAscendKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack, | |||
| const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) { | |||
| auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(kernel_pack); | |||
| auto kernel_json_info = kernel_pack->kernel_json_info(); | |||
| kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list()); | |||
| kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list()); | |||
| kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces); | |||
| AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get()); | |||
| } | |||
| @@ -49,7 +49,7 @@ const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return outp | |||
| const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; } | |||
| bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| if (stream_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "stream_ptr should not be nullptr."; | |||
| @@ -74,10 +74,6 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect | |||
| [](const AddressPtr &input) -> void * { return input->addr; }); | |||
| (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args), | |||
| [](const AddressPtr &output) -> void * { return output->addr; }); | |||
| if (!workspace.empty()) { | |||
| (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtime_args), | |||
| [](const AddressPtr &addr) -> void * { return addr->addr; }); | |||
| } | |||
| rtL2Ctrl_t *l2ctrl = nullptr; | |||
| auto stream = static_cast<rtStream_t *>(stream_ptr); | |||
| @@ -90,8 +86,7 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect | |||
| return true; | |||
| } | |||
| std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, uint32_t stream_id) { | |||
| if (kernel_pack_ == nullptr) { | |||
| MS_LOG(EXCEPTION) << "kernel pack should not be nullptr."; | |||
| @@ -112,10 +107,6 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in | |||
| [](const AddressPtr &input) -> void * { return input->addr; }); | |||
| (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs), | |||
| [](const AddressPtr &output) -> void * { return output->addr; }); | |||
| if (!workspace.empty()) { | |||
| (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(workspace_addrs), | |||
| [](const AddressPtr &workspace) -> void * { return workspace->addr; }); | |||
| } | |||
| uint32_t block_dim = DEFAULT_BLOCK_DIM; // default blockdim equal to 1. | |||
| auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim); | |||
| @@ -39,15 +39,14 @@ KernelPackPtr AkgGpuKernelBuilder::AkgInsertCache(const std::string &kernel_name | |||
| void AkgGpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack, | |||
| const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) { | |||
| auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack); | |||
| auto kernel_json_info = kernel_pack->kernel_json_info(); | |||
| kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list()); | |||
| kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list()); | |||
| kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces); | |||
| AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get()); | |||
| } | |||
| void AkgGpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) { | |||
| kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path()); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -15,7 +15,8 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h" | |||
| #include <fstream> | |||
| #include <algorithm> | |||
| #include "nlohmann/json.hpp" | |||
| #include "utils/ms_utils.h" | |||
| @@ -91,15 +92,13 @@ void GpuKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { inpu | |||
| void GpuKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; } | |||
| void GpuKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; } | |||
| const std::vector<size_t> &GpuKernelMod::GetInputSizeList() const { return input_size_list_; } | |||
| const std::vector<size_t> &GpuKernelMod::GetOutputSizeList() const { return output_size_list_; } | |||
| const std::vector<size_t> &GpuKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; } | |||
| bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| if (stream_ptr == 0) { | |||
| MS_LOG(ERROR) << "stream_ptr should not be nullptr."; | |||
| @@ -123,10 +122,6 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect | |||
| [](const AddressPtr &input) -> void * { return reinterpret_cast<void *>(&(input->addr)); }); | |||
| (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs), | |||
| [](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); }); | |||
| if (!workspace.empty()) { | |||
| (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs), | |||
| [](const AddressPtr &addr) -> void * { return reinterpret_cast<void *>(&(addr->addr)); }); | |||
| } | |||
| result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4], | |||
| thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr), | |||
| reinterpret_cast<void **>(&runtimeargs[0]), 0); | |||
| @@ -60,7 +60,6 @@ class GpuKernelMod : public KernelMod { | |||
| void SetInputSizeList(const std::vector<size_t> &size_list); | |||
| void SetOutputSizeList(const std::vector<size_t> &size_list); | |||
| void SetWorkspaceSizeList(const std::vector<size_t> &size_list); | |||
| const std::vector<size_t> &GetInputSizeList() const override; | |||
| const std::vector<size_t> &GetOutputSizeList() const override; | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override; | |||
| @@ -141,8 +141,14 @@ FusionType GetFusionTypeByName(const std::string &name) { | |||
| return iter->first; | |||
| } | |||
| void KernelMeta::Initialize() { | |||
| kernel_meta_path_ = std::string(kGpuKernelMeta) + "/"; | |||
| void KernelMeta::Initialize(int pid) { | |||
| if (pid == -1) { | |||
| kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/"; | |||
| } else { | |||
| kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(pid) + "/"; | |||
| } | |||
| // remove old kernel cache | |||
| RemoveKernelCache(); | |||
| #if defined(_WIN32) || defined(_WIN64) | |||
| auto ret = mkdir(kernel_meta_path_.c_str()); | |||
| @@ -155,6 +161,21 @@ void KernelMeta::Initialize() { | |||
| initialized_ = true; | |||
| } | |||
| void KernelMeta::RemoveKernelCache() { | |||
| DIR *dir = opendir(kernel_meta_path_.c_str()); | |||
| if (dir == nullptr) { | |||
| return; | |||
| } | |||
| struct dirent *entry; | |||
| while ((entry = readdir(dir)) != nullptr) { | |||
| std::string kernel_file = entry->d_name; | |||
| std::string kernel_file_realpath = kernel_meta_path_ + kernel_file; | |||
| (void)remove(kernel_file_realpath.c_str()); | |||
| } | |||
| (void)closedir(dir); | |||
| (void)rmdir(kernel_meta_path_.c_str()); | |||
| } | |||
| std::string KernelMeta::Search(const std::string &kernel_name) const { | |||
| if (!initialized_) { | |||
| return ""; | |||
| @@ -206,7 +227,7 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro | |||
| KernelPackPtr kernel_pack = std::make_shared<KernelPack>(); | |||
| // just a tmp solution. | |||
| if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) { | |||
| MS_LOG(ERROR) << "Read cache json and bin file failed[" << kernel_json << "]."; | |||
| MS_LOG(DEBUG) << "Read cache json and bin file failed[" << kernel_json << "]."; | |||
| return nullptr; | |||
| } else { | |||
| return kernel_pack; | |||
| @@ -229,7 +250,7 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro | |||
| (void)kernel_json.append(kernel_name).append(kJsonSuffix); | |||
| KernelPackPtr kernel_pack = std::make_shared<KernelPack>(); | |||
| if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) { | |||
| MS_LOG(ERROR) << "Read json and bin file failed[" << kernel_json << "]."; | |||
| MS_LOG(DEBUG) << "Read json and bin file failed[" << kernel_json << "]."; | |||
| return nullptr; | |||
| } | |||
| @@ -693,9 +714,6 @@ void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNode | |||
| for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) { | |||
| auto input_node = cnode->input(input_idx); | |||
| MS_EXCEPTION_IF_NULL(input_node); | |||
| if (input_node->isa<CNode>() && AnfAlgo::GetInputTensorNum(input_node) == 0) { | |||
| continue; | |||
| } | |||
| output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first); | |||
| } | |||
| } else { | |||
| @@ -970,39 +988,5 @@ size_t CalOffset(const std::vector<int64_t> &start, const std::vector<int64_t> & | |||
| } | |||
| return offset; | |||
| } | |||
| size_t UnitSizeInBytes(const mindspore::TypeId &t) { | |||
| size_t bytes = 0; | |||
| switch (t) { | |||
| case kNumberTypeBool: | |||
| case kNumberTypeInt8: | |||
| case kNumberTypeUInt8: | |||
| bytes = sizeof(int8_t); | |||
| break; | |||
| case kNumberTypeInt16: | |||
| case kNumberTypeUInt16: | |||
| case kNumberTypeFloat16: | |||
| bytes = sizeof(int16_t); | |||
| break; | |||
| case kNumberTypeInt: | |||
| case kNumberTypeUInt: | |||
| case kNumberTypeInt32: | |||
| case kNumberTypeUInt32: | |||
| case kNumberTypeFloat: | |||
| case kNumberTypeFloat32: | |||
| bytes = sizeof(int32_t); | |||
| break; | |||
| case kNumberTypeUInt64: | |||
| case kNumberTypeInt64: | |||
| case kNumberTypeFloat64: | |||
| bytes = sizeof(int64_t); | |||
| break; | |||
| default: | |||
| MS_LOG(EXCEPTION) << "Invalid types " << t; | |||
| break; | |||
| } | |||
| return bytes; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -55,7 +55,8 @@ using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>; | |||
| class KernelMeta { | |||
| public: | |||
| KernelMeta() = default; | |||
| void Initialize(); | |||
| void Initialize(int pid); | |||
| void RemoveKernelCache(); | |||
| std::string Search(const std::string &kernel_name) const; | |||
| bool Insert(const std::string &kernel_name, const std::string &kernel_json); | |||
| std::string kernel_meta_path() const { return kernel_meta_path_; } | |||
| @@ -143,7 +144,6 @@ size_t CalOffset(const std::vector<int64_t> &start, const std::vector<int64_t> & | |||
| std::vector<int64_t> CalDimOffset(const std::vector<int64_t> &input_shape); | |||
| size_t GetCopySize(const std::vector<int64_t> &dim_offset, const std::vector<int64_t> &start, | |||
| const std::vector<int64_t> &stop); | |||
| size_t UnitSizeInBytes(const mindspore::TypeId &t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -83,7 +83,7 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input | |||
| MS_LOG(EXCEPTION) << "AdamFp32 failed."; | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, lens, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelForAutoSearch(task, lens, ¶llel_search_info_); | |||
| } | |||
| void AdamCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| @@ -26,26 +26,46 @@ namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kSizeFloat16 = sizeof(float16); | |||
| constexpr size_t kSizeFloat32 = sizeof(float); | |||
| constexpr size_t kScalarIndex = 0; | |||
| constexpr size_t kAdamWeightDecayInputSize = 9; | |||
| constexpr size_t kAdamWeightDecayOutputSize = 3; | |||
| void AdamWeightDecayCPUKernel::ParallelForAdam(const CTask &task, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| const float align_size = 16.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| std::vector<common::Task> tasks; | |||
| size_t start = 0; | |||
| size_t once_compute_size = align_size * std::ceil(count / (align_size * thread_num)); | |||
| while (start < count) { | |||
| size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); | |||
| auto block = [&, start, end]() { | |||
| task(start, end); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(block); | |||
| start += once_compute_size; | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| } | |||
| template <typename T, typename S> | |||
| void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &) { | |||
| auto var = reinterpret_cast<T *>(inputs[VAR]->addr); | |||
| auto m = reinterpret_cast<T *>(inputs[M]->addr); | |||
| auto v = reinterpret_cast<T *>(inputs[V]->addr); | |||
| auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex]; | |||
| auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex]; | |||
| auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex]; | |||
| auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex]; | |||
| auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex]; | |||
| auto gradient16 = reinterpret_cast<S *>(inputs[GRAD]->addr); | |||
| void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| auto var = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto m = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto v = reinterpret_cast<T *>(inputs[2]->addr); | |||
| auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0]; | |||
| auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0]; | |||
| auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0]; | |||
| auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0]; | |||
| auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0]; | |||
| auto gradient16 = reinterpret_cast<S *>(inputs[8]->addr); | |||
| const auto beta1_minus = 1 - beta1; | |||
| const auto beta2_minus = 1 - beta2; | |||
| // multithreading | |||
| size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1; | |||
| size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1; | |||
| std::function<void(size_t, size_t)> task; | |||
| task = [&](size_t start, size_t end) { | |||
| @@ -61,27 +81,28 @@ void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &in | |||
| var[i] -= lr * update; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, lens); | |||
| ParallelForAdam(task, lens); | |||
| } | |||
| template <typename T> | |||
| void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &) { | |||
| auto var = reinterpret_cast<T *>(inputs[VAR]->addr); | |||
| auto m = reinterpret_cast<T *>(inputs[M]->addr); | |||
| auto v = reinterpret_cast<T *>(inputs[V]->addr); | |||
| auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex]; | |||
| auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex]; | |||
| auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex]; | |||
| auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex]; | |||
| auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex]; | |||
| auto gradient = reinterpret_cast<T *>(inputs[GRAD]->addr); | |||
| const std::vector<AddressPtr> &outputs) { | |||
| auto var = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto m = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto v = reinterpret_cast<T *>(inputs[2]->addr); | |||
| auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0]; | |||
| auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0]; | |||
| auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0]; | |||
| auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0]; | |||
| auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0]; | |||
| auto gradient = reinterpret_cast<T *>(inputs[8]->addr); | |||
| const auto beta1_minus = 1 - beta1; | |||
| const auto beta2_minus = 1 - beta2; | |||
| // multithreading | |||
| size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1; | |||
| size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1; | |||
| std::function<void(size_t, size_t)> task; | |||
| task = [&](size_t start, size_t end) { | |||
| size_t i = AdamWeightDecayFp32(var, m, v, lr, beta1, beta2, epsilon, decay, gradient, start, end); | |||
| // remaining | |||
| @@ -93,14 +114,14 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPt | |||
| var[i] -= lr * update; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, lens); | |||
| ParallelForAdam(task, lens); | |||
| } | |||
| void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, VAR); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, VAR); | |||
| gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, GRAD); | |||
| std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 8); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != kAdamWeightDecayInputSize) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs."; | |||
| @@ -134,12 +155,12 @@ void AdamWeightDecayCPUKernel::CheckParam(const std::vector<kernel::AddressPtr> | |||
| } | |||
| size_t elem1_size = elem_num_ * kSizeFloat32; | |||
| size_t elem2_size = gradient_dtype_ == kNumberTypeFloat16 ? elem_num_ * kSizeFloat16 : elem1_size; | |||
| if (inputs[VAR]->size != elem1_size || inputs[M]->size != elem1_size || inputs[V]->size != elem1_size || | |||
| inputs[GRAD]->size != elem2_size) { | |||
| if (inputs[0]->size != elem1_size || inputs[1]->size != elem1_size || inputs[2]->size != elem1_size || | |||
| inputs[8]->size != elem2_size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (inputs[LR]->size != kSizeFloat32 || inputs[BETA1]->size != kSizeFloat32 || inputs[BETA2]->size != kSizeFloat32 || | |||
| inputs[EPSILON]->size != kSizeFloat32 || inputs[DECAY]->size != kSizeFloat32) { | |||
| if (inputs[3]->size != kSizeFloat32 || inputs[4]->size != kSizeFloat32 || inputs[5]->size != kSizeFloat32 || | |||
| inputs[6]->size != kSizeFloat32 || inputs[7]->size != kSizeFloat32) { | |||
| MS_LOG(EXCEPTION) << "The attribute beta, lr, epsilon and weight decay must be float!"; | |||
| } | |||
| } | |||
| @@ -32,6 +32,7 @@ class AdamWeightDecayCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void ParallelForAdam(const CTask &task, size_t count); | |||
| void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T, typename S> | |||
| void LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| @@ -40,7 +41,6 @@ class AdamWeightDecayCPUKernel : public CPUKernel { | |||
| size_t elem_num_{0}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| TypeId gradient_dtype_{kTypeUnknown}; | |||
| enum input_list_ { VAR, M, V, LR, BETA1, BETA2, EPSILON, DECAY, GRAD }; | |||
| }; | |||
| MS_REG_CPU_KERNEL(AdamWeightDecay, | |||
| @@ -76,10 +76,27 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| // multithreading | |||
| size_t length = inputs[0]->size / sizeof(T); | |||
| auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) { | |||
| LaunchApplyAdagrad(var, accum, lr, gradient, start, end); | |||
| }; | |||
| CPUKernelUtils::ParallelForAutoSearch(task, length, ¶llel_search_info_); | |||
| size_t max_thread_num = std::thread::hardware_concurrency(); | |||
| size_t use_thread_num = length < 128 * max_thread_num ? std::ceil(length / 128.0) : max_thread_num; | |||
| std::vector<std::thread> threads; | |||
| threads.reserve(use_thread_num); | |||
| size_t start = 0; | |||
| const size_t batch_size = (length + use_thread_num - 1) / use_thread_num; | |||
| if (batch_size == 0) { | |||
| MS_LOG(EXCEPTION) << "Error occur in launch kernel"; | |||
| return; | |||
| } | |||
| while (start < length) { | |||
| size_t end = (start + batch_size) > length ? length : (start + batch_size); | |||
| threads.emplace_back( | |||
| std::thread(&ApplyAdagradCPUKernel::LaunchApplyAdagrad<T *>, this, var, accum, lr, gradient, start, end)); | |||
| start += batch_size; | |||
| } | |||
| for (auto &it : threads) { | |||
| it.join(); | |||
| } | |||
| // Copy result to output tensor | |||
| auto output_var = reinterpret_cast<T *>(outputs[0]->addr); | |||
| @@ -19,7 +19,6 @@ | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "nnacl/fp32/power_fp32.h" | |||
| #include "nnacl/fp32/sub_fp32.h" | |||
| #include "nnacl/fp32/mul_fp32.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -55,7 +54,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementSub(input1 + start, input2 + start, out + start, end - start); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) { | |||
| @@ -66,7 +65,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| ElementOptSub(input1 + start, input2, out + start, end - start, &op_para); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE); | |||
| return; | |||
| } | |||
| } | |||
| @@ -85,26 +84,6 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| if (input_shape1_ == input_shape2_) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementMul(input1 + start, input2 + start, out + start, end - start); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| if (op_para.in_elements_num0_ == 1) { | |||
| ElementOptMul(input1, input2 + start, out + start, end - start, &op_para); | |||
| } else { | |||
| ElementOptMul(input1 + start, input2, out + start, end - start, &op_para); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| } | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -149,21 +128,21 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementRealDiv<T>(input1 + start, input2 + start, out + start, end - start, 1, 1); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementRealDiv<T>(input1, input2 + start, out + start, end - start, 0, 1); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num1_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementRealDiv<T>(input1 + start, input2, out + start, end - start, 1, 0); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE); | |||
| return; | |||
| } | |||
| @@ -360,7 +339,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2, | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelFor(task, output_size_); | |||
| } | |||
| template <typename T> | |||
| @@ -77,8 +77,6 @@ MS_REG_CPU_KERNEL_T(RealDiv, KernelAttr(), ArithmeticCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(Mul, KernelAttr(), ArithmeticCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(Mul, KernelAttr(), ArithmeticCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T( | |||
| FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), | |||
| ArithmeticCPUKernel, int64_t); | |||
| @@ -13,12 +13,10 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include <string> | |||
| #include <map> | |||
| #include <functional> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| @@ -31,9 +29,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::less<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] < input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -41,9 +37,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o | |||
| } else { | |||
| base_iter.SetPos(0); | |||
| for (size_t i = 0; i < output_size_; i++) { | |||
| auto x = input1[base_iter.GetInputPosA()]; | |||
| auto y = input2[base_iter.GetInputPosB()]; | |||
| out[i] = std::less<T>()(x, y); | |||
| out[i] = input1[base_iter.GetInputPosA()] < input2[base_iter.GetInputPosB()]; | |||
| base_iter.GenNextPos(); | |||
| } | |||
| } | |||
| @@ -56,9 +50,7 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool * | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::equal_to<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] == input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -72,9 +64,7 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::not_equal_to<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] != input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -116,9 +106,7 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::greater<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] > input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -132,9 +120,7 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::greater_equal<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] >= input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -148,9 +134,7 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = input1[iter.GetInputPosA()]; | |||
| auto y = input2[iter.GetInputPosB()]; | |||
| out[i] = std::less_equal<T>()(x, y); | |||
| out[i] = input1[iter.GetInputPosA()] <= input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -20,7 +20,6 @@ | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "nnacl/fp32/exp_fp32.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -32,15 +31,7 @@ void Square(const T *in, T *out, size_t size) { | |||
| out[i] = in[i] * in[i]; | |||
| } | |||
| }; | |||
| ParallelLaunch(task, size, MAX_SQUARE_SERIAL_SIZE); | |||
| } | |||
| template <typename T> | |||
| void Exp(const T *in, T *out, size_t size) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| auto task = [&in, &out](size_t start, size_t end) { ExpFp32(in + start, out + start, end - start); }; | |||
| ParallelLaunch(task, size, MAX_EXP_SERIAL_SIZE); | |||
| } | |||
| CPUKernelUtils::ParallelFor(task, size, MAX_SQUARE_SERIAL_SIZE); | |||
| } | |||
| template <typename T> | |||
| @@ -66,7 +57,7 @@ void Neg(const T *in, T *out, size_t size) { | |||
| out[i] = -in[i]; | |||
| } | |||
| }; | |||
| ParallelLaunch(task, size, MAX_NEG_SERIAL_SIZE); | |||
| CPUKernelUtils::ParallelFor(task, size, MAX_NEG_SERIAL_SIZE); | |||
| } | |||
| template <typename T> | |||
| @@ -271,7 +262,6 @@ void Identity(const T *in, T *out, size_t size) { | |||
| static const std::map<std::string, OperateType> kArithmeticOpTypeMap = {{prim::kPrimNeg->name(), NEG}, | |||
| {prim::kPrimSquare->name(), SQUARE}, | |||
| {prim::kPrimOnesLike->name(), ONESLIKE}, | |||
| {prim::kPrimExp->name(), EXP}, | |||
| {prim::kPrimZerosLike->name(), ZEROSLIKE}, | |||
| {prim::kPrimLogicalNot->name(), LOGICALNOT}, | |||
| {prim::kPrimSign->name(), SIGN}, | |||
| @@ -334,29 +324,17 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs | |||
| T *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1; | |||
| static const std::map<OperateType, std::function<void(const T *in, T *out, size_t size)>> kArithmeticOpFuncMap = { | |||
| {SQUARE, Square<T>}, | |||
| {SIGN, Sign<T>}, | |||
| {NEG, Neg<T>}, | |||
| {LOGICALNOT, LogicalNot<T>}, | |||
| {ONESLIKE, OnesLike<T>}, | |||
| {ZEROSLIKE, ZerosLike<T>}, | |||
| {FLOOR, Floor<T>}, | |||
| {RECIPROCAL, Reciprocal<T>}, | |||
| {GELU, Gelu<T>}, | |||
| {SIN, Sin<T>}, | |||
| {COS, Cos<T>}, | |||
| {TAN, Tan<T>}, | |||
| {ASIN, Asin<T>}, | |||
| {ACOS, ACos<T>}, | |||
| {ATAN, Atan<T>}, | |||
| {SINH, Sinh<T>}, | |||
| {COSH, Cosh<T>}, | |||
| {ASINH, Asinh<T>}, | |||
| {ACOSH, Acosh<T>}, | |||
| {ATANH, Atanh<T>}, | |||
| {RINT, Rint<T>}, | |||
| {ROUND, Round<T>}, | |||
| {EXP, Exp<T>}}; | |||
| {SQUARE, Square<T>}, {SIGN, Sign<T>}, | |||
| {NEG, Neg<T>}, {LOGICALNOT, LogicalNot<T>}, | |||
| {ONESLIKE, OnesLike<T>}, {ZEROSLIKE, ZerosLike<T>}, | |||
| {FLOOR, Floor<T>}, {RECIPROCAL, Reciprocal<T>}, | |||
| {GELU, Gelu<T>}, {SIN, Sin<T>}, | |||
| {COS, Cos<T>}, {TAN, Tan<T>}, | |||
| {ASIN, Asin<T>}, {ACOS, ACos<T>}, | |||
| {ATAN, Atan<T>}, {SINH, Sinh<T>}, | |||
| {COSH, Cosh<T>}, {ASINH, Asinh<T>}, | |||
| {ACOSH, Acosh<T>}, {ATANH, Atanh<T>}, | |||
| {RINT, Rint<T>}, {ROUND, Round<T>}}; | |||
| if (kArithmeticOpFuncMap.find(operate_type_) != kArithmeticOpFuncMap.end()) { | |||
| kArithmeticOpFuncMap.at(operate_type_)(input, output, lens); | |||
| } else { | |||
| @@ -20,9 +20,8 @@ | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| const float MAX_NEG_SERIAL_SIZE = 5000; | |||
| const float MAX_SQUARE_SERIAL_SIZE = 5000; | |||
| const float MAX_EXP_SERIAL_SIZE = 15000; | |||
| const float MAX_NEG_SERIAL_SIZE = 20000; | |||
| const float MAX_SQUARE_SERIAL_SIZE = 20000; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -59,10 +58,6 @@ class IdentityCPUKernel : public ArithmeticSelfCPUKernel { | |||
| MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| ArithmeticSelfCPUKernel); | |||
| MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| ArithmeticSelfCPUKernel); | |||
| MS_REG_CPU_KERNEL(Exp, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| ArithmeticSelfCPUKernel); | |||
| MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| ArithmeticSelfCPUKernel); | |||
| MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| @@ -90,7 +90,7 @@ bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std:: | |||
| ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, input_shape_[0], this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelForAutoSearch(task, input_shape_[0], ¶llel_search_info_); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -55,7 +55,7 @@ bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const s | |||
| auto task = [&](size_t start, size_t end) { | |||
| ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, input_shape_[1], this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelForAutoSearch(task, input_shape_[1], ¶llel_search_info_); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -74,7 +74,7 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c | |||
| } | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, before_axis, this, ¶llel_search_info_); | |||
| CPUKernelUtils::ParallelForAutoSearch(task, before_axis, ¶llel_search_info_); | |||
| return true; | |||
| } | |||
| @@ -138,77 +138,6 @@ void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, Para | |||
| } | |||
| } | |||
| ActorThreadPool *GetActorMgrInnerThreadPool() { | |||
| auto actor_manager = ActorMgr::GetActorMgrRef(); | |||
| auto thread_pool = actor_manager->GetActorThreadPool(); | |||
| // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler. | |||
| if (thread_pool == nullptr) { | |||
| const size_t kMaxThreadNum = 23; | |||
| size_t max_thread_num = std::thread::hardware_concurrency() - 1; | |||
| if (max_thread_num < 1) { | |||
| max_thread_num = 1; | |||
| } | |||
| max_thread_num = max_thread_num < kMaxThreadNum ? max_thread_num : kMaxThreadNum; | |||
| actor_manager->Initialize(true, 0, max_thread_num); | |||
| thread_pool = actor_manager->GetActorThreadPool(); | |||
| MS_EXCEPTION_IF_NULL(thread_pool); | |||
| } | |||
| return thread_pool; | |||
| } | |||
| // Use threadpool of mindrt | |||
| void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content) { | |||
| auto thread_pool = GetActorMgrInnerThreadPool(); | |||
| size_t kernel_thread_num = thread_pool->GetKernelThreadNum(); | |||
| if (kernel_thread_num == 0) { | |||
| MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!"; | |||
| } | |||
| size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num; | |||
| size_t once_compute_size = (count + thread_num - 1) / thread_num; | |||
| size_t task_num = count / once_compute_size; | |||
| if (count % once_compute_size != 0) { | |||
| task_num += 1; | |||
| } | |||
| auto func = [&](void *, int task_id, float, float) { | |||
| size_t start = task_id * once_compute_size; | |||
| size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); | |||
| task(start, end); | |||
| return common::SUCCESS; | |||
| }; | |||
| thread_pool->ParallelLaunch(func, content, task_num); | |||
| } | |||
| void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content, | |||
| ParallelSearchInfo *parallel_search_info) { | |||
| const size_t MAX_POW = 6; | |||
| const size_t AVG_COUNT = 5; | |||
| size_t current_pow = parallel_search_info->search_count / AVG_COUNT; | |||
| if (current_pow < MAX_POW) { | |||
| if (parallel_search_info->search_count % AVG_COUNT == 0) { | |||
| parallel_search_info->tmp_sum_cost_time = 0; | |||
| } | |||
| float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow); | |||
| double start_time = GetTime(); | |||
| ParallelLaunch(task, count, block_size, content); | |||
| double cost_time = GetTime() - start_time; | |||
| parallel_search_info->tmp_sum_cost_time += cost_time; | |||
| parallel_search_info->search_count++; | |||
| if (parallel_search_info->search_count % AVG_COUNT == 0) { | |||
| double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT; | |||
| if (parallel_search_info->min_cost_time > avg_time) { | |||
| parallel_search_info->min_cost_time = avg_time; | |||
| parallel_search_info->best_block_size = block_size; | |||
| parallel_search_info->best_pow = current_pow; | |||
| } else if (current_pow - parallel_search_info->best_pow >= 2) { | |||
| parallel_search_info->search_count = AVG_COUNT * MAX_POW; | |||
| } | |||
| } | |||
| } else { | |||
| ParallelLaunch(task, count, parallel_search_info->best_block_size, content); | |||
| } | |||
| } | |||
| std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) { | |||
| if (axis < 0) { | |||
| axis = axis + SizeToInt(shape.size()); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -25,8 +25,6 @@ | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "ir/anf.h" | |||
| #include "runtime/framework/graph_scheduler.h" | |||
| #include "actor/actormgr.h" | |||
| using mindspore::kernel::Address; | |||
| using mindspore::kernel::AddressPtr; | |||
| @@ -64,7 +62,6 @@ const char DELTA[] = "delta"; | |||
| const char SORTED[] = "sorted"; | |||
| const char ADJ_ST[] = "adjoint_st"; | |||
| const char ADJ_dT[] = "adjoint_dt"; | |||
| const char PERIODS[] = "periods"; | |||
| enum OperateType { | |||
| ADD = 0, | |||
| @@ -122,7 +119,6 @@ enum OperateType { | |||
| ATAN2, | |||
| RINT, | |||
| ROUND, | |||
| EXP, | |||
| IDENTITY, | |||
| }; | |||
| @@ -156,19 +152,6 @@ class CPUKernel : public kernel::KernelMod { | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| ParallelSearchInfo parallel_search_info_; | |||
| template <typename T> | |||
| inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) { | |||
| if (index >= addr_list.size()) { | |||
| MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")"; | |||
| } | |||
| if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) { | |||
| MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index; | |||
| } | |||
| return reinterpret_cast<T *>(addr_list[index]->addr); | |||
| } | |||
| }; | |||
| class CPUKernelUtils { | |||
| @@ -226,12 +209,6 @@ class TransposeIterator { | |||
| std::vector<size_t> axes_; | |||
| size_t pos_{0}; | |||
| }; | |||
| ActorThreadPool *GetActorMgrInnerThreadPool(); | |||
| void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr); | |||
| void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content, | |||
| ParallelSearchInfo *parallel_search_info); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -21,7 +21,6 @@ | |||
| #include <string> | |||
| #include "runtime/device/kernel_info.h" | |||
| #include "runtime/device/cpu/kernel_select_cpu.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -112,11 +111,6 @@ std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string & | |||
| MS_LOG(INFO) << "Not registered CPU kernel: op[" << kernel_name << "]!"; | |||
| return std::make_pair(false, 0); | |||
| } | |||
| if (device::cpu::IsDynamicParamKernel(kernel_name)) { | |||
| return std::make_pair(true, 0); | |||
| } | |||
| auto kernel_attrs = GetSupportedKernelAttrList(kernel_name); | |||
| if (kernel_attrs[0].GetInputSize() == 0 && kernel_attrs[0].GetOutputSize() == 0) { | |||
| auto op_info_ptr = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kCPU); | |||
| @@ -144,7 +144,8 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in | |||
| const int bottom_y_index = ceilf(target_y); | |||
| const int left_x_index = floorf(target_x); | |||
| const int right_x_index = ceilf(target_x); | |||
| const float y_lerp = target_y - top_y_index; | |||
| const float x_lerp = target_x - left_x_index; | |||
| const float top_left = static_cast<float>( | |||
| input_image[((box_index * input_height_ + top_y_index) * input_width_ + left_x_index) * channel_ + | |||
| pos_channel]); | |||
| @@ -157,9 +158,9 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in | |||
| const float bottom_right = static_cast<float>( | |||
| input_image[((box_index * input_height_ + bottom_y_index) * input_width_ + right_x_index) * channel_ + | |||
| pos_channel]); | |||
| const float top = top_left + (top_right - top_left) * (target_x - left_x_index); | |||
| const float bottom = bottom_left + (bottom_right - bottom_left) * (target_x - left_x_index); | |||
| output[pos] = top + (bottom - top) * (target_y - top_y_index); | |||
| const float top = top_left + (top_right - top_left) * x_lerp; | |||
| const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; | |||
| output[pos] = top + (bottom - top) * y_lerp; | |||
| } else if (method_ == 3) { | |||
| int y1h = static_cast<int>(y1 * input_height_); | |||
| int x1w = static_cast<int>(x1 * input_width_); | |||
| @@ -169,37 +170,36 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in | |||
| int h = ((y2h - y1h + 1) > 1) ? y2h - y1h + 1 : 1; | |||
| float y_point = (pos_y + 0.5) * (h / static_cast<float>(final_height_)) - 0.5; | |||
| int top_y_index = std::min(std::max(0, static_cast<int>(floorf(y_point))), h - 1); | |||
| int bottom_y_index = std::min(std::max(0, static_cast<int>(ceilf(y_point))), h - 1); | |||
| int top_y_index = floorf(y_point); | |||
| top_y_index = std::min(std::max(0, top_y_index), h - 1); | |||
| int bottom_y_index = ceilf(y_point); | |||
| bottom_y_index = std::min(std::max(0, bottom_y_index), h - 1); | |||
| float x_point = (pos_x + 0.5) * (w / static_cast<float>(final_width_)) - 0.5; | |||
| int left_x_index = std::min(std::max(0, static_cast<int>(floorf(x_point))), w - 1); | |||
| int right_x_index = std::min(std::max(0, static_cast<int>(ceilf(x_point))), w - 1); | |||
| int left_x_index = floorf(x_point); | |||
| left_x_index = std::min(std::max(0, left_x_index), w - 1); | |||
| int right_x_index = ceilf(x_point); | |||
| right_x_index = std::min(std::max(0, right_x_index), w - 1); | |||
| const float y_lerp = y_point - top_y_index; | |||
| const float x_lerp = x_point - left_x_index; | |||
| const int y_top_index = box_index * input_height_ + y1h + top_y_index; | |||
| const int y_bottom_index = box_index * input_height_ + y1h + bottom_y_index; | |||
| const int y_top_index = std::max(0, y1h + top_y_index); | |||
| const int y_bottom_index = std::max(0, y1h + bottom_y_index); | |||
| const int x_left_index = std::max(0, x1w + left_x_index); | |||
| const int x_right_index = std::max(0, x1w + right_x_index); | |||
| const float top_left = static_cast<float>( | |||
| input_image[((box_index * input_height_ + y_top_index) * input_width_ + x_left_index) * channel_ + | |||
| pos_channel]); | |||
| const float top_right = static_cast<float>( | |||
| input_image[((box_index * input_height_ + y_top_index) * input_width_ + x_right_index) * channel_ + | |||
| pos_channel]); | |||
| const float top_left = | |||
| static_cast<float>(input_image[(y_top_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]); | |||
| const float top_right = | |||
| static_cast<float>(input_image[(y_top_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]); | |||
| const float bottom_left = static_cast<float>( | |||
| input_image[((box_index * input_height_ + y_bottom_index) * input_width_ + x_left_index) * channel_ + | |||
| pos_channel]); | |||
| input_image[(y_bottom_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]); | |||
| const float bottom_right = static_cast<float>( | |||
| input_image[((box_index * input_height_ + y_bottom_index) * input_width_ + x_right_index) * channel_ + | |||
| pos_channel]); | |||
| output[pos] = top_left * (1 - y_lerp) * (1 - x_lerp) + bottom_right * y_lerp * x_lerp + | |||
| top_right * (1 - y_lerp) * x_lerp + bottom_left * y_lerp * (1 - x_lerp); | |||
| input_image[(y_bottom_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]); | |||
| float ret = top_left * (1 - y_lerp) * (1 - x_lerp) + bottom_right * y_lerp * x_lerp + | |||
| top_right * (1 - y_lerp) * x_lerp + bottom_left * y_lerp * (1 - x_lerp); | |||
| output[pos] = ret; | |||
| } else { | |||
| // Nearest Neighbour | |||
| const int closest_x_index = roundf(target_x); | |||
| @@ -35,14 +35,15 @@ class CropAndResizeCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| int method_{1}; | |||
| float extrapolation_value_{0.0}; | |||
| int output_size_{0}; | |||
| int input_height_{0}; | |||
| int input_width_{0}; | |||
| int final_height_{0}; | |||
| int final_width_{0}; | |||
| int channel_{0}; | |||
| int method_; | |||
| float extrapolation_value_; | |||
| int input_crop_size_; | |||
| int output_size_; | |||
| int input_height_; | |||
| int input_width_; | |||
| int final_height_; | |||
| int final_width_; | |||
| int channel_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(CropAndResize, | |||
| @@ -43,9 +43,9 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| DropoutBackwardKernel<float16>(inputs, outputs, keep_prob_); | |||
| DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| DropoutBackwardKernel<float>(inputs, outputs, keep_prob_); | |||
| DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_); | |||
| } else { | |||
| MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU."; | |||
| } | |||
| @@ -55,7 +55,8 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, cons | |||
| template <typename T> | |||
| void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs, float keep_prob) { | |||
| const std::vector<AddressPtr> &outputs, size_t num_count, | |||
| float keep_prob) { | |||
| auto *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *mask = reinterpret_cast<T *>(inputs[1]->addr); | |||
| @@ -69,7 +70,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr | |||
| input_tmp[i] = static_cast<float>(input[i]); | |||
| mask_tmp[i] = static_cast<float>(mask[i]); | |||
| } | |||
| DropoutGrad(input_tmp, mask_tmp, output_tmp, SizeToInt(num_count_), scale); | |||
| DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale); | |||
| for (size_t i = 0; i < num_count_; ++i) { | |||
| output[i] = static_cast<float16>(output_tmp[i]); | |||
| } | |||
| @@ -77,7 +78,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr | |||
| delete[] output_tmp; | |||
| delete[] mask_tmp; | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| DropoutGrad(input, mask, output, SizeToInt(num_count_), scale); | |||
| DropoutGrad(input, mask, output, num_count_, scale); | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| @@ -40,7 +40,7 @@ class DropoutGradCpuBwdKernel : public CPUKernel { | |||
| TypeId dtype_{kTypeUnknown}; | |||
| template <typename T> | |||
| void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||
| float keep_prob); | |||
| size_t num_count, float keep_prob); | |||
| }; | |||
| MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,8 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h" | |||
| #include <string> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h" | |||
| #include "common/thread_pool.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "nnacl/fp32_grad/activation_grad.h" | |||
| @@ -27,50 +25,50 @@ namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "ReLUGrad failed."; | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "ReLUGrad only support float"; | |||
| } | |||
| int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "ReLUGrad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "ReLU6Grad failed."; | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "ReLU6Grad only support float"; | |||
| } | |||
| int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "ReLU6Grad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "AbsGrad failed."; | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "AbsGrad only support float"; | |||
| } | |||
| int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "AbsGrad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "SigmoidGrad failed."; | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "SigmoidGrad only support float"; | |||
| } | |||
| int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "SigmoidGrad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| @@ -82,14 +80,14 @@ void EltWiseGradCPUKernel<T>::SqrtGrad(const T *input1, const T *input2, T *out, | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "TanhGrad failed."; | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "TanhGrad only support float"; | |||
| } | |||
| int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "TanhGrad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| @@ -209,18 +207,6 @@ void EltWiseGradCPUKernel<T>::AcoshGrad(const T *input1, const T *input2, T *out | |||
| } | |||
| } | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const { | |||
| if constexpr (!std::is_same<T, float>::value) { | |||
| MS_LOG(EXCEPTION) << "SoftplusGrad only support float"; | |||
| } | |||
| int ret = ::SoftplusGrad(input1 + start, input2 + start, end - start, out + start); | |||
| if (ret == NNACL_ERR) { | |||
| MS_LOG(EXCEPTION) << "SoftplusGrad execute failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void EltWiseGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| @@ -233,19 +219,12 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| static const std::map<std::string, | |||
| std::function<void(EltWiseGradCPUKernel *, const T *, const T *, T *, size_t, size_t)>> | |||
| elt_map{{prim::kPrimReluGrad->name(), &EltWiseGradCPUKernel<T>::ReluGrad}, | |||
| {prim::kPrimRelu6Grad->name(), &EltWiseGradCPUKernel<T>::ReLU6Grad}, | |||
| {prim::kPrimSigmoidGrad->name(), &EltWiseGradCPUKernel<T>::SigmoidGrad}, | |||
| {prim::kPrimAbsGrad->name(), &EltWiseGradCPUKernel<T>::AbsGrad}, | |||
| {prim::kPrimTanhGrad->name(), &EltWiseGradCPUKernel<T>::TanhGrad}, | |||
| {prim::kPrimSqrtGrad->name(), &EltWiseGradCPUKernel<T>::SqrtGrad}, | |||
| {prim::kPrimGeLUGrad->name(), &EltWiseGradCPUKernel<T>::GeluGrad}, | |||
| {prim::kPrimAsinGrad->name(), &EltWiseGradCPUKernel<T>::AsinGrad}, | |||
| {prim::kPrimACosGrad->name(), &EltWiseGradCPUKernel<T>::ACosGrad}, | |||
| {prim::kPrimAtanGrad->name(), &EltWiseGradCPUKernel<T>::AtanGrad}, | |||
| {prim::kPrimAsinhGrad->name(), &EltWiseGradCPUKernel<T>::AsinhGrad}, | |||
| {prim::kPrimAcoshGrad->name(), &EltWiseGradCPUKernel<T>::AcoshGrad}, | |||
| {prim::kPrimSoftplusGrad->name(), &EltWiseGradCPUKernel<T>::SoftplusGrad}}; | |||
| elt_map{{"ReluGrad", &EltWiseGradCPUKernel<T>::ReluGrad}, {"ReLU6Grad", &EltWiseGradCPUKernel<T>::ReLU6Grad}, | |||
| {"SigmoidGrad", &EltWiseGradCPUKernel<T>::SigmoidGrad}, {"AbsGrad", &EltWiseGradCPUKernel<T>::AbsGrad}, | |||
| {"TanhGrad", &EltWiseGradCPUKernel<T>::TanhGrad}, {"SqrtGrad", &EltWiseGradCPUKernel<T>::SqrtGrad}, | |||
| {"GeLUGrad", &EltWiseGradCPUKernel<T>::GeluGrad}, {"AsinGrad", &EltWiseGradCPUKernel<T>::AsinGrad}, | |||
| {"ACosGrad", &EltWiseGradCPUKernel<T>::ACosGrad}, {"AtanGrad", &EltWiseGradCPUKernel<T>::AtanGrad}, | |||
| {"AsinhGrad", &EltWiseGradCPUKernel<T>::AsinhGrad}, {"AcoshGrad", &EltWiseGradCPUKernel<T>::AcoshGrad}}; | |||
| if (inputs.size() < 2 || outputs.size() != 1) { | |||
| MS_LOG(ERROR) << kernel_name_ << " requires at least 2 inputs and 1 output, but got " << inputs.size() | |||
| << " inputs and " << outputs.size() << " output."; | |||
| @@ -259,9 +238,9 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu | |||
| const auto input1 = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| ParallelLaunchAutoSearch( | |||
| CPUKernelUtils::ParallelForAutoSearch( | |||
| std::bind(elt_map.at(kernel_name_), this, input0, input1, output, std::placeholders::_1, std::placeholders::_2), | |||
| outputs[0]->size / sizeof(T), this, ¶llel_search_info_); | |||
| outputs[0]->size / sizeof(T), ¶llel_search_info_); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -48,7 +48,6 @@ class EltWiseGradCPUKernel : public CPUKernel { | |||
| void AtanGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const; | |||
| void AsinhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const; | |||
| void AcoshGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const; | |||
| void SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const; | |||
| std::string kernel_name_ = ""; | |||
| }; | |||
| @@ -104,10 +103,6 @@ MS_REG_CPU_KERNEL_T( | |||
| AcoshGrad, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| EltWiseGradCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T( | |||
| SoftplusGrad, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| EltWiseGradCPUKernel, float); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -30,7 +30,7 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| // The duration between two PullWeights requests when return code is ResponseCode_SucNotReady. | |||
| // The duration between two downloading requests when return code is ResponseCode_SucNotReady. | |||
| constexpr int kRetryDurationOfPullWeights = 200; | |||
| template <typename T> | |||
| class FusedPullWeightKernel : public CPUKernel { | |||
| @@ -51,17 +51,19 @@ class FusedPullWeightKernel : public CPUKernel { | |||
| MS_EXCEPTION_IF_NULL(fbb); | |||
| total_iteration_++; | |||
| uint64_t step_num_per_iteration = fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration(); | |||
| // The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server. | |||
| MS_LOG(INFO) << "Try to pull weights. Local step number: " << total_iteration_ | |||
| << ", step number needs to run per iteration: " << step_num_per_iteration; | |||
| if (step_num_per_iteration != fl::kOneStepPerIteration && | |||
| total_iteration_ % step_num_per_iteration != fl::kTrainBeginStepNum) { | |||
| if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() != | |||
| fl::kTrainBeginStepNum) { | |||
| return true; | |||
| } | |||
| fl_iteration_++; | |||
| MS_LOG(INFO) << "Launching pulling weight for federated learning iteration " << fl_iteration_; | |||
| if (fl_iteration_ > ps::PSContext::instance()->fl_iteration_num()) { | |||
| MS_LOG(INFO) << ps::PSContext::instance()->fl_iteration_num() << " iterations are completed."; | |||
| fl_iteration_ = 1; | |||
| } | |||
| MS_LOG(INFO) << "Start pulling weight for federated learning iteration " << fl_iteration_; | |||
| if (!BuildPullWeightReq(fbb)) { | |||
| MS_LOG(EXCEPTION) << "Building request for FusedPullWeight failed."; | |||
| return false; | |||
| @@ -71,16 +73,11 @@ class FusedPullWeightKernel : public CPUKernel { | |||
| const schema::ResponsePullWeight *pull_weight_rsp = nullptr; | |||
| int retcode = schema::ResponseCode_SucNotReady; | |||
| while (retcode == schema::ResponseCode_SucNotReady) { | |||
| if (!fl::worker::FLWorker::GetInstance().running()) { | |||
| MS_LOG(WARNING) << "Worker has finished."; | |||
| return true; | |||
| } | |||
| if (!fl::worker::FLWorker::GetInstance().SendToServer( | |||
| 0, fbb->GetBufferPointer(), fbb->GetSize(), ps::core::TcpUserCommand::kPullWeight, &pull_weight_rsp_msg)) { | |||
| MS_LOG(WARNING) << "Sending request for FusedPullWeight to server 0 failed. Retry later."; | |||
| retcode = schema::ResponseCode_SucNotReady; | |||
| std::this_thread::sleep_for(std::chrono::milliseconds(kRetryDurationOfPullWeights)); | |||
| continue; | |||
| MS_LOG(WARNING) << "Sending request for FusedPullWeight to server 0 failed. This iteration is dropped."; | |||
| fl::worker::FLWorker::GetInstance().SetIterationRunning(); | |||
| return true; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(pull_weight_rsp_msg); | |||
| @@ -91,8 +88,6 @@ class FusedPullWeightKernel : public CPUKernel { | |||
| fl_iteration_ = pull_weight_rsp->iteration(); | |||
| MS_LOG(DEBUG) << "Server is not ready for downloading yet. Reason: " << pull_weight_rsp->reason()->str() | |||
| << ". Retry later."; | |||
| // Recreate fbb to avoid memory leak of FlatBuffers. | |||
| fbb = std::make_shared<fl::FBBuilder>(); | |||
| if (!BuildPullWeightReq(fbb)) { | |||
| MS_LOG(EXCEPTION) << "Building request for FusedDownloadWeightsByKeys failed."; | |||
| return false; | |||
| @@ -121,7 +116,7 @@ class FusedPullWeightKernel : public CPUKernel { | |||
| return false; | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " success. Iteration: " << fl_iteration_; | |||
| MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_; | |||
| fl::worker::FLWorker::GetInstance().SetIterationRunning(); | |||
| return true; | |||
| } | |||
| @@ -28,7 +28,7 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| // The duration between two PushWeights requests when return code is ResponseCode_SucNotReady. | |||
| // The duration between two uploading requests when return code is ResponseCode_SucNotReady. | |||
| constexpr int kRetryDurationOfPushWeights = 200; | |||
| template <typename T> | |||
| class FusedPushWeightKernel : public CPUKernel { | |||
| @@ -49,17 +49,19 @@ class FusedPushWeightKernel : public CPUKernel { | |||
| MS_EXCEPTION_IF_NULL(fbb); | |||
| total_iteration_++; | |||
| uint64_t step_num_per_iteration = fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration(); | |||
| // The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server. | |||
| MS_LOG(INFO) << "Try to push weights. Local step number: " << total_iteration_ | |||
| << ", step number needs to run per iteration: " << step_num_per_iteration; | |||
| if (step_num_per_iteration != fl::kOneStepPerIteration && | |||
| total_iteration_ % step_num_per_iteration != fl::kTrainEndStepNum) { | |||
| if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() != | |||
| fl::kTrainBeginStepNum) { | |||
| return true; | |||
| } | |||
| fl_iteration_++; | |||
| MS_LOG(INFO) << "Launching pushing weight for federated learning iteration " << fl_iteration_; | |||
| if (fl_iteration_ > ps::PSContext::instance()->fl_iteration_num()) { | |||
| MS_LOG(INFO) << ps::PSContext::instance()->fl_iteration_num() << " iterations are completed."; | |||
| fl_iteration_ = 1; | |||
| } | |||
| MS_LOG(INFO) << "Start pushing weight for federated learning iteration " << fl_iteration_; | |||
| if (!BuildPushWeightReq(fbb, inputs)) { | |||
| MS_LOG(EXCEPTION) << "Building request for FusedPushWeight failed."; | |||
| return false; | |||
| @@ -71,17 +73,13 @@ class FusedPushWeightKernel : public CPUKernel { | |||
| const schema::ResponsePushWeight *push_weight_rsp = nullptr; | |||
| int retcode = schema::ResponseCode_SucNotReady; | |||
| while (retcode == schema::ResponseCode_SucNotReady) { | |||
| if (!fl::worker::FLWorker::GetInstance().running()) { | |||
| MS_LOG(WARNING) << "Worker has finished."; | |||
| return true; | |||
| } | |||
| if (!fl::worker::FLWorker::GetInstance().SendToServer(i, fbb->GetBufferPointer(), fbb->GetSize(), | |||
| ps::core::TcpUserCommand::kPushWeight, | |||
| &push_weight_rsp_msg)) { | |||
| MS_LOG(WARNING) << "Sending request for FusedPushWeight to server " << i << " failed."; | |||
| retcode = schema::ResponseCode_SucNotReady; | |||
| std::this_thread::sleep_for(std::chrono::milliseconds(kRetryDurationOfPushWeights)); | |||
| continue; | |||
| MS_LOG(WARNING) << "Sending request for FusedPushWeight to server " << i | |||
| << " failed. This iteration is dropped."; | |||
| fl::worker::FLWorker::GetInstance().SetIterationCompleted(); | |||
| return true; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(push_weight_rsp_msg); | |||
| @@ -107,7 +105,8 @@ class FusedPushWeightKernel : public CPUKernel { | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "Push weights for " << weight_full_names_ << " success. Iteration: " << fl_iteration_; | |||
| MS_LOG(INFO) << "Push weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_; | |||
| fl::worker::FLWorker::GetInstance().SetIterationCompleted(); | |||
| return true; | |||
| } | |||
| @@ -52,26 +52,6 @@ MS_REG_CPU_KERNEL_T( | |||
| MaskedSelect, | |||
| KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt32), | |||
| MaskedSelectCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T( | |||
| MaskedSelect, | |||
| KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt16), | |||
| MaskedSelectCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T( | |||
| MaskedSelect, | |||
| KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt64), | |||
| MaskedSelectCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T( | |||
| MaskedSelect, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat16), | |||
| MaskedSelectCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T( | |||
| MaskedSelect, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat64), | |||
| MaskedSelectCPUKernel, double); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_ | |||
| @@ -58,38 +58,6 @@ MS_REG_CPU_KERNEL_T(MaskedSelectGrad, | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeInt32), | |||
| MaskedSelectGradCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T(MaskedSelectGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeBool) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MaskedSelectGradCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T(MaskedSelectGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat64) | |||
| .AddInputAttr(kNumberTypeBool) | |||
| .AddInputAttr(kNumberTypeFloat64) | |||
| .AddOutputAttr(kNumberTypeFloat64), | |||
| MaskedSelectGradCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(MaskedSelectGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeInt16) | |||
| .AddInputAttr(kNumberTypeBool) | |||
| .AddInputAttr(kNumberTypeInt16) | |||
| .AddOutputAttr(kNumberTypeInt16), | |||
| MaskedSelectGradCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T(MaskedSelectGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddInputAttr(kNumberTypeBool) | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddOutputAttr(kNumberTypeInt64), | |||
| MaskedSelectGradCPUKernel, int64_t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_ | |||