| @@ -0,0 +1,10 @@ | |||
| test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text | |||
| third_party/librknn_api filter=lfs diff=lfs merge=lfs -text | |||
| test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text | |||
| @@ -0,0 +1,135 @@ | |||
| option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON) | |||
| # config lite_build_config.h.in | |||
| set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL}) | |||
| set(LITE_WITH_CUDA ${MGE_WITH_CUDA}) | |||
| set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING}) | |||
| set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) | |||
| set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC}) | |||
| if(NOT MGB_WITH_FLATBUFFERS) | |||
| include(../cmake/flatbuffers.cmake) | |||
| endif() | |||
| file(GLOB_RECURSE SRC_FBS src/**/*.fbs) | |||
| build_flatbuffers( | |||
| "${SRC_FBS}" | |||
| "" | |||
| lite_fbs_generate | |||
| "" | |||
| "${CMAKE_CURRENT_BINARY_DIR}" | |||
| "" | |||
| "" | |||
| ) | |||
| file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp) | |||
| if(MGE_WITH_MINIMUM_SIZE) | |||
| set(LITE_ENABLE_LOGGING OFF) | |||
| set(LITE_ENABLE_EXCEPTION OFF) | |||
| endif() | |||
| # Write out lite_build_config.h | |||
| # It defines macros needed by lite | |||
| configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h) | |||
| install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| # begin config lite | |||
| if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) | |||
| # FXIME third_party cpp redis do not support build with clang-cl | |||
| file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp) | |||
| list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS}) | |||
| file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp) | |||
| list(APPEND SOURCES_LITE ${SOURCES_TACOPIE}) | |||
| endif() | |||
| add_library(lite_static STATIC ${SOURCES_LITE}) | |||
| add_dependencies(lite_static lite_fbs_generate) | |||
| include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>) | |||
| if(LITE_BUILD_WITH_MGE) | |||
| target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) | |||
| add_compile_definitions(LITE_BUILD_WITH_MGE=1) | |||
| message(STATUS "build lite with MegEngine.") | |||
| else() | |||
| target_link_libraries(lite_static PUBLIC flatbuffers) | |||
| endif() | |||
| include_directories( | |||
| PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include> | |||
| PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include> | |||
| PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite> | |||
| PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include> | |||
| PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src> | |||
| PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include> | |||
| ) | |||
| # end config lite | |||
| # define a shared lib | |||
| add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>) | |||
| if(LITE_BUILD_WITH_MGE) | |||
| target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) | |||
| endif() | |||
| if(ANDROID) | |||
| link_libraries(log) | |||
| target_link_libraries(lite_static PRIVATE log) | |||
| target_link_libraries(lite_shared PRIVATE log) | |||
| endif() | |||
| if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) | |||
| # FXIME third_party cpp redis do not support build with clang-cl | |||
| target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) | |||
| target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) | |||
| target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) | |||
| target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) | |||
| endif() | |||
| set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script") | |||
| add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT}) | |||
| if(NOT MSVC AND NOT WIN32) | |||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") | |||
| endif() | |||
| #TODO: implemente version script for other OS | |||
| if (UNIX AND NOT APPLE) | |||
| target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) | |||
| set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) | |||
| endif() | |||
| # config install | |||
| install(TARGETS lite_static | |||
| LIBRARY DESTINATION lite/lib/${MGE_ARCH} | |||
| FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} | |||
| ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) | |||
| install(TARGETS lite_shared | |||
| LIBRARY DESTINATION lite/lib/${MGE_ARCH} | |||
| FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} | |||
| ARCHIVE DESTINATION lite/lib/${MGE_ARCH} | |||
| ) | |||
| install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h | |||
| DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c) | |||
| install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include | |||
| DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") | |||
| install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include | |||
| DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") | |||
| add_subdirectory(example) | |||
| if(MGE_WITH_TEST) | |||
| add_subdirectory(test) | |||
| endif() | |||
| # tools and example | |||
| add_executable(rc4_encryptor tools/rc4_encrypt.cpp) | |||
| target_link_libraries(rc4_encryptor lite_static) | |||
| if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) | |||
| # FIXME: hip obj can not find cpp obj only through lite_static | |||
| target_link_libraries(rc4_encryptor megdnn) | |||
| endif() | |||
| target_include_directories(rc4_encryptor PRIVATE | |||
| {PROJECT_SOURCE_DIR}/lite/src/decryption) | |||
| install (TARGETS rc4_encryptor | |||
| EXPORT ${LITE_EXPORT_TARGETS} | |||
| RUNTIME DESTINATION lite/tools) | |||
| @@ -0,0 +1,251 @@ | |||
| # Lite | |||
| It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in | |||
| user's SDK | |||
| ## bazel build | |||
| 目前支持内部 bazel 和 CMake 编译,支持 C++/C, Python 接口, | |||
| 下面是 bazel 中 lite_shared 目标的编译,可以作为其他目标的编译的参考, | |||
| 该编译依赖内部 bazel 编译以及 megvii3。 | |||
| ### 配置编译环境 | |||
| 需要使用 megvii3 workspace 来完成 bazel 的编译 | |||
| #### Clone megvii3 安装 bazel | |||
| ```bash | |||
| git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git | |||
| ./utils/bazel/get_bazel.sh | |||
| ``` | |||
| #### Clone megbrain | |||
| ``` | |||
| git submodule update brain/megbrain brain/midout | |||
| ``` | |||
| ### 编译 x86 CUDA 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ | |||
| --compiler="gcc7_cuda10" -c opt | |||
| ``` | |||
| ### 编译 x86 CPU 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ | |||
| --compiler="gcc9" -c opt | |||
| ``` | |||
| ### 编译 arm OpenCL 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \ | |||
| -c opt --define enable_opencl=1 --define enable_opencl_search=1 | |||
| ``` | |||
| ### 编译 arm opencl lite_examples | |||
| bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \ | |||
| --cpu=android_aarch64 --define enable_opencl=1 --define enable_opencl_search=1 | |||
| ####如何运行snpe_loder 的lite_exampes 请查看下面的wiki | |||
| https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906 | |||
| ### 编译 armv7 CPU 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \ | |||
| -c opt | |||
| ``` | |||
| ### 编译 arm64 CPU 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ | |||
| -c opt | |||
| ``` | |||
| ### 编译 arm64 CPU v8.2 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ | |||
| --copt -march=armv8.2-a+fp16+dotprod -c opt | |||
| ``` | |||
| ## 同时支持cmake构建 | |||
| cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine | |||
| 和RKNPU后端且打开OpenCL的release模式 | |||
| ```bash | |||
| EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \ | |||
| -DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh" | |||
| ``` | |||
| * 如果需要支持性能分析的 profile 功能,则需要在编译时候加上 | |||
| --copt -DMGB_ENABLE_JSON=1 该参数 | |||
| * 如果需要支持 fast-run 功能则需要加上 | |||
| --copt -DMGB_ENABLE_FASTRUN=1,开启 fast-run 功能 | |||
| * 如果编译 arm64,可以加上 --copt -mcpu=cortex-a53 选项进行优化。 | |||
| ### midout 裁减编译 | |||
| 具体 midout 的裁减原理见 megbrain 中 midout 裁减,裁减方法见 MegBrain | |||
| 和 MegEngine 的裁减方法 | |||
| ## 模型 | |||
| ### 支持的模型 | |||
| lite 目前支持只支持 MegEngine dump 的模型格式,可以加载的模型文件包括原始 | |||
| 的模型文件,原始的加密模型,pack 之后的加密或者非加密模型。加密算法以及 | |||
| 加密的秘钥可以用户自定义,然后注册到 lite 中,详见 example 中加解密部分。 | |||
| * 原始模型未加密:直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型 | |||
| * 原始加密模型:将上述 dump 的模型通过加密算法进行加密,lite 提供两种默认 | |||
| 的加密算法,在 tools 中,分别为 aes 和 rc4. 对应为:aes_encypt.sh 和 | |||
| rc4_encrypt.cpp,rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在 | |||
| 加载时候需要在 Config 中配置模型的加密方式。 | |||
| * pack 之后的模型:模型结构将在下面介绍,可以将上面加密或者未加密的模型,和下面 | |||
| 定义的 json config 文件一同打包为一个 pack 之后的模型,可以使用 tools 下面 | |||
| 的 pack_model_and_info.py 工具中完成,pack_model_and_info.py 的使用详见其中 | |||
| 的 help 输出。 | |||
| ### 模型结构 | |||
| 不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分. | |||
| * 打包处理之后的文件: | |||
| 模型打包过程可以通过脚本 pack_model_and_json.py 来完成,其将模型info文件( | |||
| 可以是任意格式,推荐使用JSON,可以加密也可以不加密)和加密或者未加密的模型文件 | |||
| 一同打包在一起,并在文件开头加上 Header 来帮助解析。 | |||
| * 原始文件和原始的加密文件没有 Header 和模型 info部分,模型加载需要的信息 | |||
| 可以通过 Config 和 NetworkIO 进行传递。 | |||
| ### Header | |||
| Header 部分最开始为一个明文固定model_tag,目前定义为"packed_model"字符串, | |||
| 后面主要包含模型文件各个部分的信息,每个部分的加密方式,load 模型时候可以 | |||
| 调用相应的解密方法对各个部分进行解密,以及model infomation 部分的解析方法。 | |||
| 具体细节参考lite/src/parse_model/pack_model.fbs | |||
| ### Info部分 | |||
| Info 部分主要用来解释模型,如用户关心的:模型的输入数据的格式,模型运行的平台 | |||
| 等信息,这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。 | |||
| 由于这个 Info 部分不同的用户需求不一致,想传递的信息也无法统一,所以目前 | |||
| Lite 中提供自定义的方式,用户可以自定义自己 Info 部分的类容,并在 Header 中 | |||
| 指定 **Info 解析方式名字** ,并注册以该名字为 key 的解析函数到 Lite 中, | |||
| 以这样方式来可以实现用户自定义 Info 格式。同时,Lite 中也提供了一套定义好的 | |||
| 格式,其名字为 "LITE_default",并已经实现了对应的解析函数,该 info | |||
| 为 JSON 格式,具体内容定义如下: | |||
| ```json | |||
| { | |||
| "name": "shufflenet_test", | |||
| "valid": true, | |||
| "version": "8.9999.0", | |||
| "has_compression": false, | |||
| "device": { | |||
| "type": "CPU", | |||
| "device_id": 0, | |||
| "number_threads": 1, | |||
| "use_tensor_rt": false, | |||
| "enable_inplace_model": false | |||
| }, | |||
| "options":{ | |||
| "weight_preprocess": false, | |||
| "var_sanity_check_first_run": true, | |||
| "const_shape": false, | |||
| "jit_level": 0, | |||
| "record_level": 0 | |||
| }, | |||
| "IO":{ | |||
| "inputs":[ | |||
| { | |||
| "name": "data", | |||
| "io_type": "value", | |||
| "is_host": true, | |||
| "dtype": "float32", | |||
| "shape": { | |||
| "dim0": 1, | |||
| "dim1": 3, | |||
| "dim2": 224, | |||
| "dim3": 224 | |||
| } | |||
| } | |||
| ], | |||
| "outputs":[ | |||
| { | |||
| "name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]", | |||
| "io_type": "value", | |||
| "is_host": true, | |||
| "dtype": "float32", | |||
| "shape": { | |||
| "dim0": 1, | |||
| "dim1": 1000, | |||
| "dim2": 0, | |||
| "dim3": 0 | |||
| } | |||
| } | |||
| ] | |||
| } | |||
| } | |||
| ``` | |||
| * model_name: 指这个模型的名字,用户可以用来验证是否运行了正确的模型, | |||
| 和 Header 部分中的进行对比 check | |||
| * valid: 指在这个 info 文件中的设置是否影响模型的 Config | |||
| * version: 指模型对应的 megbrain 的版本号,load 模型时候会进行 check | |||
| * has_compression: 标识这个模型文件中 tensor 的数据是否压缩过 | |||
| * device: 目前支持字段包括:"CPU","CUDA","OPENCL","ATLAS" | |||
| * number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效 | |||
| * IO::inputs::type: 包括 value,shape,详见 include"network.h" | |||
| * IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端 | |||
| * IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端 | |||
| * IO::outputs::shape::dimx: 如果为0,则便是该 dim 无效 | |||
| ### Model部分 | |||
| 可以是加密的模型文件或者未加密的模型文件 | |||
| ## 使用 | |||
| 丰富的使用方法详见文件 example 中文档和对应的 example。 | |||
| ## 工具 | |||
| 目前 lite 中有三个工具保存在 tools 目录中,其他 megbrain 工具 | |||
| 没有包含在内,分别为: | |||
| * pack_model_and_info.py 为上面提到的模型打包工具,其为一个 | |||
| python 脚本,可以直接用其对已有的模型和模型 information 的文件,按照上面 | |||
| 的格式进行打包模型,用户可以指定模型名字,模型加密方式,模型信息 | |||
| 文件加密方式,解析方式等,如下: | |||
| ```bash | |||
| python3 pack_model_and_info.py --input-model xxx.mge \ | |||
| --model-name="shufflenet_test" \ | |||
| --model-cryption="RC4_default" \ | |||
| --input-info xxx.json \ | |||
| --info-cryption="RC4_default" \ | |||
| --info-parser="LITE_default" \ | |||
| -o xxx.lite | |||
| ``` | |||
| * aes_encrypt.sh 为一个 aes 加密方式的加密脚本,可以将一个文件, | |||
| 通过指定的的 key 加密成一个 aes 加密的文件,其中 key 为 32 个字节 | |||
| 16进制数。 | |||
| ```bash | |||
| aes_encrypt.sh xxx.mdl xxx_encrypted.mdl \ | |||
| 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F | |||
| ``` | |||
| * rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具,这个工具可以通过 | |||
| 制定的 key 或者默认的 key 加密制定的文件,支持 rc4 方法和 | |||
| simple_fast_rc4 两种方法,支持自定义 key。 | |||
| * bazel 编译 x86 命令为: | |||
| ```bash | |||
| bazel build //brain/megbrain/lite:rc4_encryptor \ | |||
| --cpu='k8' --compiler='gcc9' | |||
| ``` | |||
| * 加密文件,具体用法见 help | |||
| ```bash | |||
| rc4_encryptor encrypt_predefined_rc4 \ | |||
| to_be_encrypt.file encrypted.file | |||
| ``` | |||
| @@ -0,0 +1,32 @@ | |||
| /** | |||
| * \file lite/build_config/lite_build_config.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef _HEADER_LITE_BUILD_CONFIG | |||
| #define _HEADER_LITE_BUILD_CONFIG | |||
| #ifndef LITE_ENABLE_LOGGING | |||
| #define LITE_ENABLE_LOGGING 1 | |||
| #endif | |||
| #ifndef LITE_ENABLE_EXCEPTION | |||
| #if __cpp_exceptions || __EXCEPTIONS || \ | |||
| (defined(_MSC_VER) && defined(_CPPUNWIND)) | |||
| #define LITE_ENABLE_EXCEPTION 1 | |||
| #else | |||
| #define LITE_ENABLE_EXCEPTION 0 | |||
| #endif | |||
| #endif | |||
| #ifndef LITE_WITH_CUDA | |||
| #define LITE_WITH_CUDA 0 | |||
| #endif | |||
| #ifndef LITE_ASSERT_LOC | |||
| #define LITE_ASSERT_LOC 1 | |||
| #endif | |||
| #endif // _HEADER_LITE_BUILD_CONFIG | |||
| @@ -0,0 +1,47 @@ | |||
| file (GLOB_RECURSE SOURCES ./*.cpp) | |||
| add_executable(lite_examples ${SOURCES}) | |||
| if(LITE_BUILD_WITH_RKNPU) | |||
| #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check | |||
| target_link_options(lite_examples PRIVATE "-fuse-ld=gold") | |||
| endif() | |||
| target_link_libraries(lite_examples lite_static) | |||
| if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) | |||
| # FIXME: hip obj can not find cpp obj only through lite_static | |||
| target_link_libraries(lite_examples megdnn) | |||
| endif() | |||
| if(UNIX) | |||
| if(APPLE OR ANDROID) | |||
| target_link_libraries(lite_examples dl) | |||
| else() | |||
| target_link_libraries(lite_examples dl rt) | |||
| endif() | |||
| endif() | |||
| install (TARGETS lite_examples | |||
| EXPORT ${LITE_EXPORT_TARGETS} | |||
| RUNTIME DESTINATION lite/bin) | |||
| # add lite_examples_depends_shared for CI check symbol export valid | |||
| add_executable(lite_examples_depends_shared ${SOURCES}) | |||
| if(LITE_BUILD_WITH_RKNPU) | |||
| #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check | |||
| target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold") | |||
| endif() | |||
| target_link_libraries(lite_examples_depends_shared lite_shared) | |||
| if(UNIX) | |||
| if(APPLE OR ANDROID) | |||
| target_link_libraries(lite_examples_depends_shared dl) | |||
| else() | |||
| target_link_libraries(lite_examples_depends_shared dl rt) | |||
| endif() | |||
| endif() | |||
| install (TARGETS lite_examples_depends_shared | |||
| EXPORT ${LITE_EXPORT_TARGETS} | |||
| RUNTIME DESTINATION lite/bin) | |||
| @@ -0,0 +1,101 @@ | |||
| /** | |||
| * \file example/example.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include <lite_build_config.h> | |||
| #include "lite/global.h" | |||
| #include "lite/network.h" | |||
| #include "lite/tensor.h" | |||
| #include "npy.h" | |||
| #include <string.h> | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| namespace lite { | |||
| namespace example { | |||
| void set_cpu_affinity(const std::vector<int>& cpuset); | |||
| struct Args { | |||
| int args_parse_ret = 0; | |||
| std::string example_name; | |||
| std::string model_path; | |||
| std::string input_path; | |||
| std::string output_path; | |||
| std::string loader_path; | |||
| static Args from_argv(int argc, char** argv); | |||
| }; | |||
| std::shared_ptr<Tensor> parse_npy( | |||
| const std::string& path, | |||
| LiteBackend backend = LiteBackend::LITE_DEFAULT); | |||
| using ExampleFunc = std::function<bool(const Args&)>; | |||
| using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>; | |||
| ExampleFuncMap* get_example_function_map(); | |||
| bool register_example(std::string example_name, const ExampleFunc& fuction); | |||
| template <int> | |||
| struct Register; | |||
| #if LITE_BUILD_WITH_MGE | |||
| #if LITE_WITH_CUDA | |||
| bool load_from_path_run_cuda(const Args& args); | |||
| #endif | |||
| bool basic_load_from_path(const Args& args); | |||
| bool basic_load_from_path_with_loader(const Args& args); | |||
| bool basic_load_from_memory(const Args& args); | |||
| bool cpu_affinity(const Args& args); | |||
| bool network_share_same_weights(const Args& args); | |||
| bool reset_input(const Args& args); | |||
| bool reset_input_output(const Args& args); | |||
| bool config_user_allocator(const Args& args); | |||
| bool register_cryption_method(const Args& args); | |||
| bool update_cryption_key(const Args& args); | |||
| bool async_forward(const Args& args); | |||
| #if LITE_WITH_CUDA | |||
| bool device_input(const Args& args); | |||
| bool device_input_output(const Args& args); | |||
| bool pinned_host_input(const Args& args); | |||
| #endif | |||
| #endif | |||
| } // namespace example | |||
| } // namespace lite | |||
| #if LITE_BUILD_WITH_MGE | |||
| bool basic_c_interface(const lite::example::Args& args); | |||
| bool device_io_c_interface(const lite::example::Args& args); | |||
| bool async_c_interface(const lite::example::Args& args); | |||
| #endif | |||
| #define CONCAT_IMPL(a, b) a##b | |||
| #define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) | |||
| #define REGIST_EXAMPLE(name_, func_) \ | |||
| REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_) | |||
| #define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_) \ | |||
| template <> \ | |||
| struct Register<number_> { \ | |||
| Register() { register_example(name_, func_); } \ | |||
| }; \ | |||
| namespace { \ | |||
| Register<number_> MACRO_CONCAT(example_function_, number_); \ | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,172 @@ | |||
| /** | |||
| * \file example/example.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/global.h" | |||
| #include "lite/network.h" | |||
| #include "lite/tensor.h" | |||
| #include "example.h" | |||
| #include "npy.h" | |||
| #include <string.h> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <vector> | |||
| using namespace lite; | |||
| using namespace example; | |||
| Args Args::from_argv(int argc, char** argv) { | |||
| Args ret; | |||
| if (argc < 4) { | |||
| printf("usage: lite_examples <example_name> <model file> <input " | |||
| "file> <output file>.\n"); | |||
| printf("*********The output file is optional.*************\n"); | |||
| printf("The registered examples include:\n"); | |||
| size_t index = 0; | |||
| for (auto it : *get_example_function_map()) { | |||
| printf("%zu : %s\n", index, it.first.c_str()); | |||
| index++; | |||
| } | |||
| ret.args_parse_ret = -1; | |||
| return ret; | |||
| } | |||
| ret.example_name = argv[1]; | |||
| ret.model_path = argv[2]; | |||
| ret.input_path = argv[3]; | |||
| if (argc > 4) { | |||
| ret.output_path = argv[4]; | |||
| } | |||
| if (argc > 5) { | |||
| ret.loader_path = argv[5]; | |||
| } | |||
| return ret; | |||
| } | |||
| ExampleFuncMap* lite::example::get_example_function_map() { | |||
| static ExampleFuncMap static_map; | |||
| return &static_map; | |||
| } | |||
| bool lite::example::register_example(std::string example_name, | |||
| const ExampleFunc& fuction) { | |||
| auto map = get_example_function_map(); | |||
| if (map->find(example_name) != map->end()) { | |||
| printf("Error!!! This example is registed yet\n"); | |||
| return false; | |||
| } | |||
| (*map)[example_name] = fuction; | |||
| return true; | |||
| } | |||
| std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path, | |||
| LiteBackend backend) { | |||
| std::string type_str; | |||
| std::vector<npy::ndarray_len_t> stl_shape; | |||
| std::vector<int8_t> raw; | |||
| npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); | |||
| auto lite_tensor = | |||
| std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU); | |||
| Layout layout; | |||
| layout.ndim = stl_shape.size(); | |||
| const std::map<std::string, LiteDataType> type_map = { | |||
| {"f4", LiteDataType::LITE_FLOAT}, | |||
| {"i4", LiteDataType::LITE_INT}, | |||
| {"i1", LiteDataType::LITE_INT8}, | |||
| {"u1", LiteDataType::LITE_UINT8}}; | |||
| layout.shapes[0] = 1; | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| layout.shapes[i] = static_cast<size_t>(stl_shape[i]); | |||
| } | |||
| for (auto& item : type_map) { | |||
| if (type_str.find(item.first) != std::string::npos) { | |||
| layout.data_type = item.second; | |||
| break; | |||
| } | |||
| } | |||
| lite_tensor->set_layout(layout); | |||
| size_t length = lite_tensor->get_tensor_total_size_in_byte(); | |||
| void* dest = lite_tensor->get_memory_ptr(); | |||
| memcpy(dest, raw.data(), length); | |||
| //! rknn not support reshape now | |||
| if (layout.ndim == 3) { | |||
| lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]), | |||
| static_cast<int>(layout.shapes[1]), | |||
| static_cast<int>(layout.shapes[2])}); | |||
| } | |||
| return lite_tensor; | |||
| } | |||
| void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) { | |||
| #if defined(__APPLE__) || defined(WIN32) | |||
| #pragma message("set_cpu_affinity not enabled on apple and windows platform") | |||
| #else | |||
| cpu_set_t mask; | |||
| CPU_ZERO(&mask); | |||
| for (auto i : cpuset) { | |||
| CPU_SET(i, &mask); | |||
| } | |||
| auto err = sched_setaffinity(0, sizeof(mask), &mask); | |||
| if (err) { | |||
| printf("failed to sched_setaffinity: %s (error ignored)", | |||
| strerror(errno)); | |||
| } | |||
| #endif | |||
| } | |||
| int main(int argc, char** argv) { | |||
| set_log_level(LiteLogLevel::WARN); | |||
| auto&& args = Args::from_argv(argc, argv); | |||
| if (args.args_parse_ret) | |||
| return -1; | |||
| auto map = get_example_function_map(); | |||
| auto example = (*map)[args.example_name]; | |||
| if (example) { | |||
| printf("Begin to run %s example.\n", args.example_name.c_str()); | |||
| return example(args); | |||
| } else { | |||
| printf("The example of %s is not registed.", args.example_name.c_str()); | |||
| return -1; | |||
| } | |||
| } | |||
| namespace lite { | |||
| namespace example { | |||
| #if LITE_BUILD_WITH_MGE | |||
| #if LITE_WITH_CUDA | |||
| REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda); | |||
| #endif | |||
| REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path); | |||
| REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader); | |||
| REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory); | |||
| REGIST_EXAMPLE("cpu_affinity", cpu_affinity); | |||
| REGIST_EXAMPLE("register_cryption_method", register_cryption_method); | |||
| REGIST_EXAMPLE("update_cryption_key", update_cryption_key); | |||
| REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights); | |||
| REGIST_EXAMPLE("reset_input", reset_input); | |||
| REGIST_EXAMPLE("reset_input_output", reset_input_output); | |||
| REGIST_EXAMPLE("config_user_allocator", config_user_allocator); | |||
| REGIST_EXAMPLE("async_forward", async_forward); | |||
| REGIST_EXAMPLE("basic_c_interface", basic_c_interface); | |||
| REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface); | |||
| REGIST_EXAMPLE("async_c_interface", async_c_interface); | |||
| #if LITE_WITH_CUDA | |||
| REGIST_EXAMPLE("device_input", device_input); | |||
| REGIST_EXAMPLE("device_input_output", device_input_output); | |||
| REGIST_EXAMPLE("pinned_host_input", pinned_host_input); | |||
| #endif | |||
| #endif | |||
| } // namespace example | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,166 @@ | |||
| # Example | |||
| 在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子,主要 | |||
| 是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example | |||
| 都是使用 shufflenet 来进行演示。 | |||
| ## Example bazel 的编译和运行 | |||
| * 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境,编译 CPU 版本 | |||
| ```bash | |||
| ./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \ | |||
| --compiler="gcc9" -c opt | |||
| ``` | |||
| * 运行时需要指定运行的具体 example 名字,运行的模型,模型运行的数据 | |||
| * 获取所有的 example 名字 | |||
| ``` | |||
| bazel-bin/brain/megbrain/lite/lite_examples | |||
| ``` | |||
| * 运行 example,下面命令运行 basic_load_from_memory | |||
| ``` | |||
| bazel-bin/brain/megbrain/lite/lite_examples \ | |||
| basic_load_from_memory \ | |||
| path-to-megbrain/lite/test/resource/lite/shufflenet.mge \ | |||
| path-to-megbrain/lite/test/resource/lite/input_data.npy | |||
| ``` | |||
| ## basic 使用 | |||
| * **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和 | |||
| basic_load_from_memory** | |||
| * 该 example 使用 lite 来完成基本的 inference 功能,load 模型使用默认的配置, | |||
| 进行 forward 之前将输入数据 copy 到输入 tensor 中,完成 forward 之后,再将 | |||
| 数据从输出 tensor 中 copy 到用户的内存中,输入 tensor 和输出 tensor 都是从 | |||
| Network 中通过 name 来获取的,输入输出 tensor 的 layout 也可以从对应的 tensor | |||
| 中直接获取获取,**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。** | |||
| ## 输入输出指定的内存 | |||
| * **实现在 reset_io.cpp 中,包括两个 example,reset_input 和 reset_input_output | |||
| 两个 example。** | |||
| * 该 example 中演示输入 tensor 的内存为用户指定的内存(该内存中已经保存好输入 | |||
| 数据),输出 tensor 也可以是用户指定的内存,这样 Network 完成 Forward 之后就会将数据 | |||
| 保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。 | |||
| * 主要是通过 tensor 中的 reset 接口,该接口可以重新指定 tensor 的内存和对应的 | |||
| layout,如果 layout 没有指定,默认为 tensor 中原来的 layout。 | |||
| * **该方法中由于内存是用户申请,需要用户提前知道输入,输出 tensor 对应的 layout,然后 | |||
| 根据 layout 来申请内存,另外通过 reset 设置到 tensor 中的内存,生命周期不由 tensor | |||
| 管理,由外部用户来管理。** | |||
| ## 输入输出指定 device 上内存 | |||
| * **实现在 device_io.cpp 中,device_input 和 device_input_output 两个 example。** | |||
| * 该 example 中配置模型运行在 device(CUDA) 上,并且使用用户提前申请的 device 上的内存 | |||
| 作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上,不设置默认 | |||
| 在 CPU 上,其他地方和**输入输出为用户指定的内存**的使用相同 | |||
| * 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端 | |||
| ## 申请 pinned host 内存作为输入 | |||
| * **实现在 device_io.cpp 中,函数名字为 pinned_host_input。** | |||
| * 这个 example 中模型运行在 device(CUDA) 上,但是输入输出在 CPU 上,为了加速 host2device 的 | |||
| copy,将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出 | |||
| output tensor 不是 device 上的时候,默认就是 pinned host 的。 | |||
| * 申请 pinned host 内存的方法是:构建 tensor 的时候指定 device,layout,以及 is_host_pinned | |||
| 参数,这样申请的内存就是 pinned host 的内存。 | |||
| ```C | |||
| bool is_pinned_host = true; | |||
| auto tensor_pinned_input = | |||
| Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); | |||
| ``` | |||
| ## 用户指定内存分配器 | |||
| * **实现在 user_allocator.cpp 中,函数名为:config_user_allocator。** | |||
| * 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法,用户自定义 | |||
| 内存分配器需要继承自 lite 中的 Allocator 基类,并实现 allocate 和 free 两个接口。目前在 CPU | |||
| 上验证是正确的,其他设备上有待测试。 | |||
| * 设置自定定义内存分配器的接口为 Network 中如下接口: | |||
| ```C | |||
| Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator); | |||
| ``` | |||
| ## 多个 Network 共享同一份模型 weights | |||
| * **实现在 network_share_weights.cpp 中,函数名为:network_share_same_weights。** | |||
| * 很多情况用户希望多个 Network 共享同一份 weights,因为模型中 weights 是只读的,这样可以节省 | |||
| 模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能,首先创建一个新的 Network, | |||
| 用户可以指定新的 Config 和 NetworkIO 以及其他一些配置,使得新创建出来的 Network 完成不同的 | |||
| 功能。 | |||
| * 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口: | |||
| ```C | |||
| static void shared_weight_with_network( | |||
| std::shared_ptr<Network> dst_network, | |||
| const std::shared_ptr<Network> src_network); | |||
| ``` | |||
| * dst_network: 指新 load 出来的 Network | |||
| * src_network:已经 load 的老的 Network | |||
| ## CPU 绑核 | |||
| * **实现在 cpu_affinity.cpp 中,函数名为:cpu_affinity。** | |||
| * 该 example 之中指定模型运行在 CPU 多线程上,然后使用 Network 中的 | |||
| set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来,用户可以 | |||
| 根据该 id 决定具体绑核行为,在多线程中,如果线程总数为 n,则 id 为 n-1 的线程为主线程。 | |||
| ## 用户注册自定义解密算法和 key | |||
| * **实现在 user_cryption.cpp 中,函数名为:register_cryption_method 和 update_aes_key 。** | |||
| * 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口,实现了使用用户自定的解密算法 | |||
| 实现模型的 load 操作。在这个 example 中,自定义了一个解密方法,(其实没有做任何事情, | |||
| 将模型两次异或上 key 之后返回,等于将原始模型直接返回),然后将其注册到 lite 中,后面创建 Network 时候在其 | |||
| config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其 | |||
| key 的更新操作。 | |||
| 目前 lite 里面定义好了几种解密算法: | |||
| * AES_default : 其 key 是由 32 个 unsighed char 组成,默认为0到31 | |||
| * RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char,hash | |||
| key 在前,enc_key 在后。 | |||
| * SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。 | |||
| 大概命名规则为:前面大写是具体算法的名字,'_'后面的小写,代表解密 key。 | |||
| 具体的接口为: | |||
| ```C | |||
| bool register_decryption_and_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key); | |||
| bool update_decryption_or_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key); | |||
| ``` | |||
| register 接口中必须要求三个参数都是正确的值,update中 decrypt_nam 必须为已有的解密算法, | |||
| 将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新 | |||
| ## 异步执行模式 | |||
| * **实现在 basic.cpp 中,函数名为:async_forward。** | |||
| * 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式, | |||
| 目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持,在 inference 时异步模式, | |||
| 主线程可以在工作线程正在执行计算的同时做一些其他的运算,避免长时间等待,但是 | |||
| 在一些单核处理器上没有收益。 | |||
| ## 纯 C example | |||
| * **实现在 lite_c_interface.cpp,函数名为:basic_c_interface, | |||
| device_io_c_interface,async_c_interface** | |||
| * Lite 完成对 C++ 接口的封装,对外暴露了纯 C 的接口,用户如果不是源码依赖 Lite | |||
| 的情况下,应该使用纯 C 接口来完成集成。 | |||
| * 纯 C 的所有接口都是返回一个 int,如果这个 int 的数值不为 0,则又错误产生,需要 | |||
| 调用 LITE_get_last_error 来获取错误信息。 | |||
| * 纯 C 的所有 get 函数都需要先定义一个对应的对象,然后将该对象的指针传递进接口, | |||
| Lite 会将结果写入到 对应指针的地址里面。 | |||
| @@ -0,0 +1,370 @@ | |||
| /** | |||
| * \file example/basic.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include <thread> | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include <cstdio> | |||
| #include "misc.h" | |||
| using namespace lite; | |||
| using namespace example; | |||
| namespace { | |||
| void output_info(std::shared_ptr<Network> network, size_t output_size) { | |||
| for (size_t index = 0; index < output_size; index++) { | |||
| printf("output[%zu] names %s \n", index, | |||
| network->get_all_output_name()[index].c_str()); | |||
| std::shared_ptr<Tensor> output_tensor = | |||
| network->get_output_tensor(index); | |||
| size_t ndim = output_tensor->get_layout().ndim; | |||
| for (size_t i = 0; i < ndim; i++) { | |||
| printf("output[%zu] tensor.shape[%zu] %zu \n", index, i, | |||
| output_tensor->get_layout().shapes[i]); | |||
| } | |||
| } | |||
| } | |||
| void output_data_info(std::shared_ptr<Network> network, size_t output_size) { | |||
| for (size_t index = 0; index < output_size; index++) { | |||
| auto output_tensor = network->get_output_tensor(index); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| LiteDataType dtype = output_tensor->get_layout().data_type; | |||
| float max = -1000.0f; | |||
| float min = 1000.0f; | |||
| int max_idx = 0; | |||
| int min_idx = 0; | |||
| float sum = 0.0f; | |||
| #define cb(_dtype, _real_dtype) \ | |||
| case LiteDataType::_dtype: { \ | |||
| for (size_t i = 0; i < out_length; i++) { \ | |||
| _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \ | |||
| sum += data; \ | |||
| if (max < data) { \ | |||
| max = data; \ | |||
| max_idx = i; \ | |||
| } \ | |||
| if (min > data) { \ | |||
| min = data; \ | |||
| min_idx = i; \ | |||
| } \ | |||
| } \ | |||
| } break; | |||
| switch (dtype) { | |||
| cb(LITE_FLOAT, float); | |||
| cb(LITE_INT, int); | |||
| cb(LITE_INT8, int8_t); | |||
| cb(LITE_UINT8, uint8_t); | |||
| default: | |||
| printf("unknow datatype"); | |||
| } | |||
| printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n", | |||
| out_length, index, max, max_idx, min, min_idx, sum); | |||
| } | |||
| #undef cb | |||
| } | |||
| } // namespace | |||
| #if LITE_WITH_CUDA | |||
| bool lite::example::load_from_path_run_cuda(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| set_log_level(LiteLogLevel::DEBUG); | |||
| //! config the network running in CUDA device | |||
| lite::Config config{false, -1, LiteDeviceType::LITE_CUDA}; | |||
| //! set NetworkIO | |||
| NetworkIO network_io; | |||
| std::string input_name = "img0_comp_fullface"; | |||
| bool is_host = false; | |||
| IO device_input{input_name, is_host}; | |||
| network_io.inputs.push_back(device_input); | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = | |||
| std::make_shared<Network>(config, network_io); | |||
| network->load_model(network_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| Layout input_layout = input_tensor->get_layout(); | |||
| //! read data from numpy data file | |||
| auto src_tensor = parse_npy(input_path); | |||
| //! malloc the device memory | |||
| auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
| //! copy to the device memory | |||
| tensor_device.copy_from(*src_tensor); | |||
| //! Now the device memory if filled with user input data, set it to the | |||
| //! input tensor | |||
| input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); | |||
| //! forward | |||
| { | |||
| lite::Timer ltimer("warmup"); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(0); | |||
| } | |||
| lite::Timer ltimer("forward_iter"); | |||
| for (int i = 0; i < 10; i++) { | |||
| ltimer.reset_start(); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(i); | |||
| } | |||
| //! get the output data or read tensor set in network_in | |||
| size_t output_size = network->get_all_output_name().size(); | |||
| output_info(network, output_size); | |||
| output_data_info(network, output_size); | |||
| return true; | |||
| } | |||
| #endif | |||
| bool lite::example::basic_load_from_path(const Args& args) { | |||
| set_log_level(LiteLogLevel::DEBUG); | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto layout = input_tensor->get_layout(); | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]); | |||
| } | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| auto layout0 = src_tensor->get_layout(); | |||
| for (size_t i = 0; i < layout0.ndim; i++) { | |||
| printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]); | |||
| } | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| //! forward | |||
| { | |||
| lite::Timer ltimer("warmup"); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(0); | |||
| } | |||
| lite::Timer ltimer("forward_iter"); | |||
| for (int i = 0; i < 10; i++) { | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(i); | |||
| } | |||
| //! forward | |||
| { | |||
| lite::Timer ltimer("warmup"); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(0); | |||
| } | |||
| for (int i = 0; i < 10; i++) { | |||
| ltimer.reset_start(); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(i); | |||
| } | |||
| //! get the output data or read tensor set in network_in | |||
| size_t output_size = network->get_all_output_name().size(); | |||
| output_info(network, output_size); | |||
| output_data_info(network, output_size); | |||
| return true; | |||
| } | |||
| bool lite::example::basic_load_from_path_with_loader(const Args& args) { | |||
| set_log_level(LiteLogLevel::DEBUG); | |||
| lite::set_loader_lib_path(args.loader_path); | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto input_layout = input_tensor->get_layout(); | |||
| //! copy or forward data to network | |||
| auto src_tensor = parse_npy(input_path); | |||
| auto src_layout = src_tensor->get_layout(); | |||
| if (src_layout.ndim != input_layout.ndim) { | |||
| printf("src dim is not equal model input dim\n"); | |||
| } | |||
| //! pay attention the input shape can change | |||
| for (size_t i = 0; i < input_layout.ndim; i++) { | |||
| if (input_layout.shapes[i] != src_layout.shapes[i]) { | |||
| printf("src shape not equal input shape"); | |||
| } | |||
| } | |||
| input_tensor->set_layout(src_tensor->get_layout()); | |||
| //! reset or forward data to network | |||
| input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout()); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! forward | |||
| { | |||
| lite::Timer ltimer("warmup"); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(0); | |||
| } | |||
| lite::Timer ltimer("forward_iter"); | |||
| for (int i = 0; i < 10; i++) { | |||
| ltimer.reset_start(); | |||
| network->forward(); | |||
| network->wait(); | |||
| ltimer.print_used_time(i); | |||
| } | |||
| //! get the output data or read tensor set in network_in | |||
| size_t output_size = network->get_all_output_name().size(); | |||
| output_info(network, output_size); | |||
| output_data_info(network, output_size); | |||
| return true; | |||
| } | |||
| bool lite::example::basic_load_from_memory(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| FILE* fin = fopen(network_path.c_str(), "rb"); | |||
| if (!fin) { | |||
| printf("failed to open %s.", network_path.c_str()); | |||
| } | |||
| fseek(fin, 0, SEEK_END); | |||
| size_t size = ftell(fin); | |||
| fseek(fin, 0, SEEK_SET); | |||
| void* ptr = malloc(size); | |||
| std::shared_ptr<void> buf{ptr, ::free}; | |||
| auto len = fread(buf.get(), 1, size, fin); | |||
| if (len < 1) { | |||
| printf("read file failed.\n"); | |||
| } | |||
| fclose(fin); | |||
| network->load_model(buf.get(), size); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| printf("length=%zu\n", length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool lite::example::async_forward(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| Config config; | |||
| config.options.var_sanity_check_first_run = false; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| //! set async mode and callback | |||
| volatile bool finished = false; | |||
| network->set_async_callback([&finished]() { | |||
| #if !__DEPLOY_ON_XP_SP2__ | |||
| std::cout << "worker thread_id:" << std::this_thread::get_id() | |||
| << std::endl; | |||
| #endif | |||
| finished = true; | |||
| }); | |||
| #if !__DEPLOY_ON_XP_SP2__ | |||
| std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; | |||
| #endif | |||
| //! forward | |||
| network->forward(); | |||
| size_t count = 0; | |||
| while (finished == false) { | |||
| count++; | |||
| } | |||
| printf("Forward finish, count is %zu\n", count); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| printf("length=%zu\n", length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,69 @@ | |||
| /** | |||
| * \file example/cpu_affinity.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| bool lite::example::cpu_affinity(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| //! run with multi theads | |||
| Runtime::set_cpu_threads_number(network, 4); | |||
| network->load_model(network_path); | |||
| std::vector<int> core_ids = {0, 1, 2, 3}; | |||
| auto affinity = [core_ids](int id) { | |||
| //! add user define affinity function | |||
| set_cpu_affinity({core_ids[id]}); | |||
| printf("set thread id = %d with the affinity of core %d.\n", id, | |||
| core_ids[id]); | |||
| }; | |||
| Runtime::set_runtime_thread_affinity(network, affinity); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| printf("length=%zu\n", length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,189 @@ | |||
| /** | |||
| * \file example/device_io.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include <thread> | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| #if LITE_WITH_CUDA | |||
| bool lite::example::device_input(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! config the network running in CUDA device | |||
| lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
| //! set NetworkIO | |||
| NetworkIO network_io; | |||
| std::string input_name = "data"; | |||
| bool is_host = false; | |||
| IO device_input{input_name, is_host}; | |||
| network_io.inputs.push_back(device_input); | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = | |||
| std::make_shared<Network>(config, network_io); | |||
| network->load_model(network_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| Layout input_layout = input_tensor->get_layout(); | |||
| //! read data from numpy data file | |||
| auto src_tensor = parse_npy(input_path); | |||
| //! malloc the device memory | |||
| auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
| //! copy to the device memory | |||
| tensor_device.copy_from(*src_tensor); | |||
| //! Now the device memory if filled with user input data, set it to the | |||
| //! input tensor | |||
| input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool lite::example::device_input_output(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! config the network running in CUDA device | |||
| lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
| //! set NetworkIO include input and output | |||
| NetworkIO network_io; | |||
| std::string input_name = "data"; | |||
| std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; | |||
| bool is_host = false; | |||
| IO device_input{input_name, is_host}; | |||
| IO device_output{output_name, is_host}; | |||
| network_io.inputs.push_back(device_input); | |||
| network_io.outputs.push_back(device_output); | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = | |||
| std::make_shared<Network>(config, network_io); | |||
| network->load_model(network_path); | |||
| std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0); | |||
| Layout input_layout = input_tensor_device->get_layout(); | |||
| //! read data from numpy data file | |||
| auto src_tensor = parse_npy(input_path); | |||
| //! malloc the device memory | |||
| auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
| //! copy to the device memory | |||
| tensor_device.copy_from(*src_tensor); | |||
| //! Now the device memory is filled with user input data, set it to the | |||
| //! input tensor | |||
| input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! output is in device, should copy it to host | |||
| std::shared_ptr<Tensor> output_tensor_device = | |||
| network->get_io_tensor(output_name); | |||
| auto output_tensor = std::make_shared<Tensor>(); | |||
| output_tensor->copy_from(*output_tensor_device); | |||
| //! get the output data or read tensor set in network_in | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool lite::example::pinned_host_input(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! config the network running in CUDA device | |||
| lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| Layout input_layout = input_tensor->get_layout(); | |||
| //! read data from numpy data file | |||
| auto src_tensor = parse_npy(input_path); | |||
| //! malloc the pinned host memory | |||
| bool is_pinned_host = true; | |||
| auto tensor_pinned_input = | |||
| Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); | |||
| //! copy to the pinned memory | |||
| tensor_pinned_input.copy_from(*src_tensor); | |||
| //! set the pinned host memory to the network as input | |||
| input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,224 @@ | |||
| /** | |||
| * \file example/basic_c_interface.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #include "misc.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "lite-c/global_c.h" | |||
| #include "lite-c/network_c.h" | |||
| #include "lite-c/tensor_c.h" | |||
| #include <thread> | |||
| #define LITE_CAPI_CHECK(_expr) \ | |||
| do { \ | |||
| int _ret = (_expr); \ | |||
| if (_ret) { \ | |||
| LITE_THROW(LITE_get_last_error()); \ | |||
| } \ | |||
| } while (0) | |||
| bool basic_c_interface(const lite::example::Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! read input data to lite::tensor | |||
| auto src_tensor = lite::example::parse_npy(input_path); | |||
| void* src_ptr = src_tensor->get_memory_ptr(); | |||
| //! create and load the network | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network, *default_config(), *default_network_io())); | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
| //! set input data to input tensor | |||
| LiteTensor c_input_tensor; | |||
| LITE_CAPI_CHECK( | |||
| LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
| void* dst_ptr; | |||
| size_t length_in_byte; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
| &length_in_byte)); | |||
| LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr)); | |||
| //! copy or forward data to network | |||
| memcpy(dst_ptr, src_ptr, length_in_byte); | |||
| //! forward | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
| //! get the output data or read tensor data | |||
| const char* output_name; | |||
| LiteTensor c_output_tensor; | |||
| //! get the first output tensor name | |||
| LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
| &c_output_tensor)); | |||
| void* output_ptr; | |||
| size_t length_output_in_byte; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); | |||
| LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, | |||
| &length_output_in_byte)); | |||
| size_t out_length = length_output_in_byte / sizeof(float); | |||
| printf("length=%zu\n", out_length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(output_ptr)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool device_io_c_interface(const lite::example::Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! read input data to lite::tensor | |||
| auto src_tensor = lite::example::parse_npy(input_path); | |||
| void* src_ptr = src_tensor->get_memory_ptr(); | |||
| size_t length_read_in = src_tensor->get_tensor_total_size_in_byte(); | |||
| //! create and load the network | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network, *default_config(), *default_network_io())); | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
| //! set input data to input tensor | |||
| LiteTensor c_input_tensor; | |||
| size_t length_tensor_in; | |||
| LITE_CAPI_CHECK( | |||
| LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
| LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
| &length_tensor_in)); | |||
| if (length_read_in != length_tensor_in) { | |||
| LITE_THROW("The input data size is not match the network input tensro " | |||
| "size,\n"); | |||
| } | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, | |||
| length_tensor_in)); | |||
| //! reset the output tensor memory with user allocated memory | |||
| size_t out_length = 1000; | |||
| LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; | |||
| std::shared_ptr<float> ptr(new float[out_length], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| const char* output_name; | |||
| LiteTensor c_output_tensor; | |||
| LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
| &c_output_tensor)); | |||
| LITE_CAPI_CHECK( | |||
| LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); | |||
| //! forward | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
| printf("length=%zu\n", out_length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| void* out_data = ptr.get(); | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| namespace { | |||
| volatile bool finished = false; | |||
| int async_callback(void) { | |||
| #if !__DEPLOY_ON_XP_SP2__ | |||
| std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; | |||
| #endif | |||
| finished = true; | |||
| return 0; | |||
| } | |||
| } // namespace | |||
| bool async_c_interface(const lite::example::Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! read input data to lite::tensor | |||
| auto src_tensor = lite::example::parse_npy(input_path); | |||
| void* src_ptr = src_tensor->get_memory_ptr(); | |||
| LiteNetwork c_network; | |||
| LiteConfig config = *default_config(); | |||
| config.options.var_sanity_check_first_run = false; | |||
| LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io())); | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
| //! set input data to input tensor | |||
| LiteTensor c_input_tensor; | |||
| size_t length_tensor_in; | |||
| LITE_CAPI_CHECK( | |||
| LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
| LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
| &length_tensor_in)); | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, | |||
| length_tensor_in)); | |||
| #if !__DEPLOY_ON_XP_SP2__ | |||
| std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl; | |||
| #endif | |||
| LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback)); | |||
| //! forward | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| size_t count = 0; | |||
| while (finished == false) { | |||
| count++; | |||
| } | |||
| printf("The count is %zu\n", count); | |||
| finished = false; | |||
| //! get the output data or read tensor data | |||
| const char* output_name; | |||
| LiteTensor c_output_tensor; | |||
| //! get the first output tensor name | |||
| LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
| &c_output_tensor)); | |||
| void* output_ptr; | |||
| size_t length_output_in_byte; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); | |||
| LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, | |||
| &length_output_in_byte)); | |||
| size_t out_length = length_output_in_byte / sizeof(float); | |||
| printf("length=%zu\n", out_length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(output_ptr)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,78 @@ | |||
| /** | |||
| * \file example/network_share_weights.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| bool lite::example::network_share_same_weights(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| network->load_model(network_path); | |||
| //! load a new network from the created network and share the same weights, | |||
| Config config_new; | |||
| config_new.options.const_shape = true; | |||
| NetworkIO network_io_new; | |||
| std::shared_ptr<Network> weight_shared_network = | |||
| std::make_shared<Network>(config_new, network_io_new); | |||
| Runtime::shared_weight_with_network(weight_shared_network, network); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| std::shared_ptr<Tensor> input_tensor2 = | |||
| weight_shared_network->get_input_tensor(0); | |||
| void* dst_ptr2 = input_tensor2->get_memory_ptr(); | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| memcpy(dst_ptr2, src, length); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| weight_shared_network->forward(); | |||
| weight_shared_network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| std::shared_ptr<Tensor> output_tensor2 = | |||
| weight_shared_network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| void* out_data2 = output_tensor2->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| printf("length=%zu\n", length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| float data2 = static_cast<float*>(out_data2)[i]; | |||
| if (data != data2) { | |||
| printf("the result between the origin network and weight share " | |||
| "netwrok is different.\n"); | |||
| } | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,95 @@ | |||
| /** | |||
| * \file example/reset_io.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| bool lite::example::reset_input(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| lite::Config config; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto layout = input_tensor->get_layout(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| input_tensor->reset(src, layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! 6. get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool lite::example::reset_input_output(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| lite::Config config; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto layout = input_tensor->get_layout(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| input_tensor->reset(src, layout); | |||
| //! set output ptr to store the network output | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < 1000; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,89 @@ | |||
| /** | |||
| * \file example/user_allocator.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| namespace { | |||
| class CheckAllocator : public lite::Allocator { | |||
| public: | |||
| //! allocate memory of size in the given device with the given align | |||
| void* allocate(LiteDeviceType, int, size_t size, size_t align) override { | |||
| #ifdef WIN32 | |||
| return _aligned_malloc(size, align); | |||
| #elif defined(__ANDROID__) || defined(ANDROID) | |||
| return memalign(align, size); | |||
| #else | |||
| void* ptr = nullptr; | |||
| auto err = posix_memalign(&ptr, align, size); | |||
| if (!err) { | |||
| printf("failed to malloc %zu bytes with align %zu", size, align); | |||
| } | |||
| return ptr; | |||
| #endif | |||
| }; | |||
| //! free the memory pointed by ptr in the given device | |||
| void free(LiteDeviceType, int, void* ptr) override { | |||
| #ifdef WIN32 | |||
| _aligned_free(ptr); | |||
| #else | |||
| ::free(ptr); | |||
| #endif | |||
| }; | |||
| }; | |||
| } // namespace | |||
| bool lite::example::config_user_allocator(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| auto allocator = std::make_shared<CheckAllocator>(); | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
| Runtime::set_memory_allocator(network, allocator); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| //! copy or forward data to network | |||
| size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
| void* dst_ptr = input_tensor->get_memory_ptr(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| memcpy(dst_ptr, src, length); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| printf("length=%zu\n", length); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,122 @@ | |||
| /** | |||
| * \file example/user_cryption.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../example.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| using namespace lite; | |||
| using namespace example; | |||
| namespace { | |||
| std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size, | |||
| const std::vector<uint8_t>& key) { | |||
| if (key.size() == 1) { | |||
| std::vector<uint8_t> ret(size, 0); | |||
| const uint8_t* ptr = static_cast<const uint8_t*>(model_mem); | |||
| uint8_t key_data = key[0]; | |||
| for (size_t i = 0; i < size; i++) { | |||
| ret[i] = ptr[i] ^ key_data ^ key_data; | |||
| } | |||
| return ret; | |||
| } else { | |||
| printf("the user define decrypt method key length is wrong.\n"); | |||
| return {}; | |||
| } | |||
| } | |||
| } // namespace | |||
| bool lite::example::register_cryption_method(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! register the decryption method | |||
| register_decryption_and_key("just_for_test", decrypt_model, {15}); | |||
| lite::Config config; | |||
| config.bare_model_cryption_name = "just_for_test"; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto layout = input_tensor->get_layout(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| input_tensor->reset(src, layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| bool lite::example::update_cryption_key(const Args& args) { | |||
| std::string network_path = args.model_path; | |||
| std::string input_path = args.input_path; | |||
| //! update the decryption method key | |||
| std::vector<uint8_t> key(32, 0); | |||
| for (size_t i = 0; i < 32; i++) { | |||
| key[i] = 31 - i; | |||
| } | |||
| update_decryption_or_key("AES_default", nullptr, key); | |||
| lite::Config config; | |||
| config.bare_model_cryption_name = "AES_default"; | |||
| //! create and load the network | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(network_path); | |||
| //! set input data to input tensor | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto layout = input_tensor->get_layout(); | |||
| auto src_tensor = parse_npy(input_path); | |||
| void* src = src_tensor->get_memory_ptr(); | |||
| input_tensor->reset(src, layout); | |||
| //! forward | |||
| network->forward(); | |||
| network->wait(); | |||
| //! get the output data or read tensor set in network_in | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
| output_tensor->get_layout().get_elem_size(); | |||
| float max = -1.0f; | |||
| float sum = 0.0f; | |||
| for (size_t i = 0; i < out_length; i++) { | |||
| float data = static_cast<float*>(out_data)[i]; | |||
| sum += data; | |||
| if (max < data) | |||
| max = data; | |||
| } | |||
| printf("max=%e, sum=%e\n", max, sum); | |||
| return true; | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,638 @@ | |||
| /* | |||
| Copyright 2017 Leon Merten Lohse | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in | |||
| all copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| */ | |||
| /* | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #ifndef NPY_H | |||
| #define NPY_H | |||
| #include <algorithm> | |||
| #include <complex> | |||
| #include <cstdint> | |||
| #include <cstring> | |||
| #include <fstream> | |||
| #include <iostream> | |||
| #include <regex> | |||
| #include <sstream> | |||
| #include <stdexcept> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| namespace npy { | |||
| /* Compile-time test for byte order. | |||
| If your compiler does not define these per default, you may want to define | |||
| one of these constants manually. | |||
| Defaults to little endian order. */ | |||
| #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ | |||
| defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ | |||
| defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ | |||
| defined(__MIBSEB) || defined(__MIBSEB__) | |||
| const bool big_endian = true; | |||
| #else | |||
| const bool big_endian = false; | |||
| #endif | |||
| const char magic_string[] = "\x93NUMPY"; | |||
| const size_t magic_string_length = 6; | |||
| const char little_endian_char = '<'; | |||
| const char big_endian_char = '>'; | |||
| const char no_endian_char = '|'; | |||
| constexpr char host_endian_char = | |||
| (big_endian ? big_endian_char : little_endian_char); | |||
| /* npy array length */ | |||
| typedef unsigned long int ndarray_len_t; | |||
| inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, | |||
| unsigned char v_minor = 0) { | |||
| ostream.write(magic_string, magic_string_length); | |||
| ostream.put(v_major); | |||
| ostream.put(v_minor); | |||
| } | |||
| inline void read_magic(std::istream& istream, unsigned char& v_major, | |||
| unsigned char& v_minor) { | |||
| char buf[magic_string_length + 2]; | |||
| istream.read(buf, magic_string_length + 2); | |||
| if (!istream) { | |||
| fprintf(stderr, "io error: failed reading file"); | |||
| } | |||
| if (0 != std::memcmp(buf, magic_string, magic_string_length)) { | |||
| fprintf(stderr, "this file does not have a valid npy format."); | |||
| } | |||
| v_major = buf[magic_string_length]; | |||
| v_minor = buf[magic_string_length + 1]; | |||
| } | |||
| // typestring magic | |||
| struct Typestring { | |||
| private: | |||
| char c_endian; | |||
| char c_type; | |||
| int len; | |||
| public: | |||
| inline std::string str() { | |||
| const size_t max_buflen = 16; | |||
| char buf[max_buflen]; | |||
| std::sprintf(buf, "%c%c%u", c_endian, c_type, len); | |||
| return std::string(buf); | |||
| } | |||
| Typestring(const std::vector<float>&) | |||
| : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} | |||
| Typestring(const std::vector<double>&) | |||
| : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} | |||
| Typestring(const std::vector<long double>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'f'}, | |||
| len{sizeof(long double)} {} | |||
| Typestring(const std::vector<char>&) | |||
| : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} | |||
| Typestring(const std::vector<short>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} | |||
| Typestring(const std::vector<int>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} | |||
| Typestring(const std::vector<long>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} | |||
| Typestring(const std::vector<long long>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} | |||
| Typestring(const std::vector<unsigned char>&) | |||
| : c_endian{no_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned char)} {} | |||
| Typestring(const std::vector<unsigned short>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned short)} {} | |||
| Typestring(const std::vector<unsigned int>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned int)} {} | |||
| Typestring(const std::vector<unsigned long>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned long)} {} | |||
| Typestring(const std::vector<unsigned long long>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned long long)} {} | |||
| Typestring(const std::vector<std::complex<float>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<float>)} {} | |||
| Typestring(const std::vector<std::complex<double>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<double>)} {} | |||
| Typestring(const std::vector<std::complex<long double>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<long double>)} {} | |||
| }; | |||
| inline void parse_typestring(std::string typestring) { | |||
| std::regex re("'([<>|])([ifuc])(\\d+)'"); | |||
| std::smatch sm; | |||
| std::regex_match(typestring, sm, re); | |||
| if (sm.size() != 4) { | |||
| fprintf(stderr, "invalid typestring"); | |||
| } | |||
| } | |||
| namespace pyparse { | |||
| /** | |||
| Removes leading and trailing whitespaces | |||
| */ | |||
| inline std::string trim(const std::string& str) { | |||
| const std::string whitespace = " \t"; | |||
| auto begin = str.find_first_not_of(whitespace); | |||
| if (begin == std::string::npos) | |||
| return ""; | |||
| auto end = str.find_last_not_of(whitespace); | |||
| return str.substr(begin, end - begin + 1); | |||
| } | |||
| inline std::string get_value_from_map(const std::string& mapstr) { | |||
| size_t sep_pos = mapstr.find_first_of(":"); | |||
| if (sep_pos == std::string::npos) | |||
| return ""; | |||
| std::string tmp = mapstr.substr(sep_pos + 1); | |||
| return trim(tmp); | |||
| } | |||
| /** | |||
| Parses the string representation of a Python dict | |||
| The keys need to be known and may not appear anywhere else in the data. | |||
| */ | |||
| inline std::unordered_map<std::string, std::string> parse_dict( | |||
| std::string in, std::vector<std::string>& keys) { | |||
| std::unordered_map<std::string, std::string> map; | |||
| if (keys.size() == 0) | |||
| return map; | |||
| in = trim(in); | |||
| // unwrap dictionary | |||
| if ((in.front() == '{') && (in.back() == '}')) | |||
| in = in.substr(1, in.length() - 2); | |||
| else { | |||
| fprintf(stderr, "Not a Python dictionary."); | |||
| } | |||
| std::vector<std::pair<size_t, std::string>> positions; | |||
| for (auto const& value : keys) { | |||
| size_t pos = in.find("'" + value + "'"); | |||
| if (pos == std::string::npos) { | |||
| fprintf(stderr, "Missing %s key.", value.c_str()); | |||
| } | |||
| std::pair<size_t, std::string> position_pair{pos, value}; | |||
| positions.push_back(position_pair); | |||
| } | |||
| // sort by position in dict | |||
| std::sort(positions.begin(), positions.end()); | |||
| for (size_t i = 0; i < positions.size(); ++i) { | |||
| std::string raw_value; | |||
| size_t begin{positions[i].first}; | |||
| size_t end{std::string::npos}; | |||
| std::string key = positions[i].second; | |||
| if (i + 1 < positions.size()) | |||
| end = positions[i + 1].first; | |||
| raw_value = in.substr(begin, end - begin); | |||
| raw_value = trim(raw_value); | |||
| if (raw_value.back() == ',') | |||
| raw_value.pop_back(); | |||
| map[key] = get_value_from_map(raw_value); | |||
| } | |||
| return map; | |||
| } | |||
| /** | |||
| Parses the string representation of a Python boolean | |||
| */ | |||
| inline bool parse_bool(const std::string& in) { | |||
| if (in == "True") | |||
| return true; | |||
| if (in == "False") | |||
| return false; | |||
| fprintf(stderr, "Invalid python boolan."); | |||
| return false; | |||
| } | |||
| /** | |||
| Parses the string representation of a Python str | |||
| */ | |||
| inline std::string parse_str(const std::string& in) { | |||
| if ((in.front() == '\'') && (in.back() == '\'')) | |||
| return in.substr(1, in.length() - 2); | |||
| fprintf(stderr, "Invalid python string."); | |||
| return ""; | |||
| } | |||
| /** | |||
| Parses the string represenatation of a Python tuple into a vector of its items | |||
| */ | |||
| inline std::vector<std::string> parse_tuple(std::string in) { | |||
| std::vector<std::string> v; | |||
| const char seperator = ','; | |||
| in = trim(in); | |||
| if ((in.front() == '(') && (in.back() == ')')) | |||
| in = in.substr(1, in.length() - 2); | |||
| else { | |||
| fprintf(stderr, "Invalid Python tuple."); | |||
| } | |||
| std::istringstream iss(in); | |||
| for (std::string token; std::getline(iss, token, seperator);) { | |||
| v.push_back(token); | |||
| } | |||
| return v; | |||
| } | |||
| template <typename T> | |||
| inline std::string write_tuple(const std::vector<T>& v) { | |||
| if (v.size() == 0) | |||
| return ""; | |||
| std::ostringstream ss; | |||
| if (v.size() == 1) { | |||
| ss << "(" << v.front() << ",)"; | |||
| } else { | |||
| const std::string delimiter = ", "; | |||
| // v.size() > 1 | |||
| ss << "("; | |||
| std::copy(v.begin(), v.end() - 1, | |||
| std::ostream_iterator<T>(ss, delimiter.c_str())); | |||
| ss << v.back(); | |||
| ss << ")"; | |||
| } | |||
| return ss.str(); | |||
| } | |||
| inline std::string write_boolean(bool b) { | |||
| if (b) | |||
| return "True"; | |||
| else | |||
| return "False"; | |||
| } | |||
| } // namespace pyparse | |||
| inline void parse_header(std::string header, std::string& descr) { | |||
| /* | |||
| The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
| The next 1 byte is an unsigned byte: the major version number of the file | |||
| format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
| number of the file format, e.g. x00. Note: the version of the file format | |||
| is not tied to the version of the numpy package. The next 2 bytes form a | |||
| little-endian unsigned short int: the length of the header data | |||
| HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
| array's format. It is an ASCII string which contains a Python literal | |||
| expression of a dictionary. It is terminated by a newline ('n') and | |||
| padded with spaces | |||
| ('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
| evenly divisible by 16 for alignment purposes. The dictionary contains | |||
| three keys: | |||
| "descr" : dtype.descr | |||
| An object that can be passed as an argument to the numpy.dtype() | |||
| constructor to create the array's dtype. For repeatability and | |||
| readability, this dictionary is formatted using pprint.pformat() so the | |||
| keys are in alphabetic order. | |||
| */ | |||
| // remove trailing newline | |||
| if (header.back() != '\n') | |||
| fprintf(stderr, "invalid header"); | |||
| header.pop_back(); | |||
| // parse the dictionary | |||
| std::vector<std::string> keys{"descr"}; | |||
| auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
| if (dict_map.size() == 0) | |||
| fprintf(stderr, "invalid dictionary in header"); | |||
| std::string descr_s = dict_map["descr"]; | |||
| parse_typestring(descr_s); | |||
| // remove | |||
| descr = npy::pyparse::parse_str(descr_s); | |||
| return; | |||
| } | |||
| inline void parse_header(std::string header, std::string& descr, | |||
| bool& fortran_order, | |||
| std::vector<ndarray_len_t>& shape) { | |||
| /* | |||
| The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
| The next 1 byte is an unsigned byte: the major version number of the file | |||
| format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
| number of the file format, e.g. x00. Note: the version of the file format | |||
| is not tied to the version of the numpy package. The next 2 bytes form a | |||
| little-endian unsigned short int: the length of the header data | |||
| HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
| array's format. It is an ASCII string which contains a Python literal | |||
| expression of a dictionary. It is terminated by a newline ('n') and | |||
| padded with spaces | |||
| ('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
| evenly divisible by 16 for alignment purposes. The dictionary contains | |||
| three keys: | |||
| "descr" : dtype.descr | |||
| An object that can be passed as an argument to the numpy.dtype() | |||
| constructor to create the array's dtype. "fortran_order" : bool Whether | |||
| the array data is Fortran-contiguous or not. Since Fortran-contiguous | |||
| arrays are a common form of non-C-contiguity, we allow them to be written | |||
| directly to disk for efficiency. "shape" : tuple of int The shape of the | |||
| array. For repeatability and readability, this dictionary is formatted | |||
| using pprint.pformat() so the keys are in alphabetic order. | |||
| */ | |||
| // remove trailing newline | |||
| if (header.back() != '\n') | |||
| fprintf(stderr, "invalid header"); | |||
| header.pop_back(); | |||
| // parse the dictionary | |||
| std::vector<std::string> keys{"descr", "fortran_order", "shape"}; | |||
| auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
| if (dict_map.size() == 0) | |||
| fprintf(stderr, "invalid dictionary in header"); | |||
| std::string descr_s = dict_map["descr"]; | |||
| std::string fortran_s = dict_map["fortran_order"]; | |||
| std::string shape_s = dict_map["shape"]; | |||
| // TODO: extract info from typestring | |||
| parse_typestring(descr_s); | |||
| // remove | |||
| descr = npy::pyparse::parse_str(descr_s); | |||
| // convert literal Python bool to C++ bool | |||
| fortran_order = npy::pyparse::parse_bool(fortran_s); | |||
| // parse the shape tuple | |||
| auto shape_v = npy::pyparse::parse_tuple(shape_s); | |||
| if (shape_v.size() == 0) | |||
| fprintf(stderr, "invalid shape tuple in header"); | |||
| for (auto item : shape_v) { | |||
| ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item)); | |||
| shape.push_back(dim); | |||
| } | |||
| } | |||
| inline std::string write_header_dict(const std::string& descr, | |||
| bool fortran_order, | |||
| const std::vector<ndarray_len_t>& shape) { | |||
| std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); | |||
| std::string shape_s = npy::pyparse::write_tuple(shape); | |||
| return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + | |||
| ", 'shape': " + shape_s + ", }"; | |||
| } | |||
| inline void write_header(std::ostream& out, const std::string& descr, | |||
| bool fortran_order, | |||
| const std::vector<ndarray_len_t>& shape_v) { | |||
| std::string header_dict = write_header_dict(descr, fortran_order, shape_v); | |||
| size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; | |||
| unsigned char version[2] = {1, 0}; | |||
| if (length >= 255 * 255) { | |||
| length = magic_string_length + 2 + 4 + header_dict.length() + 1; | |||
| version[0] = 2; | |||
| version[1] = 0; | |||
| } | |||
| size_t padding_len = 16 - length % 16; | |||
| std::string padding(padding_len, ' '); | |||
| // write magic | |||
| write_magic(out, version[0], version[1]); | |||
| // write header length | |||
| if (version[0] == 1 && version[1] == 0) { | |||
| char header_len_le16[2]; | |||
| uint16_t header_len = static_cast<uint16_t>(header_dict.length() + | |||
| padding.length() + 1); | |||
| header_len_le16[0] = (header_len >> 0) & 0xff; | |||
| header_len_le16[1] = (header_len >> 8) & 0xff; | |||
| out.write(reinterpret_cast<char*>(header_len_le16), 2); | |||
| } else { | |||
| char header_len_le32[4]; | |||
| uint32_t header_len = static_cast<uint32_t>(header_dict.length() + | |||
| padding.length() + 1); | |||
| header_len_le32[0] = (header_len >> 0) & 0xff; | |||
| header_len_le32[1] = (header_len >> 8) & 0xff; | |||
| header_len_le32[2] = (header_len >> 16) & 0xff; | |||
| header_len_le32[3] = (header_len >> 24) & 0xff; | |||
| out.write(reinterpret_cast<char*>(header_len_le32), 4); | |||
| } | |||
| out << header_dict << padding << '\n'; | |||
| } | |||
| inline std::string read_header(std::istream& istream) { | |||
| // check magic bytes an version number | |||
| unsigned char v_major, v_minor; | |||
| read_magic(istream, v_major, v_minor); | |||
| uint32_t header_length = 0; | |||
| if (v_major == 1 && v_minor == 0) { | |||
| char header_len_le16[2]; | |||
| istream.read(header_len_le16, 2); | |||
| header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); | |||
| if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { | |||
| // TODO: display warning | |||
| } | |||
| } else if (v_major == 2 && v_minor == 0) { | |||
| char header_len_le32[4]; | |||
| istream.read(header_len_le32, 4); | |||
| header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | | |||
| (header_len_le32[2] << 16) | (header_len_le32[3] << 24); | |||
| if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { | |||
| // TODO: display warning | |||
| } | |||
| } else { | |||
| fprintf(stderr, "unsupported file format version"); | |||
| } | |||
| auto buf_v = std::vector<char>(); | |||
| buf_v.reserve(header_length); | |||
| istream.read(buf_v.data(), header_length); | |||
| std::string header(buf_v.data(), header_length); | |||
| return header; | |||
| } | |||
| inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) { | |||
| ndarray_len_t size = 1; | |||
| for (ndarray_len_t i : shape) | |||
| size *= i; | |||
| return size; | |||
| } | |||
| template <typename Scalar> | |||
| inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, | |||
| unsigned int n_dims, const unsigned long shape[], | |||
| const std::vector<Scalar>& data) { | |||
| Typestring typestring_o(data); | |||
| std::string typestring = typestring_o.str(); | |||
| std::ofstream stream(filename, std::ofstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::vector<ndarray_len_t> shape_v(shape, shape + n_dims); | |||
| write_header(stream, typestring, fortran_order, shape_v); | |||
| auto size = static_cast<size_t>(comp_size(shape_v)); | |||
| stream.write(reinterpret_cast<const char*>(data.data()), | |||
| sizeof(Scalar) * size); | |||
| } | |||
| template <typename Scalar> | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::vector<unsigned long>& shape, | |||
| std::vector<Scalar>& data) { | |||
| bool fortran_order; | |||
| LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data); | |||
| } | |||
| template <typename Scalar> | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::vector<unsigned long>& shape, | |||
| bool& fortran_order, std::vector<Scalar>& data) { | |||
| std::ifstream stream(filename, std::ifstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::string header = read_header(stream); | |||
| // parse header | |||
| std::string typestr; | |||
| parse_header(header, typestr, fortran_order, shape); | |||
| // check if the typestring matches the given one | |||
| Typestring typestring_o{data}; | |||
| std::string expect_typestr = typestring_o.str(); | |||
| if (typestr != expect_typestr) { | |||
| fprintf(stderr, "formatting error: typestrings not matching"); | |||
| } | |||
| // compute the data size based on the shape | |||
| auto size = static_cast<size_t>(comp_size(shape)); | |||
| data.resize(size); | |||
| // read the data | |||
| stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size); | |||
| } | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::string& type_str, | |||
| std::vector<ndarray_len_t>& shape, | |||
| std::vector<int8_t>& data) { | |||
| std::ifstream stream(filename, std::ifstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::string header = read_header(stream); | |||
| bool fortran_order; | |||
| // parse header | |||
| parse_header(header, type_str, fortran_order, shape); | |||
| // check if the typestring matches the given one | |||
| std::string size_str = type_str.substr(type_str.size() - 1); | |||
| size_t elem_size = atoi(size_str.c_str()); | |||
| // compute the data size based on the shape | |||
| auto byte_size = elem_size * static_cast<size_t>(comp_size(shape)); | |||
| data.resize(byte_size); | |||
| // read the data | |||
| stream.read(reinterpret_cast<char*>(data.data()), byte_size); | |||
| } | |||
| } // namespace npy | |||
| #endif // NPY_H | |||
| @@ -0,0 +1,97 @@ | |||
| /** | |||
| * \file inlude/lite/common_enum_c.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_COMMON_ENUM_C_H_ | |||
| #define LITE_COMMON_ENUM_C_H_ | |||
| /*! | |||
| * \brief The log level. | |||
| */ | |||
| typedef enum LiteLogLevel { | |||
| DEBUG = 0, /*!< The lowest level and most verbose */ | |||
| INFO = 1, /*!< The lowest level and most verbose */ | |||
| WARN = 2, /*!< Print only warning and errors */ | |||
| ERROR = 3, /*!< Print only errors */ | |||
| } LiteLogLevel; | |||
| typedef enum LiteBackend { | |||
| LITE_DEFAULT = 0, //! default backend is mge | |||
| } LiteBackend; | |||
| typedef enum LiteDeviceType { | |||
| LITE_CPU = 0, | |||
| LITE_CUDA = 1, | |||
| LITE_ATLAS = 3, | |||
| LITE_NPU = 4, | |||
| //! when the device information is set in model, so set LITE_DEVICE_DEFAULT | |||
| //! in lite | |||
| LITE_DEVICE_DEFAULT = 5, | |||
| } LiteDeviceType; | |||
| typedef enum LiteDataType { | |||
| LITE_FLOAT = 0, | |||
| LITE_HALF = 1, | |||
| LITE_INT = 2, | |||
| LITE_INT16 = 3, | |||
| LITE_INT8 = 4, | |||
| LITE_UINT8 = 5, | |||
| LITE_UINT = 6, | |||
| LITE_UINT16 = 7, | |||
| LITE_INT64 = 8, | |||
| } LiteCDataType; | |||
| typedef enum LiteTensorPhase { | |||
| //! Tensor maybe input or output | |||
| LITE_IO = 0, | |||
| //! Tensor is input | |||
| LITE_INPUT = 1, | |||
| //! Tensor is output | |||
| LITE_OUTPUT = 2, | |||
| } LiteTensorPhase; | |||
| /*! | |||
| * \brief the input and output type, include SHAPE and VALUE | |||
| * sometimes user only need the shape of the output tensor | |||
| */ | |||
| typedef enum LiteIOType { | |||
| LITE_IO_VALUE = 0, | |||
| LITE_IO_SHAPE = 1, | |||
| } LiteIOType; | |||
| /*! | |||
| * \brief operation algorithm seletion strategy type, some operations have | |||
| * multi algorithms, different algorithm has different attribute, according to | |||
| * the strategy, the best algorithm will be selected. | |||
| * | |||
| * Note: These strategies can be combined | |||
| * | |||
| * 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, | |||
| * use heuristic instead | |||
| * | |||
| * 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the | |||
| * reproducible algo | |||
| * | |||
| * 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best | |||
| * algorithm from the reproducible algorithms set | |||
| * | |||
| * 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best | |||
| * algorithm form the optimzed algorithms, thus profile will process fast | |||
| * | |||
| * 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: | |||
| * profile the best algorithm form the optimzed and reproducible algorithms | |||
| */ | |||
| typedef enum LiteAlgoSelectStrategy { | |||
| LITE_ALGO_HEURISTIC = 1 << 0, | |||
| LITE_ALGO_PROFILE = 1 << 1, | |||
| LITE_ALGO_REPRODUCIBLE = 1 << 2, | |||
| LITE_ALGO_OPTIMIZED = 1 << 3, | |||
| } LiteAlgoSelectStrategy; | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,157 @@ | |||
| /** | |||
| * \file inlude/lite/global.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "macro.h" | |||
| #include "network.h" | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <vector> | |||
| namespace lite { | |||
| /** | |||
| * \brief Model decryption function | |||
| * | |||
| * \param[in] const void* is the decrypted model memory pointer | |||
| * \param[in] size_t the size the decrypted model memory in byte | |||
| * \param[in] const std::vector<uint8_t>& the decryption key vector | |||
| */ | |||
| using DecryptionFunc = std::function<std::vector<uint8_t>( | |||
| const void*, size_t, const std::vector<uint8_t>&)>; | |||
| /** | |||
| * \brief register a custom decryption method and key to lite. | |||
| * | |||
| * \param[in] decrypt_name the name of the decryption, which will act as the | |||
| * hash key to find the decryption method. | |||
| * | |||
| * \param[in] func the decryption function, which will decrypt the model with | |||
| * the registered key, return a vector that contain the decrypted model. | |||
| * | |||
| * \param[in] key the decryption key of the method | |||
| */ | |||
| LITE_API bool register_decryption_and_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key); | |||
| /** | |||
| * \brief update decryption function or key of a custom decryption method. | |||
| * | |||
| * \param[in] decrypt_name the name of the decryption, which will act as the | |||
| * hash key to find the decryption method. | |||
| * | |||
| * \param[in] func the decryption function, which will decrypt the model with | |||
| * the registered key, return a vector that contain the decrypted model. if | |||
| * function is nullptr, it will not be updated. | |||
| * | |||
| * \param[in] key the decryption key of the method, if the size of key is zero, | |||
| * it will not be updated | |||
| */ | |||
| LITE_API bool update_decryption_or_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key); | |||
| /** | |||
| * \brief Model information parse function | |||
| * | |||
| * \param[in] const void* is the information memory | |||
| * \param[in] size_t the size the information memory | |||
| * \param[in] const std::string the model name used for check whether the | |||
| * infomation match the model | |||
| * \param[in] Config the model config, ParseInfoFunc can fill it with the | |||
| * information in json, the config will influence Network loading later | |||
| * \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the | |||
| * information in json, the networkio will influence Network forwarding later | |||
| * \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the | |||
| * other config not inclue in config and networkIO, ParseInfoFunc can fill it | |||
| * with the information in json, now support: | |||
| * "device_id" : int, default 0 | |||
| * "number_threads" : size_t, default 1 | |||
| * "is_inplace_model" : bool, default false | |||
| * "use_tensorrt" : bool, default false | |||
| */ | |||
| using ParseInfoFunc = std::function<bool( | |||
| const void*, size_t, const std::string model_name, Config& config, | |||
| NetworkIO& network_io, | |||
| std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
| std::string& extra_info)>; | |||
| /** | |||
| * \brief register a custom parser function to lite. | |||
| * | |||
| * \param[in] info_type the name of the parser function, which will act as the | |||
| * hash key to find the parser method. | |||
| * | |||
| * \param[in] parse_func the parser function, which will parse the given | |||
| * information and modify the Network Config and IO. | |||
| * | |||
| */ | |||
| LITE_API bool register_parse_info_func(std::string info_type, | |||
| const ParseInfoFunc& parse_func); | |||
| /*! \brief Get version | |||
| */ | |||
| LITE_API void get_version(int& major, int& minor, int& patch); | |||
| /*! \brief Set the current log level. | |||
| * \param[in] level The new log level | |||
| */ | |||
| LITE_API void set_log_level(LiteLogLevel level); | |||
| /*! \brief Get the current log level. | |||
| * \return The current log level | |||
| */ | |||
| LITE_API LiteLogLevel get_log_level(); | |||
| /*! \brief Get device count | |||
| * \param[in] device_type device type | |||
| * \return the device count | |||
| */ | |||
| LITE_API size_t get_device_count(LiteDeviceType device_type); | |||
| /*! \brief try to coalesce all free memory in megenine | |||
| */ | |||
| LITE_API void try_coalesce_all_free_memory(); | |||
| /*! | |||
| * \brief Set the loader to the lite | |||
| * \param loader_path is the file path which store the cache | |||
| */ | |||
| LITE_API void set_loader_lib_path(const std::string& loader_path); | |||
| /*! | |||
| * \brief Set the algo policy cache file for CPU/CUDA ... | |||
| * \param cache_path is the file path which store the cache | |||
| * \param always_sync sync the cache when model run | |||
| */ | |||
| LITE_API void set_persistent_cache(const std::string& cache_path, | |||
| bool always_sync = false); | |||
| /*! | |||
| * \brief dump the PersistentCache policy cache to file, if the network is set | |||
| * to profile when forward, though this the algo policy will dump to file | |||
| */ | |||
| LITE_API void dump_persistent_cache(const std::string& cache_path); | |||
| /*! | |||
| * \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
| */ | |||
| LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path); | |||
| /*! | |||
| * \brief dump the TensorRT cache to the file set in set_tensor_rt_cache | |||
| */ | |||
| LITE_API void dump_tensor_rt_cache(); | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,20 @@ | |||
| /** | |||
| * \file include/lite/macro.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_MACRO_H_ | |||
| #define LITE_MACRO_H_ | |||
| #if defined(_WIN32) | |||
| #define LITE_API __declspec(dllexport) | |||
| #else | |||
| #define LITE_API __attribute__((visibility("default"))) | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,368 @@ | |||
| /** | |||
| * \file inlude/lite/network.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "macro.h" | |||
| #include "tensor.h" | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <mutex> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x, | |||
| LiteAlgoSelectStrategy y) { | |||
| return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) | | |||
| static_cast<uint32_t>(y)); | |||
| } | |||
| /*! | |||
| * \brief the inference options which will be translated to megenine | |||
| * | |||
| * \param weight_preprocess is the option wich optimize the inferece performance | |||
| * with preprocess the const weights | |||
| * | |||
| * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
| * dimshuffle | |||
| * | |||
| * \param fake_next_exec whether only to perform non-computing tasks (like | |||
| * memory allocation and queue initialization) for next exec. This would be | |||
| * reset to false when the graph is executed. | |||
| * | |||
| * \param var_sanity_check_first_run Disable var sanity check on the first run. | |||
| * Var sanity check is enabled on the first-time execution by default, and can | |||
| * be used to find some potential memory access errors in the operator | |||
| * implementation. | |||
| * | |||
| * \param const_shape This can be used to reduce memory usage since some | |||
| * static inference data structures can be omitted. | |||
| * | |||
| * \param force_dynamic_alloc force dynamic memory alloc for all vars | |||
| * | |||
| * \param force_output_dynamic_alloc force dynamic memory alloc for output vars | |||
| * which are used as CallbackCaller input when call compile() function | |||
| * | |||
| * \param no_profiling_on_shape_change do not re-profile to select best impl | |||
| * algo when input shape changes (use previous algo) | |||
| * | |||
| * \param jit_level Execute supported operators with JIT (support MLIR, | |||
| * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: | |||
| * 1 for basic elemwise opr; | |||
| * 2 for including reduce operator | |||
| * | |||
| * \param record_level flag optimize the inference performace with record the | |||
| * kernel tasks in first run, hereafter the inference all need to execute the | |||
| * recorded tasks. | |||
| * level = 0 means the normal inference, | |||
| * level = 1 means use record inference, | |||
| * level = 2 means record inference with free the extra memory | |||
| * | |||
| * \param graph_opt_level optimization level: | |||
| * 0: disable | |||
| * 1: level-1: inplace arith transformations during graph | |||
| * construction | |||
| * 2: level-2: level-1, plus global optimization before graph | |||
| * compiling | |||
| * 3: also enable JIT | |||
| * <0: corresponding level, with result check for debug | |||
| * | |||
| * \param async_exec_level exec: dispatch on separate threads for different | |||
| * comp_node. | |||
| * 0: do not perform async dispatch | |||
| * 1: dispatch async if there are more than one comp node with limited queue | |||
| * mask 0b10: async if there are multiple comp nodes with | |||
| * mask 0b100: always async | |||
| */ | |||
| struct LITE_API Options { | |||
| bool weight_preprocess = false; | |||
| bool fuse_preprocess = false; | |||
| bool fake_next_exec = false; | |||
| bool var_sanity_check_first_run = true; | |||
| bool const_shape = false; | |||
| bool force_dynamic_alloc = false; | |||
| bool force_output_dynamic_alloc = false; | |||
| bool no_profiling_on_shape_change = false; | |||
| uint8_t jit_level = 0; | |||
| uint8_t comp_node_seq_record_level = 0; | |||
| uint8_t graph_opt_level = 2; | |||
| uint16_t async_exec_level = 1; | |||
| //! layout transform options | |||
| bool enable_nchw44 = false; | |||
| bool enable_nchw44_dot = false; | |||
| bool enable_nchw88 = false; | |||
| bool enable_nhwcd4 = false; | |||
| bool enable_nchw4 = false; | |||
| bool enable_nchw32 = false; | |||
| bool enable_nchw64 = false; | |||
| }; | |||
| /*! | |||
| * \brief Configuration when load and compile the graph | |||
| * | |||
| * \param bare_model_cryption_name is the bare model cryption method name, bare | |||
| *model is not pack json info inside | |||
| * | |||
| *\param has_compression flag whether the model is compressed, the compress | |||
| *method will read form the model | |||
| */ | |||
| struct LITE_API Config { | |||
| bool has_compression = false; | |||
| int device_id = 0; | |||
| LiteDeviceType device_type = LiteDeviceType::LITE_CPU; | |||
| LiteBackend backend = LiteBackend::LITE_DEFAULT; | |||
| std::string bare_model_cryption_name = {}; | |||
| Options options = {}; | |||
| }; | |||
| /*! | |||
| * \brief config the network input and output item | |||
| * | |||
| */ | |||
| struct LITE_API IO { | |||
| //! the tensor name in the graph corresponding to the IO | |||
| std::string name; | |||
| //! Used to mark where the input tensor comes from and the output where copy | |||
| //! to, if is_host is true, the input is from host and output copy to host, | |||
| //! otherwise device. Sometimes The input is from device and output no need | |||
| //! copy to host, default is true. | |||
| bool is_host = true; | |||
| //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
| //! output tensor value is invaid, only shape will be set, default is VALUE | |||
| LiteIOType io_type = LiteIOType::LITE_IO_VALUE; | |||
| //! The layout of the config from user, if other layout is set before | |||
| //! forward or get after forward by input tensor reset, this layout will by | |||
| //! pass. if no other layout is set before forward, this layout will work. | |||
| //! if this layout is no set, the model will forward with its origin layout. | |||
| //! if in output, it will used to check. | |||
| Layout config_layout = {}; | |||
| }; | |||
| /*! | |||
| * \brief the input and output information when load the network | |||
| * the NetworkIO will remain in the network until the network is destroyed | |||
| */ | |||
| struct LITE_API NetworkIO { | |||
| std::vector<IO> inputs = {}; | |||
| std::vector<IO> outputs = {}; | |||
| }; | |||
| /*! | |||
| * \brief A user-implemented allocator interface | |||
| */ | |||
| class LITE_API Allocator { | |||
| public: | |||
| virtual ~Allocator() = default; | |||
| //! allocate memory of size in the given device with the given align | |||
| virtual void* allocate(LiteDeviceType device_type, int device_id, | |||
| size_t size, size_t align) = 0; | |||
| //! free the memory pointed by ptr in the given device | |||
| virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0; | |||
| }; | |||
| /*! | |||
| * \brief the thread affinith callback type | |||
| * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), | |||
| * thread_id of (nr_threads - 1) is the main worker thread. | |||
| */ | |||
| using ThreadAffinityCallback = std::function<void(int thread_id)>; | |||
| using AsyncCallback = std::function<void(void)>; | |||
| /*! | |||
| * \brief the start/finish callback function | |||
| * \param unordered_map map from the io tensor name to the pair of which is the | |||
| * corresponding IO of user config and the realy input or output tensor. | |||
| */ | |||
| using StartCallback = std::function<void( | |||
| const std::unordered_map<std::string, | |||
| std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
| using FinishCallback = std::function<void( | |||
| const std::unordered_map<std::string, | |||
| std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
| /*! | |||
| * \brief The network is construct form a model, implement model load, init, | |||
| * forward, and display some model information | |||
| */ | |||
| class LITE_API Network { | |||
| public: | |||
| class NetworkImplBase; | |||
| ~Network(); | |||
| Network(const Config& config = {}, const NetworkIO& networkio = {}); | |||
| Network(const NetworkIO& networkio, const Config& config = {}); | |||
| //! load the model form memory | |||
| void load_model(void* model_mem, size_t size); | |||
| //! load the model from a model path | |||
| void load_model(std::string model_path); | |||
| //! only compute the output tensor in user configured | |||
| void compute_only_configured_output(); | |||
| //! get the network input and output tensor, the layout of which is | |||
| //! sync from mge tensor, when the name of input and output tensor are the | |||
| //! same, use LiteTensorPhase to separate | |||
| std::shared_ptr<Tensor> get_io_tensor( | |||
| std::string io_name, | |||
| LiteTensorPhase phase = LiteTensorPhase::LITE_IO); | |||
| //! get the network input by index | |||
| std::shared_ptr<Tensor> get_input_tensor(size_t index); | |||
| //! get the network output tensor by index | |||
| std::shared_ptr<Tensor> get_output_tensor(size_t index); | |||
| //! set the network forward in async mode and set the async callback | |||
| //! function | |||
| Network& set_async_callback(const AsyncCallback& async_callback); | |||
| //! set the start forward callback function, which will be execute before | |||
| //! forward. this can be used to check network input or dump model inputs | |||
| //! for debug | |||
| Network& set_start_callback(const StartCallback& start_callback); | |||
| //! set the finish forward callback function, which will be execute after | |||
| //! forward. this can be used to dump model outputs for debug | |||
| Network& set_finish_callback(const FinishCallback& finish_callback); | |||
| //! forward the network with filled input data and fill the output data | |||
| //! to the output tensor | |||
| void forward(); | |||
| //! waite until forward finish in sync model | |||
| void wait(); | |||
| //! get the input tensor name in the order in load return | |||
| std::string get_input_name(size_t index) const; | |||
| //! get the output tensor name in the order in load return | |||
| std::string get_output_name(size_t index) const; | |||
| //! get all the input tensor name in the order in load return | |||
| std::vector<std::string> get_all_input_name() const; | |||
| //! get all the output tensor name in the order in load return | |||
| std::vector<std::string> get_all_output_name() const; | |||
| //! set/get device id, default device id = 0 | |||
| Network& set_device_id(int device_id); | |||
| int get_device_id() const; | |||
| //! set/get stream id, default stream id = 0 | |||
| Network& set_stream_id(int stream_id); | |||
| int get_stream_id() const; | |||
| //! enable profile the network, a file will be generated | |||
| void enable_profile_performance(std::string profile_file_path); | |||
| //! get model extra info | |||
| const std::string& get_model_extra_info(); | |||
| //! get device type | |||
| LiteDeviceType get_device_type() const; | |||
| public: | |||
| friend class NetworkHelper; | |||
| private: | |||
| //! update member from implement | |||
| void update_from_implement(); | |||
| //! decrypt and parse the model file | |||
| void prase_model(std::shared_ptr<void> model_data, size_t size); | |||
| private: | |||
| bool m_loaded = false; | |||
| Config m_config; | |||
| NetworkIO m_network_io; | |||
| std::unique_ptr<NetworkImplBase> m_impl; | |||
| std::string m_extra_info; | |||
| }; | |||
| /*********************** MGE special network function ***************/ | |||
| class LITE_API Runtime { | |||
| public: | |||
| //! When device is CPU, this interface will set the to be loaded model | |||
| //! run in multi thread mode with the given thread number. | |||
| static void set_cpu_threads_number(std::shared_ptr<Network> dst_network, | |||
| size_t nr_threads); | |||
| static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network); | |||
| //! set threads affinity callback; | |||
| static void set_runtime_thread_affinity( | |||
| std::shared_ptr<Network> network, | |||
| const ThreadAffinityCallback& thread_affinity_callback); | |||
| //! Set cpu default mode when device is CPU, in some low computation | |||
| //! device or single core device, this mode will get good performace | |||
| static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
| static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
| //! Set use tensorrt forward | |||
| static void use_tensorrt(std::shared_ptr<Network> dst_network); | |||
| //! set opr algorithm selection strategy in the network | |||
| //! shared_batch_size: the batch size used by fastrun, | |||
| //! Non-zero value means that fastrun use this batch size | |||
| //! regardless of the batch size of the model. Zero means | |||
| //! fastrun use batch size of the model | |||
| //! binary_equal_between_batch: if the content of each input batch is binary | |||
| //! equal,whether the content of each output | |||
| //! batch is promised to be equal | |||
| static void set_network_algo_policy( | |||
| std::shared_ptr<Network> dst_network, | |||
| LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0, | |||
| bool binary_equal_between_batch = false); | |||
| //! set workspace_limit for oprs with multiple algorithms, set | |||
| //! workspace limitation can save memory but may influence the performance | |||
| static void set_network_algo_workspace_limit( | |||
| std::shared_ptr<Network> dst_network, size_t workspace_limit); | |||
| //! set the network memroy allocator, the allocator is defined by user | |||
| static void set_memory_allocator(std::shared_ptr<Network> dst_network, | |||
| std::shared_ptr<Allocator> user_allocator); | |||
| //! share the runtime memory with other network, the weights is not shared | |||
| static void share_runtime_memory_with(std::shared_ptr<Network> dst_network, | |||
| std::shared_ptr<Network> src_network); | |||
| //! Dump input/output values of all internal variables to output | |||
| //! file, in txt format | |||
| static void enable_io_txt_dump(std::shared_ptr<Network> dst_network, | |||
| std::string io_txt_out_file); | |||
| //! Dump input/output values of all internal variables to output | |||
| //! directory, in binary format | |||
| static void enable_io_bin_dump(std::shared_ptr<Network> dst_network, | |||
| std::string io_bin_out_dir); | |||
| //! load a new network which will share weights with src network | |||
| static void shared_weight_with_network( | |||
| std::shared_ptr<Network> dst_network, | |||
| const std::shared_ptr<Network> src_network); | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,224 @@ | |||
| /** | |||
| * \file inlude/lite/tensor.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "common_enum_c.h" | |||
| #include "macro.h" | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| namespace lite { | |||
| /*! | |||
| * \brief the simple layout description | |||
| */ | |||
| struct LITE_API Layout { | |||
| static constexpr uint32_t MAXDIM = 7; | |||
| size_t shapes[MAXDIM]; | |||
| size_t ndim = 0; | |||
| LiteDataType data_type = LiteDataType::LITE_FLOAT; | |||
| //! get the total byte of a layout | |||
| size_t get_elem_size() const; | |||
| //! compare whether the two layout is equal | |||
| bool operator==(const Layout& other) const; | |||
| }; | |||
| /*! | |||
| * \brief warpper of the MegEngine Tensor | |||
| * | |||
| * The memory is not alloc directly, when call get_memory_ptr() the memory | |||
| * will be allocated in tensor implement, which will be deleted automatically | |||
| * | |||
| * Note: if the tensor memory is set through reset() interface, the memory is | |||
| * managed by the user, it will not be freed by the tensor | |||
| * | |||
| * If the device or layout is not set, when copy form other source tensor, its | |||
| * device and layout will be copy form the source tensor | |||
| * | |||
| * if is_pinned_host is set, the storage memory of the tensor is pinned memory, | |||
| * this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
| * is not set, when copy form other device(CUDA) tensor, this tensor | |||
| * will be automatically set to pinned tensor | |||
| */ | |||
| class LITE_API Tensor { | |||
| class TensorImpl; | |||
| public: | |||
| class TensorImplBase; | |||
| Tensor(); | |||
| Tensor(LiteDeviceType device_type, bool is_pinned_host = false); | |||
| Tensor(LiteDeviceType device_type, const Layout& layout, | |||
| bool is_pinned_host = false); | |||
| Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {}, | |||
| bool is_pinned_host = false); | |||
| Tensor(int device_id, int stream_id, LiteDeviceType device_type, | |||
| bool is_pinned_host = false); | |||
| Tensor(LiteBackend backend, | |||
| LiteDeviceType device_type = LiteDeviceType::LITE_CPU, | |||
| int device_id = 0, const Layout& layout = {}, | |||
| bool is_pinned_host = false); | |||
| ~Tensor(); | |||
| LiteDeviceType get_device_type() const { return m_device_type; }; | |||
| int get_device_id() const { return m_device_id; }; | |||
| Layout get_layout() const { return m_layout; }; | |||
| bool is_pinned_host() const { return m_is_pinned_host; }; | |||
| //! set layout will change the layout and reallocate memory of the tensor | |||
| void set_layout(const Layout& layout); | |||
| //! which will trigger memory alloc in tensor implement | |||
| void* get_memory_ptr() const; | |||
| //! get the memory with the offset describe in idx | |||
| void* get_memory_ptr(const std::vector<size_t>& idx) const; | |||
| //! get the tensor capacity in byte | |||
| size_t get_tensor_total_size_in_byte() const; | |||
| //! use the user allocated data to reset the memory of the tensor, the | |||
| //! memory will not be managed by the lite, later, the user should delete | |||
| //! it. | |||
| void reset(void* prepared_data, size_t data_length_in_byte); | |||
| //! use the user allocated data and corresponding layout to reset the data | |||
| //! and layout of the tensor, the memory will not be managed by lite, later, | |||
| //! the user should delete it. | |||
| void reset(void* prepared_data, const Layout& layout); | |||
| //! reshape the tensor with new shape, keep the data_type the same | |||
| void reshape(const std::vector<int>& shape); | |||
| //! get a new tensor slice from the origin tensor | |||
| std::shared_ptr<Tensor> slice(const std::vector<size_t>& start, | |||
| const std::vector<size_t>& end, | |||
| const std::vector<size_t>& step = {}); | |||
| //! set the tensor memory with zero | |||
| void fill_zero(); | |||
| //! copy tensor form other tensor | |||
| //! Note: the best way for tensor copy is just set the dst device, left | |||
| //! layout empty, when copying the dst layout will be set the same with | |||
| //! src | |||
| void copy_from(const Tensor& src); | |||
| //! share memory with other tensor | |||
| void share_memory_with(const Tensor& src_tensor); | |||
| //! whether the memory of tensor is continue | |||
| bool is_continue_memory() const; | |||
| //! update the menbers from the implement | |||
| void update_from_implement(); | |||
| public: | |||
| friend class TensorHelper; | |||
| private: | |||
| std::shared_ptr<TensorImplBase> m_tensor_impl; | |||
| //! flag whether the storage of the tensor is pinned, this is only used | |||
| //! when the compnode is not in CPU | |||
| bool m_is_pinned_host = false; | |||
| int m_device_id = 0; | |||
| Layout m_layout; | |||
| //! the device of the tensor should not be changed after the tensor has | |||
| //! constructed | |||
| LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU; | |||
| }; | |||
| /** | |||
| * \brief a class can hold any type data, but not check whether the visit type | |||
| * is valid | |||
| */ | |||
| class LITE_API LiteAny { | |||
| public: | |||
| LiteAny() = default; | |||
| template <class T> | |||
| LiteAny(T value) : m_holder(new AnyHolder<T>(value)) { | |||
| m_is_string = std::is_same<std::string, T>(); | |||
| } | |||
| LiteAny(const LiteAny& any) { | |||
| m_holder = any.m_holder->clone(); | |||
| m_is_string = any.is_string(); | |||
| } | |||
| LiteAny& operator=(const LiteAny& any) { | |||
| m_holder = any.m_holder->clone(); | |||
| m_is_string = any.is_string(); | |||
| return *this; | |||
| } | |||
| bool is_string() const { return m_is_string; } | |||
| class HolderBase { | |||
| public: | |||
| virtual ~HolderBase() = default; | |||
| virtual std::shared_ptr<HolderBase> clone() = 0; | |||
| virtual size_t type_length() const = 0; | |||
| }; | |||
| template<class T> | |||
| class AnyHolder : public HolderBase { | |||
| public: | |||
| AnyHolder(const T value) : | |||
| m_value(value) { | |||
| } | |||
| virtual std::shared_ptr<HolderBase> clone() override { | |||
| return std::make_shared<AnyHolder>(m_value); | |||
| } | |||
| virtual size_t type_length() const override { return sizeof(T); } | |||
| public: | |||
| T m_value; | |||
| }; | |||
| //! if type is miss matching, it will throw | |||
| void type_missmatch(size_t expect, size_t get) const; | |||
| //! only check the storage type and the visit type length, so it's not safe | |||
| template <class T> | |||
| T unsafe_cast() const { | |||
| if (sizeof(T) != m_holder->type_length()) { | |||
| type_missmatch(m_holder->type_length(), sizeof(T)); | |||
| } | |||
| return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value; | |||
| } | |||
| //! only check the storage type and the visit type length, so it's not safe | |||
| void* cast_void_ptr() const { | |||
| return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value; | |||
| } | |||
| private: | |||
| std::shared_ptr<HolderBase> m_holder; | |||
| bool m_is_string = false; | |||
| }; | |||
| /*********************** special tensor function ***************/ | |||
| class LITE_API TensorUtils { | |||
| public: | |||
| //! concat all the input tensor to one on the specified dim, the result | |||
| //! tensor reside in dst_device_id of dst_device, if dst_device is | |||
| //! LITE_DEVICE_DEFAULT, the device will get from the first tensor | |||
| static std::shared_ptr<Tensor> concat( | |||
| const std::vector<Tensor>& tensors, int dim, | |||
| LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT, | |||
| int dst_device_id = -1); | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,169 @@ | |||
| /** | |||
| * \file lite-c/include/lite-c/global-c.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_C_GLOBAL_H_ | |||
| #define LITE_C_GLOBAL_H_ | |||
| #include "macro.h" | |||
| #include "network_c.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| /*! \brief Get version | |||
| */ | |||
| LITE_API int LITE_get_version(int* major, int* minor, int* patch); | |||
| /*! \brief Get the last error message. | |||
| * \return the message pointer | |||
| */ | |||
| LITE_API const char* LITE_get_last_error(); | |||
| /*! \brief Get device count | |||
| * \param[in] device_type device type | |||
| * \return the device count | |||
| */ | |||
| LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count); | |||
| /*! \brief try to coalesce all free memory in megenine | |||
| */ | |||
| LITE_API int LITE_try_coalesce_all_free_memory(); | |||
| /** | |||
| * \brief Model decryption function | |||
| * | |||
| * \param[in] input_data is the decrypted model memory pointer | |||
| * \param[in] input_size the size the decrypted model memory in byte | |||
| * \param[in] key_data decryption key data | |||
| * \param[in] key_size the size of decryption key data | |||
| * \param[out] output_data the data of decrypted data, if output_data is | |||
| * nullptr, just query the output memory length, else write the decryted data to | |||
| * the output_data | |||
| * \return size of decrypted data | |||
| */ | |||
| typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size, | |||
| const uint8_t* key_data, size_t key_size, | |||
| const void* output_data); | |||
| /** | |||
| * \brief Model information parse function | |||
| * | |||
| * \param[in] info_data is the information memory | |||
| * \param[in] info_size the size the information memory | |||
| * \param[in] model_name the model name used for check whether the | |||
| * infomation match the model | |||
| * \param[in] config the model config, ParseInfoFunc can fill it with the | |||
| * information in json, the config will influence Network loading later | |||
| * \param[in] network_io the model IO, ParseInfoFunc can fill it with the | |||
| * information in json, the networkio will influence Network forwarding later | |||
| * \param[in] device_id the address to store device_id, default 0 | |||
| * \param[in] nr_threads the address to store nr_threads, default 1 | |||
| * \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default | |||
| * \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default | |||
| * false | |||
| */ | |||
| typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size, | |||
| const char* model_name, LiteConfig* config, | |||
| LiteNetworkIO* network_io, int* device_id, | |||
| size_t* nr_threads, int* is_cpu_inplace_mode, | |||
| int* use_tensorrt); | |||
| /** | |||
| * \brief register a custom decryption method and key to lite. | |||
| * | |||
| * \param[in] decrypt_name the name of the decryption, which will act as the | |||
| * hash key to find the decryption method. | |||
| * | |||
| * \param[in] func the decryption function, which will decrypt the model with | |||
| * the registered key, return a vector that contain the decrypted model. | |||
| * \param[in] key_data the decryption key of the method | |||
| * \param[in] key_size the size of decryption key | |||
| */ | |||
| LITE_API int LITE_register_decryption_and_key(const char* decrypt_name, | |||
| const LiteDecryptionFunc func, | |||
| const uint8_t* key_data, | |||
| size_t key_size); | |||
| /** | |||
| * \brief update decryption function or key of a custom decryption method. | |||
| * | |||
| * \param[in] decrypt_name the name of the decryption, which will act as the | |||
| * hash key to find the decryption method. | |||
| * | |||
| * \param[in] func the decryption function, which will decrypt the model with | |||
| * the registered key, return a vector that contain the decrypted model. if | |||
| * function is nullptr, it will not be updated. | |||
| * | |||
| * \param[in] key the decryption key of the method, if the size of key is zero, | |||
| * it will not be updated | |||
| */ | |||
| LITE_API int LITE_update_decryption_or_key(const char* decrypt_name, | |||
| const LiteDecryptionFunc func, | |||
| const uint8_t* key_data, | |||
| size_t key_size); | |||
| /** | |||
| * \brief register a custom parser function to lite. | |||
| * | |||
| * \param[in] info_type the name of the parser function, which will act as the | |||
| * hash key to find the parser method. | |||
| * | |||
| * \param[in] parse_func the parser function, which will parse the given | |||
| * information and modify the Network Config and IO. | |||
| * | |||
| */ | |||
| LITE_API int LITE_register_parse_info_func(const char* info_type, | |||
| const LiteParseInfoFunc parse_func); | |||
| /*! | |||
| * \brief Set the loader to the lite | |||
| * \param[in] loader_path is the file path which store the cache | |||
| */ | |||
| LITE_API int LITE_set_loader_lib_path(const char* loader_path); | |||
| /*! | |||
| * \brief Set the algo policy cache file for CPU/CUDA ... | |||
| * \param[in] cache_path is the file path which store the cache | |||
| * \param[in] always_sync sync the cache when cache updated | |||
| */ | |||
| LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync); | |||
| /*! | |||
| * \brief Set the tensor policy cache file for CPU/CUDA ... | |||
| * \param[in] cache_path is the file path which store the cache | |||
| */ | |||
| LITE_API int LITE_set_tensor_rt_cache(const char* cache_path); | |||
| /*! \brief Set the current log level. | |||
| * \param[in] level The new log level | |||
| */ | |||
| LITE_API int LITE_set_log_level(LiteLogLevel level); | |||
| /*! \brief Get the current log level. | |||
| * \param[in] level The pointer to log level | |||
| */ | |||
| LITE_API int LITE_get_log_level(LiteLogLevel* level); | |||
| /*! | |||
| * \brief dump the algo policy cache to file, if the network is set to profile | |||
| * when forward, though this the algo policy will dump to file | |||
| * \param[in] cache_path is the file path which store the cache | |||
| */ | |||
| LITE_API int LITE_dump_persistent_cache(const char* cache_path); | |||
| /*! | |||
| * \brief dump the tensorrt policy cache to file | |||
| */ | |||
| LITE_API int LITE_dump_tensor_rt_cache(); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,525 @@ | |||
| /** | |||
| * \file lite-c/include/lite-c/network_c.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_C_NETWORK_H_ | |||
| #define LITE_C_NETWORK_H_ | |||
| #include "tensor_c.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| /*! | |||
| * \brief the inference options which will be translated to megenine | |||
| * | |||
| * \param weight_preprocess is the option wich optimize the inferece performance | |||
| * with preprocess the const weights | |||
| * | |||
| * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
| * dimshuffle | |||
| * | |||
| * \param fake_next_exec whether only to perform non-computing tasks (like | |||
| * memory allocation and queue initialization) for next exec. This would be | |||
| * reset to false when the graph is executed. | |||
| * | |||
| * \param var_sanity_check_first_run Disable var sanity check on the first run. | |||
| * Var sanity check is enabled on the first-time execution by default, and can | |||
| * be used to find some potential memory access errors in the operator | |||
| * implementation. | |||
| * | |||
| * \param const_shape This can be used to reduce memory usage since some | |||
| * static inference data structures can be omitted. | |||
| * | |||
| * \param force_dynamic_alloc force dynamic memory alloc for all vars | |||
| * | |||
| * \param force_output_dynamic_alloc force dynamic memory alloc for output vars | |||
| * which are used as CallbackCaller input when call compile() function | |||
| * | |||
| * \param no_profiling_on_shape_change do not re-profile to select best impl | |||
| * algo when input shape changes (use previous algo) | |||
| * | |||
| * \param jit_level Execute supported operators with JIT (support MLIR, | |||
| * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: | |||
| * 1 for basic elemwise opr; | |||
| * 2 for including reduce operator | |||
| * | |||
| * \param record_level flag optimize the inference performace with record the | |||
| * kernel tasks in first run, hereafter the inference all need to execute the | |||
| * recorded tasks. | |||
| * level = 0 means the normal inference, | |||
| * level = 1 means use record inference, | |||
| * level = 2 means record inference with free the extra memory | |||
| * | |||
| * \param graph_opt_level optimization level: | |||
| * 0: disable | |||
| * 1: level-1: inplace arith transformations during graph | |||
| * construction | |||
| * 2: level-2: level-1, plus global optimization before graph | |||
| * compiling | |||
| * 3: also enable JIT | |||
| * <0: corresponding level, with result check for debug | |||
| * | |||
| * \param async_exec_level exec: dispatch on separate threads for different | |||
| * comp_node. | |||
| * 0: do not perform async dispatch | |||
| * 1: dispatch async if there are more than one comp node with limited queue | |||
| * mask 0b10: async if there are multiple comp nodes with | |||
| * mask 0b100: always async | |||
| */ | |||
| typedef struct Options { | |||
| int weight_preprocess; | |||
| int fuse_preprocess; | |||
| int fake_next_exec; | |||
| int var_sanity_check_first_run; | |||
| int const_shape; | |||
| int force_dynamic_alloc; | |||
| int force_output_dynamic_alloc; | |||
| int no_profiling_on_shape_change; | |||
| int jit_level; | |||
| int comp_node_seq_record_level; | |||
| int graph_opt_level; | |||
| int async_exec_level; | |||
| //! layout transform options | |||
| int enable_nchw44; | |||
| int enable_nchw44_dot; | |||
| int enable_nchw88; | |||
| int enable_nhwcd4; | |||
| int enable_nchw4; | |||
| int enable_nchw32; | |||
| int enable_nchw64; | |||
| } LiteOptions; | |||
| //! define a default Options | |||
| extern LITE_API const LiteOptions default_option; | |||
| /*! | |||
| * \brief Configuration when load and compile the graph | |||
| * | |||
| * \param bare_model_cryption_name is the bare model cryption method name, bare | |||
| *model is not pack json info inside | |||
| * | |||
| *\param has_compression flag whether the model is compressed, the compress | |||
| *method will read form the model | |||
| */ | |||
| typedef struct LiteConfig { | |||
| int has_compression; | |||
| int device_id; | |||
| LiteDeviceType device_type; | |||
| LiteBackend backend; | |||
| const char* bare_model_cryption_name; | |||
| LiteOptions options; | |||
| } LiteConfig; | |||
| //! get default config | |||
| LITE_API LiteConfig* default_config(); | |||
| /*! | |||
| * \brief config the network input and output item | |||
| * | |||
| */ | |||
| typedef struct LiteIO { | |||
| //! the tensor name in the graph corresponding to the IO | |||
| const char* name; | |||
| //! Used to mark where the input tensor comes from and the output where copy | |||
| //! to, if is_host is true, the input is from host and output copy to host, | |||
| //! otherwise device. Sometimes The input is from device and output no need | |||
| //! copy to host, default is true. | |||
| int is_host; | |||
| //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
| //! output tensor value is invaid, only shape will be set, default is VALUE | |||
| LiteIOType io_type; | |||
| //! The layout of the config from user, if other layout is set before | |||
| //! forward or get after forward, this layout will by pass. if no other | |||
| //! layout is set before forward, this layout will work. if this layout is | |||
| //! no set, the model will forward with its origin layout. if in output, it | |||
| //! will used to check. | |||
| LiteLayout config_layout; | |||
| } LiteIO; | |||
| //! define a default IO | |||
| extern LITE_API const LiteIO default_io; | |||
| /*! | |||
| * \brief the input and output information when load the network | |||
| * the NetworkIO will remain in the network until the network is destroyed | |||
| */ | |||
| typedef struct LiteNetworkIO { | |||
| LiteIO* inputs; | |||
| LiteIO* outputs; | |||
| size_t input_size; //! the number IO in inputs | |||
| size_t output_size; //! the number IO in outputs | |||
| } LiteNetworkIO; | |||
| //! get default NetworkIO | |||
| LITE_API LiteNetworkIO* default_network_io(); | |||
| /*! | |||
| * \brief A user-implemented allocator function | |||
| */ | |||
| //! allocate memory of size in the given device with the given align | |||
| typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id, | |||
| size_t size, size_t align); | |||
| //! free the memory pointed by ptr in the given device | |||
| typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr); | |||
| /*! | |||
| * \brief the thread affinith callback type | |||
| * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), | |||
| * thread_id of (nr_threads - 1) is the main worker thread. | |||
| */ | |||
| typedef int (*LiteThreadAffinityCallback)(int thread_id); | |||
| typedef int (*LiteAsyncCallback)(); | |||
| /*! | |||
| * \brief the start/finish callback function | |||
| * \param unordered_map map from the io tensor name to the pair of which is the | |||
| * corresponding IO of user config and the realy input or output tensor. | |||
| */ | |||
| typedef int (*LiteStartCallback)(const LiteIO* inputs, | |||
| const LiteTensor* input_tensors, size_t size); | |||
| typedef int (*LiteFinishCallback)(const LiteIO* outputs, | |||
| const LiteTensor* output_tensors, | |||
| size_t size); | |||
| /*! | |||
| * \brief The network is construct form a model, implement model load, init, | |||
| * forward, and display some model information | |||
| */ | |||
| typedef void* LiteNetwork; | |||
| /** | |||
| * \brief Create a lite Network object with default config and networkIO. | |||
| * \param[out] network The netwrok pointer | |||
| * \return int if the return is not zero, error happened, the error message | |||
| * can get by LITE_get_last_error | |||
| */ | |||
| LITE_API int LITE_make_default_network(LiteNetwork* network); | |||
| /** | |||
| * \brief Create a lite Network object from the given config and networkIO. | |||
| * \param[in] config The configration to create the network | |||
| * \param[in] network_io The configration io to create the network | |||
| * \param[out] network The network pointer | |||
| */ | |||
| LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config, | |||
| const LiteNetworkIO network_io); | |||
| /** | |||
| * \brief Create a lite Network object from the given config and networkIO. | |||
| * \param[in] config The configration to create the network | |||
| * \param[out] network The network pointer | |||
| */ | |||
| LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config); | |||
| /** | |||
| * \brief load the model to network form memory | |||
| * \param[in] model_mem The model in memory | |||
| * \param[in] size The size of the model memory | |||
| * \param[out] network The network to be load model in | |||
| */ | |||
| LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, | |||
| size_t size); | |||
| /** | |||
| * \brief load the model to network form given path | |||
| * \param[in] model_path The model path | |||
| * \param[out] network The network to be load model in | |||
| */ | |||
| LITE_API int LITE_load_model_from_path(LiteNetwork network, | |||
| const char* model_path); | |||
| /** | |||
| * \brief load a new network which will share weights with src network | |||
| * \param[in] origin_network The origin network pointer | |||
| * \param[out] network The network pointer | |||
| */ | |||
| LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network, | |||
| const LiteNetwork src_network); | |||
| /** | |||
| * \brief Destroy a lite network object. | |||
| * \param[in] network The network pointer | |||
| * \return int if the return is not zero, error happened, the error message | |||
| * can get by LITE_get_last_error | |||
| */ | |||
| LITE_API int LITE_destroy_network(LiteNetwork network); | |||
| /** | |||
| * \brief forward the network with filled input data and fill the output data | |||
| * to the output tensor | |||
| * \param[in] network The loaded model | |||
| */ | |||
| LITE_API int LITE_forward(const LiteNetwork network); | |||
| /** | |||
| * \brief waite until forward finish in sync model | |||
| * \param[in] network The loaded model | |||
| */ | |||
| LITE_API int LITE_wait(const LiteNetwork network); | |||
| /** | |||
| * \brief get the network input and ouput tensor, the layout of which is | |||
| * get from model | |||
| * \param[in] network The loaded model | |||
| * \param[in] io_name The input or output name | |||
| * \param[in] phase The tensor phase | |||
| * \param[out] tensor The IO tensor get from the network | |||
| */ | |||
| LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name, | |||
| LiteTensorPhase phase, LiteTensor* tensor); | |||
| /** | |||
| * \brief get the input tensor name in the order in loaded model | |||
| * \param[in] network The loaded model | |||
| * \param[in] index The index of input tensor | |||
| * \param[out] name The input tensor name | |||
| */ | |||
| LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index, | |||
| const char** name); | |||
| /** | |||
| * \brief get the output tensor name in the order in loaded model | |||
| * \param[in] network The loaded model | |||
| * \param[in] index The index of output tensor | |||
| * \param[out] name The output tensor name | |||
| */ | |||
| LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index, | |||
| const char** name); | |||
| /** | |||
| * \brief get all the input tensor name in the order in loaded model | |||
| * \param[in] network The loaded model | |||
| * \param[in] size The number of the input tensor | |||
| * \param[out] name The input tensor names | |||
| */ | |||
| LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size, | |||
| const char** name); | |||
| /** | |||
| * \brief get all the output tensor name in the order in loaded model | |||
| * \param[in] network The loaded model | |||
| * \param[in] size The number of output tensor | |||
| * \param[out] name The output tensor name | |||
| */ | |||
| LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size, | |||
| const char** name); | |||
| /** | |||
| * \brief get whether the model is running in cpu inplace mode | |||
| * \param[in] network The loaded model | |||
| * \param[out] is_cpu_inplace_mode whether is in cpu inplace mode | |||
| */ | |||
| LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network, | |||
| int* is_cpu_inplace_mode); | |||
| /** | |||
| * \brief get the number of thread the network will run with | |||
| * \param[in] network The loaded model | |||
| * \param[out] nr_threads the thread number when the network running | |||
| */ | |||
| LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network, | |||
| size_t* nr_threads); | |||
| /** | |||
| * \brief get the device id the network will run with | |||
| * \param[in] network The loaded model | |||
| * \param[out] device_id the device id of the network will run | |||
| */ | |||
| LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id); | |||
| /** | |||
| * \brief get the stream id the network will run with | |||
| * \param[in] network The loaded model | |||
| * \param[out] stream_id the stream id of the network will run | |||
| */ | |||
| LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id); | |||
| /** | |||
| * \brief get the device type the network will run with | |||
| * \param[in] network The loaded model | |||
| * \param[out] device_type the device type of the network will run | |||
| */ | |||
| LITE_API int LITE_get_device_type(const LiteNetwork network, | |||
| LiteDeviceType* device_type); | |||
| /** | |||
| * \brief get the device type the network will run with | |||
| * \param[in] network The loaded model | |||
| * \param[out] info : the json format memory | |||
| * \param[out] info_size: the json format memory size | |||
| */ | |||
| LITE_API int LITE_get_model_extra_info(const LiteNetwork network, | |||
| const char** info, int* info_size); | |||
| /** | |||
| * \brief Set cpu default mode when device is CPU, in some low computation | |||
| * device or single core device, this mode will get good performace | |||
| * \param[in] network The loaded model | |||
| */ | |||
| LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network); | |||
| /** | |||
| * \brief When device is CPU, this interface will set the to be loaded model | |||
| * run in multi thread mode with the given thread number. | |||
| * \param[in] network The loaded model | |||
| * \param[in] nr_threads The threads number | |||
| */ | |||
| LITE_API int LITE_set_cpu_threads_number(LiteNetwork network, | |||
| size_t nr_threads); | |||
| /** | |||
| * \brief set device id, default device id = 0 | |||
| * \param[in] network The loaded model | |||
| * \param[in] device_id The device id to be set | |||
| */ | |||
| LITE_API int LITE_set_device_id(LiteNetwork network, int device_id); | |||
| /** | |||
| * \brief set stream id, default stream id = 0 | |||
| * \param[in] network The loaded model | |||
| * \param[in] stream_id The stream id to be set | |||
| */ | |||
| LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id); | |||
| /** | |||
| * \brief enable tensorrt | |||
| * \param[in] network The loaded model | |||
| */ | |||
| LITE_API int LITE_use_tensorrt(LiteNetwork network); | |||
| /** | |||
| * \brief set opr algorithm selection strategy in the network | |||
| * \param[in] network The loaded model | |||
| * \param[in] select_strategy The operator algorithm selection strategy | |||
| */ | |||
| LITE_API int LITE_set_network_algo_policy(LiteNetwork network, | |||
| LiteAlgoSelectStrategy strategy); | |||
| /** | |||
| * \brief set opr algorithm selection strategy in the network | |||
| * \param[in] network The loaded model | |||
| * \param[in] shared_batch_size: the batch size used by fastrun, | |||
| * Non-zero value means that fastrun use this batch size | |||
| * regardless of the batch size of the model. Zero means | |||
| * fastrun use batch size of the model | |||
| * \param[in] binary_equal_between_batch: if the content of each input batch is | |||
| * binary equal,whether the content of each output batch is | |||
| * promised to be equal | |||
| */ | |||
| LITE_API int LITE_set_network_algo_fastrun_config( | |||
| LiteNetwork network, unsigned int shared_batch_size, | |||
| int binary_equal_between_batch); | |||
| /** | |||
| * \brief set workspace_limit for oprs with multiple algorithms, set | |||
| * workspace limit can save memory but may influence the performance | |||
| * \param[in] network The loaded model | |||
| * \param[in] workspace_limit The operator algorithm workspace limit | |||
| */ | |||
| LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network, | |||
| size_t workspace_limit); | |||
| /** | |||
| * \brief set the network forward in async mode and set the async callback | |||
| * function | |||
| * \param[in] network The loaded model | |||
| * \param[in] async_callback when network finish forwarding, the callbak | |||
| * will be called | |||
| */ | |||
| LITE_API int LITE_set_async_callback(LiteNetwork network, | |||
| const LiteAsyncCallback async_callback); | |||
| /** | |||
| * \brief set the start forward callback function, which will be execute beform | |||
| * forward, this can be used to check network input or dump model inputs | |||
| * for debug | |||
| * \param[in] network The loaded model | |||
| * \param[in] start_callback when network start forwarding, the callbak | |||
| * will be called | |||
| */ | |||
| LITE_API int LITE_set_start_callback(LiteNetwork network, | |||
| const LiteStartCallback start_callback); | |||
| /** | |||
| * \brief set the finish forward callback function, which will be execute after | |||
| * forward, this can be used to dump model outputs for debug | |||
| * \param[in] network The loaded model | |||
| * \param[in] finish_callback when network finish forwarding, the callbak | |||
| * will be called | |||
| */ | |||
| LITE_API int LITE_set_finish_callback(LiteNetwork network, | |||
| const LiteFinishCallback finish_callback); | |||
| /** | |||
| * \brief set threads affinity callback | |||
| * \param[in] network The loaded model | |||
| * \param[in] thread_affinity_callback | |||
| */ | |||
| LITE_API int LITE_set_runtime_thread_affinity( | |||
| LiteNetwork network, | |||
| const LiteThreadAffinityCallback thread_affinity_callback); | |||
| /** | |||
| * \brief set the network memroy allocator, the allocator is defined by user | |||
| * \param[in] network The loaded model | |||
| * \param[in] allocate_fun The allocate function of the user defined allocator | |||
| * \param[in] free_fun The free function of the user defined allocator | |||
| */ | |||
| LITE_API int LITE_set_memory_allocator(LiteNetwork network, | |||
| const LiteAllocate allocate_fun, | |||
| const LiteFree free_fun); | |||
| /** | |||
| * \brief the dst_network share the runtime memory with src_network | |||
| * \param[in] src_network The source network | |||
| * \param[in] dst_network The dst network to shared memory with src_network | |||
| */ | |||
| LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network, | |||
| LiteNetwork dst_network); | |||
| /** | |||
| * \brief enable profile the network, a JSON format file will be generated | |||
| * \param[in] network The loaded model | |||
| * \param[in] profile_json_file_path The profile result file path | |||
| */ | |||
| LITE_API int LITE_enable_profile_performance( | |||
| LiteNetwork network, const char* profile_json_file_path); | |||
| /** | |||
| * \brief Dump input/output values of all internal variables to output file, | |||
| * in text format | |||
| * \param[in] network The loaded model | |||
| * \param[in] io_txt_out_file The dumped txt file name | |||
| */ | |||
| LITE_API int LITE_enable_io_txt_dump(LiteNetwork network, | |||
| const char* io_txt_out_file); | |||
| /** | |||
| * \brief Dump input/output values of all internal variables to output | |||
| * directory, in binary format | |||
| * \param[in] network The loaded model | |||
| * \param[in] io_bin_out_dir The dumped bin file directory | |||
| */ | |||
| LITE_API int LITE_enable_io_bin_dump(LiteNetwork network, | |||
| const char* io_bin_out_dir); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,251 @@ | |||
| /** | |||
| * \file lite-c/include/lite-c/tensor_c.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_TENSOR_C_H_ | |||
| #define LITE_TENSOR_C_H_ | |||
| #include "common_enum_c.h" | |||
| #include "macro.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #include "stddef.h" | |||
| #include "stdint.h" | |||
| #define LAYOUT_MAX_DIM (7) | |||
| /*! | |||
| * \brief the simple layout description | |||
| */ | |||
| typedef struct LiteLayout { | |||
| size_t shapes[LAYOUT_MAX_DIM]; | |||
| size_t ndim; | |||
| LiteDataType data_type; | |||
| } LiteLayout; | |||
| //! define a default LiteLayout | |||
| extern LITE_API const LiteLayout default_layout; | |||
| /*! | |||
| * \brief warpper of the MegEngine Tensor | |||
| * | |||
| * if is_pinned_host is set, the storage memory of the tensor is pinned memory, | |||
| * this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
| * is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor | |||
| * will be automatically set to pinned tensor | |||
| */ | |||
| typedef struct LiteTensorDesc { | |||
| //! flag whether the storage of the tensor is pinned, this is only used when | |||
| //! the compnode is not in CPU | |||
| int is_pinned_host; | |||
| //! the layout of the tensor | |||
| LiteLayout layout; | |||
| //! the device of the tensor should not be changed after the tensor has | |||
| //! constructed | |||
| LiteDeviceType device_type; | |||
| //! device id of the tensor | |||
| int device_id; | |||
| } LiteTensorDesc; | |||
| //! define a default TensorDesc | |||
| extern LITE_API const LiteTensorDesc default_desc; | |||
| /*! | |||
| * \brief The pointer to a Lite Tensor object | |||
| */ | |||
| typedef void* LiteTensor; | |||
| /** | |||
| * \brief Create a lite tensor object from the given describe. | |||
| * \param[in] tensor_describe The description to create the Tensor | |||
| * \param[out] tensor The Tensor pointer | |||
| * \return int if the return is not zero, error happened, the error message | |||
| * can get by LITE_get_last_error | |||
| */ | |||
| LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe, | |||
| LiteTensor* tensor); | |||
| /** | |||
| * \brief Destroy a lite tensor object. | |||
| * \param[in] tensor The Tensor pointer | |||
| * \return int if the return is not zero, error happened, the error message | |||
| * can get by LITE_get_last_error | |||
| */ | |||
| LITE_API int LITE_destroy_tensor(LiteTensor tensor); | |||
| /** | |||
| * \brief change the layout of a Tensor object. | |||
| * \param[in] tensor The Tensor | |||
| * \param[out] layout The Layout to be set to a tensor | |||
| */ | |||
| LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout); | |||
| /** | |||
| * \brief use the user allocated data to reset the memory of the tensor, the | |||
| * memory will not be managed by the lite, later, the user should delete | |||
| * it. | |||
| * \param[in] tensor The Tensor | |||
| * \param[in] prepared_data The allocated memory which satisfy the Tensor | |||
| * \param[in] data_length_in_byte The length of the allocated memory | |||
| * layout | |||
| */ | |||
| LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, | |||
| size_t data_length_in_byte); | |||
| /** | |||
| * \brief use the user allocated data and corresponding layout to reset the | |||
| * data and layout of the tensor, the memory will not be managed by lite, later, | |||
| * the user should delete it. | |||
| * \param[in] tensor The Tensor | |||
| * \param[in] layout The Layout to be set to the tensor | |||
| * \param[in] prepared_data The allocated memory which satisfy the layout to be | |||
| * set | |||
| */ | |||
| LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, | |||
| void* prepared_data); | |||
| /** | |||
| * \brief reshape a tensor with the memroy not change, the total number of | |||
| * element in the reshaped tensor must equal to the origin tensor, the input | |||
| * shape must only contain one or zero -1 to flag it can be deduced | |||
| * automatically. | |||
| * \param[in] tensor The Tensor to be reshape | |||
| * \param[in] shape the user input shape | |||
| * \param[in] size the number of data in shape, | |||
| */ | |||
| LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size); | |||
| /** | |||
| * \brief slice a tensor with input param | |||
| * \param[in] tensor The Tensor to be slice | |||
| * \param[in] start start index of every axis of to be sliced | |||
| * \param[in] end end index of every axis of to be sliced | |||
| * \param[in] step step of every axis of to be sliced, if nullptr, step will be | |||
| * 1 | |||
| * \param[in] size the number axis to be sliced | |||
| * \param[out] sliced_tensor the result tensor sliced from the origin tensor | |||
| */ | |||
| LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, | |||
| const size_t* end, const size_t* step, | |||
| size_t size, LiteTensor* slice_tensor); | |||
| /** | |||
| * \brief fill zero to the tensor | |||
| * \param[in] tensor The Tensor to be memset | |||
| */ | |||
| LITE_API int LITE_tensor_fill_zero(LiteTensor tensor); | |||
| /** | |||
| * \brief copy tensor form other tensor | |||
| * \param[out] dst_tensor The Tensor to copy into | |||
| * \param[in] src_tensor The Tensor to copy from | |||
| */ | |||
| LITE_API int LITE_tensor_copy(LiteTensor dst_tensor, | |||
| const LiteTensor src_tensor); | |||
| /** | |||
| * \brief share memory form other tensor | |||
| * \param[out] dst_tensor The Tensor to share into | |||
| * \param[in] src_tensor The Tensor to be shared | |||
| */ | |||
| LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor, | |||
| const LiteTensor src_tensor); | |||
| /** | |||
| * \brief get the memory pointer of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] data a pointer to void pointer | |||
| */ | |||
| LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data); | |||
| /** | |||
| * \brief get the memory pointer of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[in] index The coordinate in the tensor | |||
| * \param[in] size The lenght of coordinate | |||
| * \param[out] data a pointer to void pointer | |||
| */ | |||
| LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor, | |||
| const size_t* index, size_t size, | |||
| void** data); | |||
| /** | |||
| * \brief get the tensor capacity in byte of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] size_ptr a pointer to the return size | |||
| */ | |||
| LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, | |||
| size_t* size); | |||
| /** | |||
| * \brief get the tensor layout of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] layout_ptr a pointer will be write with the layout of the tensor | |||
| */ | |||
| LITE_API int LITE_get_tensor_layout(const LiteTensor tensor, | |||
| LiteLayout* layout); | |||
| /** | |||
| * \brief get the tensor device of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] device_ptr a pointer will be write with the device of the tensor | |||
| */ | |||
| LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor, | |||
| LiteDeviceType* device_type); | |||
| /** | |||
| * \brief get the tensor device id of a Tensor object. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] device_id a pointer will be write with the device id of the | |||
| * tensor | |||
| */ | |||
| LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id); | |||
| /** | |||
| * \brief whether the tensor is is_pinned_host. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] is_pinned_host_ptr a int pointer will be write with whether the | |||
| * tensor is pinned host | |||
| */ | |||
| LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host); | |||
| /** | |||
| * \brief whether the tensor memory is continue. | |||
| * \param[in] tensor The input Tensor | |||
| * \param[out] is_continue a int pointer will be write with whether the | |||
| * tensor continue | |||
| */ | |||
| LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue); | |||
| /** | |||
| * \brief concat the inputs tensor to one big tensor | |||
| * \param[in] tensors ptr The input Tensors | |||
| * \param[in] nr_tensors number input Tensor | |||
| * \param[in] dim the dim concat act on | |||
| * \param[in] dst_device the device type of result tensor, when | |||
| * LITE_DEVICE_DEFAULT, the result tensor device type will get from the first | |||
| * tensor | |||
| * \param[in] device_id the device id of result tensor, when -1, the result | |||
| * tensor device id will get from the first tensor | |||
| * \param[out] result_tensor the result tensor after concat | |||
| */ | |||
| LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, | |||
| LiteDeviceType dst_device, int device_id, | |||
| LiteTensor* result_tensor); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,73 @@ | |||
| /** | |||
| * \file lite-c/src/common.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef LITE_C_COMMON_H_ | |||
| #define LITE_C_COMMON_H_ | |||
| #include "../src/misc.h" | |||
| #include "lite-c/network_c.h" | |||
| #include "lite-c/tensor_c.h" | |||
| #include "lite/network.h" | |||
| #include <exception> | |||
| #include <stdexcept> | |||
| //! convert c Layout to lite::Layout | |||
| lite::Layout convert_to_layout(const LiteLayout& layout); | |||
| //! convert lite::Layout to C Layout | |||
| LiteLayout convert_to_clayout(const lite::Layout& layout); | |||
| //! convert c config to lite::config | |||
| lite::Config convert_to_lite_config(const LiteConfig c_config); | |||
| //! convert C NetworkIO io to lite::NetworkIO | |||
| lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io); | |||
| /*! | |||
| * \brief handle exception | |||
| * \param e the exception | |||
| * \return the return value of the error | |||
| */ | |||
| int LiteHandleException(const std::exception& e); | |||
| #if LITE_ENABLE_EXCEPTION | |||
| /*! \brief macro to guard a function */ | |||
| #define LITE_CAPI_BEGIN() try { | |||
| /*! \brief every function starts with LITE_CAPI_BEGIN(); | |||
| * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS | |||
| */ | |||
| #define LITE_CAPI_END() \ | |||
| } \ | |||
| catch (std::exception & _except_) { \ | |||
| return LiteHandleException(_except_); \ | |||
| } \ | |||
| return 0; | |||
| #else | |||
| /*! \brief macro to guard a function */ | |||
| #define LITE_CAPI_BEGIN() { | |||
| /*! \brief every function starts with LITE_CAPI_BEGIN(); | |||
| * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS | |||
| */ | |||
| #define LITE_CAPI_END() \ | |||
| } \ | |||
| return 0; | |||
| #endif | |||
| /*! | |||
| * \brief catch the exception with stms | |||
| */ | |||
| #define LITE_CAPI_END_WITH_STMS(_stms) \ | |||
| } \ | |||
| catch (std::exception & _except_) { \ | |||
| _stms; \ | |||
| return LiteHandleException(_except_); \ | |||
| } \ | |||
| return 0; | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,192 @@ | |||
| /** | |||
| * \file lite-c/src/tensor.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/global.h" | |||
| #include "common.h" | |||
| #include "lite-c/global_c.h" | |||
| #include <exception> | |||
| #include <mutex> | |||
| namespace { | |||
| class ErrorMsg { | |||
| public: | |||
| std::string& get_error_msg() { return error_msg; } | |||
| void set_error_msg(const std::string& msg) { error_msg = msg; } | |||
| private: | |||
| std::string error_msg; | |||
| }; | |||
| ErrorMsg& get_global_error() { | |||
| static thread_local ErrorMsg error_msg; | |||
| return error_msg; | |||
| } | |||
| } // namespace | |||
| int LiteHandleException(const std::exception& e) { | |||
| get_global_error().set_error_msg(e.what()); | |||
| return -1; | |||
| } | |||
| const char* LITE_get_last_error() { | |||
| return get_global_error().get_error_msg().c_str(); | |||
| } | |||
| int LITE_get_version(int* major, int* minor, int* patch) { | |||
| LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null"); | |||
| lite::get_version(*major, *minor, *patch); | |||
| return 0; | |||
| } | |||
| int LITE_get_device_count(LiteDeviceType device_type, size_t* count) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(count, "The ptr pass to LITE api is null"); | |||
| *count = lite::get_device_count(device_type); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_try_coalesce_all_free_memory(){ | |||
| LITE_CAPI_BEGIN(); | |||
| lite::try_coalesce_all_free_memory(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_register_decryption_and_key(const char* decrypt_name, | |||
| const LiteDecryptionFunc func, | |||
| const uint8_t* key_data, size_t key_size) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(decrypt_name && key_data && func, | |||
| "The ptr pass to LITE api is null"); | |||
| std::vector<uint8_t> key; | |||
| for (size_t i = 0; i < key_size; i++) { | |||
| key.push_back(key_data[i]); | |||
| } | |||
| auto decrypt_func = [func](const void* input_data, size_t input_size, | |||
| const std::vector<uint8_t>& key) { | |||
| auto size = | |||
| func(input_data, input_size, key.data(), key.size(), nullptr); | |||
| std::vector<uint8_t> output(size, 0); | |||
| func(input_data, input_size, key.data(), key.size(), output.data()); | |||
| return output; | |||
| }; | |||
| lite::register_decryption_and_key(decrypt_name, decrypt_func, key); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_update_decryption_or_key(const char* decrypt_name, | |||
| const LiteDecryptionFunc func, | |||
| const uint8_t* key_data, size_t key_size) { | |||
| LITE_CAPI_BEGIN(); | |||
| std::vector<uint8_t> key; | |||
| for (size_t i = 0; i < key_size; i++) { | |||
| key.push_back(key_data[i]); | |||
| } | |||
| lite::DecryptionFunc decrypt_func = nullptr; | |||
| if (func) { | |||
| decrypt_func = [func](const void* input_data, size_t input_size, | |||
| const std::vector<uint8_t>& key) { | |||
| auto size = func(input_data, input_size, key.data(), key.size(), | |||
| nullptr); | |||
| std::vector<uint8_t> output(size, 0); | |||
| func(input_data, input_size, key.data(), key.size(), output.data()); | |||
| return output; | |||
| }; | |||
| } | |||
| lite::update_decryption_or_key(decrypt_name, decrypt_func, key); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_register_parse_info_func(const char* info_type, | |||
| const LiteParseInfoFunc parse_func) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null"); | |||
| auto lite_func = [parse_func]( | |||
| const void* info_data, size_t info_size, | |||
| const std::string model_name, lite::Config& config, | |||
| lite::NetworkIO& network_io, | |||
| std::unordered_map<std::string, lite::LiteAny>& | |||
| separate_config_map, | |||
| std::string& extra_info) { | |||
| LITE_MARK_USED_VAR(extra_info); | |||
| size_t nr_threads = 1; | |||
| int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false; | |||
| LiteNetworkIO c_io; | |||
| LiteConfig c_config; | |||
| auto ret = parse_func(info_data, info_size, model_name.c_str(), | |||
| &c_config, &c_io, &device_id, &nr_threads, | |||
| &is_cpu_inplace_mode, &use_tensorrt); | |||
| config = convert_to_lite_config(c_config); | |||
| network_io = convert_to_lite_io(c_io); | |||
| if (device_id != 0) { | |||
| separate_config_map["device_id"] = device_id; | |||
| } | |||
| if (nr_threads != 1) { | |||
| separate_config_map["nr_threads"] = nr_threads; | |||
| } | |||
| if (is_cpu_inplace_mode != false) { | |||
| separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode; | |||
| } | |||
| if (use_tensorrt != false) { | |||
| separate_config_map["use_tensorrt"] = use_tensorrt; | |||
| } | |||
| return ret; | |||
| }; | |||
| lite::register_parse_info_func(info_type, lite_func); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_loader_lib_path(const char* loader_path) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(loader_path, "The ptr pass to LITE api is null"); | |||
| lite::set_loader_lib_path(loader_path); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_persistent_cache(const char* cache_path, int always_sync) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
| lite::set_persistent_cache(cache_path, always_sync); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_tensor_rt_cache(const char* cache_path) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
| lite::set_tensor_rt_cache(cache_path); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_log_level(LiteLogLevel level) { | |||
| LITE_CAPI_BEGIN(); | |||
| lite::set_log_level(level); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_log_level(LiteLogLevel* level) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(level, "The ptr pass to LITE api is null"); | |||
| *level = lite::get_log_level(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_dump_persistent_cache(const char* cache_path) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
| lite::dump_persistent_cache(cache_path); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_dump_tensor_rt_cache() { | |||
| LITE_CAPI_BEGIN(); | |||
| lite::dump_tensor_rt_cache(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,580 @@ | |||
| /** | |||
| * \file lite-c/src/network.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/network.h" | |||
| #include "common.h" | |||
| #include "lite-c/network_c.h" | |||
| #include "../../src/network_impl_base.h" | |||
| #include <memory> | |||
| #include <mutex> | |||
| #include <unordered_map> | |||
| #include <string.h> | |||
| //! define a default Options | |||
| const LiteOptions default_option = { | |||
| .weight_preprocess = false, | |||
| .fuse_preprocess = false, | |||
| .fake_next_exec = false, | |||
| .var_sanity_check_first_run = true, | |||
| .const_shape = false, | |||
| .force_dynamic_alloc = false, | |||
| .force_output_dynamic_alloc = false, | |||
| .no_profiling_on_shape_change = false, | |||
| .jit_level = 0, | |||
| .comp_node_seq_record_level = 0, | |||
| .graph_opt_level = 2, | |||
| .async_exec_level = 1, | |||
| //! layout transform options | |||
| .enable_nchw44 = 0, | |||
| .enable_nchw44_dot = 0, | |||
| .enable_nchw88 = 0, | |||
| .enable_nhwcd4 = 0, | |||
| .enable_nchw4 = 0, | |||
| .enable_nchw32 = 0, | |||
| .enable_nchw64 = 0, | |||
| }; | |||
| //! define a default config | |||
| LiteConfig default_config_t = {.has_compression = false, | |||
| .device_id = -1, | |||
| .device_type = LiteDeviceType::LITE_CPU, | |||
| .backend = LiteBackend::LITE_DEFAULT, | |||
| .bare_model_cryption_name = nullptr, | |||
| .options = default_option}; | |||
| LiteConfig* default_config() { | |||
| return &default_config_t; | |||
| } | |||
| //! define a default IO | |||
| const LiteIO default_io = {.name = nullptr, | |||
| .is_host = true, | |||
| .io_type = LiteIOType::LITE_IO_VALUE, | |||
| .config_layout = default_layout}; | |||
| //! define a default NetworkIO | |||
| LiteNetworkIO default_network_io_t = {.inputs = nullptr, | |||
| .outputs = nullptr, | |||
| .input_size = 0, | |||
| .output_size = 0}; | |||
| LiteNetworkIO* default_network_io() { | |||
| return &default_network_io_t; | |||
| } | |||
| namespace { | |||
| std::unordered_map<void*, std::shared_ptr<lite::Network>>& | |||
| get_gloabl_network_holder() { | |||
| static thread_local std::unordered_map<void*, | |||
| std::shared_ptr<lite::Network>> | |||
| network_holder; | |||
| return network_holder; | |||
| } | |||
| /*! | |||
| * \brief A user-implemented allocator interface | |||
| */ | |||
| class UserAllocator : public lite::Allocator { | |||
| public: | |||
| UserAllocator(LiteAllocate allocate_func, LiteFree free_func) | |||
| : m_allocator(allocate_func), m_free(free_func) { | |||
| LITE_ASSERT(m_allocator && m_free); | |||
| } | |||
| //! allocate memory of size in the given device with the given align | |||
| void* allocate(LiteDeviceType device_type, int device_id, size_t size, | |||
| size_t align) override { | |||
| return m_allocator(device_type, device_id, size, align); | |||
| } | |||
| //! free the memory pointed by ptr in the given device | |||
| void free(LiteDeviceType device_type, int device_id, void* ptr) override { | |||
| m_free(device_type, device_id, ptr); | |||
| } | |||
| private: | |||
| LiteAllocate m_allocator; | |||
| LiteFree m_free; | |||
| }; | |||
| } // namespace | |||
| //! convert c config to lite::config | |||
| lite::Config convert_to_lite_config(const LiteConfig c_config) { | |||
| lite::Config lite_config; | |||
| lite_config.device_type = c_config.device_type; | |||
| if (c_config.bare_model_cryption_name) { | |||
| lite_config.bare_model_cryption_name = | |||
| c_config.bare_model_cryption_name; | |||
| } | |||
| lite_config.backend = c_config.backend; | |||
| lite_config.has_compression = c_config.has_compression; | |||
| lite_config.device_id = c_config.device_id; | |||
| lite_config.options.weight_preprocess = c_config.options.weight_preprocess; | |||
| lite_config.options.fuse_preprocess = c_config.options.fuse_preprocess; | |||
| lite_config.options.fake_next_exec = c_config.options.fake_next_exec; | |||
| lite_config.options.var_sanity_check_first_run = | |||
| c_config.options.var_sanity_check_first_run; | |||
| lite_config.options.const_shape = c_config.options.const_shape; | |||
| lite_config.options.force_dynamic_alloc = c_config.options.const_shape; | |||
| lite_config.options.force_output_dynamic_alloc = | |||
| c_config.options.force_output_dynamic_alloc; | |||
| lite_config.options.no_profiling_on_shape_change = | |||
| c_config.options.no_profiling_on_shape_change; | |||
| lite_config.options.jit_level = c_config.options.jit_level; | |||
| lite_config.options.comp_node_seq_record_level = | |||
| c_config.options.comp_node_seq_record_level; | |||
| lite_config.options.graph_opt_level = c_config.options.graph_opt_level; | |||
| lite_config.options.async_exec_level = c_config.options.async_exec_level; | |||
| lite_config.options.enable_nchw44 = c_config.options.enable_nchw44; | |||
| lite_config.options.enable_nchw44_dot = c_config.options.enable_nchw44_dot; | |||
| lite_config.options.enable_nchw88 = c_config.options.enable_nchw88; | |||
| lite_config.options.enable_nchw4 = c_config.options.enable_nchw4; | |||
| lite_config.options.enable_nhwcd4 = c_config.options.enable_nhwcd4; | |||
| lite_config.options.enable_nchw32 = c_config.options.enable_nchw32; | |||
| lite_config.options.enable_nchw64 = c_config.options.enable_nchw64; | |||
| return lite_config; | |||
| } | |||
| //! convert C NetworkIO io to lite::NetworkIO | |||
| lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io) { | |||
| lite::NetworkIO network_io; | |||
| for (size_t i = 0; i < c_network_io.input_size; i++) { | |||
| LiteIO* c_io = c_network_io.inputs + i; | |||
| LITE_ASSERT(c_io->name, "input name of io tensor must set."); | |||
| network_io.inputs.push_back( | |||
| {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type, | |||
| convert_to_layout(c_io->config_layout)}); | |||
| } | |||
| for (size_t i = 0; i < c_network_io.output_size; i++) { | |||
| LiteIO* c_io = c_network_io.outputs + i; | |||
| LITE_ASSERT(c_io->name, "output name of io tensor must set."); | |||
| network_io.outputs.push_back( | |||
| {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type, | |||
| convert_to_layout(c_io->config_layout)}); | |||
| } | |||
| return network_io; | |||
| } | |||
| int LITE_make_default_network(LiteNetwork* network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto lite_network = std::make_shared<lite::Network>(); | |||
| get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
| *network = lite_network.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_make_network(LiteNetwork* network, const LiteConfig config, | |||
| const LiteNetworkIO network_io) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto lite_network = std::make_shared<lite::Network>( | |||
| convert_to_lite_config(config), convert_to_lite_io(network_io)); | |||
| get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
| *network = lite_network.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_make_network_config(LiteNetwork* network, const LiteConfig config) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto lite_network = | |||
| std::make_shared<lite::Network>(convert_to_lite_config(config)); | |||
| get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
| *network = lite_network.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, | |||
| size_t size) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(model_mem, "The model memory pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->load_model(model_mem, size); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_load_model_from_path(LiteNetwork network, const char* model_path) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(model_path, "The model path pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->load_model(model_path); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_destroy_network(LiteNetwork network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| get_gloabl_network_holder().erase(network); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_forward(const LiteNetwork network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->forward(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_wait(const LiteNetwork network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->wait(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_io_tensor(LiteNetwork network, const char* io_name, | |||
| LiteTensorPhase phase, LiteTensor* tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto io_tensor = | |||
| static_cast<lite::Network*>(network)->get_io_tensor(io_name, phase); | |||
| *tensor = io_tensor.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_input_name(const LiteNetwork network, size_t index, | |||
| const char** name) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network && name, "The network pass to LITE api is null"); | |||
| *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
| ->get_input_name(index); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_output_name(const LiteNetwork network, size_t index, | |||
| const char** name) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(name, "The name ptr pass to LITE api is null"); | |||
| *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
| ->get_output_name(index); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_all_input_name(const LiteNetwork network, size_t* size, | |||
| const char** name) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto&& names = | |||
| lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
| ->get_all_input_name(); | |||
| if (size) | |||
| *size = names.size(); | |||
| if (name) { | |||
| for (auto in_name : names) { | |||
| *name = in_name; | |||
| name++; | |||
| } | |||
| } | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_all_output_name(const LiteNetwork network, size_t* size, | |||
| const char** name) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto&& names = | |||
| lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
| ->get_all_output_name(); | |||
| if (size) | |||
| *size = names.size(); | |||
| if (name) { | |||
| for (auto in_name : names) { | |||
| *name = in_name; | |||
| name++; | |||
| } | |||
| } | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_device_id(LiteNetwork network, int device_id) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->set_device_id(device_id); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_device_id(const LiteNetwork network, int* device_id) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(device_id, "The device_id pass to LITE api is null"); | |||
| *device_id = static_cast<lite::Network*>(network)->get_device_id(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_stream_id(LiteNetwork network, int stream_id) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->set_stream_id(stream_id); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_stream_id(const LiteNetwork network, int* stream_id) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(stream_id, "The stream_id pass to LITE api is null"); | |||
| *stream_id = static_cast<lite::Network*>(network)->get_stream_id(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_model_extra_info(const LiteNetwork network, const char** info, | |||
| int* info_size) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(info_size, "The info and info_size are all null"); | |||
| auto& extra_info = | |||
| static_cast<lite::Network*>(network)->get_model_extra_info(); | |||
| *info_size = extra_info.size(); | |||
| *info = extra_info.c_str(); | |||
| LITE_MARK_USED_VAR(info); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_device_type(const LiteNetwork network, | |||
| LiteDeviceType* device_type) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(device_type, "The device_type pass to LITE api is null"); | |||
| *device_type = static_cast<lite::Network*>(network)->get_device_type(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_async_callback(LiteNetwork network, | |||
| const LiteAsyncCallback async_callback) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(async_callback, "The ptr pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->set_async_callback( | |||
| std::move(async_callback)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_start_callback(LiteNetwork network, | |||
| const LiteStartCallback start_callback) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto lite_start_callback = | |||
| [start_callback]( | |||
| const std::unordered_map< | |||
| std::string, | |||
| std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>& | |||
| inputs_map) -> void { | |||
| std::vector<LiteIO> ios; | |||
| std::vector<LiteTensor> io_tensors; | |||
| size_t nr_io = 0; | |||
| for (const auto& io : inputs_map) { | |||
| nr_io++; | |||
| auto&& lite_io = io.second.first; | |||
| ios.push_back({lite_io.name.c_str(), lite_io.is_host, | |||
| lite_io.io_type, | |||
| convert_to_clayout(lite_io.config_layout)}); | |||
| io_tensors.push_back(io.second.second.get()); | |||
| } | |||
| start_callback(ios.data(), io_tensors.data(), nr_io); | |||
| }; | |||
| static_cast<lite::Network*>(network)->set_start_callback( | |||
| lite_start_callback); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_finish_callback(LiteNetwork network, | |||
| const LiteFinishCallback finish_callback) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| auto lite_finish_callback = | |||
| [finish_callback]( | |||
| const std::unordered_map< | |||
| std::string, | |||
| std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>& | |||
| outputs_map) -> void { | |||
| std::vector<LiteIO> ios; | |||
| std::vector<LiteTensor> io_tensors; | |||
| size_t nr_io = 0; | |||
| for (const auto& io : outputs_map) { | |||
| nr_io++; | |||
| auto&& lite_io = io.second.first; | |||
| ios.push_back({lite_io.name.c_str(), lite_io.is_host, | |||
| lite_io.io_type, | |||
| convert_to_clayout(lite_io.config_layout)}); | |||
| io_tensors.push_back(io.second.second.get()); | |||
| } | |||
| finish_callback(ios.data(), io_tensors.data(), nr_io); | |||
| }; | |||
| static_cast<lite::Network*>(network)->set_finish_callback( | |||
| lite_finish_callback); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_enable_profile_performance(LiteNetwork network, | |||
| const char* profile_json_file_path) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| static_cast<lite::Network*>(network)->enable_profile_performance( | |||
| profile_json_file_path); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_is_cpu_inplace_mode(const LiteNetwork network, | |||
| int* is_cpu_inplace_mode) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network && is_cpu_inplace_mode, | |||
| "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| *is_cpu_inplace_mode = lite::Runtime::is_cpu_inplace_mode(network_shared); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_cpu_threads_number(const LiteNetwork network, size_t* nr_threads) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| LITE_ASSERT(nr_threads, "The ptr pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| *nr_threads = lite::Runtime::get_cpu_threads_number(network_shared); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_cpu_inplace_mode(LiteNetwork network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_cpu_inplace_mode(network_shared); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_use_tensorrt(LiteNetwork network){ | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::use_tensorrt(network_shared); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_cpu_threads_number(LiteNetwork network, size_t nr_threads) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_cpu_threads_number(network_shared, nr_threads); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_network_algo_policy(LiteNetwork network, | |||
| LiteAlgoSelectStrategy strategy) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_network_algo_policy(network_shared, strategy); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_network_algo_fastrun_config(LiteNetwork network, | |||
| unsigned int shared_batch_size, | |||
| int binary_equal_between_batch) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_network_algo_policy( | |||
| network_shared, LiteAlgoSelectStrategy(0), shared_batch_size, | |||
| binary_equal_between_batch); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_network_algo_workspace_limit(LiteNetwork network, | |||
| size_t workspace_limit) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_network_algo_workspace_limit(network_shared, | |||
| workspace_limit); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_runtime_thread_affinity( | |||
| LiteNetwork network, | |||
| const LiteThreadAffinityCallback thread_affinity_callback) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_runtime_thread_affinity( | |||
| network_shared, std::move(thread_affinity_callback)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_memory_allocator(LiteNetwork network, | |||
| const LiteAllocate allocate_fun, | |||
| const LiteFree free_fun) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network && allocate_fun && free_fun, | |||
| "The ptr pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::set_memory_allocator( | |||
| network_shared, | |||
| std::make_shared<UserAllocator>(allocate_fun, free_fun)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_enable_io_txt_dump(LiteNetwork network, const char* io_txt_out_file) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::enable_io_txt_dump(network_shared, io_txt_out_file); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_enable_io_bin_dump(LiteNetwork network, const char* io_bin_out_dir) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> network_shared{ | |||
| static_cast<lite::Network*>(network), [](void*) {}}; | |||
| lite::Runtime::enable_io_bin_dump(network_shared, io_bin_out_dir); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_shared_weight_with_network(LiteNetwork dst_network, | |||
| const LiteNetwork src_network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(dst_network && src_network, | |||
| "The network pass to LITE api is null"); | |||
| const std::shared_ptr<lite::Network> src_shared_net{ | |||
| static_cast<lite::Network*>(src_network), [](void*) {}}; | |||
| std::shared_ptr<lite::Network> dst_shared_net{ | |||
| static_cast<lite::Network*>(dst_network), [](void*) {}}; | |||
| lite::Runtime::shared_weight_with_network(dst_shared_net, src_shared_net); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_share_runtime_memroy(LiteNetwork dst_network, | |||
| LiteNetwork src_network) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(src_network && dst_network, | |||
| "The network pass to LITE api is null"); | |||
| std::shared_ptr<lite::Network> src_shared{ | |||
| static_cast<lite::Network*>(src_network), [](void*) {}}; | |||
| std::shared_ptr<lite::Network> dst_shared{ | |||
| static_cast<lite::Network*>(dst_network), [](void*) {}}; | |||
| lite::Runtime::share_runtime_memory_with(dst_shared, src_shared); | |||
| LITE_CAPI_END(); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,257 @@ | |||
| /** | |||
| * \file lite-c/src/tensor.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/tensor.h" | |||
| #include "../../src/tensor_impl_base.h" | |||
| #include "common.h" | |||
| #include "lite-c/tensor_c.h" | |||
| #include <set> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| const LiteLayout default_layout = {.shapes = {0, 0, 0, 0, 0}, | |||
| .ndim = 0, | |||
| .data_type = LiteDataType::LITE_FLOAT}; | |||
| const LiteTensorDesc default_desc = {.is_pinned_host = false, | |||
| .layout = default_layout, | |||
| .device_type = LiteDeviceType::LITE_CPU, | |||
| .device_id = 0}; | |||
| namespace { | |||
| std::unordered_map<void*, std::shared_ptr<lite::Tensor>>& | |||
| get_global_tensor_holder() { | |||
| static thread_local std::unordered_map<void*, std::shared_ptr<lite::Tensor>> | |||
| global_holder; | |||
| return global_holder; | |||
| } | |||
| std::unordered_map<std::string, lite::LiteAny>& | |||
| get_global_tensor_attr_holder() { | |||
| static thread_local std::unordered_map<std::string, lite::LiteAny> | |||
| global_holder; | |||
| return global_holder; | |||
| } | |||
| } // namespace | |||
| //! convert the lite::Layout to Layout | |||
| LiteLayout convert_to_clayout(const lite::Layout& layout) { | |||
| LiteLayout clayout; | |||
| clayout.ndim = layout.ndim; | |||
| LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "layout ndim is to large"); | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| clayout.shapes[i] = layout.shapes[i]; | |||
| } | |||
| clayout.data_type = layout.data_type; | |||
| return clayout; | |||
| } | |||
| //! convert the C Layout to lite::Layout | |||
| lite::Layout convert_to_layout(const LiteLayout& clayout) { | |||
| lite::Layout layout; | |||
| layout.ndim = clayout.ndim; | |||
| LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "clayout ndim is to large"); | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| layout.shapes[i] = clayout.shapes[i]; | |||
| } | |||
| layout.data_type = clayout.data_type; | |||
| return layout; | |||
| } | |||
| int LITE_make_tensor(const LiteTensorDesc tensor_describe, LiteTensor* tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE_make_tensor is null"); | |||
| lite::Layout layout = convert_to_layout(tensor_describe.layout); | |||
| auto lite_tensor = std::make_shared<lite::Tensor>( | |||
| tensor_describe.device_id, tensor_describe.device_type, layout, | |||
| tensor_describe.is_pinned_host); | |||
| get_global_tensor_holder()[lite_tensor.get()] = lite_tensor; | |||
| *tensor = lite_tensor.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_destroy_tensor(LiteTensor tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| get_global_tensor_holder().erase(tensor); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| auto tensor_ptr = static_cast<lite::Tensor*>(tensor); | |||
| tensor_ptr->set_layout(convert_to_layout(layout)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, | |||
| size_t data_length_in_byte) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); | |||
| static_cast<lite::Tensor*>(tensor)->reset(prepared_data, | |||
| data_length_in_byte); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, | |||
| void* prepared_data) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); | |||
| static_cast<lite::Tensor*>(tensor)->reset(prepared_data, | |||
| convert_to_layout(layout)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor && shape, "The tensor pass to LITE c_api is null"); | |||
| std::vector<int> shapes; | |||
| for (int i = 0; i < size; i++) { | |||
| shapes.push_back(shape[i]); | |||
| } | |||
| static_cast<lite::Tensor*>(tensor)->reshape(shapes); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, | |||
| const size_t* end, const size_t* step, size_t size, | |||
| LiteTensor* slice_tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor && start && end && slice_tensor, | |||
| "The tensor pass to LITE c_api is null"); | |||
| std::vector<size_t> starts, ends, steps; | |||
| for (size_t i = 0; i < size; i++) { | |||
| starts.push_back(start[i]); | |||
| ends.push_back(end[i]); | |||
| if (step) { | |||
| steps.push_back(step[i]); | |||
| } | |||
| } | |||
| auto ret_tensor = | |||
| static_cast<lite::Tensor*>(tensor)->slice(starts, ends, steps); | |||
| get_global_tensor_holder()[ret_tensor.get()] = ret_tensor; | |||
| *slice_tensor = ret_tensor.get(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_fill_zero(LiteTensor tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| static_cast<lite::Tensor*>(tensor)->fill_zero(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_copy(LiteTensor dst_tensor, const LiteTensor src_tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(dst_tensor && src_tensor, | |||
| "The tensor pass to LITE c_api is null"); | |||
| static_cast<lite::Tensor*>(dst_tensor) | |||
| ->copy_from(*static_cast<lite::Tensor*>(src_tensor)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_share_memory_with(LiteTensor dst_tensor, | |||
| const LiteTensor src_tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(dst_tensor && src_tensor, | |||
| "The tensor pass to LITE c_api is null"); | |||
| static_cast<lite::Tensor*>(dst_tensor) | |||
| ->share_memory_with(*static_cast<lite::Tensor*>(src_tensor)); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_memory(const LiteTensor tensor, void** data) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(data, "The data ptr pass to LITE c_api is null"); | |||
| *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_memory_with_index(const LiteTensor tensor, | |||
| const size_t* index, size_t size, | |||
| void** data) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor && index && data, | |||
| "The tensor pass to LITE c_api is null"); | |||
| std::vector<size_t> index_v; | |||
| for (size_t i = 0; i < size; i++) { | |||
| index_v.push_back(index[i]); | |||
| } | |||
| *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(index_v); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, size_t* size) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(size, "The size ptr pass to LITE c_api is null"); | |||
| *size = static_cast<lite::Tensor*>(tensor)->get_tensor_total_size_in_byte(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_layout(const LiteTensor tensor, LiteLayout* layout) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(layout, "The layout ptr pass to LITE c_api is null"); | |||
| *layout = convert_to_clayout( | |||
| static_cast<lite::Tensor*>(tensor)->get_layout()); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_device_type(const LiteTensor tensor, | |||
| LiteDeviceType* device_type) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(device_type, "The device ptr pass to LITE c_api is null"); | |||
| *device_type = static_cast<lite::Tensor*>(tensor)->get_device_type(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor && device_id, "The tensor pass to LITE c_api is null"); | |||
| *device_id = static_cast<lite::Tensor*>(tensor)->get_device_id(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(is_pinned_host, | |||
| "The is_pinned_host ptr pass to LITE c_api is null"); | |||
| *is_pinned_host = static_cast<lite::Tensor*>(tensor)->is_pinned_host(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue) { | |||
| LITE_CAPI_BEGIN(); | |||
| LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
| LITE_ASSERT(is_continue, "The is_continue ptr pass to LITE c_api is null"); | |||
| *is_continue = static_cast<lite::Tensor*>(tensor)->is_continue_memory(); | |||
| LITE_CAPI_END(); | |||
| } | |||
| int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, | |||
| LiteDeviceType dst_device, int device_id, | |||
| LiteTensor* result_tensor) { | |||
| LITE_CAPI_BEGIN(); | |||
| std::vector<lite::Tensor> v_tensors; | |||
| for (int i = 0; i < nr_tensor; i++) { | |||
| v_tensors.push_back(*static_cast<lite::Tensor*>(tensors[i])); | |||
| } | |||
| auto tensor = | |||
| lite::TensorUtils::concat(v_tensors, dim, dst_device, device_id); | |||
| get_global_tensor_holder()[tensor.get()] = tensor; | |||
| *result_tensor = tensor.get(); | |||
| LITE_CAPI_END() | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,12 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| from .base import * | |||
| from .global_setting import * | |||
| from .network import * | |||
| from .struct import * | |||
| from .tensor import * | |||
| from .utils import * | |||
| @@ -0,0 +1,152 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import ctypes | |||
| import glob | |||
| import logging | |||
| import os | |||
| import sys | |||
| from ctypes import * | |||
| if sys.platform == "win32": | |||
| lib_path = os.path.join(os.path.dirname(__file__), "libs") | |||
| dll_paths = list(filter(os.path.exists, [lib_path,])) | |||
| assert len(dll_paths) > 0 | |||
| kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) | |||
| has_load_library_attr = hasattr(kernel32, "AddDllDirectory") | |||
| old_error_mode = kernel32.SetErrorMode(0x0001) | |||
| kernel32.LoadLibraryW.restype = ctypes.c_void_p | |||
| if has_load_library_attr: | |||
| kernel32.AddDllDirectory.restype = ctypes.c_void_p | |||
| kernel32.LoadLibraryExW.restype = ctypes.c_void_p | |||
| for dll_path in dll_paths: | |||
| if sys.version_info >= (3, 8): | |||
| os.add_dll_directory(dll_path) | |||
| elif has_load_library_attr: | |||
| res = kernel32.AddDllDirectory(dll_path) | |||
| if res is None: | |||
| err = ctypes.WinError(ctypes.get_last_error()) | |||
| err.strerror += ' Error adding "{}" to the DLL search PATH.'.format( | |||
| dll_path | |||
| ) | |||
| raise err | |||
| else: | |||
| print("WARN: python or OS env have some issue, may load DLL failed!!!") | |||
| import glob | |||
| dlls = glob.glob(os.path.join(lib_path, "*.dll")) | |||
| path_patched = False | |||
| for dll in dlls: | |||
| is_loaded = False | |||
| if has_load_library_attr: | |||
| res = kernel32.LoadLibraryExW(dll, None, 0x00001100) | |||
| last_error = ctypes.get_last_error() | |||
| if res is None and last_error != 126: | |||
| err = ctypes.WinError(last_error) | |||
| err.strerror += ' Error loading "{}" or one of its dependencies.'.format( | |||
| dll | |||
| ) | |||
| raise err | |||
| elif res is not None: | |||
| is_loaded = True | |||
| if not is_loaded: | |||
| if not path_patched: | |||
| os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]]) | |||
| path_patched = True | |||
| res = kernel32.LoadLibraryW(dll) | |||
| if res is None: | |||
| err = ctypes.WinError(ctypes.get_last_error()) | |||
| err.strerror += ' Error loading "{}" or one of its dependencies.'.format( | |||
| dll | |||
| ) | |||
| raise err | |||
| kernel32.SetErrorMode(old_error_mode) | |||
| class _LiteCLib: | |||
| def __init__(self): | |||
| cwd = os.getcwd() | |||
| package_dir = os.path.dirname(os.path.realpath(__file__)) | |||
| debug_path = os.getenv("LITE_LIB_PATH") | |||
| os.chdir(package_dir) | |||
| lite_libs = glob.glob("libs/liblite*") | |||
| os.chdir(cwd) | |||
| if debug_path is None: | |||
| assert len(lite_libs) == 1 | |||
| self._lib = CDLL(os.path.join(package_dir, lite_libs[0])) | |||
| else: | |||
| self._lib = CDLL(debug_path) | |||
| self._register_api( | |||
| "LITE_get_version", [POINTER(c_int), POINTER(c_int), POINTER(c_int)] | |||
| ) | |||
| self.lib.LITE_get_version.restype = None | |||
| self._register_api("LITE_set_log_level", [c_int]) | |||
| self._register_api("LITE_get_log_level", []) | |||
| self._register_api("LITE_get_last_error", [], False) | |||
| self.lib.LITE_get_last_error.restype = c_char_p | |||
| def _errcheck(self, result, func, args): | |||
| if result: | |||
| error = self.lib.LITE_get_last_error() | |||
| msg = error.decode("utf-8") | |||
| logging.error("{}".format(msg)) | |||
| raise RuntimeError("{}".format(msg)) | |||
| return result | |||
| def _register_api(self, api_name, arg_types, error_check=True): | |||
| func = getattr(self.lib, api_name) | |||
| func.argtypes = arg_types | |||
| func.restype = c_int | |||
| if error_check: | |||
| func.errcheck = self._errcheck | |||
| @property | |||
| def lib(self): | |||
| return self._lib | |||
| @property | |||
| def version(self): | |||
| major = c_int() | |||
| minor = c_int() | |||
| patch = c_int() | |||
| self.lib.LITE_get_version(byref(major), byref(minor), byref(patch)) | |||
| return "{}.{}.{}".format(major.value, minor.value, patch.value) | |||
| def set_log_level(self, level): | |||
| self.lib.LITE_set_log_level(level) | |||
| def get_log_level(self): | |||
| return self.lib.LITE_get_log_level() | |||
| _lib = _LiteCLib() | |||
| version = _lib.version | |||
| set_log_level = _lib.set_log_level | |||
| get_log_level = _lib.get_log_level | |||
| _Cnetwork = c_void_p | |||
| _Ctensor = c_void_p | |||
| class _LiteCObjMetaClass(type): | |||
| """metaclass for lite object""" | |||
| def __new__(cls, name, bases, attrs): | |||
| for api in attrs["_api_"]: | |||
| _lib._register_api(*api) | |||
| del attrs["_api_"] | |||
| attrs["_lib"] = _lib.lib | |||
| return super().__new__(cls, name, bases, attrs) | |||
| class _LiteCObjBase(metaclass=_LiteCObjMetaClass): | |||
| _api_ = [] | |||
| @@ -0,0 +1,120 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| from ctypes import * | |||
| import numpy as np | |||
| from .base import _Ctensor, _lib, _LiteCObjBase | |||
| from .network import * | |||
| from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure | |||
| from .tensor import * | |||
| LiteDecryptionFunc = CFUNCTYPE( | |||
| c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p | |||
| ) | |||
| class _GlobalAPI(_LiteCObjBase): | |||
| """ | |||
| get the api from the lib | |||
| """ | |||
| _api_ = [ | |||
| ("LITE_get_device_count", [c_int, POINTER(c_size_t)]), | |||
| ("LITE_try_coalesce_all_free_memory", []), | |||
| ( | |||
| "LITE_register_decryption_and_key", | |||
| [c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t], | |||
| ), | |||
| ( | |||
| "LITE_update_decryption_or_key", | |||
| [c_char_p, c_void_p, POINTER(c_uint8), c_size_t], | |||
| ), | |||
| ("LITE_set_loader_lib_path", [c_char_p]), | |||
| ("LITE_set_persistent_cache", [c_char_p, c_int]), | |||
| # ('LITE_set_tensor_rt_cache', [c_char_p]), | |||
| ("LITE_dump_persistent_cache", [c_char_p]), | |||
| ("LITE_dump_tensor_rt_cache", [c_char_p]), | |||
| ] | |||
| def decryption_func(func): | |||
| """the decryption function decorator | |||
| :type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte | |||
| """ | |||
| @CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p) | |||
| def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data): | |||
| in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length) | |||
| key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length) | |||
| if c_out_data: | |||
| out_length = func(in_arr, None) | |||
| out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length) | |||
| return func(in_arr, key_arr, out_arr) | |||
| # just query the output length | |||
| else: | |||
| return func(in_arr, key_arr, None) | |||
| return wrapper | |||
| class LiteGlobal(object): | |||
| """ | |||
| some global config in lite | |||
| """ | |||
| _api = _GlobalAPI()._lib | |||
| @staticmethod | |||
| def register_decryption_and_key(decryption_name, decryption_func, key): | |||
| c_name = c_char_p(decryption_name.encode("utf-8")) | |||
| key_length = len(key) | |||
| c_key = (c_uint8 * key_length)(*key) | |||
| LiteGlobal._api.LITE_register_decryption_and_key( | |||
| c_name, decryption_func, c_key, key_length | |||
| ) | |||
| @staticmethod | |||
| def update_decryption_key(decryption_name, key): | |||
| c_name = c_char_p(decryption_name.encode("utf-8")) | |||
| key_length = len(key) | |||
| c_key = (c_uint8 * key_length)(*key) | |||
| LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length) | |||
| @staticmethod | |||
| def set_loader_lib_path(path): | |||
| c_path = c_char_p(path.encode("utf-8")) | |||
| LiteGlobal._api.LITE_set_loader_lib_path(c_path) | |||
| @staticmethod | |||
| def set_persistent_cache(path, always_sync=False): | |||
| c_path = c_char_p(path.encode("utf-8")) | |||
| LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync) | |||
| @staticmethod | |||
| def set_tensorrt_cache(path): | |||
| c_path = c_char_p(path.encode("utf-8")) | |||
| LiteGlobal._api.LITE_set_tensorrt_cache(c_path) | |||
| @staticmethod | |||
| def dump_persistent_cache(path): | |||
| c_path = c_char_p(path.encode("utf-8")) | |||
| LiteGlobal._api.LITE_dump_persistent_cache(c_path) | |||
| @staticmethod | |||
| def dump_tensorrt_cache(): | |||
| LiteGlobal._api.LITE_dump_tensorrt_cache() | |||
| @staticmethod | |||
| def get_device_count(device_type): | |||
| count = c_size_t() | |||
| LiteGlobal._api.LITE_get_device_count(device_type, byref(count)) | |||
| return count.value | |||
| @staticmethod | |||
| def try_coalesce_all_free_memory(): | |||
| LiteGlobal._api.LITE_try_coalesce_all_free_memory() | |||
| @@ -0,0 +1,531 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| from ctypes import * | |||
| import numpy as np | |||
| from .base import _Cnetwork, _Ctensor, _lib, _LiteCObjBase | |||
| from .struct import * | |||
| from .tensor import * | |||
| class LiteOptions(Structure): | |||
| """ | |||
| the inference options will be used to config a network | |||
| """ | |||
| _fields_ = [ | |||
| ("weight_preprocess", c_int), | |||
| ("fuse_preprocess", c_int), | |||
| ("fake_next_exec", c_int), | |||
| ("var_sanity_check_first_run", c_int), | |||
| ("const_shape", c_int), | |||
| ("force_dynamic_alloc", c_int), | |||
| ("force_output_dynamic_alloc", c_int), | |||
| ("no_profiling_on_shape_change", c_int), | |||
| ("jit_level", c_int), | |||
| ("comp_node_seq_record_level", c_int), | |||
| ("graph_opt_level", c_int), | |||
| ("async_exec_level", c_int), | |||
| # layout transform options | |||
| ("enable_nchw44", c_int), | |||
| ("enable_nchw44_dot", c_int), | |||
| ("enable_nchw88", c_int), | |||
| ("enable_nhwcd4", c_int), | |||
| ("enable_nchw4", c_int), | |||
| ("enable_nchw32", c_int), | |||
| ("enable_nchw64", c_int), | |||
| ] | |||
| def __init__(self): | |||
| self.weight_preprocess = False | |||
| self.fuse_preprocess = False | |||
| self.fake_next_exec = False | |||
| self.var_sanity_check_first_run = True | |||
| self.const_shape = False | |||
| self.force_dynamic_alloc = False | |||
| self.force_output_dynamic_alloc = False | |||
| self.no_profiling_on_shape_change = False | |||
| self.jit_level = 0 | |||
| self.comp_node_seq_record_level = 0 | |||
| self.graph_opt_level = 2 | |||
| self.async_exec_level = 1 | |||
| def __repr__(self): | |||
| data = { | |||
| "weight_preprocess": bool(self.weight_preprocess), | |||
| "fuse_preprocess": bool(self.fuse_preprocess), | |||
| "fake_next_exec": bool(self.fake_next_exec), | |||
| "var_sanity_check_first_run": bool(self.var_sanity_check_first_run), | |||
| "const_shape": bool(self.const_shape), | |||
| "force_dynamic_alloc": bool(self.force_dynamic_alloc), | |||
| "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), | |||
| "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), | |||
| "jit_level": self.jit_level, | |||
| "comp_node_seq_record_level": self.comp_node_seq_record_level, | |||
| "graph_opt_level": self.graph_opt_level, | |||
| "async_exec_level": self.async_exec_level, | |||
| } | |||
| return data.__repr__() | |||
| class LiteConfig(Structure): | |||
| """ | |||
| Configuration when load and compile the graph | |||
| bare_model_cryption_name: is the bare model cryption method name, bare | |||
| model is not pack model info inside | |||
| use_loader_dynamic_param: when model forward with device loader of npu, | |||
| use_loader_dynamic_param used to flag whether the loader use device input or | |||
| output, if use device input or output it will set Non-zero , else set zero | |||
| has_compression: flag whether the model is compressed, the compress | |||
| method will used to read the model | |||
| """ | |||
| _fields_ = [ | |||
| ("has_compression", c_int), | |||
| ("device_id", c_int), | |||
| ("device_type", c_int), | |||
| ("backend", c_int), | |||
| ("bare_model_cryption_name", c_char_p), | |||
| ("options", LiteOptions), | |||
| ] | |||
| def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None): | |||
| self.device_type = device_type | |||
| if option: | |||
| self.options = option | |||
| else: | |||
| self.options = LiteOptions() | |||
| self.bare_model_cryption_name = c_char_p(b"") | |||
| self.use_loader_dynamic_param = 0 | |||
| self.has_compression = 0 | |||
| self.backend = LiteBackend.LITE_DEFAULT | |||
| def __repr__(self): | |||
| data = { | |||
| "has_compression": bool(self.has_compression), | |||
| "device_id": LiteDeviceType(self.device_id), | |||
| "device_type": LiteDeviceType(self.device_type), | |||
| "backend": LiteBackend(self.backend), | |||
| "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"), | |||
| "options": self.options, | |||
| } | |||
| return data.__repr__() | |||
| class LiteIO(Structure): | |||
| """ | |||
| config the network input and output item | |||
| name: the tensor name in the graph corresponding to the IO | |||
| is_host: Used to mark where the input tensor comes from and the output where copy | |||
| to, if is_host is true, the input is from host and output copy to host, | |||
| otherwise device. Sometimes The input is from device and output no need | |||
| copy to host, default is true. | |||
| io_type: The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
| output tensor value is invaid, only shape will be set, default is VALUE | |||
| config_layout: The layout of the config from user, if other layout is set before | |||
| forward or get after forward, this layout will by pass. if no other | |||
| layout is set before forward, this layout will work. if this layout is | |||
| no set, the model will forward with its origin layout. if in output, it | |||
| will used to check. | |||
| """ | |||
| _fields_ = [ | |||
| ("name", c_char_p), | |||
| ("is_host", c_int), | |||
| ("io_type", c_int), | |||
| ("config_layout", LiteLayout), | |||
| ] | |||
| def __init__( | |||
| self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None | |||
| ): | |||
| if type(name) == str: | |||
| self.name = c_char_p(name.encode("utf-8")) | |||
| else: | |||
| self.name = c_char_p(name) | |||
| if layout: | |||
| self.config_layout = layout | |||
| else: | |||
| self.config_layout = LiteLayout() | |||
| self.is_host = is_host | |||
| self.io_type = io_type | |||
| def __repr__(self): | |||
| data = { | |||
| "name": self.name, | |||
| "is_host": bool(self.is_host), | |||
| "io_type": LiteIOType(self.io_type), | |||
| "config_layout": self.config_layout, | |||
| } | |||
| return data.__repr__() | |||
| def __hash__(self): | |||
| return hash(self.name) | |||
| class _LiteNetworkIO(Structure): | |||
| """ | |||
| the input and output information when load the network | |||
| """ | |||
| _fields_ = [ | |||
| ("inputs", POINTER(LiteIO)), | |||
| ("outputs", POINTER(LiteIO)), | |||
| ("input_size", c_size_t), | |||
| ("output_size", c_size_t), | |||
| ] | |||
| def __init__(self): | |||
| self.inputs = POINTER(LiteIO)() | |||
| self.outputs = POINTER(LiteIO)() | |||
| self.input_size = 0 | |||
| self.output_size = 0 | |||
| class LiteNetworkIO(object): | |||
| """ | |||
| the input and output information for user to construct _LiteNetWorkIO | |||
| """ | |||
| def __init__(self): | |||
| self.inputs = [] | |||
| self.outputs = [] | |||
| def add_input(self, input_io): | |||
| assert isinstance(input_io, LiteIO) | |||
| self.inputs.append(input_io) | |||
| def add_output(self, output_io): | |||
| assert isinstance(output_io, LiteIO) | |||
| self.outputs.append(output_io) | |||
| def _create_network_io(self): | |||
| network_io = _LiteNetworkIO() | |||
| length = 1 if len(self.inputs) == 0 else len(self.inputs) | |||
| self.c_inputs = (LiteIO * length)(*self.inputs) | |||
| length = 1 if len(self.outputs) == 0 else len(self.outputs) | |||
| self.c_outputs = (LiteIO * length)(*self.outputs) | |||
| network_io.inputs = pointer(self.c_inputs[0]) | |||
| network_io.outputs = pointer(self.c_outputs[0]) | |||
| network_io.input_size = len(self.inputs) | |||
| network_io.output_size = len(self.outputs) | |||
| return network_io | |||
| def __repr__(self): | |||
| data = {"inputs": list(self.inputs), "outputs": list(self.outputs)} | |||
| return data.__repr__() | |||
| LiteAsyncCallback = CFUNCTYPE(c_int) | |||
| def start_finish_callback(func): | |||
| @CFUNCTYPE(c_int, POINTER(LiteIO), POINTER(_Ctensor), c_size_t) | |||
| def wrapper(c_ios, c_tensors, size): | |||
| ios = {} | |||
| for i in range(size): | |||
| tensor = LiteTensor() | |||
| tensor._tensor = c_tensors[i] | |||
| tensor.update() | |||
| io = c_ios[i] | |||
| ios[io] = tensor | |||
| return func(ios) | |||
| return wrapper | |||
| class _NetworkAPI(_LiteCObjBase): | |||
| """ | |||
| get the network api from the lib | |||
| """ | |||
| _api_ = [ | |||
| ("LITE_make_default_network", [POINTER(_Cnetwork)]), | |||
| ("LITE_make_network", [POINTER(_Cnetwork), LiteConfig, _LiteNetworkIO]), | |||
| ("LITE_load_model_from_mem", [_Cnetwork, c_void_p, c_size_t]), | |||
| ("LITE_load_model_from_path", [_Cnetwork, c_char_p]), | |||
| ("LITE_shared_weight_with_network", [_Cnetwork, _Ctensor]), | |||
| ("LITE_destroy_network", [_Cnetwork]), | |||
| ("LITE_forward", [_Cnetwork]), | |||
| ("LITE_wait", [_Cnetwork]), | |||
| ("LITE_get_io_tensor", [_Cnetwork, c_char_p, c_int, POINTER(_Ctensor)]), | |||
| ("LITE_get_input_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), | |||
| ("LITE_get_output_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), | |||
| ("LITE_get_all_input_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), | |||
| ("LITE_get_all_output_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), | |||
| ("LITE_is_cpu_inplace_mode", [_Cnetwork, POINTER(c_int)]), | |||
| ("LITE_get_cpu_threads_number", [_Cnetwork, POINTER(c_size_t)]), | |||
| ("LITE_get_device_id", [_Cnetwork, POINTER(c_int)]), | |||
| ("LITE_set_device_id", [_Cnetwork, c_int]), | |||
| ("LITE_set_cpu_inplace_mode", [_Cnetwork]), | |||
| ("LITE_use_tensorrt", [_Cnetwork]), | |||
| ("LITE_set_cpu_threads_number", [_Cnetwork, c_size_t]), | |||
| ("LITE_set_stream_id", [_Cnetwork, c_int]), | |||
| ("LITE_get_stream_id", [_Cnetwork, POINTER(c_int)]), | |||
| ("LITE_set_network_algo_policy", [_Cnetwork, c_int]), | |||
| ("LITE_set_network_algo_fastrun_config", [_Cnetwork, c_int, c_int]), | |||
| ("LITE_set_network_algo_workspace_limit", [_Cnetwork, c_size_t]), | |||
| ("LITE_share_runtime_memroy", [_Cnetwork, _Cnetwork]), | |||
| ("LITE_enable_profile_performance", [_Cnetwork, c_char_p]), | |||
| ("LITE_enable_io_txt_dump", [_Cnetwork, c_char_p]), | |||
| ("LITE_enable_io_bin_dump", [_Cnetwork, c_char_p]), | |||
| ("LITE_set_async_callback", [_Cnetwork, LiteAsyncCallback]), | |||
| ("LITE_set_start_callback", [_Cnetwork]), | |||
| ("LITE_set_finish_callback", [_Cnetwork]), | |||
| ] | |||
| class LiteNetwork(object): | |||
| """ | |||
| the network to load a model and forward | |||
| """ | |||
| _api = _NetworkAPI()._lib | |||
| def __init__(self, config=None, io=None): | |||
| """ | |||
| create a network with config and networkio | |||
| """ | |||
| self._network = _Cnetwork() | |||
| if config: | |||
| self.config = config | |||
| else: | |||
| self.config = LiteConfig() | |||
| if io: | |||
| self.network_io = io | |||
| else: | |||
| self.network_io = LiteNetworkIO() | |||
| c_network_io = self.network_io._create_network_io() | |||
| self._api.LITE_make_network(byref(self._network), self.config, c_network_io) | |||
| def __repr__(self): | |||
| data = {"config": self.config, "IOs": self.network_io} | |||
| return data.__repr__() | |||
| def __del__(self): | |||
| self._api.LITE_destroy_network(self._network) | |||
| def load(self, path): | |||
| c_path = c_char_p(path.encode("utf-8")) | |||
| self._api.LITE_load_model_from_path(self._network, c_path) | |||
| def forward(self): | |||
| self._api.LITE_forward(self._network) | |||
| def wait(self): | |||
| self._api.LITE_wait(self._network) | |||
| def is_cpu_inplace_mode(self): | |||
| """ | |||
| whether the network run in cpu inpalce mode | |||
| """ | |||
| inplace = c_int() | |||
| self._api.LITE_is_cpu_inplace_mode(self._network, byref(inplace)) | |||
| return bool(inplace.value) | |||
| def enable_cpu_inplace_mode(self): | |||
| """ | |||
| set cpu forward in inplace mode with which cpu forward only create one | |||
| thread | |||
| Note: this must be set before the network loaded | |||
| """ | |||
| self._api.LITE_set_cpu_inplace_mode(self._network) | |||
| def use_tensorrt(self): | |||
| """ | |||
| Note: this must be set before the network loaded | |||
| """ | |||
| self._api.LITE_use_tensorrt(self._network) | |||
| @property | |||
| def device_id(self): | |||
| """ | |||
| get the device id | |||
| """ | |||
| device_id = c_int() | |||
| self._api.LITE_get_device_id(self._network, byref(device_id)) | |||
| return device_id.value | |||
| @device_id.setter | |||
| def device_id(self, device_id): | |||
| """ | |||
| set the device id | |||
| Note: this must be set before the network loaded | |||
| """ | |||
| self._api.LITE_set_device_id(self._network, device_id) | |||
| @property | |||
| def stream_id(self): | |||
| """ | |||
| get the stream id | |||
| """ | |||
| stream_id = c_int() | |||
| self._api.LITE_get_stream_id(self._network, byref(stream_id)) | |||
| return stream_id.value | |||
| @stream_id.setter | |||
| def stream_id(self, stream_id): | |||
| """ | |||
| set the stream id | |||
| Note: this must be set before the network loaded | |||
| """ | |||
| self._api.LITE_set_stream_id(self._network, stream_id) | |||
| @property | |||
| def threads_number(self): | |||
| """ | |||
| get the thread number of the netwrok | |||
| """ | |||
| nr_thread = c_size_t() | |||
| self._api.LITE_get_cpu_threads_number(self._network, byref(nr_thread)) | |||
| return nr_thread.value | |||
| @threads_number.setter | |||
| def threads_number(self, nr_threads): | |||
| """ | |||
| set the network forward in multithread mode, and the thread number | |||
| Note: this must be set before the network loaded | |||
| """ | |||
| self._api.LITE_set_cpu_threads_number(self._network, nr_threads) | |||
| def get_io_tensor(self, name, phase=LiteTensorPhase.LITE_IO): | |||
| """ | |||
| get input or output tensor by its name | |||
| """ | |||
| if type(name) == str: | |||
| c_name = c_char_p(name.encode("utf-8")) | |||
| else: | |||
| c_name = c_char_p(name) | |||
| tensor = LiteTensor() | |||
| self._api.LITE_get_io_tensor( | |||
| self._network, c_name, phase, byref(tensor._tensor) | |||
| ) | |||
| tensor.update() | |||
| return tensor | |||
| def get_input_name(self, index): | |||
| """ | |||
| get the input name by the index in the network | |||
| """ | |||
| c_name = c_char_p() | |||
| self._api.LITE_get_input_name(self._network, index, byref(c_name)) | |||
| return c_name.value.decode("utf-8") | |||
| def get_output_name(self, index): | |||
| """ | |||
| get the output name by the index in the network | |||
| """ | |||
| c_name = c_char_p() | |||
| self._api.LITE_get_output_name(self._network, index, byref(c_name)) | |||
| return c_name.value.decode("utf-8") | |||
| def get_all_input_name(self): | |||
| """ | |||
| get all the input tensor name in the network | |||
| """ | |||
| nr_input = c_size_t() | |||
| self._api.LITE_get_all_input_name(self._network, byref(nr_input), None) | |||
| if nr_input.value > 0: | |||
| names = (c_char_p * nr_input.value)() | |||
| self._api.LITE_get_all_input_name(self._network, None, names) | |||
| ret_name = [names[i].decode("utf-8") for i in range(nr_input.value)] | |||
| return ret_name | |||
| def get_all_output_name(self): | |||
| """ | |||
| get all the output tensor name in the network | |||
| """ | |||
| nr_output = c_size_t() | |||
| self._api.LITE_get_all_output_name(self._network, byref(nr_output), None) | |||
| if nr_output.value > 0: | |||
| names = (c_char_p * nr_output.value)() | |||
| self._api.LITE_get_all_output_name(self._network, None, names) | |||
| ret_name = [names[i].decode("utf-8") for i in range(nr_output.value)] | |||
| return ret_name | |||
| def share_weights_with(self, src_network): | |||
| """ | |||
| share weights with the loaded network | |||
| """ | |||
| assert isinstance(src_network, LiteNetwork) | |||
| self._api.LITE_shared_weight_with_network(self._network, src_network._network) | |||
| def share_runtime_memroy(self, src_network): | |||
| """ | |||
| share runtime memory with the srouce network | |||
| """ | |||
| assert isinstance(src_network, LiteNetwork) | |||
| self._api.LITE_share_runtime_memroy(self._network, src_network._network) | |||
| def async_with_callback(self, async_callback): | |||
| async_callback = LiteAsyncCallback(async_callback) | |||
| self._api.LITE_set_async_callback(self._network, async_callback) | |||
| def set_start_callback(self, start_callback): | |||
| """ | |||
| when the network start forward, the callback will be called, | |||
| the start_callback with param mapping from LiteIO to the corresponding | |||
| LiteTensor | |||
| """ | |||
| self._api.LITE_set_start_callback(self._network, start_callback) | |||
| def set_finish_callback(self, finish_callback): | |||
| """ | |||
| when the network finish forward, the callback will be called, | |||
| the finish_callback with param mapping from LiteIO to the corresponding | |||
| LiteTensor | |||
| """ | |||
| self._api.LITE_set_finish_callback(self._network, finish_callback) | |||
| def enable_profile_performance(self, profile_file): | |||
| c_file = profile_file.encode("utf-8") | |||
| self._api.LITE_enable_profile_performance(self._network, c_file) | |||
| def set_network_algo_workspace_limit(self, size_limit): | |||
| self._api.LITE_set_network_algo_workspace_limit(self._network, size_limit) | |||
| def set_network_algo_policy( | |||
| self, policy, shared_batch_size=0, binary_equal_between_batch=False | |||
| ): | |||
| """ | |||
| shared_batch_size: the batch size used by fastrun, | |||
| Non-zero value means that fastrun use this batch size | |||
| regardless of the batch size of the model. Zero means | |||
| fastrun use batch size of the model | |||
| binary_equal_between_batch: if the content of each input batch is | |||
| binary equal,whether the content of each output batch is | |||
| promised to be equal | |||
| """ | |||
| self._api.LITE_set_network_algo_policy(self._network, policy) | |||
| self._api.LITE_set_network_algo_fastrun_config( | |||
| self._network, shared_batch_size, binary_equal_between_batch | |||
| ) | |||
| def io_txt_dump(self, txt_file): | |||
| c_file = txt_file.encode("utf-8") | |||
| self._api.LITE_enable_io_txt_dump(self._network, c_file) | |||
| def io_bin_dump(self, bin_dir): | |||
| c_dir = bin_dir.encode("utf-8") | |||
| self._api.LITE_enable_io_bin_dump(self._network, c_dir) | |||
| @@ -0,0 +1,90 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import logging | |||
| from ctypes import * | |||
| from enum import Enum, IntEnum | |||
| class LiteBackend(IntEnum): | |||
| LITE_DEFAULT = 0 | |||
| class LiteDeviceType(IntEnum): | |||
| LITE_CPU = 0 | |||
| LITE_CUDA = 1 | |||
| LITE_ATLAS = 3 | |||
| LITE_NPU = 4 | |||
| LITE_DEVICE_DEFAULT = 5 | |||
| class LiteDataType(IntEnum): | |||
| LITE_FLOAT = 0 | |||
| LITE_HALF = 1 | |||
| LITE_INT = 2 | |||
| LITE_INT16 = 3 | |||
| LITE_INT8 = 4 | |||
| LITE_UINT8 = 5 | |||
| class LiteTensorPhase(IntEnum): | |||
| LITE_IO = 0 | |||
| LITE_INPUT = 1 | |||
| LITE_OUTPUT = 2 | |||
| class LiteIOType(IntEnum): | |||
| """ | |||
| the input and output type, include SHAPE and VALUE | |||
| sometimes user only need the shape of the output tensor | |||
| """ | |||
| LITE_IO_VALUE = 0 | |||
| LITE_IO_SHAPE = 1 | |||
| class LiteAlgoSelectStrategy(IntEnum): | |||
| """ | |||
| operation algorithm seletion strategy type, some operations have | |||
| multi algorithms, different algorithm has different attribute, according to | |||
| the strategy, the best algorithm will be selected. | |||
| Note: These strategies can be combined | |||
| LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, | |||
| use heuristic instead | |||
| LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the | |||
| reproducible algo | |||
| LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best | |||
| algorithm from the reproducible algorithms set | |||
| LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best | |||
| algorithm form the optimzed algorithms, thus profile will process fast | |||
| LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: | |||
| profile the best algorithm form the optimzed and reproducible algorithms | |||
| """ | |||
| LITE_ALGO_HEURISTIC = 1 | |||
| LITE_ALGO_PROFILE = 2 | |||
| LITE_ALGO_REPRODUCIBLE = 4 | |||
| LITE_ALGO_OPTIMIZED = 8 | |||
| class LiteLogLevel(IntEnum): | |||
| """ | |||
| DEBUG: The most verbose level, printing debugging info | |||
| INFO: The default level | |||
| WARN: Printing warnings | |||
| ERROR: The least verbose level, printing errors only | |||
| """ | |||
| DEBUG = 0 | |||
| INFO = 1 | |||
| WARN = 2 | |||
| ERROR = 3 | |||
| @@ -0,0 +1,471 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| from ctypes import * | |||
| import numpy as np | |||
| from .base import _Ctensor, _lib, _LiteCObjBase | |||
| from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure | |||
| MAX_DIM = 7 | |||
| _lite_type_to_nptypes = { | |||
| LiteDataType.LITE_INT: np.int32, | |||
| LiteDataType.LITE_FLOAT: np.float32, | |||
| LiteDataType.LITE_UINT8: np.uint8, | |||
| LiteDataType.LITE_INT8: np.int8, | |||
| LiteDataType.LITE_INT16: np.int16, | |||
| LiteDataType.LITE_HALF: np.float16, | |||
| } | |||
| _nptype_to_lite_type = {val: key for key, val in _lite_type_to_nptypes.items()} | |||
| _str_nptypes_to_lite_nptypes = { | |||
| np.dtype("int32"): LiteDataType.LITE_INT, | |||
| np.dtype("float32"): LiteDataType.LITE_FLOAT, | |||
| np.dtype("uint8"): LiteDataType.LITE_UINT8, | |||
| np.dtype("int8"): LiteDataType.LITE_INT8, | |||
| np.dtype("int16"): LiteDataType.LITE_INT16, | |||
| np.dtype("float16"): LiteDataType.LITE_HALF, | |||
| } | |||
| ctype_to_lite_dtypes = { | |||
| c_int: LiteDataType.LITE_INT, | |||
| c_uint: LiteDataType.LITE_INT, | |||
| c_float: LiteDataType.LITE_FLOAT, | |||
| c_ubyte: LiteDataType.LITE_UINT8, | |||
| c_byte: LiteDataType.LITE_INT8, | |||
| c_short: LiteDataType.LITE_INT16, | |||
| c_ushort: LiteDataType.LITE_INT16, | |||
| } | |||
| class LiteLayout(Structure): | |||
| """ | |||
| the simple layout description | |||
| """ | |||
| _fields_ = [ | |||
| ("shapes", c_size_t * MAX_DIM), | |||
| ("ndim", c_size_t), | |||
| ("data_type", c_int), | |||
| ] | |||
| def __init__(self, shape=None, dtype=None): | |||
| if shape: | |||
| shape = list(shape) | |||
| assert len(shape) <= MAX_DIM, "Layout max dim is 7." | |||
| self.shapes = (c_size_t * MAX_DIM)(*shape) | |||
| self.ndim = len(shape) | |||
| else: | |||
| self.shapes = (c_size_t * MAX_DIM)() | |||
| self.ndim = 0 | |||
| if not dtype: | |||
| self.data_type = LiteDataType.LITE_FLOAT | |||
| elif isinstance(dtype, LiteDataType): | |||
| self.data_type = dtype | |||
| elif type(dtype) == str: | |||
| self.data_type = _str_nptypes_to_lite_nptypes[np.dtype(dtype)] | |||
| elif isinstance(dtype, np.dtype): | |||
| ctype = np.ctypeslib.as_ctypes_type(dtype) | |||
| self.data_type = ctype_to_lite_dtypes[ctype] | |||
| elif isinstance(dtype, type): | |||
| self.data_type = _nptype_to_lite_type[dtype] | |||
| else: | |||
| raise RuntimeError("unkonw data type") | |||
| def __repr__(self): | |||
| data = { | |||
| "shapes": list(self.shapes), | |||
| "ndim": self.ndim, | |||
| "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)], | |||
| } | |||
| return data.__repr__() | |||
| class _LiteTensorDesc(Structure): | |||
| """ | |||
| warpper of the MegEngine Tensor | |||
| :is_pinned_host: when set, the storage memory of the tensor is pinned memory, | |||
| this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
| is not set, when copy form other device(CUDA) tensor, this tensor | |||
| will be automatically set to pinned tensor | |||
| """ | |||
| _fields_ = [ | |||
| ("is_pinned_host", c_int), | |||
| ("layout", LiteLayout), | |||
| ("device_type", c_int), | |||
| ("device_id", c_int), | |||
| ] | |||
| def __init__(self): | |||
| self.layout = LiteLayout() | |||
| self.device_type = LiteDeviceType.LITE_CPU | |||
| self.is_pinned_host = False | |||
| self.device_id = 0 | |||
| def __repr__(self): | |||
| data = { | |||
| "is_pinned_host": self.is_pinned_host, | |||
| "layout": LiteLayout(self.layout), | |||
| "device_type": LiteDeviceType(self.device_type.value), | |||
| "device_id": self.device_id, | |||
| } | |||
| return data.__repr__() | |||
| class _TensorAPI(_LiteCObjBase): | |||
| """ | |||
| get the api from the lib | |||
| """ | |||
| _api_ = [ | |||
| ("LITE_make_tensor", [_LiteTensorDesc, POINTER(_Ctensor)]), | |||
| ("LITE_set_tensor_layout", [_Ctensor, LiteLayout]), | |||
| ("LITE_reset_tensor_memory", [_Ctensor, c_void_p, c_size_t]), | |||
| ("LITE_reset_tensor", [_Ctensor, LiteLayout, c_void_p]), | |||
| ("LITE_tensor_reshape", [_Ctensor, POINTER(c_int), c_int]), | |||
| ( | |||
| "LITE_tensor_slice", | |||
| [ | |||
| _Ctensor, | |||
| POINTER(c_size_t), | |||
| POINTER(c_size_t), | |||
| POINTER(c_size_t), | |||
| c_size_t, | |||
| POINTER(_Ctensor), | |||
| ], | |||
| ), | |||
| ( | |||
| "LITE_tensor_concat", | |||
| [POINTER(_Ctensor), c_int, c_int, c_int, c_int, POINTER(_Ctensor),], | |||
| ), | |||
| ("LITE_tensor_fill_zero", [_Ctensor]), | |||
| ("LITE_tensor_copy", [_Ctensor, _Ctensor]), | |||
| ("LITE_tensor_share_memory_with", [_Ctensor, _Ctensor]), | |||
| ("LITE_get_tensor_memory", [_Ctensor, POINTER(c_void_p)]), | |||
| ("LITE_get_tensor_total_size_in_byte", [_Ctensor, POINTER(c_size_t)]), | |||
| ("LITE_get_tensor_layout", [_Ctensor, POINTER(LiteLayout)]), | |||
| ("LITE_get_tensor_device_type", [_Ctensor, POINTER(c_int)]), | |||
| ("LITE_get_tensor_device_id", [_Ctensor, POINTER(c_int)]), | |||
| ("LITE_destroy_tensor", [_Ctensor]), | |||
| ("LITE_is_pinned_host", [_Ctensor, POINTER(c_int)]), | |||
| ] | |||
| class LiteTensor(object): | |||
| """ | |||
| the tensor to hold a block of data | |||
| """ | |||
| _api = _TensorAPI()._lib | |||
| def __init__( | |||
| self, | |||
| layout=None, | |||
| device_type=LiteDeviceType.LITE_CPU, | |||
| device_id=0, | |||
| is_pinned_host=False, | |||
| ): | |||
| """ | |||
| create a Tensor with layout, device, is_pinned_host param | |||
| """ | |||
| self._tensor = _Ctensor() | |||
| if layout: | |||
| self._layout = layout | |||
| else: | |||
| self._layout = LiteLayout() | |||
| self._device_type = device_type | |||
| self._device_id = device_id | |||
| self._is_pinned_host = is_pinned_host | |||
| tensor_desc = _LiteTensorDesc() | |||
| tensor_desc.layout = self._layout | |||
| tensor_desc.device_type = device_type | |||
| tensor_desc.device_id = device_id | |||
| tensor_desc.is_pinned_host = is_pinned_host | |||
| self._api.LITE_make_tensor(tensor_desc, byref(self._tensor)) | |||
| def __del__(self): | |||
| self._api.LITE_destroy_tensor(self._tensor) | |||
| def fill_zero(self): | |||
| """ | |||
| fill the buffer memory with zero | |||
| """ | |||
| self._api.LITE_tensor_fill_zero(self._tensor) | |||
| self.update() | |||
| def share_memory_with(self, src_tensor): | |||
| """ | |||
| share the same memory with the src_tensor, the self memory will be freed | |||
| """ | |||
| assert isinstance(src_tensor, LiteTensor) | |||
| self._api.LITE_tensor_share_memory_with(self._tensor, src_tensor._tensor) | |||
| self.update() | |||
| @property | |||
| def layout(self): | |||
| self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) | |||
| return self._layout | |||
| @layout.setter | |||
| def layout(self, layout): | |||
| assert isinstance(layout, LiteLayout) | |||
| self._layout = layout | |||
| self._api.LITE_set_tensor_layout(self._tensor, layout) | |||
| @property | |||
| def is_pinned_host(self): | |||
| """ | |||
| whether the tensor is pinned tensor | |||
| """ | |||
| pinned = c_int() | |||
| self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) | |||
| self._is_pinned_host = pinned | |||
| return bool(self._is_pinned_host) | |||
| @property | |||
| def device_type(self): | |||
| """ | |||
| get device of the tensor | |||
| """ | |||
| device_type = c_int() | |||
| self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) | |||
| self._device_type = device_type | |||
| return LiteDeviceType(device_type.value) | |||
| @property | |||
| def device_id(self): | |||
| """ | |||
| get device id of the tensor | |||
| """ | |||
| device_id = c_int() | |||
| self._api.LITE_get_tensor_device_id(self._tensor, byref(device_id)) | |||
| self._device_id = device_id.value | |||
| return device_id.value | |||
| @property | |||
| def is_continue(self): | |||
| """ | |||
| whether the tensor memory is continue | |||
| """ | |||
| is_continue = c_int() | |||
| self._api.LITE_is_memory_continue(self._tensor, byref(is_continue)) | |||
| return bool(is_continue.value) | |||
| @property | |||
| def nbytes(self): | |||
| """ | |||
| get the length of the meomry in byte | |||
| """ | |||
| self.update() | |||
| length = c_size_t() | |||
| self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length)) | |||
| return length.value | |||
| def update(self): | |||
| """ | |||
| update the member from C, this will auto used after slice, share | |||
| """ | |||
| pinned = c_int() | |||
| self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) | |||
| self._is_pinned_host = pinned | |||
| device_type = c_int() | |||
| self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) | |||
| self._device_type = device_type | |||
| self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) | |||
| def copy_from(self, src_tensor): | |||
| """ | |||
| copy memory form the src_tensor | |||
| """ | |||
| assert isinstance(src_tensor, LiteTensor) | |||
| self._api.LITE_tensor_copy(self._tensor, src_tensor._tensor) | |||
| self.update() | |||
| def reshape(self, shape): | |||
| """ | |||
| reshape the tensor with data not change, only change the shape | |||
| :param shape: int arrary of dst_shape | |||
| """ | |||
| shape = list(shape) | |||
| length = len(shape) | |||
| c_shape = (c_int * length)(*shape) | |||
| self._api.LITE_tensor_reshape(self._tensor, c_shape, length) | |||
| self.update() | |||
| def slice(self, start, end, step=None): | |||
| """ | |||
| slice the tensor with gaven start, end, step | |||
| :param start: silce begin index of each dim | |||
| :param end: silce end index of each dim | |||
| :param step: silce step of each dim | |||
| """ | |||
| start = list(start) | |||
| end = list(end) | |||
| length = len(start) | |||
| assert length == len(end), "slice with different length of start and end." | |||
| if step: | |||
| assert length == len(step), "slice with different length of start and step." | |||
| step = list(step) | |||
| else: | |||
| step = [1 for i in range(length)] | |||
| c_start = (c_size_t * length)(*start) | |||
| c_end = (c_size_t * length)(*end) | |||
| c_step = (c_size_t * length)(*step) | |||
| slice_tensor = LiteTensor() | |||
| self._api.LITE_tensor_slice( | |||
| self._tensor, c_start, c_end, c_step, length, byref(slice_tensor._tensor) | |||
| ) | |||
| slice_tensor.update() | |||
| return slice_tensor | |||
| def get_ctypes_memory(self): | |||
| """ | |||
| get the memory of the tensor, return c_void_p of the tensor memory | |||
| """ | |||
| self.update() | |||
| mem = c_void_p() | |||
| self._api.LITE_get_tensor_memory(self._tensor, byref(mem)) | |||
| return mem | |||
| def set_data_by_share(self, data, length=0, layout=None): | |||
| """ | |||
| share the data to the tensor | |||
| param data: the data will shared to the tensor, it should be a | |||
| numpy.ndarray or ctypes data | |||
| """ | |||
| self.update() | |||
| if isinstance(data, np.ndarray): | |||
| assert ( | |||
| self.is_continue | |||
| ), "set_data_by_share can only apply in continue tensor." | |||
| assert ( | |||
| self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
| ), "set_data_by_share can only apply in cpu tensor or pinned tensor." | |||
| np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
| c_type = np.ctypeslib.as_ctypes_type(np_type) | |||
| if self.nbytes != data.nbytes: | |||
| self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type]) | |||
| self._shared_data = data | |||
| data = data.ctypes.data_as(POINTER(c_type)) | |||
| if layout is not None: | |||
| self.layout = layout | |||
| else: | |||
| assert length == 0 or length == self.nbytes, "the data length is not match." | |||
| self._api.LITE_reset_tensor_memory(self._tensor, data, self.nbytes) | |||
| def set_data_by_copy(self, data, data_length=0, layout=None): | |||
| """ | |||
| copy the data to the tensor | |||
| param data: the data to copy to tensor, it should be list, | |||
| numpy.ndarraya or ctypes with length | |||
| """ | |||
| self.update() | |||
| if layout is not None: | |||
| self.layout = layout | |||
| assert self.is_continue, "set_data_by_copy can only apply in continue tensor." | |||
| assert ( | |||
| self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
| ), "set_data_by_copy can only apply in cpu tensor or pinned tensor." | |||
| np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
| c_type = np.ctypeslib.as_ctypes_type(np_type) | |||
| tensor_memory = c_void_p() | |||
| if type(data) == list: | |||
| length = len(data) | |||
| self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
| tensor_length = self.nbytes | |||
| assert ( | |||
| length * sizeof(c_type) <= tensor_length | |||
| ), "the length of input data to set to the tensor is too large." | |||
| arr = (c_type * length)(*data) | |||
| memmove(tensor_memory, arr, sizeof(c_type) * length) | |||
| elif type(data) == np.ndarray: | |||
| if self.nbytes != data.nbytes: | |||
| self.layout = LiteLayout(data.shape, data.dtype) | |||
| arr = data.ctypes.data_as(POINTER(c_type)) | |||
| self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
| assert self.nbytes == data.nbytes | |||
| memmove(tensor_memory, arr, self.nbytes) | |||
| else: | |||
| assert ( | |||
| data_length == self.nbytes or layout is not None | |||
| ), "when input data is ctypes, the length of input data or layout must set" | |||
| self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
| memmove(tensor_memory, data, data_length) | |||
| def to_numpy(self): | |||
| """ | |||
| get the buffer of the tensor | |||
| """ | |||
| self.update() | |||
| if self.nbytes <= 0: | |||
| return np.array([]) | |||
| if self.is_continue and ( | |||
| self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
| ): | |||
| ptr = c_void_p() | |||
| self._api.LITE_get_tensor_memory(self._tensor, byref(ptr)) | |||
| np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
| shape = [self._layout.shapes[i] for i in range(self._layout.ndim)] | |||
| np_arr = np.zeros(shape, np_type) | |||
| if np_arr.nbytes: | |||
| memmove(np_arr.ctypes.data_as(c_void_p), ptr, np_arr.nbytes) | |||
| return np_arr | |||
| else: | |||
| tmp_tensor = LiteTensor(self.layout) | |||
| tmp_tensor.copy_from(self) | |||
| return tmp_tensor.to_numpy() | |||
| def __repr__(self): | |||
| self.update() | |||
| data = { | |||
| "layout": self._layout, | |||
| "device_type": LiteDeviceType(self._device_type.value), | |||
| "device_id": int(self.device_id), | |||
| "is_pinned_host": bool(self._is_pinned_host), | |||
| } | |||
| return data.__repr__() | |||
| def LiteTensorConcat( | |||
| tensors, dim, device_type=LiteDeviceType.LITE_DEVICE_DEFAULT, device_id=-1 | |||
| ): | |||
| """ | |||
| concat tensor in input dim to one tensor | |||
| dim : the dim to act concat | |||
| device_type: the result tensor device type | |||
| device_id: the result tensor device id | |||
| """ | |||
| api = _TensorAPI()._lib | |||
| length = len(tensors) | |||
| c_tensors = [t._tensor for t in tensors] | |||
| c_tensors = (_Ctensor * length)(*c_tensors) | |||
| result_tensor = LiteTensor() | |||
| api.LITE_tensor_concat( | |||
| cast(byref(c_tensors), POINTER(c_void_p)), | |||
| length, | |||
| dim, | |||
| device_type, | |||
| device_id, | |||
| byref(result_tensor._tensor), | |||
| ) | |||
| result_tensor.update() | |||
| return result_tensor | |||
| @@ -0,0 +1,122 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import threading | |||
| import numpy as np | |||
| from .base import * | |||
| from .struct import * | |||
| from .tensor import * | |||
| class TensorBatchCollector: | |||
| """ | |||
| this is a tensor utils to collect subtensor in batch continuous | |||
| """ | |||
| def __init__( | |||
| self, | |||
| shape, | |||
| dtype=LiteDataType.LITE_INT8, | |||
| device_type=LiteDeviceType.LITE_CUDA, | |||
| device_id=0, | |||
| is_pinned_host=False, | |||
| tensor=None, | |||
| ): | |||
| self._mutex = threading.Lock() | |||
| self.dev_type = device_type | |||
| self.is_pinned_host = is_pinned_host | |||
| self.dev_id = 0 | |||
| self.shape = shape | |||
| self.dtype = LiteLayout(dtype=dtype).data_type | |||
| self._free_list = list(range(self.shape[0])) | |||
| if tensor is not None: | |||
| assert ( | |||
| tensor.layout.shapes[0 : tensor.layout.ndim] == shape | |||
| ), "The tensor set to TensorBatchCollector is not right." | |||
| self._tensor = tensor | |||
| self.dtype = tensor.layout.data_type | |||
| self.device_type = tensor.device_type | |||
| self.device_id = tensor.device_type | |||
| else: | |||
| self._tensor = LiteTensor( | |||
| LiteLayout(shape, dtype), device_type, device_id, is_pinned_host | |||
| ) | |||
| def collect_id(self, array, batch_id): | |||
| if isinstance(array, np.ndarray): | |||
| shape = array.shape | |||
| assert list(shape) == self.shape[1:] | |||
| in_dtype = ctype_to_lite_dtypes[np.ctypeslib.as_ctypes_type(array.dtype)] | |||
| assert in_dtype == self.dtype | |||
| # get the batch index | |||
| with self._mutex: | |||
| if batch_id in self._free_list: | |||
| self._free_list.remove(batch_id) | |||
| # get the subtensor | |||
| subtensor = self._tensor.slice([batch_id], [batch_id + 1]) | |||
| if subtensor.device_type == LiteDeviceType.LITE_CPU: | |||
| subtensor.set_data_by_copy(array) | |||
| else: | |||
| pinned_tensor = LiteTensor( | |||
| subtensor.layout, self.dev_type, self.dev_id, True | |||
| ) | |||
| pinned_tensor.set_data_by_share(array) | |||
| subtensor.copy_from(pinned_tensor) | |||
| else: | |||
| assert isinstance(array, LiteTensor) | |||
| ndim = array.layout.ndim | |||
| shape = list(array.layout.shapes)[0:ndim] | |||
| assert list(shape) == self.shape[1:] | |||
| in_dtype = array.layout.data_type | |||
| assert in_dtype == self.dtype | |||
| # get the batch index | |||
| with self._mutex: | |||
| if batch_id in self._free_list: | |||
| self._free_list.remove(batch_id) | |||
| # get the subtensor | |||
| subtensor = self._tensor.slice([batch_id], [batch_id + 1]) | |||
| subtensor.copy_from(array) | |||
| return batch_id | |||
| def collect(self, array): | |||
| with self._mutex: | |||
| if len(self._free_list) == 0: | |||
| return -1 | |||
| idx = self._free_list.pop(0) | |||
| return self.collect_id(array, idx) | |||
| def collect_by_ctypes(self, data, length): | |||
| """ | |||
| collect with ctypes data input | |||
| """ | |||
| with self._mutex: | |||
| if len(self._free_list) == 0: | |||
| return -1 | |||
| idx = self._free_list.pop(0) | |||
| # get the subtensor | |||
| subtensor = self._tensor.slice([idx], [idx + 1]) | |||
| if subtensor.device_type == LiteDeviceType.LITE_CPU: | |||
| subtensor.set_data_by_copy(data, length) | |||
| else: | |||
| pinned_tensor = LiteTensor( | |||
| subtensor.layout, self.dev_type, self.dev_id, True | |||
| ) | |||
| pinned_tensor.set_data_by_share(data, length) | |||
| subtensor.copy_from(pinned_tensor) | |||
| def free(self, indexes): | |||
| with self._mutex: | |||
| self._free_list.extend(indexes) | |||
| def get(self): | |||
| return self._tensor | |||
| def to_numpy(self): | |||
| return self._tensor.to_numpy() | |||
| @@ -0,0 +1,199 @@ | |||
| # PyLite | |||
| Lite的python接口提供更加方便灵活的使用Lite进行模型Inference,支持各种平台上运行,X86-CUDA,X86-CPU,Arm-CPU,Arm-CUDA平台。 | |||
| ## 安装 | |||
| ### whl包安装 | |||
| Lite python接口的whl包会随着megbrain的发版发布,版本号和megbrain保持一致,目前发布的Lite的whl包,包括Linux、windows、macos平台,这些平台可以直接通过pip3安装。 | |||
| ```shell | |||
| python3 -m pip install --upgrade pip | |||
| python3 -m pip install megenginelite -i https://pypi.megvii-inc.com/simple | |||
| ``` | |||
| ### develop 安装 | |||
| 开发模式下,可以使用Cmake编译出lite动态库liblite.so/liblite.dll/liblite_shared.dylib,并使用这个动态库进行开发和debug。该方式安装的pylite只能在本地机器上使用,不能copy到其他机器上使用。 | |||
| * 编译liblite.so。使用cmake编译出liblite.so | |||
| * clone megbrain工程到本地 | |||
| ```shell | |||
| git clone git@git-core.megvii-inc.com:brain-sdk/MegBrain.git | |||
| ``` | |||
| * 进行Cmake编译,这里的cmake编译同megbrain的cmake编译,使用参数和宏也完全一样 | |||
| * 编译准备 | |||
| ```shell | |||
| cd MegBrain | |||
| sh ./third_party/prepare.sh | |||
| mkdir build | |||
| cd build | |||
| ``` | |||
| * 编译X86-CUDA版本 | |||
| ```shell | |||
| cmake .. -DMGE_WITH_CUDA=ON -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) | |||
| ``` | |||
| * 编译X86 CPU Only版本 | |||
| ```shell | |||
| cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) | |||
| ``` | |||
| * 编译完成之后,liblite.so 保存在build目录中的lite文件下 | |||
| * 将liblite.so copy到megenginelite的python源文件目录下,就可以使用megenginelite了。 | |||
| ```shell | |||
| MegBrain的工程目录为 ${mgb_hone} | |||
| cp ${mgb_hone}/build/lite/liblite.so ${mgb_home}/lite/pylite/megenginelite/ | |||
| cd ${mgb_home}/lite/pylite | |||
| python3 -m "import megenginelite" | |||
| ``` | |||
| 这样就可以在${mgb_home}/lite/pylite 目录下面开发和debug lite的python接口了 | |||
| ## python3中使用megenginelite | |||
| Lite的python接口是对其C/C++接口的一层封装,他们使用的模型都是相同的模型格式。megenginelite提供两种数据接口,分别是LiteTensor和LiteNetwork。 | |||
| ### LiteTensor | |||
| LiteTensor提供了用户对数据的操作接口,提供了接口包括: | |||
| * fill_zero: 将tensor的内存设置为全0 | |||
| * share_memory_with: 可以和其他LiteTensor的共享内存 | |||
| * copy_from: 从其他LiteTensor中copy数据到自身内存中 | |||
| * reshape: 改变该LiteTensor的shape,内存数据保持不变 | |||
| * slice: 对该LiteTensor中的数据进行切片,需要分别指定每一维切片的start,end,和step。 | |||
| * set_data_by_share: 调用之后使得该LiteTensor中的内存共享自输入的array的内存,输入的array必须是numpy的ndarray,并且tensor在CPU上 | |||
| * set_data_by_copy: 该LiteTensor将会从输入的data中copy数据,data可以是list和numpy的ndarray,需要保证data的数据量不超过tensor的容量,tensor在CPU上 | |||
| * to_numpy: 将该LiteTensor中数据copy到numpy的array中,返回给用户,如果是非连续的LiteTensor,如slice出来的,将copy到连续的numpy array中,该接口主要数为了debug,有性能问题。 | |||
| #### 使用example | |||
| * LiteTensor 设置数据example | |||
| ``` | |||
| def test_tensor_set_data(): | |||
| layout = LiteLayout([2, 16], "int8") | |||
| tensor = LiteTensor(layout) | |||
| assert tensor.nbytes == 2 * 16 | |||
| data = [i for i in range(32)] | |||
| tensor.set_data_by_copy(data) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == i | |||
| arr = np.ones([2, 16], "int8") | |||
| tensor.set_data_by_copy(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == 1 | |||
| for i in range(32): | |||
| arr[i // 16][i % 16] = i | |||
| tensor.set_data_by_share(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == i | |||
| arr[0][8] = 100 | |||
| arr[1][3] = 20 | |||
| real_data = tensor.to_numpy() | |||
| assert real_data[0][8] == 100 | |||
| assert real_data[1][3] == 20 | |||
| ``` | |||
| * tensor 共享内存example | |||
| ```python | |||
| def test_tensor_share_memory_with(): | |||
| layout = LiteLayout([4, 32], "int16") | |||
| tensor = LiteTensor(layout) | |||
| assert tensor.nbytes == 4 * 32 * 2 | |||
| arr = np.ones([4, 32], "int16") | |||
| for i in range(128): | |||
| arr[i // 32][i % 32] = i | |||
| tensor.set_data_by_share(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(128): | |||
| assert real_data[i // 32][i % 32] == i | |||
| tensor2 = LiteTensor(layout) | |||
| tensor2.share_memory_with(tensor) | |||
| real_data = tensor.to_numpy() | |||
| real_data2 = tensor2.to_numpy() | |||
| for i in range(128): | |||
| assert real_data[i // 32][i % 32] == i | |||
| assert real_data2[i // 32][i % 32] == i | |||
| arr[1][18] = 5 | |||
| arr[3][7] = 345 | |||
| real_data = tensor2.to_numpy() | |||
| assert real_data[1][18] == 5 | |||
| assert real_data[3][7] == 345 | |||
| ``` | |||
| 更多的使用可以参考pylite中test/test_tensor.py中的使用 | |||
| ### LiteNetwork | |||
| LiteNetwork主要为用户提供模型载入,运行等功能。使用的模型见lite的readme中关于模型的部分 | |||
| * CPU基本模型载入运行的example | |||
| ``` | |||
| def test_network_basic(): | |||
| source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
| input_data_path = os.path.join(source_dir, "input_data.npy") | |||
| # read input to input_data | |||
| input_data = np.load(input_data_path) | |||
| model_path = os.path.join(source_dir, "shufflenet.mge") | |||
| network = LiteNetwork() | |||
| network.load(model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| assert input_tensor.layout.shapes[0] == 1 | |||
| assert input_tensor.layout.shapes[1] == 3 | |||
| assert input_tensor.layout.shapes[2] == 224 | |||
| assert input_tensor.layout.shapes[3] == 224 | |||
| assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
| assert input_tensor.layout.ndim == 4 | |||
| # copy input data to input_tensor of the network | |||
| input_tensor.set_data_by_copy(input_data) | |||
| for i in range(3): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) | |||
| ``` | |||
| * CUDA上使用device内存作为模型输入,需要在构造network候配置config和IO信息 | |||
| ``` | |||
| def test_network_device_IO(): | |||
| source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
| input_data_path = os.path.join(source_dir, "input_data.npy") | |||
| model_path = os.path.join(source_dir, "shufflenet.mge") | |||
| # read input to input_data | |||
| input_data = np.load(input_data_path) | |||
| input_layout = LiteLayout([1, 3, 224, 224]) | |||
| host_input_data = LiteTensor(layout=input_layout) | |||
| host_input_data.set_data_by_share(input_data) | |||
| dev_input_data = LiteTensor(layout=input_layout, device_type=LiteDeviceType.LITE_CUDA) | |||
| dev_input_data.copy_from(host_input_data) | |||
| # construct LiteOption | |||
| options = LiteOptions() | |||
| options.weight_preprocess = 1 | |||
| options.var_sanity_check_first_run = 0 | |||
| net_config = LiteConfig(device_type=LiteDeviceType.LITE_CUDA, option=options) | |||
| # constuct LiteIO, is_host=False means the input tensor will use device memory | |||
| input_io = LiteIO("data", is_host=False) | |||
| ios = LiteNetworkIO() | |||
| ios.add_input(input_io) | |||
| network = LiteNetwork(config=net_config, io=ios) | |||
| network.load(model_path) | |||
| input_name = network.get_input_name(0) | |||
| dev_input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| # copy input data to input_tensor of the network | |||
| dev_input_tensor.share_memory_with(dev_input_data) | |||
| for i in range(3): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) | |||
| ``` | |||
| 更多的使用可以参考pylite中test/test_network.py和test/test_network_cuda.py中的使用 | |||
| @@ -0,0 +1 @@ | |||
| numpy>=1.18 | |||
| @@ -0,0 +1,20 @@ | |||
| #!/usr/bin/env bash | |||
| set -e | |||
| cd $(dirname $0)/.. | |||
| ISORT_ARG="" | |||
| BLACK_ARG="" | |||
| while getopts 'd' OPT; do | |||
| case $OPT in | |||
| d) | |||
| ISORT_ARG="--diff --check-only" | |||
| BLACK_ARG="--diff --check" | |||
| ;; | |||
| ?) | |||
| echo "Usage: `basename $0` [-d]" | |||
| esac | |||
| done | |||
| isort $ISORT_ARG -j $(nproc) -rc megenginelite test | |||
| black $BLACK_ARG --target-version=py35 -- megenginelite test | |||
| @@ -0,0 +1,127 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import os | |||
| import re | |||
| import pathlib | |||
| import platform | |||
| from distutils.file_util import copy_file | |||
| from setuptools import setup, find_packages, Extension | |||
| from setuptools.command.build_ext import build_ext as _build_ext | |||
| class PrecompiledExtesion(Extension): | |||
| def __init__(self, name): | |||
| super().__init__(name, sources=[]) | |||
| class build_ext(_build_ext): | |||
| def build_extension(self, ext): | |||
| if not isinstance(ext, PrecompiledExtesion): | |||
| return super().build_extension(ext) | |||
| if not self.inplace: | |||
| fullpath = self.get_ext_fullpath(ext.name) | |||
| extdir = pathlib.Path(fullpath) | |||
| extdir.parent.mkdir(parents=True, exist_ok=True) | |||
| modpath = self.get_ext_fullname(ext.name).split('.') | |||
| if platform.system() == 'Windows': | |||
| modpath[-1] += '.dll' | |||
| elif platform.system() == 'Darwin': | |||
| modpath[-1] += '.dylib' | |||
| else: | |||
| modpath[-1] += '.so' | |||
| modpath = str(pathlib.Path(*modpath).resolve()) | |||
| copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run) | |||
| v = {} | |||
| with open("megenginelite/version.py") as fp: | |||
| exec(fp.read(), v) | |||
| __version__ = v['__version__'] | |||
| email = 'megengine@megvii.com' | |||
| # https://www.python.org/dev/peps/pep-0440 | |||
| # Public version identifiers: [N!]N(.N)*[{a|b|rc}N][.postN][.devN] | |||
| # Local version identifiers: <public version identifier>[+<local version label>] | |||
| # PUBLIC_VERSION_POSTFIX use to handle rc or dev info | |||
| public_version_postfix = os.environ.get('PUBLIC_VERSION_POSTFIX') | |||
| if public_version_postfix: | |||
| __version__ = '{}{}'.format(__version__, public_version_postfix) | |||
| local_version = [] | |||
| strip_sdk_info = os.environ.get('STRIP_SDK_INFO', 'False').lower() | |||
| sdk_name = os.environ.get('SDK_NAME', 'cpu') | |||
| if 'true' == strip_sdk_info: | |||
| print('wheel version strip sdk info') | |||
| else: | |||
| local_version.append(sdk_name) | |||
| local_postfix = os.environ.get('LOCAL_VERSION') | |||
| if local_postfix: | |||
| local_version.append(local_postfix) | |||
| if len(local_version): | |||
| __version__ = '{}+{}'.format(__version__, '.'.join(local_version)) | |||
| packages = find_packages() | |||
| megenginelite_data = [ | |||
| str(f.relative_to('megenginelite')) | |||
| for f in pathlib.Path('megenginelite').glob('**/*') | |||
| ] | |||
| if platform.system() == 'Windows': | |||
| megenginelite_data.remove('libs\\liblite_shared.dll') | |||
| elif platform.system() == 'Darwin': | |||
| megenginelite_data.remove('libs/liblite_shared.dylib') | |||
| else: | |||
| megenginelite_data.remove('libs/liblite_shared.so') | |||
| with open('requires.txt') as f: | |||
| requires = f.read().splitlines() | |||
| prebuild_modules=[PrecompiledExtesion('megenginelite.libs.liblite_shared')] | |||
| setup_kwargs = dict( | |||
| name=package_name, | |||
| version=__version__, | |||
| description='Inference Framework for MegEngine', | |||
| author='Megvii Engine Team', | |||
| author_email=email, | |||
| packages=packages, | |||
| package_data={ | |||
| 'megenginelite': megenginelite_data, | |||
| }, | |||
| ext_modules=prebuild_modules, | |||
| install_requires=requires, | |||
| cmdclass={'build_ext': build_ext}, | |||
| ) | |||
| setup_kwargs.update(dict( | |||
| classifiers=[ | |||
| 'Development Status :: 3 - Alpha', | |||
| 'Intended Audience :: Developers', | |||
| 'Intended Audience :: Education', | |||
| 'Intended Audience :: Science/Research', | |||
| 'License :: OSI Approved :: Apache Software License', | |||
| 'Programming Language :: C++', | |||
| 'Programming Language :: Python :: 3', | |||
| 'Programming Language :: Python :: 3.5', | |||
| 'Programming Language :: Python :: 3.6', | |||
| 'Programming Language :: Python :: 3.7', | |||
| 'Programming Language :: Python :: 3.8', | |||
| 'Topic :: Scientific/Engineering', | |||
| 'Topic :: Scientific/Engineering :: Mathematics', | |||
| 'Topic :: Scientific/Engineering :: Artificial Intelligence', | |||
| 'Topic :: Software Development', | |||
| 'Topic :: Software Development :: Libraries', | |||
| 'Topic :: Software Development :: Libraries :: Python Modules', | |||
| ], | |||
| license='Apache 2.0', | |||
| keywords='megengine deep learning', | |||
| data_files = [("megengine", [ | |||
| "../LICENSE", | |||
| "../ACKNOWLEDGMENTS", | |||
| ])] | |||
| )) | |||
| setup(**setup_kwargs) | |||
| @@ -0,0 +1,92 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import os | |||
| import unittest | |||
| import numpy as np | |||
| from megenginelite import * | |||
| set_log_level(2) | |||
| class TestShuffleNet(unittest.TestCase): | |||
| source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
| input_data_path = os.path.join(source_dir, "input_data.npy") | |||
| correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
| correct_data = np.load(correct_data_path).flatten() | |||
| input_data = np.load(input_data_path) | |||
| def check_correct(self, out_data, error=1e-4): | |||
| out_data = out_data.flatten() | |||
| assert np.isfinite(out_data.sum()) | |||
| assert self.correct_data.size == out_data.size | |||
| for i in range(out_data.size): | |||
| assert abs(out_data[i] - self.correct_data[i]) < error | |||
| def do_forward(self, network, times=3): | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| input_tensor.set_data_by_copy(self.input_data) | |||
| for i in range(times): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| class TestGlobal(TestShuffleNet): | |||
| def test_device_count(self): | |||
| LiteGlobal.try_coalesce_all_free_memory() | |||
| count = LiteGlobal.get_device_count(LiteDeviceType.LITE_CPU) | |||
| assert count > 0 | |||
| def test_register_decryption_method(self): | |||
| @decryption_func | |||
| def function(in_arr, key_arr, out_arr): | |||
| if not out_arr: | |||
| return in_arr.size | |||
| else: | |||
| for i in range(in_arr.size): | |||
| out_arr[i] = in_arr[i] ^ key_arr[0] ^ key_arr[0] | |||
| return out_arr.size | |||
| LiteGlobal.register_decryption_and_key("just_for_test", function, [15]) | |||
| config = LiteConfig() | |||
| config.bare_model_cryption_name = "just_for_test".encode("utf-8") | |||
| network = LiteNetwork() | |||
| model_path = os.path.join(self.source_dir, "shufflenet.mge") | |||
| network.load(model_path) | |||
| self.do_forward(network) | |||
| def test_update_decryption_key(self): | |||
| wrong_key = [0] * 32 | |||
| LiteGlobal.update_decryption_key("AES_default", wrong_key) | |||
| with self.assertRaises(RuntimeError): | |||
| config = LiteConfig() | |||
| config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
| network = LiteNetwork(config) | |||
| model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
| network.load(model_path) | |||
| right_key = [i for i in range(32)] | |||
| LiteGlobal.update_decryption_key("AES_default", right_key) | |||
| config = LiteConfig() | |||
| config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
| network = LiteNetwork(config) | |||
| model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
| network.load(model_path) | |||
| self.do_forward(network) | |||
| @@ -0,0 +1,405 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import os | |||
| import unittest | |||
| import numpy as np | |||
| from megenginelite import * | |||
| set_log_level(2) | |||
| def test_version(): | |||
| print("Lite verson: {}".format(version)) | |||
| def test_network_io(): | |||
| input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE) | |||
| input_io2 = LiteIO( | |||
| "data2", | |||
| is_host=True, | |||
| io_type=LiteIOType.LITE_IO_SHAPE, | |||
| layout=LiteLayout([2, 4, 4]), | |||
| ) | |||
| io = LiteNetworkIO() | |||
| io.add_input(input_io1) | |||
| io.add_input(input_io2) | |||
| output_io1 = LiteIO("out1", is_host=False) | |||
| output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000])) | |||
| io.add_output(output_io1) | |||
| io.add_output(output_io2) | |||
| assert len(io.inputs) == 2 | |||
| assert len(io.outputs) == 2 | |||
| assert io.inputs[0] == input_io1 | |||
| assert io.outputs[0] == output_io1 | |||
| c_io = io._create_network_io() | |||
| assert c_io.input_size == 2 | |||
| assert c_io.output_size == 2 | |||
| class TestShuffleNet(unittest.TestCase): | |||
| source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
| input_data_path = os.path.join(source_dir, "input_data.npy") | |||
| correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
| model_path = os.path.join(source_dir, "shufflenet.mge") | |||
| correct_data = np.load(correct_data_path).flatten() | |||
| input_data = np.load(input_data_path) | |||
| def check_correct(self, out_data, error=1e-4): | |||
| out_data = out_data.flatten() | |||
| assert np.isfinite(out_data.sum()) | |||
| assert self.correct_data.size == out_data.size | |||
| for i in range(out_data.size): | |||
| assert abs(out_data[i] - self.correct_data[i]) < error | |||
| def do_forward(self, network, times=3): | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| input_tensor.set_data_by_copy(self.input_data) | |||
| for i in range(times): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| class TestNetwork(TestShuffleNet): | |||
| def test_decryption(self): | |||
| model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
| config = LiteConfig() | |||
| config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
| network = LiteNetwork(config) | |||
| network.load(model_path) | |||
| self.do_forward(network) | |||
| def test_pack_model(self): | |||
| model_path = os.path.join(self.source_dir, "test_packed_model_rc4.lite") | |||
| network = LiteNetwork() | |||
| network.load(model_path) | |||
| self.do_forward(network) | |||
| def test_network_basic(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| assert input_tensor.layout.shapes[0] == 1 | |||
| assert input_tensor.layout.shapes[1] == 3 | |||
| assert input_tensor.layout.shapes[2] == 224 | |||
| assert input_tensor.layout.shapes[3] == 224 | |||
| assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
| assert input_tensor.layout.ndim == 4 | |||
| self.do_forward(network) | |||
| def test_network_shared_data(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| input_tensor.set_data_by_share(self.input_data) | |||
| for i in range(3): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| def test_network_get_name(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| input_names = network.get_all_input_name() | |||
| assert input_names[0] == "data" | |||
| output_names = network.get_all_output_name() | |||
| assert output_names[0] == network.get_output_name(0) | |||
| self.do_forward(network) | |||
| def test_network_set_device_id(self): | |||
| network = LiteNetwork() | |||
| assert network.device_id == 0 | |||
| network.device_id = 1 | |||
| network.load(self.model_path) | |||
| assert network.device_id == 1 | |||
| with self.assertRaises(RuntimeError): | |||
| network.device_id = 1 | |||
| self.do_forward(network) | |||
| def test_network_set_stream_id(self): | |||
| network = LiteNetwork() | |||
| assert network.stream_id == 0 | |||
| network.stream_id = 1 | |||
| network.load(self.model_path) | |||
| assert network.stream_id == 1 | |||
| with self.assertRaises(RuntimeError): | |||
| network.stream_id = 1 | |||
| self.do_forward(network) | |||
| def test_network_set_thread_number(self): | |||
| network = LiteNetwork() | |||
| assert network.threads_number == 1 | |||
| network.threads_number = 2 | |||
| network.load(self.model_path) | |||
| assert network.threads_number == 2 | |||
| with self.assertRaises(RuntimeError): | |||
| network.threads_number = 2 | |||
| self.do_forward(network) | |||
| def test_network_cpu_inplace(self): | |||
| network = LiteNetwork() | |||
| assert network.is_cpu_inplace_mode() == False | |||
| network.enable_cpu_inplace_mode() | |||
| network.load(self.model_path) | |||
| assert network.is_cpu_inplace_mode() == True | |||
| with self.assertRaises(RuntimeError): | |||
| network.enable_cpu_inplace_mode() | |||
| self.do_forward(network) | |||
| def test_network_option(self): | |||
| option = LiteOptions() | |||
| option.weight_preprocess = 1 | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| network = LiteNetwork(config=config) | |||
| network.load(self.model_path) | |||
| self.do_forward(network) | |||
| def test_network_reset_io(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| input_io = LiteIO("data") | |||
| ios = LiteNetworkIO() | |||
| ios.add_input(input_io) | |||
| network = LiteNetwork(config=config, io=ios) | |||
| network.load(self.model_path) | |||
| input_tensor = network.get_io_tensor("data") | |||
| assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
| self.do_forward(network) | |||
| def test_network_by_share(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
| layout = LiteLayout(self.input_data.shape, self.input_data.dtype) | |||
| tensor_tmp = LiteTensor(layout=layout) | |||
| tensor_tmp.set_data_by_share(self.input_data) | |||
| input_tensor.share_memory_with(tensor_tmp) | |||
| for i in range(3): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| def test_network_share_weights(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| src_network = LiteNetwork(config=config) | |||
| src_network.load(self.model_path) | |||
| new_network = LiteNetwork() | |||
| new_network.enable_cpu_inplace_mode() | |||
| new_network.share_weights_with(src_network) | |||
| self.do_forward(src_network) | |||
| self.do_forward(new_network) | |||
| def test_network_share_runtime_memory(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| src_network = LiteNetwork(config=config) | |||
| src_network.load(self.model_path) | |||
| new_network = LiteNetwork() | |||
| new_network.enable_cpu_inplace_mode() | |||
| new_network.share_runtime_memroy(src_network) | |||
| new_network.load(self.model_path) | |||
| self.do_forward(src_network) | |||
| self.do_forward(new_network) | |||
| # def test_network_async(self): | |||
| # count = 0 | |||
| # finished = False | |||
| # | |||
| # def async_callback(): | |||
| # nonlocal finished | |||
| # finished = True | |||
| # return 0 | |||
| # | |||
| # option = LiteOptions() | |||
| # option.var_sanity_check_first_run = 0 | |||
| # config = LiteConfig(option=option) | |||
| # | |||
| # network = LiteNetwork(config=config) | |||
| # network.load(self.model_path) | |||
| # | |||
| # network.async_with_callback(async_callback) | |||
| # | |||
| # input_tensor = network.get_io_tensor(network.get_input_name(0)) | |||
| # output_tensor = network.get_io_tensor(network.get_output_name(0)) | |||
| # | |||
| # input_tensor.set_data_by_share(self.input_data) | |||
| # network.forward() | |||
| # | |||
| # while not finished: | |||
| # count += 1 | |||
| # | |||
| # assert count > 0 | |||
| # output_data = output_tensor.to_numpy() | |||
| # self.check_correct(output_data) | |||
| # | |||
| # def test_network_start_callback(self): | |||
| # network = LiteNetwork() | |||
| # network.load(self.model_path) | |||
| # start_checked = False | |||
| # | |||
| # @start_finish_callback | |||
| # def start_callback(ios): | |||
| # nonlocal start_checked | |||
| # start_checked = True | |||
| # assert len(ios) == 1 | |||
| # for key in ios: | |||
| # io = key | |||
| # data = ios[key].to_numpy().flatten() | |||
| # input_data = self.input_data.flatten() | |||
| # assert data.size == input_data.size | |||
| # assert io.name.decode("utf-8") == "data" | |||
| # for i in range(data.size): | |||
| # assert data[i] == input_data[i] | |||
| # return 0 | |||
| # | |||
| # network.set_start_callback(start_callback) | |||
| # self.do_forward(network, 1) | |||
| # assert start_checked == True | |||
| # | |||
| # def test_network_finish_callback(self): | |||
| # network = LiteNetwork() | |||
| # network.load(self.model_path) | |||
| # finish_checked = False | |||
| # | |||
| # @start_finish_callback | |||
| # def finish_callback(ios): | |||
| # nonlocal finish_checked | |||
| # finish_checked = True | |||
| # assert len(ios) == 1 | |||
| # for key in ios: | |||
| # io = key | |||
| # data = ios[key].to_numpy().flatten() | |||
| # output_data = self.correct_data.flatten() | |||
| # assert data.size == output_data.size | |||
| # for i in range(data.size): | |||
| # assert data[i] == output_data[i] | |||
| # return 0 | |||
| # | |||
| # network.set_finish_callback(finish_callback) | |||
| # self.do_forward(network, 1) | |||
| # assert finish_checked == True | |||
| def test_enable_profile(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| network.enable_profile_performance("./profile.json") | |||
| self.do_forward(network) | |||
| fi = open("./profile.json", "r") | |||
| fi.close() | |||
| os.remove("./profile.json") | |||
| def test_io_txt_dump(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| network.io_txt_dump("./io_txt.txt") | |||
| self.do_forward(network) | |||
| def test_io_bin_dump(self): | |||
| import shutil | |||
| folder = "./out" | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| if not os.path.exists(folder): | |||
| os.mkdir(folder) | |||
| network.io_bin_dump(folder) | |||
| self.do_forward(network) | |||
| shutil.rmtree(folder) | |||
| def test_algo_workspace_limit(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| print("modify the workspace limit.") | |||
| network.set_network_algo_workspace_limit(10000) | |||
| self.do_forward(network) | |||
| def test_network_algo_policy(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| network.set_network_algo_policy( | |||
| LiteAlgoSelectStrategy.LITE_ALGO_PROFILE | |||
| | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE | |||
| ) | |||
| self.do_forward(network) | |||
| def test_network_algo_policy_ignore_batch(self): | |||
| network = LiteNetwork() | |||
| network.load(self.model_path) | |||
| network.set_network_algo_policy( | |||
| LiteAlgoSelectStrategy.LITE_ALGO_PROFILE, | |||
| shared_batch_size=1, | |||
| binary_equal_between_batch=True, | |||
| ) | |||
| self.do_forward(network) | |||
| @@ -0,0 +1,220 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import functools | |||
| import os | |||
| import unittest | |||
| import numpy as np | |||
| from megenginelite import * | |||
| set_log_level(2) | |||
| def require_cuda(ngpu=1): | |||
| """a decorator that disables a testcase if cuda is not enabled""" | |||
| def dector(func): | |||
| @functools.wraps(func) | |||
| def wrapped(*args, **kwargs): | |||
| if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA) >= ngpu: | |||
| return func(*args, **kwargs) | |||
| return wrapped | |||
| return dector | |||
| class TestShuffleNetCuda(unittest.TestCase): | |||
| source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
| input_data_path = os.path.join(source_dir, "input_data.npy") | |||
| correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
| model_path = os.path.join(source_dir, "shufflenet.mge") | |||
| correct_data = np.load(correct_data_path).flatten() | |||
| input_data = np.load(input_data_path) | |||
| def check_correct(self, out_data, error=1e-4): | |||
| out_data = out_data.flatten() | |||
| assert np.isfinite(out_data.sum()) | |||
| assert self.correct_data.size == out_data.size | |||
| for i in range(out_data.size): | |||
| assert abs(out_data[i] - self.correct_data[i]) < error | |||
| def do_forward(self, network, times=3): | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| input_tensor.set_data_by_copy(self.input_data) | |||
| for i in range(times): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| class TestNetwork(TestShuffleNetCuda): | |||
| @require_cuda() | |||
| def test_network_basic(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| network.load(self.model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| assert input_tensor.layout.shapes[0] == 1 | |||
| assert input_tensor.layout.shapes[1] == 3 | |||
| assert input_tensor.layout.shapes[2] == 224 | |||
| assert input_tensor.layout.shapes[3] == 224 | |||
| assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
| assert input_tensor.layout.ndim == 4 | |||
| self.do_forward(network) | |||
| @require_cuda() | |||
| def test_network_shared_data(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| network.load(self.model_path) | |||
| input_name = network.get_input_name(0) | |||
| input_tensor = network.get_io_tensor(input_name) | |||
| output_name = network.get_output_name(0) | |||
| output_tensor = network.get_io_tensor(output_name) | |||
| input_tensor.set_data_by_share(self.input_data) | |||
| for i in range(3): | |||
| network.forward() | |||
| network.wait() | |||
| output_data = output_tensor.to_numpy() | |||
| self.check_correct(output_data) | |||
| @require_cuda(2) | |||
| def test_network_set_device_id(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| assert network.device_id == 0 | |||
| network.device_id = 1 | |||
| network.load(self.model_path) | |||
| assert network.device_id == 1 | |||
| with self.assertRaises(RuntimeError): | |||
| network.device_id = 1 | |||
| self.do_forward(network) | |||
| @require_cuda() | |||
| def test_network_option(self): | |||
| option = LiteOptions() | |||
| option.weight_preprocess = 1 | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config=config) | |||
| network.load(self.model_path) | |||
| self.do_forward(network) | |||
| @require_cuda() | |||
| def test_network_reset_io(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| input_io = LiteIO("data") | |||
| ios = LiteNetworkIO() | |||
| ios.add_input(input_io) | |||
| network = LiteNetwork(config=config, io=ios) | |||
| network.load(self.model_path) | |||
| input_tensor = network.get_io_tensor("data") | |||
| assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
| self.do_forward(network) | |||
| @require_cuda() | |||
| def test_network_share_weights(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| src_network = LiteNetwork(config=config) | |||
| src_network.load(self.model_path) | |||
| new_network = LiteNetwork() | |||
| new_network.enable_cpu_inplace_mode() | |||
| new_network.share_weights_with(src_network) | |||
| self.do_forward(src_network) | |||
| self.do_forward(new_network) | |||
| @require_cuda() | |||
| def test_network_share_runtime_memory(self): | |||
| option = LiteOptions() | |||
| option.var_sanity_check_first_run = 0 | |||
| config = LiteConfig(option=option) | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| src_network = LiteNetwork(config=config) | |||
| src_network.load(self.model_path) | |||
| new_network = LiteNetwork() | |||
| new_network.enable_cpu_inplace_mode() | |||
| new_network.share_runtime_memroy(src_network) | |||
| new_network.load(self.model_path) | |||
| self.do_forward(src_network) | |||
| self.do_forward(new_network) | |||
| @require_cuda() | |||
| def test_enable_profile(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| network.load(self.model_path) | |||
| network.enable_profile_performance("./profile.json") | |||
| self.do_forward(network) | |||
| fi = open("./profile.json", "r") | |||
| fi.close() | |||
| os.remove("./profile.json") | |||
| @require_cuda() | |||
| def test_algo_workspace_limit(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| network.load(self.model_path) | |||
| print("modify the workspace limit.") | |||
| network.set_network_algo_workspace_limit(10000) | |||
| self.do_forward(network) | |||
| @require_cuda() | |||
| def test_network_algo_policy(self): | |||
| config = LiteConfig() | |||
| config.device_type = LiteDeviceType.LITE_CUDA | |||
| network = LiteNetwork(config) | |||
| network.load(self.model_path) | |||
| network.set_network_algo_policy( | |||
| LiteAlgoSelectStrategy.LITE_ALGO_PROFILE | |||
| | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE | |||
| ) | |||
| self.do_forward(network) | |||
| @@ -0,0 +1,291 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import functools | |||
| import numpy as np | |||
| from megenginelite import * | |||
| def require_cuda(func): | |||
| """a decorator that disables a testcase if cuda is not enabled""" | |||
| @functools.wraps(func) | |||
| def wrapped(*args, **kwargs): | |||
| if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): | |||
| return func(*args, **kwargs) | |||
| return wrapped | |||
| def test_tensor_make(): | |||
| empty_layout = LiteLayout() | |||
| assert empty_layout.ndim == 0 | |||
| assert empty_layout.data_type == int(LiteDataType.LITE_FLOAT) | |||
| empty_tensor = LiteTensor() | |||
| assert empty_tensor.layout.ndim == empty_layout.ndim | |||
| assert empty_tensor.layout.data_type == empty_layout.data_type | |||
| layout = LiteLayout([4, 16]) | |||
| layout = LiteLayout(dtype="float32") | |||
| layout = LiteLayout([4, 16], "float32") | |||
| layout = LiteLayout([4, 16], "float16") | |||
| layout = LiteLayout([4, 16], np.float32) | |||
| layout = LiteLayout([4, 16], np.int8) | |||
| layout = LiteLayout([4, 16], LiteDataType.LITE_FLOAT) | |||
| tensor = LiteTensor(layout) | |||
| tensor = LiteTensor(layout, LiteDeviceType.LITE_CPU) | |||
| assert tensor.layout == layout | |||
| assert tensor.device_type == LiteDeviceType.LITE_CPU | |||
| assert tensor.is_continue == True | |||
| assert tensor.is_pinned_host == False | |||
| assert tensor.nbytes == 4 * 16 * 4 | |||
| assert tensor.device_id == 0 | |||
| tensor = LiteTensor(layout, device_id=1) | |||
| assert tensor.device_id == 1 | |||
| def test_tensor_set_data(): | |||
| layout = LiteLayout([2, 16], "int8") | |||
| tensor = LiteTensor(layout) | |||
| assert tensor.nbytes == 2 * 16 | |||
| data = [i for i in range(32)] | |||
| tensor.set_data_by_copy(data) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == i | |||
| arr = np.ones([2, 16], "int8") | |||
| tensor.set_data_by_copy(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == 1 | |||
| for i in range(32): | |||
| arr[i // 16][i % 16] = i | |||
| tensor.set_data_by_share(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 16][i % 16] == i | |||
| arr[0][8] = 100 | |||
| arr[1][3] = 20 | |||
| real_data = tensor.to_numpy() | |||
| assert real_data[0][8] == 100 | |||
| assert real_data[1][3] == 20 | |||
| def test_fill_zero(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor1 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 2 | |||
| tensor1.set_data_by_copy([i for i in range(32)]) | |||
| real_data = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i | |||
| tensor1.fill_zero() | |||
| real_data = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == 0 | |||
| def test_copy_from(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor1 = LiteTensor(layout) | |||
| tensor2 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 2 | |||
| assert tensor2.nbytes == 4 * 8 * 2 | |||
| tensor1.set_data_by_copy([i for i in range(32)]) | |||
| tensor2.copy_from(tensor1) | |||
| real_data = tensor2.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i | |||
| tensor1.set_data_by_copy([i + 5 for i in range(32)]) | |||
| tensor2.copy_from(tensor1) | |||
| real_data = tensor2.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i + 5 | |||
| def test_reshape(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor1 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 2 | |||
| tensor1.set_data_by_copy([i for i in range(32)]) | |||
| real_data = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i | |||
| tensor1.reshape([8, 4]) | |||
| real_data = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 4][i % 4] == i | |||
| def test_slice(): | |||
| layout = LiteLayout([4, 8], "int32") | |||
| tensor1 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 4 | |||
| tensor1.set_data_by_copy([i for i in range(32)]) | |||
| real_data_org = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data_org[i // 8][i % 8] == i | |||
| tensor2 = tensor1.slice([1, 4], [3, 8]) | |||
| assert tensor2.layout.shapes[0] == 2 | |||
| assert tensor2.layout.shapes[1] == 4 | |||
| assert tensor2.is_continue == False | |||
| real_data = tensor2.to_numpy() | |||
| for i in range(8): | |||
| row = i // 4 | |||
| col = i % 4 | |||
| assert real_data[row][col] == real_data_org[row + 1][col + 4] | |||
| def test_tensor_share_memory(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor1 = LiteTensor(layout) | |||
| tensor2 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 2 | |||
| assert tensor2.nbytes == 4 * 8 * 2 | |||
| tensor1.set_data_by_copy([i for i in range(32)]) | |||
| tensor2.share_memory_with(tensor1) | |||
| real_data = tensor2.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i | |||
| tensor1.set_data_by_copy([i + 5 for i in range(32)]) | |||
| real_data = tensor2.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i + 5 | |||
| def test_tensor_share_ctype_memory(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor1 = LiteTensor(layout) | |||
| assert tensor1.nbytes == 4 * 8 * 2 | |||
| arr = np.ones([4, 8], "int16") | |||
| for i in range(32): | |||
| arr[i // 8][i % 8] = i | |||
| tensor1.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) | |||
| real_data = tensor1.to_numpy() | |||
| for i in range(32): | |||
| assert real_data[i // 8][i % 8] == i | |||
| @require_cuda | |||
| def test_tensor_share_ctype_memory_device(): | |||
| layout = LiteLayout([4, 8], "int16") | |||
| tensor_cpu = LiteTensor( | |||
| layout=layout, device_type=LiteDeviceType.LITE_CUDA, is_pinned_host=True | |||
| ) | |||
| tensor_cuda1 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) | |||
| tensor_cuda2 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) | |||
| assert tensor_cpu.nbytes == 4 * 8 * 2 | |||
| assert tensor_cuda1.nbytes == 4 * 8 * 2 | |||
| assert tensor_cuda2.nbytes == 4 * 8 * 2 | |||
| arr = np.ones([4, 8], "int16") | |||
| for i in range(32): | |||
| arr[i // 8][i % 8] = i | |||
| tensor_cpu.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) | |||
| tensor_cuda1.copy_from(tensor_cpu) | |||
| device_mem = tensor_cuda1.get_ctypes_memory() | |||
| tensor_cuda2.set_data_by_share(device_mem, tensor_cuda1.nbytes) | |||
| real_data1 = tensor_cuda1.to_numpy() | |||
| real_data2 = tensor_cuda2.to_numpy() | |||
| for i in range(32): | |||
| assert real_data1[i // 8][i % 8] == i | |||
| assert real_data2[i // 8][i % 8] == i | |||
| def test_tensor_share_memory_with(): | |||
| layout = LiteLayout([4, 32], "int16") | |||
| tensor = LiteTensor(layout) | |||
| assert tensor.nbytes == 4 * 32 * 2 | |||
| arr = np.ones([4, 32], "int16") | |||
| for i in range(128): | |||
| arr[i // 32][i % 32] = i | |||
| tensor.set_data_by_share(arr) | |||
| real_data = tensor.to_numpy() | |||
| for i in range(128): | |||
| assert real_data[i // 32][i % 32] == i | |||
| tensor2 = LiteTensor(layout) | |||
| tensor2.share_memory_with(tensor) | |||
| real_data = tensor.to_numpy() | |||
| real_data2 = tensor2.to_numpy() | |||
| for i in range(128): | |||
| assert real_data[i // 32][i % 32] == i | |||
| assert real_data2[i // 32][i % 32] == i | |||
| arr[1][18] = 5 | |||
| arr[3][7] = 345 | |||
| real_data = tensor2.to_numpy() | |||
| assert real_data[1][18] == 5 | |||
| assert real_data[3][7] == 345 | |||
| def test_empty_tensor(): | |||
| empty_tensor = LiteTensor() | |||
| assert empty_tensor.layout.ndim == 0 | |||
| assert empty_tensor.layout.data_type == int(LiteDataType.LITE_FLOAT) | |||
| # check empty tensor to numpy | |||
| data = empty_tensor.to_numpy() | |||
| def test_tensor_by_set_copy_with_new_layout(): | |||
| layout = LiteLayout([4, 32], "int16") | |||
| tensor = LiteTensor(layout) | |||
| assert tensor.nbytes == 4 * 32 * 2 | |||
| arr = np.ones([8, 64], "int32") | |||
| tensor.set_data_by_copy(arr) | |||
| new_layout = tensor.layout | |||
| assert new_layout.ndim == 2 | |||
| assert new_layout.shapes[0] == 8 | |||
| assert new_layout.shapes[1] == 64 | |||
| tensor = LiteTensor(layout) | |||
| tensor.set_data_by_share(arr) | |||
| new_layout = tensor.layout | |||
| assert new_layout.ndim == 2 | |||
| assert new_layout.shapes[0] == 8 | |||
| assert new_layout.shapes[1] == 64 | |||
| def test_tensor_concat(): | |||
| layout = LiteLayout([4, 32], "int16") | |||
| tensors = [] | |||
| arr = np.ones([4, 32], "int16") | |||
| for j in range(4): | |||
| for i in range(128): | |||
| arr[i // 32][i % 32] = j | |||
| tensor = LiteTensor(layout) | |||
| tensor.set_data_by_copy(arr) | |||
| tensors.append(tensor) | |||
| new_tensor = LiteTensorConcat(tensors, 0) | |||
| real_data = new_tensor.to_numpy() | |||
| for j in range(4): | |||
| for i in range(128): | |||
| index = j * 128 + i | |||
| assert real_data[index // 32][index % 32] == j | |||
| @@ -0,0 +1,199 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import functools | |||
| import numpy as np | |||
| from megenginelite import * | |||
| def require_cuda(func): | |||
| """a decorator that disables a testcase if cuda is not enabled""" | |||
| @functools.wraps(func) | |||
| def wrapped(*args, **kwargs): | |||
| if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): | |||
| return func(*args, **kwargs) | |||
| return wrapped | |||
| @require_cuda | |||
| def test_tensor_collect_batch(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
| ) | |||
| arr = np.ones([8, 8], "int32") | |||
| for i in range(4): | |||
| batch_tensor.collect(arr) | |||
| arr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 8 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(64): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| def test_tensor_collect_batch_cpu(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
| ) | |||
| arr = np.ones([8, 8], "int32") | |||
| for i in range(4): | |||
| batch_tensor.collect(arr) | |||
| arr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 8 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(64): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @require_cuda | |||
| def test_tensor_collect_batch_by_index(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
| ) | |||
| arr = np.ones([8, 8], "int32") | |||
| arr += 1 # ==2 | |||
| batch_tensor.collect_id(arr, 1) | |||
| arr -= 1 # ==1 | |||
| batch_tensor.collect_id(arr, 0) | |||
| arr += 2 # ==3 | |||
| batch_tensor.collect_id(arr, 2) | |||
| arr += 1 # ==4 | |||
| batch_tensor.collect_id(arr, 3) | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 8 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(64): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @require_cuda | |||
| def test_tensor_collect_batch_tensor(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
| ) | |||
| nparr = np.ones([6, 8], "int32") | |||
| tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
| for i in range(4): | |||
| tensor.set_data_by_share(nparr) | |||
| batch_tensor.collect(tensor) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| def test_tensor_collect_batch_tensor_cpu(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
| ) | |||
| nparr = np.ones([6, 8], "int32") | |||
| tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
| for i in range(4): | |||
| tensor.set_data_by_share(nparr) | |||
| batch_tensor.collect(tensor) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @require_cuda | |||
| def test_tensor_collect_batch_ctypes(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
| ) | |||
| nparr = np.ones([6, 8], "int32") | |||
| for i in range(4): | |||
| in_data = nparr.ctypes.data | |||
| batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| def test_tensor_collect_batch_ctypes_cpu(): | |||
| batch_tensor = TensorBatchCollector( | |||
| [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
| ) | |||
| nparr = np.ones([6, 8], "int32") | |||
| for i in range(4): | |||
| in_data = nparr.ctypes.data | |||
| batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @require_cuda | |||
| def test_tensor_collect_batch_device_tensor(): | |||
| all_tensor = LiteTensor( | |||
| LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), | |||
| device_type=LiteDeviceType.LITE_CUDA, | |||
| ) | |||
| batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) | |||
| nparr = np.ones([6, 8], "int32") | |||
| tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
| for i in range(4): | |||
| tensor.set_data_by_share(nparr) | |||
| batch_tensor.collect(tensor) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @require_cuda | |||
| def test_tensor_collect_batch_device_numpy(): | |||
| all_tensor = LiteTensor( | |||
| LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), | |||
| device_type=LiteDeviceType.LITE_CUDA, | |||
| ) | |||
| batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) | |||
| nparr = np.ones([6, 8], "int32") | |||
| for i in range(4): | |||
| batch_tensor.collect(nparr) | |||
| nparr += 1 | |||
| data = batch_tensor.to_numpy() | |||
| assert data.shape[0] == 4 | |||
| assert data.shape[1] == 6 | |||
| assert data.shape[2] == 8 | |||
| for i in range(4): | |||
| for j in range(48): | |||
| assert data[i][j // 8][j % 8] == i + 1 | |||
| @@ -0,0 +1,53 @@ | |||
| /** | |||
| * \file src/decryption/aes_decrypt.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "./mbedtls/aes.h" | |||
| #include "decrypt_base.h" | |||
| namespace lite { | |||
| class AESDcryption { | |||
| public: | |||
| static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
| size_t size, | |||
| const std::vector<uint8_t>& key) { | |||
| mbedtls_aes_context ctx; | |||
| mbedtls_aes_init(&ctx); | |||
| mbedtls_aes_setkey_dec(&ctx, key.data(), 256); | |||
| auto data = static_cast<const uint8_t*>(model_mem); | |||
| //! first 16 bytes is IV | |||
| uint8_t iv[16]; | |||
| //! last 8 bytes is file size(length) | |||
| auto length_ptr = data + size - 8; | |||
| size_t length = 0; | |||
| for (int i = 0; i < 8; i++) { | |||
| length |= length_ptr[i] << (8 * (7 - i)); | |||
| } | |||
| std::copy(data, data + 16, iv); | |||
| auto output = std::vector<uint8_t>(size - 24); | |||
| mbedtls_aes_crypt_cbc(&ctx, MBEDTLS_AES_DECRYPT, size - 24, iv, | |||
| data + 16, output.data()); | |||
| mbedtls_aes_free(&ctx); | |||
| output.erase(output.begin() + length, output.end()); | |||
| return output; | |||
| } | |||
| static std::vector<uint8_t> get_decrypt_key() { | |||
| std::vector<uint8_t> key = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, | |||
| 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, | |||
| 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, | |||
| 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, | |||
| 0x1C, 0x1D, 0x1E, 0x1F}; | |||
| return key; | |||
| } | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,49 @@ | |||
| /** | |||
| * \file src/decryption/decrypt_base.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite/global.h" | |||
| #include "misc.h" | |||
| namespace lite { | |||
| struct DecryptionStaticData { | |||
| std::unordered_map< | |||
| std::string, | |||
| std::pair<DecryptionFunc, std::shared_ptr<std::vector<uint8_t>>>> | |||
| decryption_methods; | |||
| LITE_MUTEX map_mutex; | |||
| }; | |||
| DecryptionStaticData& decryption_static_data(); | |||
| template <int count> | |||
| struct DecryptionRegister; | |||
| } // namespace lite | |||
| #define CONCAT_IMPL(a, b) a##b | |||
| #define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) | |||
| #define REGIST_DECRYPTION_METHOD(name_, func_, key_) \ | |||
| REGIST_DECRYPTION_METHOD_WITH_NUM(__COUNTER__, name_, func_, key_) | |||
| #define REGIST_DECRYPTION_METHOD_WITH_NUM(number_, name_, func_, key_) \ | |||
| template <> \ | |||
| struct DecryptionRegister<number_> { \ | |||
| DecryptionRegister() { \ | |||
| register_decryption_and_key(name_, func_, key_); \ | |||
| } \ | |||
| }; \ | |||
| namespace { \ | |||
| DecryptionRegister<number_> MACRO_CONCAT(decryption_, number_); \ | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,349 @@ | |||
| /** | |||
| * \file aes.h | |||
| * | |||
| * \brief AES block cipher | |||
| * | |||
| * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved | |||
| * SPDX-License-Identifier: Apache-2.0 | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); you may | |||
| * not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |||
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * | |||
| * This file is part of mbed TLS (https://tls.mbed.org) | |||
| */ | |||
| /** | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #ifndef MBEDTLS_AES_H | |||
| #define MBEDTLS_AES_H | |||
| #if !defined(MBEDTLS_CONFIG_FILE) | |||
| #include "config.h" | |||
| #else | |||
| #include MBEDTLS_CONFIG_FILE | |||
| #endif | |||
| #include <stddef.h> | |||
| #include <stdint.h> | |||
| /* padlock.c and aesni.c rely on these values! */ | |||
| #define MBEDTLS_AES_ENCRYPT 1 | |||
| #define MBEDTLS_AES_DECRYPT 0 | |||
| #define MBEDTLS_ERR_AES_INVALID_KEY_LENGTH -0x0020 /**< Invalid key length. */ | |||
| #define MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH \ | |||
| -0x0022 /**< Invalid data input length. */ | |||
| #if (defined(__ARMCC_VERSION) || defined(_MSC_VER)) && !defined(inline) && \ | |||
| !defined(__cplusplus) | |||
| #define inline __inline | |||
| #endif | |||
| #if !defined(MBEDTLS_AES_ALT) | |||
| // Regular implementation | |||
| // | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| /** | |||
| * \brief AES context structure | |||
| * | |||
| * \note buf is able to hold 32 extra bytes, which can be used: | |||
| * - for alignment purposes if VIA padlock is used, and/or | |||
| * - to simplify key expansion in the 256-bit case by | |||
| * generating an extra round key | |||
| */ | |||
| typedef struct { | |||
| int nr; /*!< number of rounds */ | |||
| uint32_t* rk; /*!< AES round keys */ | |||
| uint32_t buf[68]; /*!< unaligned data */ | |||
| } mbedtls_aes_context; | |||
| /** | |||
| * \brief Initialize AES context | |||
| * | |||
| * \param ctx AES context to be initialized | |||
| */ | |||
| void mbedtls_aes_init(mbedtls_aes_context* ctx); | |||
| /** | |||
| * \brief Clear AES context | |||
| * | |||
| * \param ctx AES context to be cleared | |||
| */ | |||
| void mbedtls_aes_free(mbedtls_aes_context* ctx); | |||
| /** | |||
| * \brief AES key schedule (encryption) | |||
| * | |||
| * \param ctx AES context to be initialized | |||
| * \param key encryption key | |||
| * \param keybits must be 128, 192 or 256 | |||
| * | |||
| * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH | |||
| */ | |||
| int mbedtls_aes_setkey_enc(mbedtls_aes_context* ctx, const unsigned char* key, | |||
| unsigned int keybits); | |||
| /** | |||
| * \brief AES key schedule (decryption) | |||
| * | |||
| * \param ctx AES context to be initialized | |||
| * \param key decryption key | |||
| * \param keybits must be 128, 192 or 256 | |||
| * | |||
| * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH | |||
| */ | |||
| int mbedtls_aes_setkey_dec(mbedtls_aes_context* ctx, const unsigned char* key, | |||
| unsigned int keybits); | |||
| /** | |||
| * \brief AES-ECB block encryption/decryption | |||
| * | |||
| * \param ctx AES context | |||
| * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
| * \param input 16-byte input block | |||
| * \param output 16-byte output block | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_aes_crypt_ecb(mbedtls_aes_context* ctx, int mode, | |||
| const unsigned char input[16], | |||
| unsigned char output[16]); | |||
| #if defined(MBEDTLS_CIPHER_MODE_CBC) | |||
| /** | |||
| * \brief AES-CBC buffer encryption/decryption | |||
| * Length should be a multiple of the block | |||
| * size (16 bytes) | |||
| * | |||
| * \note Upon exit, the content of the IV is updated so that you can | |||
| * call the function same function again on the following | |||
| * block(s) of data and get the same result as if it was | |||
| * encrypted in one call. This allows a "streaming" usage. | |||
| * If on the other hand you need to retain the contents of the | |||
| * IV, you should either save it manually or use the cipher | |||
| * module instead. | |||
| * | |||
| * \param ctx AES context | |||
| * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
| * \param length length of the input data | |||
| * \param iv initialization vector (updated after use) | |||
| * \param input buffer holding the input data | |||
| * \param output buffer holding the output data | |||
| * | |||
| * \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH | |||
| */ | |||
| int mbedtls_aes_crypt_cbc(mbedtls_aes_context* ctx, int mode, size_t length, | |||
| unsigned char iv[16], const unsigned char* input, | |||
| unsigned char* output); | |||
| #endif /* MBEDTLS_CIPHER_MODE_CBC */ | |||
| #if defined(MBEDTLS_CIPHER_MODE_CFB) | |||
| /** | |||
| * \brief AES-CFB128 buffer encryption/decryption. | |||
| * | |||
| * Note: Due to the nature of CFB you should use the same key schedule for | |||
| * both encryption and decryption. So a context initialized with | |||
| * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
| * MBEDTLS_AES_DECRYPT. | |||
| * | |||
| * \note Upon exit, the content of the IV is updated so that you can | |||
| * call the function same function again on the following | |||
| * block(s) of data and get the same result as if it was | |||
| * encrypted in one call. This allows a "streaming" usage. | |||
| * If on the other hand you need to retain the contents of the | |||
| * IV, you should either save it manually or use the cipher | |||
| * module instead. | |||
| * | |||
| * \param ctx AES context | |||
| * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
| * \param length length of the input data | |||
| * \param iv_off offset in IV (updated after use) | |||
| * \param iv initialization vector (updated after use) | |||
| * \param input buffer holding the input data | |||
| * \param output buffer holding the output data | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_aes_crypt_cfb128(mbedtls_aes_context* ctx, int mode, size_t length, | |||
| size_t* iv_off, unsigned char iv[16], | |||
| const unsigned char* input, unsigned char* output); | |||
| /** | |||
| * \brief AES-CFB8 buffer encryption/decryption. | |||
| * | |||
| * Note: Due to the nature of CFB you should use the same key schedule for | |||
| * both encryption and decryption. So a context initialized with | |||
| * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
| * MBEDTLS_AES_DECRYPT. | |||
| * | |||
| * \note Upon exit, the content of the IV is updated so that you can | |||
| * call the function same function again on the following | |||
| * block(s) of data and get the same result as if it was | |||
| * encrypted in one call. This allows a "streaming" usage. | |||
| * If on the other hand you need to retain the contents of the | |||
| * IV, you should either save it manually or use the cipher | |||
| * module instead. | |||
| * | |||
| * \param ctx AES context | |||
| * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
| * \param length length of the input data | |||
| * \param iv initialization vector (updated after use) | |||
| * \param input buffer holding the input data | |||
| * \param output buffer holding the output data | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_aes_crypt_cfb8(mbedtls_aes_context* ctx, int mode, size_t length, | |||
| unsigned char iv[16], const unsigned char* input, | |||
| unsigned char* output); | |||
| #endif /*MBEDTLS_CIPHER_MODE_CFB */ | |||
| #if defined(MBEDTLS_CIPHER_MODE_CTR) | |||
| /** | |||
| * \brief AES-CTR buffer encryption/decryption | |||
| * | |||
| * Warning: You have to keep the maximum use of your counter in mind! | |||
| * | |||
| * Note: Due to the nature of CTR you should use the same key schedule for | |||
| * both encryption and decryption. So a context initialized with | |||
| * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
| * MBEDTLS_AES_DECRYPT. | |||
| * | |||
| * \param ctx AES context | |||
| * \param length The length of the data | |||
| * \param nc_off The offset in the current stream_block (for resuming | |||
| * within current cipher stream). The offset pointer to | |||
| * should be 0 at the start of a stream. | |||
| * \param nonce_counter The 128-bit nonce and counter. | |||
| * \param stream_block The saved stream-block for resuming. Is overwritten | |||
| * by the function. | |||
| * \param input The input data stream | |||
| * \param output The output data stream | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_aes_crypt_ctr(mbedtls_aes_context* ctx, size_t length, | |||
| size_t* nc_off, unsigned char nonce_counter[16], | |||
| unsigned char stream_block[16], | |||
| const unsigned char* input, unsigned char* output); | |||
| #endif /* MBEDTLS_CIPHER_MODE_CTR */ | |||
| /** | |||
| * \brief Internal AES block encryption function | |||
| * (Only exposed to allow overriding it, | |||
| * see MBEDTLS_AES_ENCRYPT_ALT) | |||
| * | |||
| * \param ctx AES context | |||
| * \param input Plaintext block | |||
| * \param output Output (ciphertext) block | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_internal_aes_encrypt(mbedtls_aes_context* ctx, | |||
| const unsigned char input[16], | |||
| unsigned char output[16]); | |||
| /** | |||
| * \brief Internal AES block decryption function | |||
| * (Only exposed to allow overriding it, | |||
| * see MBEDTLS_AES_DECRYPT_ALT) | |||
| * | |||
| * \param ctx AES context | |||
| * \param input Ciphertext block | |||
| * \param output Output (plaintext) block | |||
| * | |||
| * \return 0 if successful | |||
| */ | |||
| int mbedtls_internal_aes_decrypt(mbedtls_aes_context* ctx, | |||
| const unsigned char input[16], | |||
| unsigned char output[16]); | |||
| #if !defined(MBEDTLS_DEPRECATED_REMOVED) | |||
| #if defined(MBEDTLS_DEPRECATED_WARNING) | |||
| #define MBEDTLS_DEPRECATED __attribute__((deprecated)) | |||
| #else | |||
| #define MBEDTLS_DEPRECATED | |||
| #endif | |||
| /** | |||
| * \brief Internal AES block encryption function | |||
| * (Only exposed to allow overriding it, | |||
| * see MBEDTLS_AES_ENCRYPT_ALT) | |||
| * | |||
| * \deprecated Superseded by mbedtls_aes_encrypt_ext() in 2.5.0 | |||
| * | |||
| * \param ctx AES context | |||
| * \param input Plaintext block | |||
| * \param output Output (ciphertext) block | |||
| */ | |||
| MBEDTLS_DEPRECATED static inline void mbedtls_aes_encrypt( | |||
| mbedtls_aes_context* ctx, const unsigned char input[16], | |||
| unsigned char output[16]) { | |||
| mbedtls_internal_aes_encrypt(ctx, input, output); | |||
| } | |||
| /** | |||
| * \brief Internal AES block decryption function | |||
| * (Only exposed to allow overriding it, | |||
| * see MBEDTLS_AES_DECRYPT_ALT) | |||
| * | |||
| * \deprecated Superseded by mbedtls_aes_decrypt_ext() in 2.5.0 | |||
| * | |||
| * \param ctx AES context | |||
| * \param input Ciphertext block | |||
| * \param output Output (plaintext) block | |||
| */ | |||
| MBEDTLS_DEPRECATED static inline void mbedtls_aes_decrypt( | |||
| mbedtls_aes_context* ctx, const unsigned char input[16], | |||
| unsigned char output[16]) { | |||
| mbedtls_internal_aes_decrypt(ctx, input, output); | |||
| } | |||
| #undef MBEDTLS_DEPRECATED | |||
| #endif /* !MBEDTLS_DEPRECATED_REMOVED */ | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #else /* MBEDTLS_AES_ALT */ | |||
| #include "aes_alt.h" | |||
| #endif /* MBEDTLS_AES_ALT */ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| /** | |||
| * \brief Checkup routine | |||
| * | |||
| * \return 0 if successful, or 1 if the test failed | |||
| */ | |||
| int mbedtls_aes_self_test(int verbose); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif /* aes.h */ | |||
| @@ -0,0 +1,5 @@ | |||
| #pragma once | |||
| #define MBEDTLS_AES_C | |||
| #define MBEDTLS_AES_ROM_TABLES | |||
| #define MBEDTLS_CIPHER_MODE_CBC | |||
| @@ -0,0 +1,156 @@ | |||
| /** | |||
| * \file src/decryption/rc4/rc4_cryption_base.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include <algorithm> | |||
| #include <cstdint> | |||
| namespace lite { | |||
| namespace rc4 { | |||
| #define m256(x) static_cast<uint8_t>(x) | |||
| /*! \brief Pseudo-random byte stream for RC4. | |||
| */ | |||
| class RC4RandStream { | |||
| public: | |||
| RC4RandStream() = default; | |||
| RC4RandStream(uint64_t key) { reset(key); } | |||
| void reset(uint64_t init_key) { | |||
| i_ = j_ = 0; | |||
| for (int i = 0; i < 256; i++) | |||
| s_[i] = i; | |||
| uint8_t j = 0; | |||
| for (int i = 0; i < 256; i++) { | |||
| j = j + s_[i] + m256(init_key >> ((i % 8) * 8)); | |||
| std::swap(s_[i], s_[j]); | |||
| } | |||
| // drop | |||
| for (int i = 0; i < 768; i++) { | |||
| next8(); | |||
| } | |||
| for (int i = 0, t = next8(); i < t; i++) { | |||
| next8(); | |||
| } | |||
| } | |||
| uint8_t next8() { | |||
| i_++; | |||
| uint8_t a = s_[i_]; | |||
| j_ += a; | |||
| uint8_t b = s_[j_]; | |||
| s_[i_] = b; | |||
| s_[j_] = a; | |||
| uint8_t c = s_[m256((i_ << 5) ^ (j_ >> 3))] + | |||
| s_[m256((j_ << 5) ^ (i_ >> 3))]; | |||
| return (s_[m256(a + b)] + s_[c ^ 0xAA]) ^ s_[m256(j_ + b)]; | |||
| } | |||
| uint64_t next64() { | |||
| uint64_t rst; | |||
| uint8_t* buf = reinterpret_cast<uint8_t*>(&rst); | |||
| for (int i = 0; i < 8; i++) { | |||
| buf[i] = next8(); | |||
| } | |||
| return rst; | |||
| } | |||
| private: | |||
| uint8_t s_[256], i_ = 0, j_ = 0; | |||
| }; | |||
| #undef m256 | |||
| /*! | |||
| * \brief fast and secure 64-bit hash | |||
| * see https://code.google.com/p/fast-hash/ | |||
| */ | |||
| class FastHash64 { | |||
| public: | |||
| FastHash64(uint64_t seed) | |||
| : hash_{seed}, | |||
| mul0_{key_gen_hash_mul0()}, | |||
| mul1_{key_gen_hash_mul1()} {} | |||
| void feed(uint64_t val) { | |||
| val ^= val >> 23; | |||
| val *= mul0_; | |||
| val ^= val >> 47; | |||
| hash_ ^= val; | |||
| hash_ *= mul1_; | |||
| } | |||
| uint64_t get() { return hash_; } | |||
| private: | |||
| uint64_t hash_; | |||
| const uint64_t mul0_, mul1_; | |||
| static uint64_t key_gen_hash_mul0() { | |||
| uint64_t rst; | |||
| uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
| buf[2] = 50; | |||
| buf[3] = 244; | |||
| buf[6] = 39; | |||
| buf[1] = 92; | |||
| buf[5] = 89; | |||
| buf[4] = 155; | |||
| buf[0] = 55; | |||
| buf[7] = 33; | |||
| return rst; | |||
| } | |||
| static uint64_t key_gen_hash_mul1() { | |||
| uint64_t rst; | |||
| uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
| buf[6] = 3; | |||
| buf[2] = 109; | |||
| buf[7] = 136; | |||
| buf[1] = 25; | |||
| buf[5] = 85; | |||
| buf[0] = 101; | |||
| buf[4] = 242; | |||
| buf[3] = 30; | |||
| return rst; | |||
| } | |||
| }; | |||
| // The encryption keys are always inlined. | |||
| static inline uint64_t key_gen_enc_key() { | |||
| uint64_t rst; | |||
| uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
| buf[4] = 120; | |||
| buf[3] = 121; | |||
| buf[7] = 122; | |||
| buf[6] = 123; | |||
| buf[0] = 124; | |||
| buf[5] = 125; | |||
| buf[2] = 126; | |||
| buf[1] = 127; | |||
| return rst; | |||
| } | |||
| static inline uint64_t key_gen_hash_key() { | |||
| uint64_t rst; | |||
| uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
| buf[2] = 101; | |||
| buf[5] = 102; | |||
| buf[4] = 103; | |||
| buf[7] = 104; | |||
| buf[1] = 105; | |||
| buf[3] = 106; | |||
| buf[6] = 107; | |||
| buf[0] = 108; | |||
| return rst; | |||
| } | |||
| } // namespace rc4 | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,219 @@ | |||
| /** | |||
| * \file src/decryption/rc4/rc4_cryption_impl.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "rc4_cryption_impl.h" | |||
| #include "../../misc.h" | |||
| #include <cstring> | |||
| using namespace lite; | |||
| /*! | |||
| * \brief Read the input stream once in order to initialize the decryption | |||
| * state. | |||
| */ | |||
| void RC4Impl::init_rc4_state() { | |||
| rc4::RC4RandStream enc_stream(m_enc_key); | |||
| rc4::FastHash64 dechash(m_hash_key); | |||
| size_t offset = 0; | |||
| std::vector<uint64_t> buffer(128); | |||
| size_t remaining = m_model_length - sizeof(uint64_t); | |||
| while (remaining > 0) { | |||
| size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); | |||
| memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset, | |||
| toread); | |||
| offset += toread; | |||
| remaining -= toread; | |||
| for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { | |||
| uint64_t value = buffer[i]; | |||
| value ^= enc_stream.next64(); | |||
| dechash.feed(value); | |||
| } | |||
| } | |||
| uint64_t hashvalue; | |||
| memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset, | |||
| sizeof(hashvalue)); | |||
| offset += sizeof(hashvalue); | |||
| hashvalue ^= dechash.get() ^ enc_stream.next64(); | |||
| m_state.hash_stream.reset(hashvalue); | |||
| m_state.enc_stream.reset(m_enc_key); | |||
| } | |||
| std::vector<uint8_t> RC4Impl::decrypt_model() { | |||
| std::vector<uint8_t> result(m_model_length, 0); | |||
| uint8_t* ptr = result.data(); | |||
| for (size_t i = 0; i < m_model_length; ++i) { | |||
| ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i]; | |||
| ptr[i] ^= m_state.hash_stream.next8() ^ m_state.enc_stream.next8(); | |||
| } | |||
| return result; | |||
| } | |||
| /*! \brief Encrypt the data in m_buffer. | |||
| * | |||
| * The basic idea is to calculate a 64-bit hash from the buffer and append | |||
| * it to the end of the buffer. The basic requirement is that the change of | |||
| * every byte including the hash value will destroy the whole model in every | |||
| * byte. | |||
| * | |||
| * Encryption: | |||
| * | |||
| * 1. First calculate a 64-bit hash, called plain hash value, from the | |||
| * buffer. | |||
| * 2. Initialize a RC4 stream with the plain hash value. | |||
| * 3. Obfuscate the model body with the RC4 stream defined in step 2. | |||
| * 4. Calculate the hash value of the obfuscated model, called hash value | |||
| * after hashing. | |||
| * 5. Encrypt the model body with a RC4 stream made from the encryption key. | |||
| * 6. Bit-xor the hash value after hashing with the plain hash value, called | |||
| * mixed hash. | |||
| * 7. Encrypt the mixed hash with the RC4 stream defined in step 5, called | |||
| * the protected hash. | |||
| * 8. Append the protected hash to the buffer. | |||
| * | |||
| * Decryption: | |||
| * 1. Decrypt the model body with a RC4 stream made from the encryption key, | |||
| * which is the reverse of step 5 and 7 of encryption and get the mixed | |||
| * hash. | |||
| * 2. Calculate the hash value of the decrypted model, which equals to the | |||
| * hash value after hashing in step 4 of encryption. | |||
| * 3. Bit-xor the hash value after hashing and the mixed hash to get the | |||
| * plain hash value, which is the reverse of step 6 of encryption. | |||
| * 4. Un-obfuscate the model body with the plain hash value, which is the | |||
| * reverse of step 3 of encryption. | |||
| * | |||
| * Think: | |||
| * 1. If any byte in the model body is broken, the hash value after hashing | |||
| * will be broken in step 2, and hence the plain hash value in step 3 | |||
| * will be also broken, and finally, the model body will be broken in | |||
| * step 4. | |||
| * 2. If the protected hash is broken, the plain hash value in step 3 will | |||
| * be broken, and finally the model body will be broken. | |||
| */ | |||
| std::vector<uint8_t> RC4Impl::encrypt_model() { | |||
| size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / | |||
| sizeof(size_t) * sizeof(size_t); | |||
| std::vector<uint8_t> pad_model(total_length, 0); | |||
| memcpy(pad_model.data(), m_model_mem, m_model_length); | |||
| // Calculate the hash of the model. | |||
| rc4::FastHash64 plainhash(m_hash_key); | |||
| uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data()); | |||
| size_t len = pad_model.size() / sizeof(uint64_t); | |||
| for (size_t i = 0; i < len; ++i) | |||
| plainhash.feed(ptr[i]); | |||
| uint64_t plainhash_value = plainhash.get(); | |||
| // Encrypt the model. | |||
| rc4::RC4RandStream hash_enc(plainhash_value); | |||
| rc4::RC4RandStream outmost_enc(m_enc_key); | |||
| rc4::FastHash64 afterhashenc_hash(m_hash_key); | |||
| for (size_t i = 0; i < len; ++i) { | |||
| uint64_t value = ptr[i] ^ hash_enc.next64(); | |||
| afterhashenc_hash.feed(value); | |||
| ptr[i] = value ^ outmost_enc.next64(); | |||
| } | |||
| uint64_t protected_hash = | |||
| plainhash_value ^ afterhashenc_hash.get() ^ outmost_enc.next64(); | |||
| size_t end = pad_model.size(); | |||
| pad_model.resize(pad_model.size() + sizeof(uint64_t)); | |||
| ptr = reinterpret_cast<uint64_t*>(&pad_model[end]); | |||
| *ptr = protected_hash; | |||
| return pad_model; | |||
| } | |||
| /*! | |||
| * \brief Read the input stream once in order to initialize the decryption | |||
| * state. | |||
| */ | |||
| void SimpleFastRC4Impl::init_sfrc4_state() { | |||
| rc4::RC4RandStream enc_stream(m_enc_key); | |||
| rc4::FastHash64 dechash(m_hash_key); | |||
| size_t offset = 0; | |||
| std::vector<uint64_t> buffer(128); | |||
| size_t remaining = m_model_length - sizeof(uint64_t); | |||
| while (remaining > 0) { | |||
| size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); | |||
| memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset, | |||
| toread); | |||
| offset += toread; | |||
| remaining -= toread; | |||
| for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { | |||
| uint64_t value = buffer[i]; | |||
| dechash.feed(value); | |||
| } | |||
| } | |||
| uint64_t hashvalue; | |||
| memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset, | |||
| sizeof(hashvalue)); | |||
| offset += sizeof(hashvalue); | |||
| /*! \brief test the hash_val. */ | |||
| if (hashvalue != dechash.get()) | |||
| LITE_THROW( | |||
| "The checksum of the file cannot be verified. The file may " | |||
| "be encrypted in the wrong algorithm or different keys."); | |||
| m_state.hash_stream.reset(m_hash_key); | |||
| m_state.enc_stream.reset(m_enc_key); | |||
| } | |||
| std::vector<uint8_t> SimpleFastRC4Impl::decrypt_model() { | |||
| std::vector<uint8_t> result(m_model_length, 0); | |||
| uint8_t* ptr = result.data(); | |||
| for (size_t i = 0; i < m_model_length; ++i) { | |||
| ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i]; | |||
| ptr[i] ^= m_state.enc_stream.next8(); | |||
| } | |||
| return result; | |||
| } | |||
| std::vector<uint8_t> SimpleFastRC4Impl::encrypt_model() { | |||
| size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / | |||
| sizeof(size_t) * sizeof(size_t); | |||
| std::vector<uint8_t> pad_model(total_length, 0); | |||
| memcpy(pad_model.data(), m_model_mem, m_model_length); | |||
| // Calculate the hash of the model. | |||
| rc4::FastHash64 enchash(m_hash_key); | |||
| uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data()); | |||
| size_t len = pad_model.size() / sizeof(uint64_t); | |||
| // Encrypt the model. | |||
| rc4::RC4RandStream out_enc(m_enc_key); | |||
| for (size_t i = 0; i < len; ++i) { | |||
| ptr[i] = ptr[i] ^ out_enc.next64(); | |||
| enchash.feed(ptr[i]); | |||
| } | |||
| uint64_t hash_value = enchash.get(); | |||
| size_t end = pad_model.size(); | |||
| pad_model.resize(pad_model.size() + sizeof(uint64_t)); | |||
| ptr = reinterpret_cast<uint64_t*>(&pad_model[end]); | |||
| *ptr = hash_value; | |||
| return pad_model; | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,79 @@ | |||
| /** | |||
| * \file src/decryption/rc4/rc4_cryption_impl.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "rc4_cryption_base.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| namespace lite { | |||
| class RC4Impl { | |||
| struct RC4State { | |||
| rc4::RC4RandStream enc_stream; | |||
| rc4::RC4RandStream hash_stream; | |||
| } m_state; | |||
| public: | |||
| RC4Impl(const void* model_mem, size_t size, const std::vector<uint8_t>& key) | |||
| : m_model_mem(model_mem), m_model_length(size) { | |||
| const uint8_t* data = key.data(); | |||
| m_hash_key = *reinterpret_cast<const uint64_t*>(data); | |||
| m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8); | |||
| } | |||
| std::vector<uint8_t> encrypt_model(); | |||
| std::vector<uint8_t> decrypt_model(); | |||
| /*! \brief Read the input stream once in order to initialize the decryption | |||
| * state. | |||
| */ | |||
| void init_rc4_state(); | |||
| private: | |||
| const void* m_model_mem; | |||
| size_t m_model_length; | |||
| uint64_t m_hash_key; | |||
| uint64_t m_enc_key; | |||
| }; | |||
| class SimpleFastRC4Impl { | |||
| struct SFRC4State { | |||
| rc4::RC4RandStream enc_stream; | |||
| rc4::RC4RandStream hash_stream; | |||
| } m_state; | |||
| public: | |||
| SimpleFastRC4Impl(const void* model_mem, size_t size, | |||
| const std::vector<uint8_t>& key) | |||
| : m_model_mem(model_mem), m_model_length(size) { | |||
| const uint8_t* data = key.data(); | |||
| m_hash_key = *reinterpret_cast<const uint64_t*>(data); | |||
| m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8); | |||
| } | |||
| std::vector<uint8_t> encrypt_model(); | |||
| std::vector<uint8_t> decrypt_model(); | |||
| /*! \brief Read the input stream once in order to initialize the decryption | |||
| * state. | |||
| */ | |||
| void init_sfrc4_state(); | |||
| private: | |||
| const void* m_model_mem; | |||
| size_t m_model_length; | |||
| uint64_t m_hash_key; | |||
| uint64_t m_enc_key; | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,58 @@ | |||
| /** | |||
| * \file src/decryption/rc4_cryption.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "rc4_cryption.h" | |||
| #include "rc4/rc4_cryption_impl.h" | |||
| #include <vector> | |||
| using namespace lite; | |||
| std::vector<uint8_t> RC4::decrypt_model(const void* model_mem, size_t size, | |||
| const std::vector<uint8_t>& key) { | |||
| RC4Impl rc4_impl(model_mem, size, key); | |||
| rc4_impl.init_rc4_state(); | |||
| return rc4_impl.decrypt_model(); | |||
| } | |||
| std::vector<uint8_t> RC4::encrypt_model(const void* model_mem, size_t size, | |||
| const std::vector<uint8_t>& key) { | |||
| RC4Impl rc4_impl(model_mem, size, key); | |||
| return rc4_impl.encrypt_model(); | |||
| } | |||
| std::vector<uint8_t> RC4::get_decrypt_key() { | |||
| std::vector<uint8_t> keys(128, 0); | |||
| uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
| data[0] = rc4::key_gen_hash_key(); | |||
| data[1] = rc4::key_gen_enc_key(); | |||
| return keys; | |||
| }; | |||
| std::vector<uint8_t> SimpleFastRC4::decrypt_model( | |||
| const void* model_mem, size_t size, const std::vector<uint8_t>& key) { | |||
| SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); | |||
| simple_fast_rc4_impl.init_sfrc4_state(); | |||
| return simple_fast_rc4_impl.decrypt_model(); | |||
| } | |||
| std::vector<uint8_t> SimpleFastRC4::encrypt_model( | |||
| const void* model_mem, size_t size, const std::vector<uint8_t>& key) { | |||
| SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); | |||
| return simple_fast_rc4_impl.encrypt_model(); | |||
| } | |||
| std::vector<uint8_t> SimpleFastRC4::get_decrypt_key() { | |||
| std::vector<uint8_t> keys(128, 0); | |||
| uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
| data[0] = rc4::key_gen_hash_key(); | |||
| data[1] = rc4::key_gen_enc_key(); | |||
| return keys; | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,44 @@ | |||
| /** | |||
| * \file src/decryption/rc4_cryption.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "rc4/rc4_cryption_base.h" | |||
| #include <vector> | |||
| namespace lite { | |||
| class RC4 { | |||
| public: | |||
| static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
| size_t size, | |||
| const std::vector<uint8_t>& key); | |||
| static std::vector<uint8_t> encrypt_model(const void* model_mem, | |||
| size_t size, | |||
| const std::vector<uint8_t>& key); | |||
| static std::vector<uint8_t> get_decrypt_key(); | |||
| }; | |||
| class SimpleFastRC4 { | |||
| public: | |||
| static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
| size_t size, | |||
| const std::vector<uint8_t>& key); | |||
| static std::vector<uint8_t> encrypt_model(const void* model_mem, | |||
| size_t size, | |||
| const std::vector<uint8_t>& key); | |||
| static std::vector<uint8_t> get_decrypt_key(); | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,53 @@ | |||
| /** | |||
| * \file src/function_base.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include <unordered_map> | |||
| #include "misc.h" | |||
| #include "type_info.h" | |||
| // template <typename tensor_type, typename ...Arg> | |||
| namespace lite { | |||
| class TensorImplDft; | |||
| class NetworkImplDft; | |||
| namespace { | |||
| template <typename class_type> | |||
| struct class_type_name { | |||
| std::string operator()() { return ""; } | |||
| }; | |||
| #define ADD_STATEMENT(class_name, backend_name) \ | |||
| template <> \ | |||
| struct class_type_name<class_name> { \ | |||
| std::string operator()() { return #backend_name; } \ | |||
| } | |||
| ADD_STATEMENT(TensorImplDft, Dft); | |||
| ADD_STATEMENT(NetworkImplDft, Dft); | |||
| #undef ADD_STATEMENT | |||
| } // namespace | |||
| // if it can't find the function, ignore | |||
| template <typename tensor_type, typename ret_type, typename... Args> | |||
| ret_type try_call_func(std::string func_name, Args... args) { | |||
| mark_used_variable(func_name); | |||
| mark_used_variable(args...); | |||
| return nullptr; | |||
| } | |||
| // if it can't find the function, throw error | |||
| template <typename tensor_type, typename ret_type, typename... Args> | |||
| ret_type call_func(std::string func_name, Args... args) { | |||
| mark_used_variable(args...); | |||
| auto backend_name = class_type_name<tensor_type>()(); | |||
| auto msg_info = | |||
| func_name + " is not aviliable in " + backend_name + " backend."; | |||
| LITE_THROW(msg_info.c_str()); | |||
| } | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,256 @@ | |||
| /** | |||
| * \file src/global.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include <lite_build_config.h> | |||
| #include "lite/global.h" | |||
| #include "decryption/aes_decrypt.h" | |||
| #include "decryption/decrypt_base.h" | |||
| #include "decryption/rc4_cryption.h" | |||
| #include "misc.h" | |||
| #include "parse_info/parse_info_base.h" | |||
| #include "parse_info/default_parse.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "megbrain/common.h" | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/serialization/extern_c_opr.h" | |||
| #include "megbrain/version.h" | |||
| #include "megcore_opencl.h" | |||
| #include "mge/algo_cache/file_cache.h" | |||
| #include "mge/common.h" | |||
| #if MGB_ENABLE_TENSOR_RT | |||
| #include "megbrain/tensorrt/tensorrt_engine_cache.h" | |||
| #endif | |||
| #if LITE_WITH_CUDA | |||
| #include "mge/algo_cache/redis_cache.h" | |||
| #endif | |||
| #endif | |||
| #include <mutex> | |||
| #include <unordered_map> | |||
| using namespace lite; | |||
| lite::DecryptionStaticData& lite::decryption_static_data() { | |||
| static lite::DecryptionStaticData global_map; | |||
| return global_map; | |||
| } | |||
| void lite::get_version(int& major, int& minor, int& patch) { | |||
| #if LITE_BUILD_WITH_MGE | |||
| auto version = mgb::get_version(); | |||
| major = version.major; | |||
| minor = version.minor; | |||
| patch = version.patch; | |||
| #else | |||
| //! without mge, the version set the max version | |||
| major = 8; | |||
| minor = 9999; | |||
| patch = 0; | |||
| #endif | |||
| } | |||
| size_t lite::get_device_count(LiteDeviceType device_type) { | |||
| #if LITE_BUILD_WITH_MGE | |||
| auto mgb_device_type = to_compnode_locator(device_type).type; | |||
| return mgb::CompNode::get_device_count(mgb_device_type); | |||
| #else | |||
| LITE_MARK_USED_VAR(device_type); | |||
| LITE_THROW("no lite backend avialible, please check build macro."); | |||
| #endif | |||
| } | |||
| bool lite::register_decryption_and_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key) { | |||
| LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
| auto& global_map = decryption_static_data().decryption_methods; | |||
| if (global_map.find(decrypt_name) != global_map.end()) { | |||
| LITE_THROW(ssprintf("The decryption method %s is already registered.", | |||
| decrypt_name.c_str())); | |||
| return false; | |||
| } else { | |||
| auto key_pointer = std::make_shared<std::vector<uint8_t>>(key); | |||
| global_map[decrypt_name] = {func, key_pointer}; | |||
| LITE_LOG("Registered ecryption method %s.", decrypt_name.c_str()); | |||
| return true; | |||
| } | |||
| } | |||
| bool lite::update_decryption_or_key(std::string decrypt_name, | |||
| const DecryptionFunc& func, | |||
| const std::vector<uint8_t>& key) { | |||
| LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
| auto& global_map = decryption_static_data().decryption_methods; | |||
| if (global_map.find(decrypt_name) != global_map.end()) { | |||
| std::shared_ptr<std::vector<uint8_t>> key_pointer; | |||
| DecryptionFunc new_func; | |||
| if (func) { | |||
| new_func = func; | |||
| LITE_LOG("%s decryption function is updated.", | |||
| decrypt_name.c_str()); | |||
| } else { | |||
| new_func = global_map[decrypt_name].first; | |||
| } | |||
| if (key.size()) { | |||
| key_pointer = std::make_shared<std::vector<uint8_t>>(key); | |||
| LITE_LOG("%s decryption key is updated.", decrypt_name.c_str()); | |||
| } else { | |||
| key_pointer = global_map[decrypt_name].second; | |||
| } | |||
| global_map[decrypt_name] = {new_func, key_pointer}; | |||
| return true; | |||
| } else { | |||
| LITE_THROW(ssprintf("The decryption method %s is not registered.", | |||
| decrypt_name.c_str())); | |||
| return false; | |||
| } | |||
| } | |||
| lite::ParseInfoStaticData& lite::parse_info_static_data() { | |||
| static lite::ParseInfoStaticData global_map; | |||
| return global_map; | |||
| } | |||
| bool lite::register_parse_info_func(std::string info_type, | |||
| const ParseInfoFunc& parse_func) { | |||
| LITE_LOCK_GUARD(parse_info_static_data().map_mutex); | |||
| auto& global_map = parse_info_static_data().parse_info_methods; | |||
| if (global_map.find(info_type) != global_map.end()) { | |||
| LITE_THROW(ssprintf("The parse info method %s is already registered.", | |||
| info_type.c_str())); | |||
| return false; | |||
| } else { | |||
| global_map[info_type] = parse_func; | |||
| LITE_LOG("Registered infomation parser method %s.", info_type.c_str()); | |||
| return true; | |||
| } | |||
| } | |||
| #if LITE_BUILD_WITH_MGE | |||
| namespace { | |||
| struct CacheControl { | |||
| LITE_MUTEX cache_mutex; | |||
| std::string cache_type = "file"; | |||
| std::atomic_size_t config_algo_times{0}; | |||
| std::atomic_size_t config_trt_times{0}; | |||
| }; | |||
| CacheControl cache_control; | |||
| } // namespace | |||
| void lite::try_coalesce_all_free_memory() { | |||
| mgb::CompNode::try_coalesce_all_free_memory(); | |||
| } | |||
| void lite::set_loader_lib_path(const std::string& loader_path) { | |||
| const char* lib_path = loader_path.c_str(); | |||
| LITE_LOG("load a device loader of path %s.", lib_path); | |||
| auto handle = dlopen(lib_path, RTLD_LAZY); | |||
| LITE_ASSERT(handle, "failed to open c opr lib %s: %s", lib_path, dlerror()); | |||
| const char* entry = MGB_C_OPR_INIT_FUNC_STR; | |||
| auto func = dlsym(handle, entry); | |||
| LITE_ASSERT(func, "can not resolve %s: %s", entry, dlerror()); | |||
| typedef void (*entry_f_t)(void*); | |||
| reinterpret_cast<entry_f_t>(func)( | |||
| reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned)); | |||
| } | |||
| void lite::set_persistent_cache(const std::string& cache_path, | |||
| bool always_sync) { | |||
| LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
| cache_control.cache_type = "file"; | |||
| if (cache_control.config_algo_times >= 1) { | |||
| LITE_WARN( | |||
| "The cache has been set,maybe some model is using now, change " | |||
| "it now may cause unknow error!!"); | |||
| } | |||
| cache_control.config_algo_times++; | |||
| mgb::PersistentCache::set_impl(std::make_shared<InFilePersistentCache>( | |||
| cache_path.c_str(), always_sync)); | |||
| } | |||
| void lite::dump_persistent_cache(const std::string& cache_path) { | |||
| LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
| LITE_ASSERT(cache_control.cache_type == "file", | |||
| "now cache type is redis, it can't be dumped."); | |||
| static_cast<InFilePersistentCache&>(mgb::PersistentCache::inst()) | |||
| .dump_cache(cache_path.c_str()); | |||
| } | |||
| //! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
| void lite::set_tensor_rt_cache(std::string tensorrt_cache_path) { | |||
| #if MGB_ENABLE_TENSOR_RT | |||
| LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
| if (cache_control.config_trt_times >= 1) { | |||
| LITE_WARN( | |||
| "The trt cache has been set,maybe some model is using now, " | |||
| "change it now may cause unknow error!!"); | |||
| } | |||
| cache_control.config_trt_times++; | |||
| mgb::TensorRTEngineCache::enable_engine_cache(true); | |||
| mgb::TensorRTEngineCache::set_impl( | |||
| std::make_shared<mgb::TensorRTEngineCacheIO>(tensorrt_cache_path)); | |||
| #else | |||
| LITE_MARK_USED_VAR(tensorrt_cache_path); | |||
| LITE_THROW("TensorRT is disable at compile time."); | |||
| #endif | |||
| } | |||
| void lite::dump_tensor_rt_cache() { | |||
| #if MGB_ENABLE_TENSOR_RT | |||
| if (mgb::TensorRTEngineCache::enable_engine_cache()) { | |||
| mgb::TensorRTEngineCache::inst().dump_cache(); | |||
| } | |||
| #else | |||
| LITE_THROW("TensorRT is disable at compile time."); | |||
| #endif | |||
| } | |||
| #else //LITE_BUILD_WITH_MGE | |||
| void lite::try_coalesce_all_free_memory() {} | |||
| void lite::set_loader_lib_path(const std::string& ) { | |||
| LITE_THROW("mge is disbale at build time, please build with mge"); | |||
| } | |||
| void lite::set_persistent_cache(const std::string&, bool) { | |||
| LITE_THROW("mge is disbale at build time, please build with mge"); | |||
| } | |||
| void lite::dump_persistent_cache(const std::string& ) { | |||
| LITE_THROW("mge is disbale at build time, please build with mge"); | |||
| } | |||
| //! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
| void lite::set_tensor_rt_cache(std::string ) { | |||
| LITE_THROW("mge is disbale at build time, please build with mge"); | |||
| } | |||
| void lite::dump_tensor_rt_cache() { | |||
| LITE_THROW("mge is disbale at build time, please build with mge"); | |||
| } | |||
| #endif | |||
| namespace lite { | |||
| REGIST_DECRYPTION_METHOD("AES_default", lite::AESDcryption::decrypt_model, | |||
| lite::AESDcryption::get_decrypt_key()); | |||
| REGIST_DECRYPTION_METHOD("RC4_default", lite::RC4::decrypt_model, | |||
| lite::RC4::get_decrypt_key()); | |||
| REGIST_DECRYPTION_METHOD("SIMPLE_FAST_RC4_default", | |||
| lite::SimpleFastRC4::decrypt_model, | |||
| lite::SimpleFastRC4::get_decrypt_key()); | |||
| REGIST_PARSE_INFO_FUNCTION("LITE_default", lite::default_parse_info); | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,37 @@ | |||
| /** | |||
| * \file lite/src/lite_build_config.h.in | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #ifndef _HEADER_LITE_BUILD_CONFIG | |||
| #define _HEADER_LITE_BUILD_CONFIG | |||
| #cmakedefine01 LITE_ENABLE_LOGGING | |||
| #cmakedefine01 LITE_ENABLE_EXCEPTION | |||
| #cmakedefine01 LITE_WITH_CUDA | |||
| #cmakedefine01 LITE_ASSERT_LOC | |||
| #ifndef LITE_ENABLE_LOGGING | |||
| #define LITE_ENABLE_LOGGING 1 | |||
| #endif | |||
| #ifndef LITE_ENABLE_EXCEPTION | |||
| #if __cpp_exceptions || __EXCEPTIONS || \ | |||
| (defined(_MSC_VER) && defined(_CPPUNWIND)) | |||
| #define LITE_ENABLE_EXCEPTION 1 | |||
| #else | |||
| #define LITE_ENABLE_EXCEPTION 0 | |||
| #endif | |||
| #endif | |||
| #ifndef LITE_WITH_CUDA | |||
| #define LITE_WITH_CUDA 0 | |||
| #endif | |||
| #ifndef LITE_ASSERT_LOC | |||
| #define LITE_ASSERT_LOC 0 | |||
| #endif | |||
| #endif // _HEADER_LITE_BUILD_CONFIG | |||
| @@ -0,0 +1,254 @@ | |||
| /** | |||
| * \file lite/src/mge/algo_cache/file_cache.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../common.h" | |||
| #include "file_cache.h" | |||
| using namespace lite; | |||
| //////////////////////// InFilePersistentCache::InputMemory /////////////// | |||
| class InFilePersistentCache::InputMemory { | |||
| const uint8_t* m_ptr; | |||
| size_t m_offset = 0; | |||
| size_t m_size; | |||
| public: | |||
| InputMemory(const uint8_t* bin, size_t size) : m_ptr{bin}, m_size{size} {} | |||
| template <typename T> | |||
| void read(T& val) { | |||
| static_assert(std::is_trivially_copyable<T>::value, | |||
| "only support trivially copyable type"); | |||
| LITE_ASSERT(m_offset + sizeof(T) <= m_size); | |||
| memcpy(&val, m_ptr, sizeof(T)); | |||
| m_offset += sizeof(T); | |||
| m_ptr += sizeof(T); | |||
| } | |||
| template <typename T> | |||
| void read(T* buf, size_t size) { | |||
| static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
| "only support read bytes"); | |||
| LITE_ASSERT(m_offset + size <= m_size); | |||
| memcpy(buf, m_ptr, size); | |||
| m_offset += size; | |||
| m_ptr += size; | |||
| } | |||
| }; | |||
| //////////////////////// InFilePersistentCache::InputFile /////////////// | |||
| class InFilePersistentCache::InputFile { | |||
| FILE* m_fp; | |||
| public: | |||
| InputFile(const char* path) : m_fp{fopen(path, "rb")} { | |||
| LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); | |||
| } | |||
| ~InputFile() { | |||
| if (m_fp) { | |||
| fclose(m_fp); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void read(T& val) { | |||
| static_assert(std::is_trivially_copyable<T>::value, | |||
| "only support trivially copyable type"); | |||
| auto ret = fread(&val, sizeof(T), 1, m_fp); | |||
| LITE_ASSERT(ret == 1); | |||
| } | |||
| template <typename T> | |||
| void read(T* buf, size_t size) { | |||
| static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
| "only support read bytes"); | |||
| auto ret = fread(buf, size, 1, m_fp); | |||
| LITE_ASSERT(ret == 1); | |||
| } | |||
| }; | |||
| //////////////////////// InFilePersistentCache::OutputFile /////////////// | |||
| class InFilePersistentCache::OutputFile { | |||
| FILE* m_fp; | |||
| public: | |||
| OutputFile(const char* path) : m_fp{fopen(path, "wb")} { | |||
| LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); | |||
| } | |||
| ~OutputFile() { | |||
| if (m_fp) { | |||
| fclose(m_fp); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void write(T val) { | |||
| auto ret = fwrite(&val, sizeof(T), 1, m_fp); | |||
| LITE_ASSERT(ret == 1); | |||
| } | |||
| template <typename T> | |||
| void write(const T* buf, size_t size) { | |||
| static_assert(sizeof(T) == 1, "only support write bytes"); | |||
| auto ret = fwrite(buf, size, 1, m_fp); | |||
| LITE_ASSERT(ret == 1); | |||
| } | |||
| void flush() { fflush(m_fp); } | |||
| void set_head() { fseek(m_fp, 0, SEEK_SET); } | |||
| }; | |||
| //////////////////////// InFilePersistentCache::BlobStorage /////////////// | |||
| template <typename Input> | |||
| InFilePersistentCache::BlobStorage& | |||
| InFilePersistentCache::BlobStorage::init_from_input(Input& inp) { | |||
| uint32_t data_size; | |||
| inp.read(data_size); | |||
| size = data_size; | |||
| data_refhold = std::make_unique<uint8_t[]>(size); | |||
| inp.read(data_refhold.get(), size); | |||
| ptr = data_refhold.get(); | |||
| return *this; | |||
| } | |||
| void InFilePersistentCache::BlobStorage::write_to_file( | |||
| OutputFile& out_file) const { | |||
| uint32_t u_size = size; | |||
| out_file.write(u_size); | |||
| out_file.write(data_refhold.get(), u_size); | |||
| } | |||
| InFilePersistentCache::BlobStorage& | |||
| InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) { | |||
| data_refhold = std::make_unique<uint8_t[]>(b.size + 1); | |||
| memcpy(data_refhold.get(), b.ptr, b.size); | |||
| data_refhold.get()[b.size] = 0; // for C-string safety | |||
| ptr = data_refhold.get(); | |||
| size = b.size; | |||
| return *this; | |||
| } | |||
| //////////////////////// InFilePersistentCache ////////////////////// | |||
| template <typename Input> | |||
| void InFilePersistentCache::read_cache(Input& inp) { | |||
| uint32_t nr_category; | |||
| inp.read(nr_category); | |||
| char category_buf[256]; | |||
| for (uint32_t i = 0; i < nr_category; i++) { | |||
| uint32_t category_size; | |||
| inp.read(category_size); | |||
| inp.read(category_buf, category_size); | |||
| category_buf[category_size] = '\0'; | |||
| std::string category(category_buf); | |||
| mgb_log_debug("load new category: %s", category_buf); | |||
| // read bobs | |||
| uint32_t nr_bobs; | |||
| inp.read(nr_bobs); | |||
| for (uint32_t j = 0; j < nr_bobs; j++) { | |||
| BlobStorage key_storage; | |||
| key_storage.init_from_input(inp).init_hash(); | |||
| mgb_log_debug("read key: %zu", key_storage.hash); | |||
| m_cache[category][std::move(key_storage)].init_from_input(inp); | |||
| } | |||
| } | |||
| } | |||
| InFilePersistentCache::InFilePersistentCache(const char* path, | |||
| bool always_open) { | |||
| if (!access(path, F_OK)) { | |||
| mgb_log_debug("use fastrun cache: %s", path); | |||
| InputFile inp(path); | |||
| read_cache<InputFile>(inp); | |||
| } | |||
| if (always_open) { | |||
| m_always_open_file = std::make_shared<OutputFile>(path); | |||
| } | |||
| } | |||
| InFilePersistentCache::InFilePersistentCache(const uint8_t* bin, size_t size) { | |||
| LITE_ASSERT(bin); | |||
| InputMemory inp(bin, size); | |||
| read_cache<InputMemory>(inp); | |||
| } | |||
| void InFilePersistentCache::dump_cache(const char* path) { | |||
| OutputFile out_file(path); | |||
| dump_cache(&out_file); | |||
| } | |||
| void InFilePersistentCache::dump_cache(OutputFile* out_file) { | |||
| uint32_t nr_category = m_cache.size(); | |||
| out_file->write(nr_category); | |||
| for (const auto& cached_category : m_cache) { | |||
| uint32_t category_size = cached_category.first.size(); | |||
| out_file->write(category_size); | |||
| out_file->write(cached_category.first.data(), category_size); | |||
| mgb_log_debug("write new category: %s", cached_category.first.c_str()); | |||
| uint32_t nr_bobs = cached_category.second.size(); | |||
| out_file->write(nr_bobs); | |||
| for (const auto& item : cached_category.second) { | |||
| mgb_log_debug("dump key: %zu", item.first.hash); | |||
| item.first.write_to_file(*out_file); | |||
| item.second.write_to_file(*out_file); | |||
| } | |||
| } | |||
| } | |||
| mgb::Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get( | |||
| const std::string& category, const Blob& key) { | |||
| decltype(m_cache.begin()) iter0; | |||
| { | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| iter0 = m_cache.find(category); | |||
| if (iter0 == m_cache.end()) | |||
| return mgb::None; | |||
| } | |||
| BlobStorage key_storage; | |||
| key_storage.Blob::operator=(key); | |||
| key_storage.init_hash(); | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| auto iter1 = iter0->second.find(key_storage); | |||
| if (iter1 == iter0->second.end()) | |||
| return mgb::None; | |||
| return iter1->second; | |||
| } | |||
| void InFilePersistentCache::put(const std::string& category, const Blob& key, | |||
| const Blob& value) { | |||
| BlobStorage key_storage; | |||
| key_storage.init_data_ref(key).init_hash(); | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| auto size0 = m_cache.size(); | |||
| m_cache[category][std::move(key_storage)].init_data_ref(value); | |||
| if (m_cache.size() > size0) { | |||
| mgb_log_debug("new cache category: %s", category.c_str()); | |||
| } | |||
| if (m_always_open_file) { | |||
| m_always_open_file->set_head(); | |||
| dump_cache(m_always_open_file.get()); | |||
| m_always_open_file->flush(); | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,85 @@ | |||
| /** | |||
| * \file lite/src/mge/algo_cache/file_cache.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "megbrain/utils/persistent_cache.h" | |||
| namespace lite { | |||
| /** | |||
| * dump format: | |||
| * | |||
| * all integers in local endian (effectively little endian as I can see) | |||
| * | |||
| * dump format: | |||
| * <nr_category|uint32_t><category_size|uint32_t><category|uint8_t*> | |||
| * <nr_bob|uint32_t>[<key_size|uint32_t><key|uint8_t*><data_size| | |||
| * uint32_t><data|uint8_t*>]* | |||
| */ | |||
| //! TODO: fix one thread set cache when other threads is using old cache | |||
| class InFilePersistentCache final : public mgb::PersistentCache { | |||
| class InputFile; | |||
| class InputMemory; | |||
| class OutputFile; | |||
| struct BlobStorage : public Blob { | |||
| std::unique_ptr<uint8_t[]> data_refhold; | |||
| size_t hash = 0; | |||
| template <typename Input> | |||
| BlobStorage& init_from_input(Input& inp); | |||
| void write_to_file(OutputFile& out_file) const; | |||
| BlobStorage& init_data_ref(const Blob& b); | |||
| BlobStorage& init_hash() { | |||
| hash = mgb::XXHash{}.update(ptr, size).digest(); | |||
| return *this; | |||
| } | |||
| bool operator==(const BlobStorage& rhs) const { | |||
| return size == rhs.size && !memcmp(ptr, rhs.ptr, size); | |||
| } | |||
| struct Hash { | |||
| size_t operator()(const BlobStorage& b) const { return b.hash; } | |||
| }; | |||
| }; | |||
| std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage, | |||
| BlobStorage::Hash>> | |||
| m_cache; | |||
| LITE_MUTEX m_mtx; | |||
| std::shared_ptr<OutputFile> m_always_open_file; | |||
| template <typename Input> | |||
| void read_cache(Input& inp); | |||
| public: | |||
| InFilePersistentCache() = default; | |||
| InFilePersistentCache(const char* path, bool always_open = false); | |||
| InFilePersistentCache(const uint8_t* bin, size_t size); | |||
| /** | |||
| * \warning You should invoke \c dump_cache mannually to save the cache | |||
| * file. | |||
| */ | |||
| void dump_cache(const char* path); | |||
| void dump_cache(OutputFile* out_file); | |||
| mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override; | |||
| void put(const std::string& category, const Blob& key, | |||
| const Blob& value) override; | |||
| }; | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,241 @@ | |||
| /** | |||
| * \file lite/src/mge/algo_cache/redis_cache.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA | |||
| #include "../../misc.h" | |||
| #include "redis_cache.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| namespace { | |||
| /* | |||
| ** Translation Table as described in RFC1113 | |||
| */ | |||
| static const char cb64[] = | |||
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | |||
| /* | |||
| ** Translation Table to decode: | |||
| *https://github.com/dgiardini/imgcalkap/blob/master/base64.c | |||
| */ | |||
| static const char cd64[] = | |||
| "|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`" | |||
| "abcdefghijklmnopq"; | |||
| /* | |||
| ** encodeblock | |||
| ** | |||
| ** encode 3 8-bit binary bytes as 4 '6-bit' characters | |||
| */ | |||
| void encodeblock(unsigned char in[3], unsigned char out[4], int len) { | |||
| out[0] = cb64[in[0] >> 2]; | |||
| out[1] = cb64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)]; | |||
| out[2] = (unsigned char)(len > 1 ? cb64[((in[1] & 0x0f) << 2) | | |||
| ((in[2] & 0xc0) >> 6)] | |||
| : '='); | |||
| out[3] = (unsigned char)(len > 2 ? cb64[in[2] & 0x3f] : '='); | |||
| } | |||
| /* | |||
| ** decodeblock | |||
| ** | |||
| ** decode 4 '6-bit' characters into 3 8-bit binary bytes | |||
| */ | |||
| void decodeblock(unsigned char in[4], unsigned char out[3]) { | |||
| out[0] = (unsigned char)(in[0] << 2 | in[1] >> 4); | |||
| out[1] = (unsigned char)(in[1] << 4 | in[2] >> 2); | |||
| out[2] = (unsigned char)(((in[2] << 6) & 0xc0) | in[3]); | |||
| } | |||
| /** | |||
| * Encode string to base64 string | |||
| * @param input - source string | |||
| * @param outdata - target base64 string | |||
| * @param linesize - max size of line | |||
| */ | |||
| void encode(const std::vector<std::uint8_t>& input, | |||
| std::vector<std::uint8_t>& outdata, int linesize = 76) { | |||
| outdata.clear(); | |||
| unsigned char in[3], out[4]; | |||
| int i, len, blocksout = 0; | |||
| size_t j = 0; | |||
| auto* indata = reinterpret_cast<const unsigned char*>(input.data()); | |||
| unsigned int insize = input.size(); | |||
| while (j <= insize) { | |||
| len = 0; | |||
| for (i = 0; i < 3; i++) { | |||
| in[i] = (unsigned char)indata[j]; | |||
| j++; | |||
| if (j <= insize) { | |||
| len++; | |||
| } else { | |||
| in[i] = 0; | |||
| } | |||
| } | |||
| if (len) { | |||
| encodeblock(in, out, len); | |||
| for (i = 0; i < 4; i++) { | |||
| outdata.push_back(out[i]); | |||
| } | |||
| blocksout++; | |||
| } | |||
| if (blocksout >= (linesize / 4) || (j == insize)) { | |||
| if (blocksout) { | |||
| outdata.push_back('\r'); | |||
| outdata.push_back('\n'); | |||
| } | |||
| blocksout = 0; | |||
| } | |||
| } | |||
| } | |||
| /** | |||
| * Decode base64 string ot source | |||
| * @param input - base64 string | |||
| * @param outdata - source string | |||
| */ | |||
| void decode(const std::vector<std::uint8_t>& input, | |||
| std::vector<std::uint8_t>& outdata) { | |||
| outdata.clear(); | |||
| unsigned char in[4], out[3], v; | |||
| int i, len; | |||
| size_t j = 0; | |||
| auto* indata = reinterpret_cast<const unsigned char*>(input.data()); | |||
| unsigned int insize = input.size(); | |||
| while (j <= insize) { | |||
| for (len = 0, i = 0; i < 4 && (j <= insize); i++) { | |||
| v = 0; | |||
| while ((j <= insize) && v == 0) { | |||
| v = (unsigned char)indata[j++]; | |||
| v = (unsigned char)((v < 43 || v > 122) ? 0 : cd64[v - 43]); | |||
| if (v) { | |||
| v = (unsigned char)((v == '$') ? 0 : v - 61); | |||
| } | |||
| } | |||
| if (j <= insize) { | |||
| len++; | |||
| if (v) { | |||
| in[i] = (unsigned char)(v - 1); | |||
| } | |||
| } else { | |||
| in[i] = 0; | |||
| } | |||
| } | |||
| if (len) { | |||
| decodeblock(in, out); | |||
| for (i = 0; i < len - 1; i++) { | |||
| outdata.push_back(out[i]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** | |||
| * Encode binary data to base64 buffer | |||
| * @param input - source data | |||
| * @param outdata - target base64 buffer | |||
| * @param linesize | |||
| */ | |||
| void encode(const std::string& input, std::string& outdata, int linesize = 76) { | |||
| std::vector<std::uint8_t> out; | |||
| std::vector<std::uint8_t> in(input.begin(), input.end()); | |||
| encode(in, out, linesize); | |||
| outdata = std::string(out.begin(), out.end()); | |||
| } | |||
| /** | |||
| * Decode base64 buffer to source binary data | |||
| * @param input - base64 buffer | |||
| * @param outdata - source binary data | |||
| */ | |||
| void decode(const std::string& input, std::string& outdata) { | |||
| std::vector<std::uint8_t> in(input.begin(), input.end()); | |||
| std::vector<std::uint8_t> out; | |||
| decode(in, out); | |||
| outdata = std::string(out.begin(), out.end()); | |||
| } | |||
| } // namespace | |||
| using namespace lite; | |||
| RedisCache::RedisCache(std::string redis_ip, size_t port, std::string password) | |||
| : m_ip(redis_ip), m_port(port), m_password(password) { | |||
| m_client.auth(password); | |||
| m_client.connect( | |||
| m_ip, m_port, | |||
| [](const std::string& host, std::size_t port, | |||
| cpp_redis::connect_state status) { | |||
| if (status == cpp_redis::connect_state::dropped) { | |||
| LITE_LOG("client disconnected from %s.", host.c_str()); | |||
| LITE_LOG("Redis server connect to %s :%zu failed.", | |||
| host.c_str(), port); | |||
| } | |||
| }, | |||
| std::uint32_t(200)); | |||
| } | |||
| mgb::Maybe<mgb::PersistentCache::Blob> RedisCache::get( | |||
| const std::string& category, const mgb::PersistentCache::Blob& key) { | |||
| LITE_LOCK_GUARD(m_mtx); | |||
| if (m_old == nullptr) { | |||
| return mgb::None; | |||
| } | |||
| auto mem_result = m_old->get(category, key); | |||
| if (mem_result.valid()) | |||
| return mem_result; | |||
| std::string key_str(static_cast<const char*>(key.ptr), key.size); | |||
| std::string redis_key_str; | |||
| encode(category + '@' + key_str, redis_key_str, 24); | |||
| auto result = m_client.get(redis_key_str); | |||
| m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100)); | |||
| LITE_ASSERT(is_valid()); | |||
| auto content = result.get(); | |||
| if (content.is_null()) | |||
| return mgb::None; | |||
| std::string decode_content; | |||
| decode(content.as_string(), decode_content); | |||
| m_old->put(category, key, {decode_content.data(), decode_content.length()}); | |||
| return m_old->get(category, key); | |||
| } | |||
| void RedisCache::put(const std::string& category, const Blob& key, | |||
| const mgb::PersistentCache::Blob& value) { | |||
| // ScopedTimer t1(std::string("put") + category); | |||
| LITE_LOCK_GUARD(m_mtx); | |||
| std::string key_str(static_cast<const char*>(key.ptr), key.size); | |||
| std::string redis_key_str; | |||
| encode(category + '@' + key_str, redis_key_str); | |||
| std::string value_str(static_cast<const char*>(value.ptr), value.size); | |||
| std::string redis_value_str; | |||
| encode(value_str, redis_value_str); | |||
| auto result = m_client.set(redis_key_str, redis_value_str); | |||
| if (m_old == nullptr) { | |||
| return; | |||
| } | |||
| m_old->put(category, key, value); | |||
| m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100)); | |||
| LITE_ASSERT(is_valid()); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,47 @@ | |||
| /** | |||
| * \file lite/src/mge/algo_cache/redis_cache.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA | |||
| #include <cpp_redis/cpp_redis> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "megbrain/utils/persistent_cache.h" | |||
| namespace lite { | |||
| //! TODO: fix one thread set cache when other threads is using old cache | |||
| class RedisCache final : public mgb::PersistentCache { | |||
| public: | |||
| RedisCache(std::string redis_ip, size_t port, std::string password); | |||
| bool is_valid() { return m_client.is_connected(); } | |||
| ~RedisCache() {} | |||
| void init(std::shared_ptr<mgb::PersistentCache> old) { m_old = old; } | |||
| mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override; | |||
| void put(const std::string& category, const Blob& key, | |||
| const Blob& value) override; | |||
| private: | |||
| std::shared_ptr<mgb::PersistentCache> m_old; | |||
| LITE_MUTEX m_mtx; | |||
| cpp_redis::client m_client; | |||
| const std::string m_ip; | |||
| const size_t m_port; | |||
| const std::string m_password; | |||
| }; | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,191 @@ | |||
| /** | |||
| * \file src/mge/common.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "common.h" | |||
| #include "megdnn/dtype.h" | |||
| using namespace lite; | |||
| using namespace mgb; | |||
| enum class CompressionMethod { | |||
| NO_COMPRESSION = 0, | |||
| FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS = 1, | |||
| FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS = 2, | |||
| }; | |||
| void lite::decompressed_tensor_value_loader( | |||
| void* ptr_, const mgb::TensorLayout& layout, | |||
| mgb::serialization::InputFile& fin) { | |||
| uint8_t compress_flag; | |||
| fin.read(&compress_flag, sizeof(compress_flag)); | |||
| size_t num_weights = layout.total_nr_elems(); | |||
| switch (CompressionMethod(compress_flag)) { | |||
| case CompressionMethod::NO_COMPRESSION: { | |||
| mgb::serialization::GraphLoadConfig::default_tensor_value_loader( | |||
| ptr_, layout, fin); | |||
| break; | |||
| } | |||
| case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS: { | |||
| if (ptr_) { | |||
| float stride, base; | |||
| std::vector<uint8_t> weights(num_weights); | |||
| fin.read(&stride, sizeof(stride)); | |||
| fin.read(&base, sizeof(base)); | |||
| fin.read(weights.data(), num_weights * sizeof(uint8_t)); | |||
| auto* ptr = static_cast<float*>(ptr_); | |||
| for (size_t i = 0; i < num_weights; ++i) | |||
| ptr[i] = stride * weights[i] + base; | |||
| } else { | |||
| fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint8_t)); | |||
| } | |||
| break; | |||
| } | |||
| case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS: { | |||
| if (ptr_) { | |||
| float stride, base; | |||
| std::vector<uint16_t> weights(num_weights); | |||
| fin.read(&stride, sizeof(stride)); | |||
| fin.read(&base, sizeof(base)); | |||
| fin.read(weights.data(), num_weights * sizeof(uint16_t)); | |||
| auto* ptr = static_cast<float*>(ptr_); | |||
| for (size_t i = 0; i < num_weights; ++i) | |||
| ptr[i] = stride * weights[i] + base; | |||
| } else { | |||
| fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint16_t)); | |||
| } | |||
| break; | |||
| } | |||
| default: | |||
| LITE_THROW("Unexpected compression method"); | |||
| } | |||
| } | |||
| LTensorLayout lite::to_impl_layout(const Layout& layout) { | |||
| mgb::TensorLayout mge_layout; | |||
| mge_layout.ndim = layout.ndim; | |||
| LITE_ASSERT(layout.ndim < TensorShape::MAX_NDIM, | |||
| "lite layout ndim is to large"); | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| mge_layout.shape[i] = layout.shapes[i]; | |||
| } | |||
| mge_layout.init_contiguous_stride(); | |||
| switch (layout.data_type) { | |||
| case LiteDataType::LITE_FLOAT: | |||
| mge_layout.dtype = mgb::dtype::Float32(); | |||
| break; | |||
| case LiteDataType::LITE_HALF: | |||
| mge_layout.dtype = mgb::dtype::Float16(); | |||
| break; | |||
| case LiteDataType::LITE_INT: | |||
| mge_layout.dtype = mgb::dtype::Int32(); | |||
| break; | |||
| case LiteDataType::LITE_INT8: | |||
| mge_layout.dtype = mgb::dtype::Int8(); | |||
| break; | |||
| case LiteDataType::LITE_UINT8: | |||
| mge_layout.dtype = mgb::dtype::Uint8(); | |||
| break; | |||
| case LiteDataType::LITE_INT16: | |||
| mge_layout.dtype = mgb::dtype::Int16(); | |||
| break; | |||
| default: | |||
| LITE_THROW(mgb::ssprintf("unsupport dtype in lite enum id is %d.", | |||
| static_cast<int>(layout.data_type))); | |||
| } | |||
| return mge_layout; | |||
| } | |||
| Layout lite::to_lite_layout(const LTensorLayout& mge_layout) { | |||
| Layout layout; | |||
| if (!mge_layout.dtype.valid()) { | |||
| return layout; | |||
| } | |||
| layout.ndim = mge_layout.ndim; | |||
| LITE_ASSERT(layout.ndim < layout.MAXDIM, "tensor layout ndim is to large"); | |||
| for (size_t i = 0; i < layout.ndim; i++) { | |||
| layout.shapes[i] = mge_layout.shape[i]; | |||
| } | |||
| switch (mge_layout.dtype.enumv()) { | |||
| case mgb::DTypeEnum::Float32: | |||
| layout.data_type = LiteDataType::LITE_FLOAT; | |||
| break; | |||
| case mgb::DTypeEnum::Float16: | |||
| layout.data_type = LiteDataType::LITE_HALF; | |||
| break; | |||
| case mgb::DTypeEnum::Int32: | |||
| layout.data_type = LiteDataType::LITE_INT; | |||
| break; | |||
| case mgb::DTypeEnum::Int16: | |||
| layout.data_type = LiteDataType::LITE_INT16; | |||
| break; | |||
| case mgb::DTypeEnum::Int8: | |||
| layout.data_type = LiteDataType::LITE_INT8; | |||
| break; | |||
| case mgb::DTypeEnum::Uint8: | |||
| layout.data_type = LiteDataType::LITE_UINT8; | |||
| break; | |||
| default: | |||
| LITE_THROW(mgb::ssprintf("unsupport dtype in lite : %s.", | |||
| mge_layout.to_string().c_str())); | |||
| } | |||
| return layout; | |||
| } | |||
| mgb::CompNode::Locator lite::to_compnode_locator(const LiteDeviceType& device) { | |||
| mgb::CompNode::Locator loc; | |||
| switch (device) { | |||
| case LiteDeviceType::LITE_CPU: | |||
| loc.type = mgb::CompNode::DeviceType::CPU; | |||
| break; | |||
| case LiteDeviceType::LITE_CUDA: | |||
| loc.type = mgb::CompNode::DeviceType::CUDA; | |||
| break; | |||
| case LiteDeviceType::LITE_ATLAS: | |||
| loc.type = mgb::CompNode::DeviceType::ATLAS; | |||
| break; | |||
| case LiteDeviceType::LITE_OPENCL: | |||
| loc.type = mgb::CompNode::DeviceType::OPENCL; | |||
| break; | |||
| case LiteDeviceType::LITE_DEVICE_DEFAULT: | |||
| loc.type = mgb::CompNode::DeviceType::UNSPEC; | |||
| break; | |||
| default: | |||
| LITE_THROW( | |||
| ssprintf("lite unsupported compnode type: enum value: %d.", | |||
| (int)(device))); | |||
| } | |||
| return loc; | |||
| } | |||
| LiteDeviceType lite::get_device_from_locator( | |||
| const mgb::CompNode::Locator& locator) { | |||
| switch (locator.type) { | |||
| case mgb::CompNode::DeviceType::CPU: | |||
| case mgb::CompNode::DeviceType::MULTITHREAD: | |||
| return LiteDeviceType::LITE_CPU; | |||
| case mgb::CompNode::DeviceType::CUDA: | |||
| return LiteDeviceType::LITE_CUDA; | |||
| case mgb::CompNode::DeviceType::ATLAS: | |||
| return LiteDeviceType::LITE_ATLAS; | |||
| case mgb::CompNode::DeviceType::OPENCL: | |||
| return LiteDeviceType::LITE_OPENCL; | |||
| case mgb::CompNode::DeviceType::UNSPEC: | |||
| return LiteDeviceType::LITE_DEVICE_DEFAULT; | |||
| default: | |||
| LITE_THROW( | |||
| ssprintf("lite unsupported compnode type: enum value: %d.", | |||
| (int)(locator.type))); | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,66 @@ | |||
| /** | |||
| * \file src/mge/common.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../misc.h" | |||
| #include "lite/network.h" | |||
| #include "lite/tensor.h" | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/serialization/serializer.h" | |||
| #include "megbrain/tensor.h" | |||
| //! rename mge name L* | |||
| namespace lite { | |||
| using LTensorLayout = mgb::TensorLayout; | |||
| using LComputingGraph = mgb::ComputingGraph; | |||
| using LDeviceTensorStorage = mgb::DeviceTensorStorage; | |||
| } // namespace lite | |||
| namespace lite { | |||
| /*! | |||
| * \brief transform mgelite Layout to mgb TensorLayout | |||
| */ | |||
| LTensorLayout to_impl_layout(const Layout& layout); | |||
| /*! | |||
| * \brief transform mgb TensorLayout to mgelite Layout | |||
| */ | |||
| Layout to_lite_layout(const mgb::TensorLayout& mge_layout); | |||
| /*! | |||
| * \brief transform mgelite device to mgb CompNode Locator | |||
| */ | |||
| mgb::CompNode::Locator to_compnode_locator(const LiteDeviceType& device); | |||
| /*! | |||
| * \brief transform mgb CompNode Locator to lite Device | |||
| */ | |||
| LiteDeviceType get_device_from_locator(const mgb::CompNode::Locator& locator); | |||
| /*! \brief A megbrain tensor loader with weight decompression. | |||
| * | |||
| * The weight to be compressed must start with a byte of compression flag (CF). | |||
| * | |||
| * 1. CF = 0: no compression. | |||
| * 2. CF = 1: float32 stride + float32 base + uint8 weight (return s*w+b) | |||
| * 3. CF = 2: float32 stride + float32 base + uint16 weight (return s*w+b) | |||
| * | |||
| */ | |||
| void decompressed_tensor_value_loader(void* ptr_, | |||
| const mgb::TensorLayout& layout, | |||
| mgb::serialization::InputFile& fin); | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,212 @@ | |||
| /** | |||
| * \file src/mge/function_dft.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "function_base.h" | |||
| #include "network_impl.h" | |||
| #include "network_impl_base.h" | |||
| #include "tensor_impl.h" | |||
| namespace lite { | |||
| #define THROW_FUNC_ERROR(func_name) \ | |||
| auto msg_info = func_name + " is not aviliable in Dft backend."; \ | |||
| LITE_THROW(msg_info.c_str()) | |||
| // the functions used for dft's tensor.cpp are as followed: | |||
| template <> | |||
| inline std::shared_ptr<Tensor::TensorImplBase> | |||
| call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
| std::string func_name) { | |||
| if (func_name == "create_tensor") { | |||
| return std::make_shared<TensorImplDft>(); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline std::shared_ptr<Tensor::TensorImplBase> | |||
| call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
| std::string func_name, LiteDeviceType device_type, | |||
| bool is_pinned_host) { | |||
| if (func_name == "create_tensor") { | |||
| return std::make_shared<TensorImplDft>(device_type, is_pinned_host); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline std::shared_ptr<Tensor::TensorImplBase> | |||
| call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
| std::string func_name, int device_id, LiteDeviceType device_type, | |||
| const Layout layout, bool is_pinned_host) { | |||
| if (func_name == "create_tensor") { | |||
| return std::make_shared<TensorImplDft>(device_id, device_type, layout, | |||
| is_pinned_host); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline std::shared_ptr<Tensor::TensorImplBase> | |||
| call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
| std::string func_name, LiteDeviceType device_type, const Layout layout, | |||
| bool is_pinned_host) { | |||
| if (func_name == "create_tensor") { | |||
| return std::make_shared<TensorImplDft>(device_type, layout, | |||
| is_pinned_host); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline std::shared_ptr<Tensor::TensorImplBase> | |||
| call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
| std::string func_name, int device_id, int stream_id, | |||
| LiteDeviceType device_type, bool is_pinned_host) { | |||
| if (func_name == "create_tensor") { | |||
| return std::make_shared<TensorImplDft>(device_id, stream_id, | |||
| device_type, is_pinned_host); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| // the functions used for dft's network.cpp are as followed: | |||
| template <> | |||
| inline std::unique_ptr<Network::NetworkImplBase> | |||
| call_func<NetworkImplDft, std::unique_ptr<Network::NetworkImplBase>>( | |||
| std::string func_name) { | |||
| if (func_name == "create_network") { | |||
| return std::make_unique<NetworkImplDft>(); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline Network::NetworkImplBase* | |||
| try_call_func<NetworkImplDft, Network::NetworkImplBase*>( | |||
| std::string func_name) { | |||
| if (func_name == "parse_model") { | |||
| return new NetworkImplDft(); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| #define CALL_FUNC(func_name, ...) \ | |||
| network_impl->cast_final_safe<NetworkImplDft>().func_name(__VA_ARGS__) | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| size_t num) { | |||
| if (func_name == "set_cpu_threads_number") { | |||
| CALL_FUNC(set_cpu_threads_number, num); | |||
| } else if (func_name == "set_network_algo_workspace_limit") { | |||
| CALL_FUNC(set_network_algo_workspace_limit, num); | |||
| } else { | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl) { | |||
| if (func_name == "use_tensorrt") { | |||
| CALL_FUNC(use_tensorrt); | |||
| } else if (func_name == "set_cpu_inplace_mode") { | |||
| CALL_FUNC(set_cpu_inplace_mode); | |||
| } else { | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| } | |||
| template <> | |||
| inline size_t call_func<NetworkImplDft, size_t>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl) { | |||
| if (func_name == "get_cpu_threads_number") { | |||
| return CALL_FUNC(get_cpu_threads_number); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline bool call_func<NetworkImplDft, bool>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl) { | |||
| if (func_name == "is_cpu_inplace_mode") { | |||
| return CALL_FUNC(is_cpu_inplace_mode); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| ThreadAffinityCallback thread_affinity_callback) { | |||
| if (func_name == "set_runtime_thread_affinity") { | |||
| return CALL_FUNC(set_runtime_thread_affinity, | |||
| std::move(thread_affinity_callback)); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size, | |||
| bool binary_equal_between_batch) { | |||
| if (func_name == "set_network_algo_policy") { | |||
| return CALL_FUNC(set_network_algo_policy, strategy, shared_batch_size, | |||
| binary_equal_between_batch); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| std::shared_ptr<Allocator> user_allocator) { | |||
| if (func_name == "set_memory_allocator") { | |||
| return CALL_FUNC(set_memory_allocator, user_allocator); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| std::string file_name) { | |||
| if (func_name == "enable_io_txt_dump") { | |||
| return CALL_FUNC(enable_io_txt_dump, file_name); | |||
| } else if (func_name == "enable_io_bin_dump") { | |||
| return CALL_FUNC(enable_io_bin_dump, file_name); | |||
| } | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| template <> | |||
| inline void call_func<NetworkImplDft, void>( | |||
| std::string func_name, Network::NetworkImplBase* network_impl, | |||
| Network::NetworkImplBase* src_network_impl) { | |||
| if (func_name == "share_runtime_memory_with") { | |||
| CALL_FUNC(share_runtime_memory_with, src_network_impl); | |||
| } else if (func_name == "shared_weight_with") { | |||
| CALL_FUNC(shared_weight_with, src_network_impl); | |||
| } else { | |||
| THROW_FUNC_ERROR(func_name); | |||
| } | |||
| } | |||
| #undef THROW_FUNC_ERROR | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,69 @@ | |||
| /** | |||
| * \file src/mge/memory_alloctor.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "common.h" | |||
| #include "megbrain/dtype.h" | |||
| #include "network_impl.h" | |||
| #include "megbrain/graph/cg.h" | |||
| namespace lite { | |||
| class UserStaticMemAlloc final : public mgb::cg::DeviceMemoryAllocator { | |||
| std::shared_ptr<Allocator> m_allocator = nullptr; | |||
| public: | |||
| UserStaticMemAlloc(std::shared_ptr<Allocator> allocator) | |||
| : m_allocator(allocator) {} | |||
| void alloc_static(LComputingGraph*, LDeviceTensorStorage& dest, | |||
| size_t size) override { | |||
| if (size < dest.size()) { | |||
| return; | |||
| } | |||
| auto cn = dest.comp_node_allow_invalid(); | |||
| LITE_ASSERT(cn.valid(), "The compnode is invalid when alloc memory."); | |||
| LiteDeviceType device_type = | |||
| get_device_from_locator(cn.locator_logical()); | |||
| int device_id = cn.locator_logical().device; | |||
| auto ptr_alloc = static_cast<mgb::dt_byte*>(m_allocator->allocate( | |||
| device_type, device_id, size, cn.get_mem_addr_alignment())); | |||
| auto storage = std::shared_ptr<mgb::dt_byte>( | |||
| ptr_alloc, | |||
| [allocator = m_allocator, device_type, device_id](void* ptr) { | |||
| allocator->free(device_type, device_id, ptr); | |||
| }); | |||
| dest.reset(cn, size, storage); | |||
| } | |||
| void alloc_dynamic(mgb::VarNode*, mgb::DeviceTensorStorage& dest, | |||
| size_t size) override { | |||
| alloc_static(nullptr, dest, size); | |||
| } | |||
| void defrag_prealloc_contig(mgb::ComputingGraph*, mgb::CompNode comp_node, | |||
| size_t size) override { | |||
| LiteDeviceType device_type = | |||
| get_device_from_locator(comp_node.locator_logical()); | |||
| int device_id = comp_node.locator_logical().device; | |||
| auto ptr_tmp = | |||
| m_allocator->allocate(device_type, device_id, size, | |||
| comp_node.get_mem_addr_alignment()); | |||
| m_allocator->free(device_type, device_id, ptr_tmp); | |||
| } | |||
| }; | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,781 @@ | |||
| /** | |||
| * \file src/mge/network_impl.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "network_impl.h" | |||
| #include "common.h" | |||
| #include "lite/network.h" | |||
| #include "memory_allocator.h" | |||
| #include "parse_model/model_parser.h" | |||
| #include "parse_info/parse_info_base.h" | |||
| #include "megbrain/common.h" | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/comp_node_env.h" | |||
| #include "megbrain/gopt/inference.h" | |||
| #include "megbrain/graph.h" | |||
| #include "megbrain/graph/cg.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/tensor.h" | |||
| #if MGB_OPENCL | |||
| #include "megcore_opencl.h" | |||
| #endif | |||
| #include <fstream> | |||
| #include <memory> | |||
| #include <set> | |||
| using namespace lite; | |||
| using namespace mgb; | |||
| LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft); | |||
| void NetworkImplDft::set_config(const Config& config) { | |||
| m_user_config = std::make_unique<Config>(); | |||
| *m_user_config = config; | |||
| m_load_config.comp_graph = mgb::ComputingGraph::make(); | |||
| m_compnode_locator = to_compnode_locator(m_user_config->device_type); | |||
| m_compnode_locator.device = config.device_id; | |||
| } | |||
| void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) { | |||
| application_config(); | |||
| const auto& src_impl = src_network->cast_final_safe<NetworkImplDft>(); | |||
| LITE_ASSERT(src_impl.m_loader, | |||
| "Clone network must after the network is loaded."); | |||
| m_load_result = src_impl.m_loader->load(m_load_config, true); | |||
| //! flag weather the mode is cross compnode model | |||
| cross_compnode_model_detect(); | |||
| //! update the IO of the network | |||
| update_io(); | |||
| //! replace the IO when there is device input or output | |||
| compile_graph(); | |||
| } | |||
| void NetworkImplDft::application_config() { | |||
| auto device_type = m_user_config->device_type; | |||
| m_compnode_locator.type = to_compnode_locator(device_type).type; | |||
| m_compnode_locator.device = m_user_config->device_id; | |||
| if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) { | |||
| m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; | |||
| m_compnode_locator.device = m_user_config->device_id; | |||
| } | |||
| //! model options | |||
| #define ConfigOption(mge_name, lite_name) \ | |||
| options.mge_name = m_user_config->options.lite_name; | |||
| auto&& options = m_load_config.comp_graph->options(); | |||
| ConfigOption(graph_opt.weight_preprocess, weight_preprocess); | |||
| ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess); | |||
| ConfigOption(fake_next_exec, fake_next_exec); | |||
| ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run); | |||
| m_load_config.const_var_shape = m_user_config->options.const_shape; | |||
| ConfigOption(force_dynamic_alloc, force_dynamic_alloc); | |||
| ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc); | |||
| ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change); | |||
| LITE_ASSERT(m_user_config->options.jit_level == 0 || | |||
| (m_user_config->options.jit_level > 0 && | |||
| device_type == LiteDeviceType::LITE_CUDA), | |||
| "jit only support in cuda device."); | |||
| ConfigOption(graph_opt.jit, jit_level); | |||
| ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level); | |||
| ConfigOption(graph_opt_level, graph_opt_level); | |||
| ConfigOption(async_exec_level, async_exec_level); | |||
| #undef ConfigOption | |||
| #define ConfigOptionLayoutTransform(name) \ | |||
| if (m_user_config->options.name) { \ | |||
| options.graph_opt.name(); \ | |||
| } | |||
| ConfigOptionLayoutTransform(enable_nchw44); | |||
| ConfigOptionLayoutTransform(enable_nchw44_dot); | |||
| ConfigOptionLayoutTransform(enable_nchw88); | |||
| ConfigOptionLayoutTransform(enable_nhwcd4); | |||
| ConfigOptionLayoutTransform(enable_nchw4); | |||
| ConfigOptionLayoutTransform(enable_nchw32); | |||
| ConfigOptionLayoutTransform(enable_nchw64); | |||
| #undef ConfigOptionLayoutTransform | |||
| if (m_user_config->has_compression) { | |||
| m_load_config.tensor_value_loader = decompressed_tensor_value_loader; | |||
| } | |||
| //! if device is LITE_NONE, the compnode information is stored in model | |||
| if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
| //! currently not set Locator type because an atlas mgb model is a | |||
| //! cross-compnode graph | |||
| if (device_type == LiteDeviceType::LITE_ATLAS) { | |||
| m_load_config.comp_node_mapper = | |||
| [this](mgb::CompNode::Locator& loc) { | |||
| if (loc.type == mgb::CompNode::DeviceType::ATLAS) { | |||
| loc.device = m_compnode_locator.device; | |||
| loc.stream = m_compnode_locator.stream; | |||
| } else if (loc.type == | |||
| mgb::CompNode::DeviceType::MULTITHREAD) { | |||
| loc.stream = m_nr_threads; | |||
| } | |||
| }; | |||
| } else { | |||
| m_load_config.comp_node_mapper = | |||
| [this](mgb::CompNode::Locator& loc) { | |||
| loc = m_compnode_locator; | |||
| }; | |||
| } | |||
| } | |||
| } | |||
| void NetworkImplDft::set_memory_allocator( | |||
| std::shared_ptr<Allocator> user_allocator) { | |||
| auto allocator = std::make_shared<UserStaticMemAlloc>(user_allocator); | |||
| LITE_ASSERT(m_load_config.comp_graph); | |||
| m_load_config.comp_graph->set_device_memory_allocator(allocator); | |||
| } | |||
| //! share the runtime memory with other network, the weights is not shared | |||
| void NetworkImplDft::share_runtime_memory_with( | |||
| Network::NetworkImplBase* network_impl) { | |||
| LITE_ASSERT(network_impl); | |||
| LITE_ASSERT(m_load_config.comp_graph); | |||
| m_load_config.comp_graph->share_device_memory_with( | |||
| *(network_impl->cast_final_safe<NetworkImplDft>() | |||
| .m_load_config.comp_graph)); | |||
| } | |||
| void NetworkImplDft::set_cpu_inplace_mode() { | |||
| LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
| "cpu inplace mode is only avaliable in CPU."); | |||
| m_is_cpu_inplace_mode = true; | |||
| if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) { | |||
| m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT; | |||
| } else { | |||
| LITE_ASSERT( | |||
| m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD, | |||
| "cpu inplace mode is only avaliable in CPU."); | |||
| m_compnode_locator.device = | |||
| mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT; | |||
| } | |||
| } | |||
| void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) { | |||
| LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
| "multi threads mode is only avaliable in CPU."); | |||
| if (nr_threads > 1) { | |||
| m_nr_threads = nr_threads; | |||
| m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; | |||
| m_compnode_locator.nr_threads = nr_threads; | |||
| } | |||
| } | |||
| void NetworkImplDft::set_runtime_thread_affinity( | |||
| const ThreadAffinityCallback& thread_affinity_callback) { | |||
| LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
| "multi threads mode is only avaliable in CPU."); | |||
| mgb::CompNode::Locator loc; | |||
| m_load_config.comp_node_mapper(loc); | |||
| auto cn = mgb::CompNode::load(loc); | |||
| if (m_nr_threads > 1) { | |||
| mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity( | |||
| thread_affinity_callback); | |||
| } else { | |||
| mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||
| [thread_affinity_callback](void) { | |||
| thread_affinity_callback(0); | |||
| }); | |||
| } | |||
| } | |||
| void NetworkImplDft::set_device_id(int device_id) { | |||
| m_compnode_locator.device = device_id; | |||
| m_user_config->device_id = device_id; | |||
| } | |||
| void NetworkImplDft::set_stream_id(int stream_id) { | |||
| m_compnode_locator.stream = stream_id; | |||
| } | |||
| void NetworkImplDft::use_tensorrt() { | |||
| auto&& options = m_load_config.comp_graph->options(); | |||
| options.graph_opt.tensorrt = true; | |||
| } | |||
| //! set the callback in async model | |||
| void NetworkImplDft::set_async_callback(const AsyncCallback& callback) { | |||
| LITE_ASSERT(!m_is_cpu_inplace_mode, | |||
| "cpu inplace mode not support async mode"); | |||
| LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU || | |||
| m_user_config->device_type == LiteDeviceType::LITE_CUDA, | |||
| "Now only cpu and cuda>10.0 support async mode"); | |||
| m_async = true; | |||
| m_async_callback = std::move(callback); | |||
| } | |||
| void NetworkImplDft::make_output_spec() { | |||
| m_output_spec.clear(); | |||
| for (auto&& out : m_network_io->outputs) { | |||
| if (m_load_result.output_var_map.count(out.name)) { | |||
| auto&& load_out = m_load_result.output_var_map[out.name]; | |||
| auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable { | |||
| mgb::CompNode comp_node = dv.comp_node(); | |||
| if (out.io_type == LiteIOType::LITE_IO_SHAPE) { | |||
| auto mgb_layout = dv.layout(); | |||
| out.lite_tensor->set_layout(to_lite_layout(mgb_layout)); | |||
| } else { | |||
| TensorHelper::implement(out.lite_tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .copy_from_mge_tensor(dv); | |||
| out.lite_tensor->update_from_implement(); | |||
| } | |||
| if (m_async) { | |||
| out.have_sync = true; | |||
| bool need_exec_cb = true; | |||
| for (auto&& j : m_network_io->outputs) { | |||
| if (!j.have_sync) { | |||
| need_exec_cb = false; | |||
| } | |||
| } | |||
| if (need_exec_cb) { | |||
| for (auto&& j : m_network_io->outputs) { | |||
| j.have_sync = false; | |||
| } | |||
| comp_node.add_callback([this]() { finish(); }); | |||
| } | |||
| } | |||
| }; | |||
| m_output_spec.emplace_back(load_out, std::move(cb)); | |||
| } else { | |||
| LITE_THROW(ssprintf("no output named : %s in the mode", | |||
| out.name.c_str())); | |||
| } | |||
| } | |||
| } | |||
| void NetworkImplDft::replace_dev_input_pass() { | |||
| mgb::CompNode::Locator locator; | |||
| m_load_config.comp_node_mapper(locator); | |||
| //! CPU is not need use device input | |||
| if (locator.type == mgb::CompNode::DeviceType::CPU) { | |||
| return; | |||
| } | |||
| //! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor | |||
| //! in m_network_io.input, user can directly change the dev tensor | |||
| //! storage through m_network_io.input.lite_tensor->reset() befor forward | |||
| using DeviceTensorMap = | |||
| std::unordered_map<std::string, | |||
| std::shared_ptr<mgb::DeviceTensorND>>; | |||
| DeviceTensorMap name2dev_tensor; | |||
| mgb::ThinHashMap<mgb::HostTensorND*, mgb::SymbolVar> host_val2var; | |||
| //! construct host_val2var that maps from host tensor to corresponding var | |||
| auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<mgb::opr::Host2DeviceCopy>()) { | |||
| mgb::HostTensorND* tensor = | |||
| opr->cast_final<mgb::opr::Host2DeviceCopy>() | |||
| .host_data() | |||
| .get(); | |||
| host_val2var[tensor] = opr->output(0); | |||
| } | |||
| }; | |||
| mgb::cg::DepOprIter dep_iter{on_opr}; | |||
| for (auto i : m_load_result.output_var_list) { | |||
| dep_iter.add(i.node()->owner_opr()); | |||
| } | |||
| mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> inp_var_map, out_var_map; | |||
| mgb::SmallVector<std::string> to_clear; | |||
| for (auto&& config_in : m_network_io->inputs) { | |||
| if (!config_in.is_host) { | |||
| auto host_val = m_load_result.tensor_map[config_in.name]; | |||
| auto dev_val = TensorHelper::implement(config_in.lite_tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .m_dev_tensor; | |||
| auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make( | |||
| *m_load_result.graph, dev_val, {config_in.name}); | |||
| inp_var_map[host_val2var.at(host_val.get())] = dev_var; | |||
| name2dev_tensor[config_in.name] = dev_val; | |||
| } | |||
| } | |||
| auto new_ovar = | |||
| mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map); | |||
| for (size_t i = 0; i < new_ovar.size(); ++i) { | |||
| out_var_map[m_load_result.output_var_list[i]] = new_ovar[i]; | |||
| } | |||
| for (auto&& i : m_load_result.output_var_map) { | |||
| i.second = out_var_map.at(i.second); | |||
| } | |||
| for (auto&& i : m_load_result.output_var_map_id) { | |||
| i.second = out_var_map.at(i.second); | |||
| } | |||
| for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) { | |||
| new_ovar[i].rename(m_load_result.output_var_list[i].node()->name()); | |||
| } | |||
| m_load_result.output_var_list = std::move(new_ovar); | |||
| } | |||
| void NetworkImplDft::cross_compnode_model_detect() { | |||
| mgb::ThinHashSet<LiteDeviceType> nr_used_device_type; | |||
| auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { | |||
| for (auto j : opr->output()) { | |||
| if (j->comp_node() != mgb::CompNode::default_cpu()) { | |||
| nr_used_device_type.insert( | |||
| get_device_from_locator(j->comp_node().locator())); | |||
| } | |||
| } | |||
| }; | |||
| mgb::cg::DepOprIter dep_iter{on_opr}; | |||
| for (auto i : m_load_result.output_var_list) { | |||
| dep_iter.add(i.node()->owner_opr()); | |||
| } | |||
| m_nr_device_type = nr_used_device_type.size(); | |||
| } | |||
| void NetworkImplDft::load_model( | |||
| std::shared_ptr<void> model_mem, size_t size, | |||
| std::unordered_map<std::string, LiteAny> separate_config_map) { | |||
| if (!m_loader) { | |||
| m_input_file = mgb::serialization::InputFile::make_mem_proxy( | |||
| model_mem, size, false); | |||
| auto format = | |||
| mgb::serialization::GraphLoader::identify_graph_dump_format( | |||
| *m_input_file); | |||
| if (!format.valid()) { | |||
| LITE_THROW("invalid model format"); | |||
| } | |||
| m_loader = mgb::serialization::GraphLoader::make( | |||
| std::move(m_input_file), format.val()); | |||
| } | |||
| //! applay the user configration to mge model | |||
| application_config(); | |||
| //! config some flag get from json config file | |||
| if (separate_config_map.find("device_id") != separate_config_map.end()) { | |||
| set_device_id(separate_config_map["device_id"].unsafe_cast<int>()); | |||
| } | |||
| if (separate_config_map.find("number_threads") != | |||
| separate_config_map.end() && | |||
| separate_config_map["number_threads"].unsafe_cast<size_t>() > 1) { | |||
| set_cpu_threads_number( | |||
| separate_config_map["number_threads"].unsafe_cast<size_t>()); | |||
| } | |||
| if (separate_config_map.find("enable_inplace_model") != | |||
| separate_config_map.end() && | |||
| separate_config_map["enable_inplace_model"].unsafe_cast<bool>()) { | |||
| set_cpu_inplace_mode(); | |||
| } | |||
| if (separate_config_map.find("use_tensorrt") != separate_config_map.end() && | |||
| separate_config_map["use_tensorrt"].unsafe_cast<bool>()) { | |||
| use_tensorrt(); | |||
| } | |||
| m_load_result = m_loader->load(m_load_config, true); | |||
| cross_compnode_model_detect(); | |||
| //! update the IO of the network | |||
| update_io(); | |||
| //! replace the IO when there is device input or output | |||
| compile_graph(); | |||
| } | |||
| void NetworkImplDft::compile_graph() { | |||
| modify_exection_policy(); | |||
| replace_dev_input_pass(); | |||
| make_output_spec(); | |||
| m_execute_func = m_load_result.graph_compile(m_output_spec); | |||
| } | |||
| void NetworkImplDft::start() const { | |||
| if (m_start_callback) { | |||
| std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>> | |||
| input_io_map; | |||
| for (auto&& io_inner : m_network_io->inputs) { | |||
| input_io_map[io_inner.name] = { | |||
| IO{io_inner.name, io_inner.is_host, io_inner.io_type, | |||
| io_inner.config_layout}, | |||
| io_inner.lite_tensor}; | |||
| } | |||
| m_start_callback(input_io_map); | |||
| } | |||
| } | |||
| void NetworkImplDft::forward() { | |||
| start(); | |||
| LITE_ASSERT(m_execute_func, "forward must be called after network loaded."); | |||
| m_execute_func->execute(); | |||
| } | |||
| void NetworkImplDft::wait() { | |||
| if (!m_async) { | |||
| m_execute_func->wait(); | |||
| } | |||
| finish(); | |||
| } | |||
| void NetworkImplDft::finish() const { | |||
| if (m_async) { | |||
| LITE_ASSERT(m_async_callback, | |||
| "The callback func must set when async mode."); | |||
| m_async_callback(); | |||
| } | |||
| if (m_finish_callback) { | |||
| std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>> | |||
| output_io_map; | |||
| for (auto&& io_inner : m_network_io->outputs) { | |||
| output_io_map[io_inner.name] = { | |||
| IO{io_inner.name, io_inner.is_host, io_inner.io_type, | |||
| io_inner.config_layout}, | |||
| io_inner.lite_tensor}; | |||
| } | |||
| m_finish_callback(output_io_map); | |||
| } | |||
| output_plugin_result(); | |||
| } | |||
| void NetworkImplDft::set_io(const NetworkIO& network_io) { | |||
| m_network_io = std::make_unique<NetworkIOInner>(); | |||
| for (auto&& in : network_io.inputs) { | |||
| m_network_io->inputs.emplace_back(in); | |||
| } | |||
| for (auto&& out : network_io.outputs) { | |||
| m_network_io->outputs.emplace_back(out); | |||
| } | |||
| } | |||
| void NetworkImplDft::update_io() { | |||
| update_input(); | |||
| update_output(); | |||
| } | |||
| void NetworkImplDft::update_input() { | |||
| auto device_type = m_user_config->device_type; | |||
| auto device_id = m_compnode_locator.device; | |||
| auto stream_id = m_compnode_locator.stream; | |||
| //! if cpu all input and output are host | |||
| if (device_type == LiteDeviceType::LITE_CPU) { | |||
| for (auto&& in : m_network_io->inputs) { | |||
| in.is_host = true; | |||
| } | |||
| } | |||
| //! if cross compnode model, modify the device input if it is not valid | |||
| if (m_nr_device_type > 1) { | |||
| for (auto&& in_tensor_iter : m_load_result.tensor_map) { | |||
| for (auto&& config_in : m_network_io->inputs) { | |||
| //! if tensor is set to device input | |||
| if (in_tensor_iter.first == config_in.name && | |||
| !config_in.is_host) { | |||
| //! if the origin compnode of the tensor is not the device, | |||
| //! set the input to host | |||
| if (get_device_from_locator( | |||
| in_tensor_iter.second->comp_node().locator()) == | |||
| LiteDeviceType::LITE_CPU) { | |||
| config_in.is_host = true; | |||
| LITE_WARN( | |||
| "The input tensor %s of the cross device model " | |||
| "should not from device.", | |||
| config_in.name.c_str()); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (auto&& in_tensor_iter : m_load_result.tensor_map) { | |||
| bool found = false; | |||
| for (auto&& config_in : m_network_io->inputs) { | |||
| if (in_tensor_iter.first == config_in.name) { | |||
| found = true; | |||
| if (config_in.is_host) { | |||
| config_in.lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type, true); | |||
| TensorHelper::implement(config_in.lite_tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .m_host_tensor = in_tensor_iter.second; | |||
| config_in.lite_tensor->update_from_implement(); | |||
| } else { | |||
| config_in.lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type); | |||
| config_in.lite_tensor->set_layout( | |||
| to_lite_layout(in_tensor_iter.second->layout())); | |||
| } | |||
| if (config_in.config_layout.ndim && | |||
| !(config_in.config_layout == | |||
| config_in.lite_tensor->get_layout())) { | |||
| config_in.lite_tensor->set_layout(config_in.config_layout); | |||
| } | |||
| } | |||
| } | |||
| if (!found) { | |||
| IOInner io_in; | |||
| io_in.name = in_tensor_iter.first; | |||
| io_in.lite_tensor = std::make_shared<Tensor>(device_id, stream_id, | |||
| device_type, true); | |||
| TensorHelper::implement(io_in.lite_tensor) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .m_host_tensor = in_tensor_iter.second; | |||
| io_in.lite_tensor->update_from_implement(); | |||
| m_network_io->inputs.push_back(io_in); | |||
| } | |||
| } | |||
| //! delete the IO that is not the network | |||
| for (auto it = m_network_io->inputs.begin(); | |||
| it != m_network_io->inputs.end();) { | |||
| if (it->lite_tensor == nullptr) { | |||
| LITE_LOG("%s is not the network input, ignore it.", | |||
| it->name.c_str()); | |||
| it = m_network_io->inputs.erase(it); | |||
| } else { | |||
| it++; | |||
| } | |||
| } | |||
| } | |||
| void NetworkImplDft::update_output() { | |||
| auto device_type = m_user_config->device_type; | |||
| auto device_id = m_compnode_locator.device; | |||
| auto stream_id = m_compnode_locator.stream; | |||
| if (device_type == LiteDeviceType::LITE_CPU) { | |||
| for (auto&& out : m_network_io->outputs) { | |||
| out.is_host = true; | |||
| } | |||
| } | |||
| //! delete the output that is not the network | |||
| for (auto out_it = m_network_io->outputs.begin(); | |||
| out_it != m_network_io->outputs.end();) { | |||
| if (std::find_if(m_load_result.output_var_list.begin(), | |||
| m_load_result.output_var_list.end(), | |||
| [out_it](const mgb::SymbolVar var) { | |||
| return var.node()->name() == out_it->name; | |||
| }) == m_load_result.output_var_list.end()) { | |||
| LITE_LOG("%s is not the network output, ignore it.", | |||
| out_it->name.c_str()); | |||
| out_it = m_network_io->outputs.erase(out_it); | |||
| } else { | |||
| out_it++; | |||
| } | |||
| } | |||
| //! user config the output tensor, so only compute the config output | |||
| if (m_compute_configured_output_only) { | |||
| LITE_ASSERT(m_network_io->outputs.size() > 0, | |||
| "compute configured output only with no configure output."); | |||
| for (auto out_it = m_network_io->outputs.begin(); | |||
| out_it != m_network_io->outputs.end(); out_it++) { | |||
| //! use pinned memory to copy form device | |||
| if (out_it->is_host) { | |||
| out_it->lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type, true); | |||
| } else { | |||
| out_it->lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type); | |||
| } | |||
| } | |||
| //! user not set, use default output | |||
| } else { | |||
| for (auto&& out : m_load_result.output_var_list) { | |||
| auto it = std::find_if(m_network_io->outputs.begin(), | |||
| m_network_io->outputs.end(), | |||
| [&out](const IOInner io) { | |||
| return io.name == out.node()->name(); | |||
| }); | |||
| if (it != m_network_io->outputs.end()) { | |||
| if (it->is_host) { | |||
| it->lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type, true); | |||
| } else { | |||
| it->lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type); | |||
| } | |||
| } else { | |||
| IOInner output; | |||
| output.name = out.node()->name(); | |||
| output.lite_tensor = std::make_shared<Tensor>( | |||
| device_id, stream_id, device_type, true); | |||
| m_network_io->outputs.push_back({output}); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(std::string io_name, | |||
| LiteTensorPhase phase) { | |||
| if (phase == LiteTensorPhase::LITE_INPUT || | |||
| phase == LiteTensorPhase::LITE_IO) { | |||
| for (auto&& config_in : m_network_io->inputs) { | |||
| if (io_name == config_in.name) { | |||
| return config_in.lite_tensor; | |||
| } | |||
| } | |||
| } | |||
| if (phase == LiteTensorPhase::LITE_OUTPUT || | |||
| phase == LiteTensorPhase::LITE_IO) { | |||
| for (auto&& config_out : m_network_io->outputs) { | |||
| if (io_name == config_out.name) { | |||
| config_out.lite_tensor->update_from_implement(); | |||
| return config_out.lite_tensor; | |||
| } | |||
| } | |||
| } | |||
| LITE_THROW(mgb::ssprintf( | |||
| "tensor name must be %s input tensor name or the registered " | |||
| "output tensor name if NetworkIO is set, if NetworkIO is not set, " | |||
| "the output tensor is all the network output tensor, or the output " | |||
| "tensor is only the registered tensor.", | |||
| io_name.c_str())); | |||
| return nullptr; | |||
| } | |||
| std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) { | |||
| return get_io_tensor(get_input_name(index)); | |||
| } | |||
| std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) { | |||
| return get_io_tensor(get_output_name(index)); | |||
| } | |||
| //! set opr algorithm selection strategy in the network | |||
| void NetworkImplDft::set_network_algo_policy(LiteAlgoSelectStrategy strategy, | |||
| uint32_t shared_batch_size, | |||
| bool binary_equal_between_batch) { | |||
| using S = megdnn::param::ExecutionPolicy::Strategy; | |||
| auto dst_strategy = static_cast<S>(0); | |||
| if (static_cast<uint32_t>(strategy) & | |||
| LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) { | |||
| dst_strategy = dst_strategy | S::HEURISTIC; | |||
| } | |||
| if (static_cast<uint32_t>(strategy) & | |||
| LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) { | |||
| dst_strategy = dst_strategy | S::PROFILE; | |||
| } | |||
| if (static_cast<uint32_t>(strategy) & | |||
| LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) { | |||
| dst_strategy = dst_strategy | S::REPRODUCIBLE; | |||
| } | |||
| if (static_cast<uint32_t>(strategy) & | |||
| LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) { | |||
| dst_strategy = dst_strategy | S::OPTIMIZED; | |||
| } | |||
| m_execution_policy = dst_strategy; | |||
| auto&& fast_run_config = | |||
| m_load_config.comp_graph->options().fast_run_config; | |||
| fast_run_config.binary_equal_between_batch = binary_equal_between_batch; | |||
| fast_run_config.shared_batch_size = shared_batch_size; | |||
| if (m_execute_func) { | |||
| LITE_WARN( | |||
| "set_network_algo_policy maybe cause error after loaded " | |||
| "network!!!!"); | |||
| modify_exection_policy(); | |||
| } | |||
| } | |||
| void NetworkImplDft::modify_exection_policy() { | |||
| mgb::SymbolVarArray vars; | |||
| for (auto i : m_output_spec) { | |||
| vars.push_back(i.first); | |||
| } | |||
| if (static_cast<uint32_t>(m_execution_policy) != 0) | |||
| mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy); | |||
| } | |||
| //! set opr algorithm selection strategy in the network | |||
| void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) { | |||
| mgb::SymbolVarArray vars; | |||
| for (auto i : m_output_spec) { | |||
| vars.push_back(i.first); | |||
| } | |||
| mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit); | |||
| } | |||
| //! get the input tensor name in the order of graph | |||
| std::vector<const char*> NetworkImplDft::get_all_output_name() const { | |||
| std::vector<const char*> output_names; | |||
| for (auto& output : m_network_io->outputs) { | |||
| output_names.push_back(output.name.c_str()); | |||
| } | |||
| return output_names; | |||
| } | |||
| //! get the input tensor name in the order of graph | |||
| std::vector<const char*> NetworkImplDft::get_all_input_name() const { | |||
| std::vector<const char*> input_names; | |||
| for (auto& input : m_load_result.tensor_map) { | |||
| input_names.push_back(input.first.c_str()); | |||
| } | |||
| return input_names; | |||
| } | |||
| //! get the output tensor name in the order of graph | |||
| const char* NetworkImplDft::get_output_name(size_t index) const { | |||
| LITE_ASSERT( | |||
| index < m_load_result.output_var_list.size(), | |||
| "The output tensor index is large than the total outputs number."); | |||
| return m_load_result.output_var_list[index].node()->name().c_str(); | |||
| } | |||
| //! get the input tensor name in the order of graph | |||
| const char* NetworkImplDft::get_input_name(size_t index) const { | |||
| LITE_ASSERT( | |||
| index < m_load_result.tensor_map.size(), | |||
| "The input tensor index is large than the total inputs number."); | |||
| size_t i = 0; | |||
| for (auto& input : m_load_result.tensor_map) { | |||
| if (i == index) { | |||
| return input.first.c_str(); | |||
| } | |||
| i++; | |||
| } | |||
| LITE_THROW(ssprintf("no input tensor of index %zu.", index)); | |||
| } | |||
| //! Plugin part | |||
| void NetworkImplDft::enable_profile_performance(std::string profile_json_file) { | |||
| #if MGB_ENABLE_JSON | |||
| #if MGB_OPENCL | |||
| mgb::CompNode::enable_opencl_profile(true); | |||
| #endif | |||
| m_profiler = std::make_unique<mgb::GraphProfiler>( | |||
| m_load_config.comp_graph.get()); | |||
| m_profiler_output_file = profile_json_file; | |||
| #else | |||
| LITE_MARK_USED_VAR(profile_json_file); | |||
| LITE_THROW("JSON is disable at compile time."); | |||
| #endif | |||
| } | |||
| void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) { | |||
| auto iodump = std::make_unique<mgb::TextOprIODump>( | |||
| m_load_config.comp_graph.get(), io_txt_out_file.c_str()); | |||
| iodump->print_addr(false); | |||
| m_iodump = std::move(iodump); | |||
| } | |||
| void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) { | |||
| m_iodump = std::make_unique<mgb::BinaryOprIODump>( | |||
| m_load_config.comp_graph.get(), io_bin_out_dir.c_str()); | |||
| } | |||
| void inline NetworkImplDft::output_plugin_result() const { | |||
| #if MGB_ENABLE_JSON | |||
| if (m_profiler && m_execute_func) { | |||
| m_profiler->to_json_full(m_execute_func.get()) | |||
| ->writeto_fpath(m_profiler_output_file); | |||
| } | |||
| #endif | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,242 @@ | |||
| /** | |||
| * \file src/mge/network_impl.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "lite/network.h" | |||
| #include "network_impl_base.h" | |||
| #include "tensor_impl.h" | |||
| #include "megbrain/graph/bases.h" | |||
| #include "megbrain/plugin/opr_io_dump.h" | |||
| #include "megbrain/plugin/profiler.h" | |||
| #include "megbrain/serialization/extern_c_opr.h" | |||
| #include "megbrain/serialization/file.h" | |||
| #include "megbrain/serialization/load_dump_config.h" | |||
| #include "megbrain/serialization/serializer.h" | |||
| #include "megbrain/utils/thin/hash_table.h" | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| /*! | |||
| * \brief implement the Network, contain the mgb related member | |||
| */ | |||
| class NetworkImplDft final : public Network::NetworkImplBase { | |||
| LITE_DYN_TYPE_OBJ_FINAL_DECL; | |||
| public: | |||
| using S = megdnn::param::ExecutionPolicy::Strategy; | |||
| //! set the config of the network, include: | |||
| //! the inference device | |||
| //! the other inference options, such as record_level, weight_preprocess... | |||
| void set_config(const Config& config) override; | |||
| //! set the special io infomation, if not set, default io tensor will used, | |||
| //! this is special for input/output is not host tensor, default the | |||
| //! input/output tensors are host tensor | |||
| void set_io(const NetworkIO& network_io) override; | |||
| //! only compute the output tensor in user configured | |||
| void compute_only_configured_output() override { | |||
| m_compute_configured_output_only = true; | |||
| } | |||
| //! get the network input and ouput tensor, the layout of which is | |||
| //! sync from mge tensor | |||
| std::shared_ptr<Tensor> get_io_tensor( | |||
| std::string io_name, | |||
| LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override; | |||
| //! get the input tensor by index in the load_result tensormap | |||
| std::shared_ptr<Tensor> get_input_tensor(size_t index) override; | |||
| //! get the output tensor by index in the load_result output_var_list | |||
| std::shared_ptr<Tensor> get_output_tensor(size_t index) override; | |||
| //! get all the input tensor name in the order in load return | |||
| std::vector<const char*> get_all_input_name() const override; | |||
| //! get all the output tensor name in the order in load return | |||
| std::vector<const char*> get_all_output_name() const override; | |||
| //! get the input tensor name in the order in load return | |||
| const char* get_input_name(size_t index) const override; | |||
| //! get the output tensor name in the order in load return | |||
| const char* get_output_name(size_t index) const override; | |||
| //! set the callback in async model | |||
| void set_async_callback(const AsyncCallback& callback) override; | |||
| //! set the start callback which will execute before network forward | |||
| void set_start_callback(const StartCallback& callback) override { | |||
| m_start_callback = std::move(callback); | |||
| } | |||
| //! set the finish callback which will execute after network forward | |||
| void set_finish_callback(const FinishCallback& callback) override { | |||
| m_finish_callback = std::move(callback); | |||
| } | |||
| //! load the model and get the m_load_result | |||
| void load_model(std::shared_ptr<void> model_mem, size_t size, | |||
| std::unordered_map<std::string, LiteAny> | |||
| separate_config_map = {}) override; | |||
| //! forward the network with filled input data and fill the output data | |||
| //! to the output tensor | |||
| void forward() override; | |||
| //! in sync model, wait utile the inference finish | |||
| void wait() override; | |||
| virtual LiteDeviceType get_device_type() const override { | |||
| return m_user_config->device_type; | |||
| } | |||
| //! Set cpu default mode when device is CPU, in some low computation | |||
| //! device or single core device, this mode will get good performace | |||
| void set_cpu_inplace_mode(); | |||
| bool is_cpu_inplace_mode() const { return m_is_cpu_inplace_mode; } | |||
| //! When device is CPU, this interface will set the to be loaded model | |||
| //! run in multi thread mode with the given thread number. | |||
| void set_cpu_threads_number(size_t nr_threads); | |||
| size_t get_cpu_threads_number() const { return m_nr_threads; } | |||
| //! set device id, default device id = 0 | |||
| void set_device_id(int device_id) override; | |||
| int get_device_id() const override { return m_compnode_locator.device; }; | |||
| LiteBackend get_backend_type() const override { | |||
| return LiteBackend::LITE_DEFAULT; | |||
| } | |||
| //! set stream id, default stream id = 0 | |||
| void set_stream_id(int stream_id) override; | |||
| int get_stream_id() const override { return m_compnode_locator.stream; }; | |||
| //! enable tensorrt | |||
| void use_tensorrt(); | |||
| //! enable profile the network, a JSON format file will be generated | |||
| void enable_profile_performance( | |||
| std::string profile_json_file_path) override; | |||
| /********************** mge special function ************************/ | |||
| //! load a new network which will share weights with src network | |||
| void shared_weight_with(const NetworkImplBase* src_network); | |||
| //! share the runtime memory with other network, the weights is not shared | |||
| void share_runtime_memory_with(NetworkImplBase* network); | |||
| //! set threads affinity callback; | |||
| void set_runtime_thread_affinity( | |||
| const ThreadAffinityCallback& thread_affinity_callback); | |||
| //! set the network memroy allocator, the allocator is defined by user | |||
| void set_memory_allocator(std::shared_ptr<Allocator> user_allocator); | |||
| //! set opr algorithm selection strategy in the network | |||
| void set_network_algo_policy(LiteAlgoSelectStrategy strategy, | |||
| uint32_t shared_batch_size, | |||
| bool binary_equal_between_batch); | |||
| //! set workspace_limit for oprs with multiple algorithms, set | |||
| //! workspace limitation can save memory but may influence the performance | |||
| void set_network_algo_workspace_limit(size_t workspace_limit); | |||
| //! Dump input/output values of all internal variables to output file, | |||
| //! in text format | |||
| void enable_io_txt_dump(std::string io_txt_out_file); | |||
| //! Dump input/output values of all internal variables to output | |||
| //! directory, in binary format | |||
| void enable_io_bin_dump(std::string io_bin_out_dir); | |||
| private: | |||
| //! construct the outputspec according to the m_network_io, and set the | |||
| //! call_back to the outputspec | |||
| void make_output_spec(); | |||
| //! modify the execution policy | |||
| void modify_exection_policy(); | |||
| //! if the input is dev tensor, the pass will replace the H2D Opr to | |||
| //! VolatileSharedDeviceTensor Opr | |||
| void replace_dev_input_pass(); | |||
| //! check whether the model is cross compnode | |||
| void cross_compnode_model_detect(); | |||
| //! when the model have loaded, update the IO, if not set networkio, update | |||
| //! the networkio with the IO of loaded model | |||
| void update_io(); | |||
| void update_input(); | |||
| void update_output(); | |||
| //! when the model info have loaded, update the config according the model | |||
| //! info, finaly use it in compute graph | |||
| void application_config(); | |||
| //! after finish forwarding the netwark, output the result of plugin to file | |||
| void output_plugin_result() const; | |||
| //! when finish forwarding the network, the function will be called | |||
| void finish() const; | |||
| //! before forwarding the network, the function will be called | |||
| void start() const; | |||
| //! compile the graph to get the execute function | |||
| void compile_graph(); | |||
| private: | |||
| bool m_async = false; | |||
| bool m_is_cpu_inplace_mode = false; | |||
| int m_nr_device_type = 0; | |||
| size_t m_nr_threads = 1; | |||
| bool m_compute_configured_output_only = false; | |||
| mgb::CompNode::Locator m_compnode_locator; | |||
| AsyncCallback m_async_callback = nullptr; | |||
| std::unique_ptr<NetworkIOInner> m_network_io; | |||
| std::unique_ptr<Config> m_user_config; | |||
| std::unique_ptr<mgb::cg::AsyncExecutable> m_execute_func; | |||
| //! The model load related data | |||
| S m_execution_policy = static_cast<S>(0); | |||
| std::unique_ptr<mgb::serialization::InputFile> m_input_file; | |||
| mgb::serialization::GraphLoadConfig m_load_config; | |||
| mgb::serialization::GraphLoader::LoadResult m_load_result; | |||
| mgb::ComputingGraph::OutputSpec m_output_spec; | |||
| std::shared_ptr<mgb::serialization::GraphLoader> m_loader; | |||
| //! start and finish callback | |||
| StartCallback m_start_callback = nullptr; | |||
| FinishCallback m_finish_callback = nullptr; | |||
| //! profile and io dump related data | |||
| #if MGB_ENABLE_JSON | |||
| std::unique_ptr<mgb::GraphProfiler> m_profiler; | |||
| std::string m_profiler_output_file; | |||
| #endif | |||
| std::unique_ptr<mgb::OprIODumpBase> m_iodump; | |||
| }; | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,435 @@ | |||
| /** | |||
| * \file inlude/mge/tensor.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "tensor_impl.h" | |||
| #include "common.h" | |||
| #include "lite/tensor.h" | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/tensor.h" | |||
| #include <memory> | |||
| using namespace lite; | |||
| /**********************TensorImpl****************************/ | |||
| LITE_DYN_TYPE_OBJ_FINAL_IMPL(TensorImplDft); | |||
| TensorImplDft::TensorImplDft() { | |||
| m_host_tensor = | |||
| std::make_shared<mgb::HostTensorND>(mgb::CompNode::default_cpu()); | |||
| } | |||
| TensorImplDft::TensorImplDft(LiteDeviceType device, bool is_pinned_host) { | |||
| auto cn = mgb::CompNode::load(to_compnode_locator(device)); | |||
| if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
| device = LiteDeviceType::LITE_CPU; | |||
| } | |||
| if (device == LiteDeviceType::LITE_CPU) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
| mgb::CompNode::default_cpu()); | |||
| } else if (is_pinned_host) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
| } else { | |||
| m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
| } | |||
| } | |||
| TensorImplDft::TensorImplDft(LiteDeviceType device, const Layout& layout, | |||
| bool is_pinned_host) { | |||
| auto cn = mgb::CompNode::load(to_compnode_locator(device)); | |||
| auto mge_layout = to_impl_layout(layout); | |||
| if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
| device = LiteDeviceType::LITE_CPU; | |||
| } | |||
| if (device == LiteDeviceType::LITE_CPU) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
| mgb::CompNode::default_cpu(), mge_layout); | |||
| } else if (is_pinned_host) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout); | |||
| } else { | |||
| m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn, mge_layout); | |||
| } | |||
| } | |||
| TensorImplDft::TensorImplDft(int device_id, LiteDeviceType device_type, | |||
| const Layout& layout, bool is_pinned_host) { | |||
| auto locator = to_compnode_locator(device_type); | |||
| locator.device = device_id; | |||
| auto cn = mgb::CompNode::load(locator); | |||
| if (device_type == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
| device_type = LiteDeviceType::LITE_CPU; | |||
| } | |||
| if (layout.ndim) { | |||
| auto mge_layout = to_impl_layout(layout); | |||
| if (device_type == LiteDeviceType::LITE_CPU) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
| mgb::CompNode::default_cpu(), mge_layout); | |||
| } else if (is_pinned_host) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout); | |||
| } else { | |||
| m_dev_tensor = | |||
| std::make_shared<mgb::DeviceTensorND>(cn, mge_layout); | |||
| } | |||
| } else { | |||
| if (device_type == LiteDeviceType::LITE_CPU) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
| mgb::CompNode::default_cpu()); | |||
| } else if (is_pinned_host) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
| } else { | |||
| m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
| } | |||
| } | |||
| } | |||
| TensorImplDft::TensorImplDft(int device_id, int stream_id, | |||
| LiteDeviceType device_type, bool is_pinned_host) { | |||
| auto locator = to_compnode_locator(device_type); | |||
| locator.device = device_id; | |||
| locator.stream = stream_id; | |||
| auto cn = mgb::CompNode::load(locator); | |||
| if (get_device_from_locator(locator) == LiteDeviceType::LITE_CPU) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
| mgb::CompNode::default_cpu()); | |||
| } else if (is_pinned_host) { | |||
| m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
| } else { | |||
| m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
| } | |||
| } | |||
| LiteDeviceType TensorImplDft::get_device_type() const { | |||
| if (is_host()) { | |||
| return LiteDeviceType::LITE_CPU; | |||
| } else { | |||
| return get_device_from_locator(m_dev_tensor->comp_node().locator()); | |||
| } | |||
| } | |||
| int TensorImplDft::get_device_id() const { | |||
| if (is_host()) { | |||
| return m_host_tensor->comp_node().locator().device; | |||
| } else { | |||
| return m_dev_tensor->comp_node().locator().device; | |||
| } | |||
| } | |||
| bool TensorImplDft::is_pinned_host() const { | |||
| return is_host() && | |||
| get_device_from_locator(m_host_tensor->comp_node().locator()) != | |||
| LiteDeviceType::LITE_CPU; | |||
| } | |||
| void TensorImplDft::set_mge_tensor_compnode(const mgb::CompNode& comp_node) { | |||
| if (is_host()) { | |||
| m_host_tensor->comp_node(comp_node, true); | |||
| } else { | |||
| m_dev_tensor->comp_node(comp_node, true); | |||
| } | |||
| } | |||
| Layout TensorImplDft::get_layout() const { | |||
| if (is_host()) { | |||
| return to_lite_layout(m_host_tensor->layout()); | |||
| } else { | |||
| return to_lite_layout(m_dev_tensor->layout()); | |||
| } | |||
| } | |||
| void* TensorImplDft::get_memory_ptr() const { | |||
| if (is_host()) { | |||
| return static_cast<void*>(m_host_tensor->raw_ptr()); | |||
| } else { | |||
| return static_cast<void*>(m_dev_tensor->raw_ptr()); | |||
| } | |||
| } | |||
| void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const { | |||
| if (is_host()) { | |||
| auto elemsize_log = m_host_tensor->layout().dtype.size_log(); | |||
| switch (elemsize_log) { | |||
| case 0: | |||
| return static_cast<void*>( | |||
| m_host_tensor->ptr<uint8_t>(idx.begin(), idx.end())); | |||
| break; | |||
| case 1: | |||
| return static_cast<void*>( | |||
| m_host_tensor->ptr<short>(idx.begin(), idx.end())); | |||
| break; | |||
| case 2: | |||
| return static_cast<void*>( | |||
| m_host_tensor->ptr<float>(idx.begin(), idx.end())); | |||
| break; | |||
| default: | |||
| LITE_THROW("not supported data_type."); | |||
| } | |||
| } else { | |||
| auto elemsize_log = m_dev_tensor->layout().dtype.size_log(); | |||
| switch (elemsize_log) { | |||
| case 0: | |||
| return static_cast<void*>( | |||
| m_dev_tensor->ptr<uint8_t>(idx.begin(), idx.end())); | |||
| break; | |||
| case 1: | |||
| return static_cast<void*>( | |||
| m_dev_tensor->ptr<short>(idx.begin(), idx.end())); | |||
| break; | |||
| case 2: | |||
| return static_cast<void*>( | |||
| m_dev_tensor->ptr<float>(idx.begin(), idx.end())); | |||
| break; | |||
| default: | |||
| LITE_THROW("not supported data_type."); | |||
| } | |||
| } | |||
| } | |||
| std::shared_ptr<Tensor> TensorImplDft::slice( | |||
| const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
| const std::vector<size_t>& step) { | |||
| Layout layout; | |||
| mgb::TensorLayout layout_mge; | |||
| if (is_host()) { | |||
| layout_mge = m_host_tensor->layout(); | |||
| layout = to_lite_layout(m_host_tensor->layout()); | |||
| } else { | |||
| layout_mge = m_dev_tensor->layout(); | |||
| layout = to_lite_layout(m_dev_tensor->layout()); | |||
| } | |||
| size_t length = start.size(); | |||
| LITE_ASSERT(length == end.size() && length <= layout.ndim, | |||
| "The start and end must be the same size and less than layout " | |||
| "ndim."); | |||
| std::vector<mgb::Slice> slices; | |||
| if (step.size()) { | |||
| LITE_ASSERT(length == step.size(), | |||
| "The start and step must be the same size."); | |||
| for (size_t i = 0; i < length; i++) { | |||
| slices.push_back(mgb::Slice{start[i], end[i], step[i]}); | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < length; i++) { | |||
| slices.push_back(mgb::Slice{start[i], end[i]}); | |||
| } | |||
| } | |||
| auto subspec = mgb::SubTensorSpec::make_from_offset_elem(layout_mge, 0); | |||
| size_t axis = 0; | |||
| for (auto&& i : slices) { | |||
| subspec.merge_with(i.apply(subspec.layout(), axis)); | |||
| axis++; | |||
| } | |||
| auto ret = std::make_shared<Tensor>(); | |||
| auto& impl = TensorHelper::implement(ret)->cast_final_safe<TensorImplDft>(); | |||
| if (is_host()) { | |||
| *impl.m_host_tensor = m_host_tensor->sub(subspec); | |||
| } else { | |||
| impl.m_dev_tensor = std::make_shared<mgb::DeviceTensorND>( | |||
| m_dev_tensor->sub(subspec)); | |||
| impl.m_host_tensor = nullptr; | |||
| } | |||
| LITE_ASSERT(is_host() == impl.is_host()); | |||
| return ret; | |||
| } | |||
| void TensorImplDft::fill_zero() { | |||
| if (is_host()) { | |||
| auto mge_layout = m_host_tensor->layout(); | |||
| if (m_host_tensor->layout().is_physical_contiguous()) { | |||
| auto ptr = get_memory_ptr(); | |||
| std::memset(ptr, 0, | |||
| mge_layout.dtype.size(mge_layout.total_nr_elems())); | |||
| } else { | |||
| TensorImplDft tmp(LiteDeviceType::LITE_CPU, | |||
| to_lite_layout(mge_layout), true); | |||
| tmp.fill_zero(); | |||
| this->copy_from(&tmp); | |||
| } | |||
| } else { | |||
| mgb::dev_tensor_memset(*m_dev_tensor, 0); | |||
| m_dev_tensor->sync(); | |||
| } | |||
| } | |||
| void TensorImplDft::share_memory_with(const TensorImplBase* src_tensor_impl) { | |||
| auto src_dft_tensor = static_cast<const TensorImplDft*>(src_tensor_impl); | |||
| LITE_ASSERT(is_host() == src_dft_tensor->is_host(), | |||
| "share memory must happen in same device"); | |||
| //! make shape the src memory is ready | |||
| src_tensor_impl->get_memory_ptr(); | |||
| if (is_host()) { | |||
| *m_host_tensor = *src_dft_tensor->m_host_tensor; | |||
| } else { | |||
| *m_dev_tensor = *src_dft_tensor->m_dev_tensor; | |||
| } | |||
| } | |||
| void TensorImplDft::set_layout(const Layout& layout) { | |||
| bool host = is_host(); | |||
| auto mgb_layout = to_impl_layout(layout); | |||
| if (host) { | |||
| m_host_tensor->dtype(mgb_layout.dtype); | |||
| m_host_tensor->resize(mgb_layout); | |||
| } else { | |||
| m_dev_tensor->dtype(mgb_layout.dtype); | |||
| m_dev_tensor->resize(mgb_layout); | |||
| } | |||
| } | |||
| void TensorImplDft::reshape(const Layout& layout) { | |||
| auto mgb_layout = to_impl_layout(layout); | |||
| bool host = is_host(); | |||
| if (host) { | |||
| m_host_tensor->resize(mgb_layout); | |||
| } else { | |||
| m_dev_tensor->resize(mgb_layout); | |||
| } | |||
| } | |||
| void TensorImplDft::reset(void* prepared_data) { | |||
| auto raw_ptr = static_cast<mgb::dt_byte*>(prepared_data); | |||
| auto raw_storage = std::shared_ptr<mgb::dt_byte>(raw_ptr, [](void*) {}); | |||
| bool host = is_host(); | |||
| if (host) { | |||
| auto cn = m_host_tensor->comp_node(); | |||
| auto mge_layout = m_host_tensor->layout(); | |||
| size_t size = mge_layout.span().dist_byte(); | |||
| mgb::HostTensorStorage storage; | |||
| storage.reset(cn, size, raw_storage); | |||
| m_host_tensor->reset(storage, mge_layout); | |||
| } else { | |||
| auto cn = m_dev_tensor->comp_node(); | |||
| auto mge_layout = m_dev_tensor->layout(); | |||
| size_t size = mge_layout.span().dist_byte(); | |||
| mgb::DeviceTensorStorage storage; | |||
| storage.reset(cn, size, raw_storage); | |||
| m_dev_tensor->reset(storage, mge_layout); | |||
| } | |||
| } | |||
| void TensorImplDft::reset(void* prepared_data, const Layout& layout) { | |||
| set_layout(layout); | |||
| reset(prepared_data); | |||
| } | |||
| bool TensorImplDft::is_continue_memory() const { | |||
| if (is_host()) { | |||
| return m_host_tensor->layout().is_physical_contiguous(); | |||
| } else { | |||
| return m_dev_tensor->layout().is_physical_contiguous(); | |||
| } | |||
| } | |||
| void TensorImplDft::copy_from(const TensorImplBase* src_impl) { | |||
| if (is_continue_memory()) { | |||
| copy_from_continue(src_impl); | |||
| } else { | |||
| copy_from_fixlayout(src_impl); | |||
| } | |||
| } | |||
| void TensorImplDft::copy_from_continue(const TensorImplBase* src_impl) { | |||
| auto src = static_cast<const TensorImplDft*>(src_impl); | |||
| if (is_host()) { | |||
| //! host to host | |||
| if (src->is_host()) { | |||
| m_host_tensor->copy_from(*src->m_host_tensor); | |||
| //! device to host | |||
| } else { | |||
| auto src_cn = src->m_dev_tensor->comp_node(); | |||
| auto dst_cn = m_host_tensor->comp_node(); | |||
| if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { | |||
| LITE_WARN( | |||
| "The dst tensor memroy is alloced before coping, " | |||
| "then pinned memroy would not use to optmize the " | |||
| "copy performance."); | |||
| //! When D2H in megbrain and the compnode of src and dst is not | |||
| //! equal, there must be one compnode that is cpu-default, so | |||
| //! here, we use temp tensor for transition | |||
| auto tmp_impl = std::make_shared<TensorImplDft>(); | |||
| tmp_impl->set_mge_tensor_compnode(src_cn); | |||
| tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
| m_host_tensor->copy_from(*tmp_impl->m_host_tensor); | |||
| } else { | |||
| //! if dst compnode is not valid(memory is not alloced), the | |||
| //! tensor is pinned host tensor | |||
| m_host_tensor->comp_node(src_cn, true); | |||
| m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
| } | |||
| } | |||
| } else { | |||
| //! host to device | |||
| if (src->is_host()) { | |||
| m_dev_tensor->copy_from(*src->m_host_tensor).sync(); | |||
| //! device to device | |||
| } else { | |||
| m_dev_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
| } | |||
| } | |||
| } | |||
| void TensorImplDft::copy_from_fixlayout(const TensorImplBase* src_impl) { | |||
| auto src = static_cast<const TensorImplDft*>(src_impl); | |||
| if (is_host()) { | |||
| //! host to host | |||
| if (src->is_host()) { | |||
| m_host_tensor->copy_from_fixlayout(*src->m_host_tensor); | |||
| //! device to host | |||
| } else { | |||
| auto src_cn = src->m_dev_tensor->comp_node(); | |||
| auto dst_cn = m_host_tensor->comp_node(); | |||
| if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { | |||
| LITE_WARN( | |||
| "The dst tensor memroy is alloced before coping, " | |||
| "then pinned memroy would not use to optmize the " | |||
| "copy performance."); | |||
| //! When D2H in megbrain and the compnode of src and dst is not | |||
| //! equal, there must be one compnode that is cpu-default, so | |||
| //! here, we use temp tensor for transition | |||
| auto tmp_impl = std::make_shared<TensorImplDft>(); | |||
| tmp_impl->set_mge_tensor_compnode(src_cn); | |||
| tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
| m_host_tensor->copy_from_fixlayout(*tmp_impl->m_host_tensor); | |||
| } else { | |||
| //! if dst compnode is not valid(memory is not alloced), the | |||
| //! tensor is pinned host tensor | |||
| m_host_tensor->comp_node(src_cn, true); | |||
| m_host_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); | |||
| } | |||
| } | |||
| } else { | |||
| //! host to device | |||
| if (src->is_host()) { | |||
| m_dev_tensor->copy_from_fixlayout(*src->m_host_tensor).sync(); | |||
| //! device to device | |||
| } else { | |||
| m_dev_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); | |||
| } | |||
| } | |||
| } | |||
| void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) { | |||
| if (is_host()) { | |||
| auto src_cn = dv.comp_node(); | |||
| m_host_tensor->comp_node(src_cn, true); | |||
| m_host_tensor->copy_from(dv); | |||
| } else { | |||
| m_dev_tensor->copy_from(dv); | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,128 @@ | |||
| /** | |||
| * \file src/mge/tensor_impl.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "lite/tensor.h" | |||
| #include "tensor_impl_base.h" | |||
| #include "megbrain/tensor.h" | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| /*! | |||
| * \brief implement the Tensor in mge | |||
| */ | |||
| class TensorImplDft final : public Tensor::TensorImplBase { | |||
| LITE_DYN_TYPE_OBJ_FINAL_DECL; | |||
| public: | |||
| TensorImplDft(); | |||
| TensorImplDft(LiteDeviceType device, bool is_pinned_host = false); | |||
| TensorImplDft(LiteDeviceType device, const Layout& layout, | |||
| bool is_pinned_host = false); | |||
| TensorImplDft(int device_id, LiteDeviceType device, | |||
| const Layout& layout = {}, bool is_pinned_host = false); | |||
| TensorImplDft(int device_id, int stream_id, LiteDeviceType device, | |||
| bool is_pinned_host = false); | |||
| virtual ~TensorImplDft() = default; | |||
| LiteDeviceType get_device_type() const override; | |||
| int get_device_id() const override; | |||
| LiteBackend get_backend_type() const override { | |||
| return LiteBackend::LITE_DEFAULT; | |||
| } | |||
| Layout get_layout() const override; | |||
| bool is_pinned_host() const override; | |||
| //! which will trigger memory alloc in tensor implement | |||
| void* get_memory_ptr() const override; | |||
| //! which will trigger memory alloc in tensor implement if memory is not | |||
| //! allocated, and compute the ptr in the gaven idx | |||
| void* get_memory_ptr(const std::vector<size_t>& idx) const override; | |||
| //! set layout will change the layout and reallocate memory of the tensor | |||
| void set_layout(const Layout& layout) override; | |||
| //! use the user allocated data to reset the memory of the tensor, the | |||
| //! memory will not be managed by the lite, later, the user should delete | |||
| //! it. | |||
| void reset(void* prepared_data) override; | |||
| //! use the user allocated data and corresponding layout to reset the data | |||
| //! and layout of the tensor, the memory will not be managed by lite, later, | |||
| //! the user should delete it. | |||
| void reset(void* prepared_data, const Layout& layout) override; | |||
| //! get a new tensor slice from the origin tensor | |||
| std::shared_ptr<Tensor> slice( | |||
| const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
| const std::vector<size_t>& step = {}) override; | |||
| //! set the tensor memory with zero | |||
| void fill_zero() override; | |||
| //! reshape the tensor with new shape, keep the data_type the same | |||
| void reshape(const Layout& layout) override; | |||
| //! copy tensor form other tensor | |||
| //! Note: the best way for tensor copy is just set the dst device, left | |||
| //! layout empty, when copying the dst layout will be set the same with | |||
| //! src | |||
| void copy_from(const TensorImplBase* src_impl) override; | |||
| //! share memory with other tensor | |||
| void share_memory_with(const TensorImplBase* src_impl) override; | |||
| //! whether the memory of tensor is continue | |||
| bool is_continue_memory() const override; | |||
| //! get host tensor | |||
| std::shared_ptr<mgb::HostTensorND> host_tensor() const { | |||
| return m_host_tensor; | |||
| } | |||
| //! get device tensor | |||
| std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const { | |||
| return m_dev_tensor; | |||
| } | |||
| //! copy from mgb tensor | |||
| void copy_from_mge_tensor(const mgb::DeviceTensorND& dv); | |||
| public: | |||
| friend class NetworkImplDft; | |||
| private: | |||
| bool is_host() const { return m_host_tensor != nullptr; }; | |||
| void copy_from_continue(const TensorImplBase* src_impl); | |||
| void copy_from_fixlayout(const TensorImplBase* src_impl); | |||
| void set_mge_tensor_compnode(const mgb::CompNode& comp_node); | |||
| private: | |||
| std::shared_ptr<mgb::HostTensorND> m_host_tensor; | |||
| std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor; | |||
| }; | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,154 @@ | |||
| /** | |||
| * \file inlude/misc.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "./misc.h" | |||
| #include "lite/global.h" | |||
| #include <time.h> | |||
| #include <chrono> | |||
| #include <cstdarg> | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "megbrain/common.h" | |||
| #endif | |||
| #ifdef __ANDROID__ | |||
| #include <android/log.h> | |||
| #endif | |||
| using namespace lite; | |||
| namespace lite { | |||
| namespace log_detail { | |||
| LiteLogLevel current_log_level = LiteLogLevel::ERROR; | |||
| template <class T, size_t N> | |||
| constexpr size_t countof(T (&)[N]) { | |||
| return N; | |||
| } | |||
| } // namespace log_detail | |||
| } // namespace lite | |||
| namespace { | |||
| std::string svsprintf(const char* fmt, va_list ap_orig) { | |||
| int size = 100; /* Guess we need no more than 100 bytes */ | |||
| char* p; | |||
| if ((p = (char*)malloc(size)) == nullptr) | |||
| return "svsprintf: malloc failed"; | |||
| for (;;) { | |||
| va_list ap; | |||
| va_copy(ap, ap_orig); | |||
| int n = vsnprintf(p, size, fmt, ap); | |||
| va_end(ap); | |||
| if (n < 0) | |||
| return "svsprintf: vsnprintf failed"; | |||
| if (n < size) { | |||
| std::string rst(p); | |||
| free(p); | |||
| return rst; | |||
| } | |||
| size = n + 1; | |||
| char* np = (char*)realloc(p, size); | |||
| if (!np) { | |||
| free(p); | |||
| return "svsprintf: realloc failed"; | |||
| } else | |||
| p = np; | |||
| } | |||
| } | |||
| } // namespace | |||
| void lite::set_log_level(LiteLogLevel l) { | |||
| log_detail::current_log_level = l; | |||
| #if LITE_BUILD_WITH_MGE | |||
| mgb::LogLevel lite_log_level = mgb::LogLevel::DEBUG; | |||
| switch (l) { | |||
| case LiteLogLevel::DEBUG: | |||
| lite_log_level = mgb::LogLevel::DEBUG; | |||
| break; | |||
| case LiteLogLevel::INFO: | |||
| lite_log_level = mgb::LogLevel::INFO; | |||
| break; | |||
| case LiteLogLevel::WARN: | |||
| lite_log_level = mgb::LogLevel::WARN; | |||
| break; | |||
| case LiteLogLevel::ERROR: | |||
| lite_log_level = mgb::LogLevel::ERROR; | |||
| break; | |||
| default: | |||
| LITE_THROW("unkonw loglevel"); | |||
| } | |||
| mgb::set_log_level(lite_log_level); | |||
| #endif | |||
| } | |||
| LiteLogLevel lite::get_log_level() { | |||
| return log_detail::current_log_level; | |||
| } | |||
| std::string lite::ssprintf(const char* format, ...) { | |||
| va_list ap; | |||
| va_start(ap, format); | |||
| auto ret = svsprintf(format, ap); | |||
| va_end(ap); | |||
| return ret; | |||
| } | |||
| void lite::print_log(LiteLogLevel level, const char* format, ...) { | |||
| if (static_cast<uint32_t>(level) < static_cast<uint32_t>(get_log_level())) { | |||
| return; | |||
| } | |||
| using namespace std::chrono; | |||
| auto now = system_clock::now(); | |||
| auto now_time_t = system_clock::to_time_t(now); | |||
| tm now_tm; | |||
| #if _WIN32 | |||
| localtime_s(&now_tm, &now_time_t); | |||
| #else | |||
| localtime_r(&now_time_t, &now_tm); | |||
| #endif | |||
| auto now_trunc_to_sec = system_clock::from_time_t(mktime(&now_tm)); | |||
| auto microsec = duration_cast<microseconds>(now - now_trunc_to_sec); | |||
| char time_buffer[100]; | |||
| snprintf(time_buffer, log_detail::countof(time_buffer), | |||
| "%02d:%02d:%02d.%06ld ", now_tm.tm_hour, now_tm.tm_min, | |||
| now_tm.tm_sec, long(microsec.count())); | |||
| const char* prefix[] = {"LITE[DBG] ", "LITE[INF] ", "LITE[WRN] ", | |||
| "LITE[ERR] "}; | |||
| std::string out; | |||
| out += prefix[int(level)]; | |||
| out += time_buffer; | |||
| va_list ap; | |||
| va_start(ap, format); | |||
| auto ret = svsprintf(format, ap); | |||
| va_end(ap); | |||
| out += ret; | |||
| #ifdef __ANDROID__ | |||
| __android_log_print(ANDROID_LOG_INFO, "lite", "%s", out.c_str()); | |||
| #else | |||
| fprintf(stderr, "%s\n", out.c_str()); | |||
| #endif | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,254 @@ | |||
| /** | |||
| * \file include/misc.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #include <chrono> | |||
| #include <exception> | |||
| #include <stdexcept> | |||
| #include <string> | |||
| #include "lite/common_enum_c.h" | |||
| #include "lite/global.h" | |||
| namespace lite { | |||
| #if LITE_ENABLE_EXCEPTION | |||
| /*! \brief The error class in lite. | |||
| * | |||
| * It can be used to represent both an error caused by the invalid | |||
| * input of the caller or an invalid runtime condition. | |||
| * | |||
| * The necessary presumption should be guaranteed by assertions instead of | |||
| * exceptions. | |||
| */ | |||
| class Error : public std::exception { | |||
| public: | |||
| Error(const std::string& msg) : m_msg("Error: " + msg) {} | |||
| const char* what() const noexcept override { return m_msg.c_str(); } | |||
| private: | |||
| std::string m_msg; | |||
| }; | |||
| #endif | |||
| std::string ssprintf(const char* fmt = 0, ...) | |||
| __attribute__((format(printf, 1, 2))); | |||
| /*! | |||
| * \brief Print a message. | |||
| * | |||
| * The message is printed only if level is above or equals to the current log | |||
| * level. | |||
| */ | |||
| void print_log(LiteLogLevel level, const char* format = 0, ...) | |||
| __attribute__((format(printf, 2, 3))); | |||
| } // namespace lite | |||
| #if LITE_ENABLE_LOGGING | |||
| #define LITE_LOG_(level, msg...) \ | |||
| do { \ | |||
| lite::print_log(LiteLogLevel::level, ##msg); \ | |||
| } while (0) | |||
| #else | |||
| #define LITE_LOG_(level, msg...) (void)0 | |||
| #endif | |||
| #define LITE_LOG(fmt...) LITE_LOG_(DEBUG, fmt); | |||
| #define LITE_DEBUG(fmt...) LITE_LOG_(DEBUG, fmt); | |||
| #define LITE_WARN(fmt...) LITE_LOG_(WARN, fmt); | |||
| #define LITE_ERROR(fmt...) LITE_LOG_(ERROR, fmt); | |||
| #if LITE_ENABLE_EXCEPTION | |||
| #define LITE_THROW(msg) throw lite::Error(msg) | |||
| #else | |||
| #define LITE_THROW(msg) \ | |||
| do { \ | |||
| LITE_ERROR(msg); \ | |||
| __builtin_trap(); \ | |||
| } while (0) | |||
| #endif | |||
| #if LITE_ENABLE_EXCEPTION | |||
| #define LITE_ERROR_HANDLER_BEGIN try { | |||
| #define LITE_ERROR_HANDLER_END \ | |||
| } \ | |||
| catch (const ::lite::Error& e) { \ | |||
| std::string msg = std::string("Lite exception: ") + e.what(); \ | |||
| LITE_ERROR("%s.", msg.c_str()); \ | |||
| throw; \ | |||
| } | |||
| #else | |||
| #define LITE_ERROR_HANDLER_BEGIN | |||
| #define LITE_ERROR_HANDLER_END | |||
| #endif | |||
| /*! \brief Return an error if the given pointer is null pointer. | |||
| * | |||
| * The macro is used to ensure the validity of the passing context pointer. | |||
| */ | |||
| #define LITE_CHECK_NON_NULL_POINTER(ptr) \ | |||
| LITE_ASSERT(ptr != nullptr, "Input ptr is null.") | |||
| //! branch prediction hint: likely to take | |||
| #define lite_likely(v) __builtin_expect(static_cast<bool>(v), 1) | |||
| //! branch prediction hint: unlikely to take | |||
| #define lite_unlikely(v) __builtin_expect(static_cast<bool>(v), 0) | |||
| #if LITE_ENABLE_LOGGING | |||
| #if LITE_ASSERT_LOC | |||
| #define LITE_ASSERT(expr, msg...) \ | |||
| do { \ | |||
| if (lite_unlikely(!(expr))) { \ | |||
| auto info = lite::ssprintf(msg); \ | |||
| LITE_THROW( \ | |||
| lite::ssprintf("Assert \' %s \' failed at file : %s \n" \ | |||
| "line %d : %s,\nextra " \ | |||
| "message: %s", \ | |||
| #expr, __FILE__, __LINE__, \ | |||
| __PRETTY_FUNCTION__, info.c_str())); \ | |||
| } \ | |||
| } while (0) | |||
| #else | |||
| #define LITE_ASSERT(expr, msg...) \ | |||
| do { \ | |||
| if (lite_unlikely(!(expr))) { \ | |||
| auto info = lite::ssprintf(msg); \ | |||
| LITE_THROW(lite::ssprintf( \ | |||
| "Assert \' %s \' failed at file : %s \n" \ | |||
| "line %d : %s,\nextra " \ | |||
| "message: %s", \ | |||
| #expr, "about location info, please build with debug", \ | |||
| __LINE__, __PRETTY_FUNCTION__, info.c_str())); \ | |||
| } \ | |||
| } while (0) | |||
| #endif | |||
| #else | |||
| #define LITE_ASSERT(expr, msg...) \ | |||
| do { \ | |||
| if (lite_unlikely(!(expr))) { \ | |||
| auto msg_string = lite::ssprintf(msg); \ | |||
| LITE_THROW(msg_string.c_str()); \ | |||
| } \ | |||
| } while (0) | |||
| #endif | |||
| #define LITE_MARK_USED_VAR(var) ((void)var) | |||
| namespace lite { | |||
| class ScopedTimer { | |||
| public: | |||
| typedef std::chrono::system_clock Clock; | |||
| typedef std::chrono::nanoseconds Nsec; | |||
| ScopedTimer(std::string name) : m_name(name) { m_start = Clock::now(); } | |||
| ~ScopedTimer() { | |||
| m_stop = Clock::now(); | |||
| std::chrono::duration<double> elapsed = m_stop - m_start; | |||
| Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
| auto msg = ssprintf("%s used time %fms.", m_name.c_str(), | |||
| static_cast<double>(u.count()) / 1000000.f); | |||
| LITE_LOG("%s", msg.c_str()); | |||
| } | |||
| private: | |||
| std::chrono::time_point<std::chrono::system_clock> m_start, m_stop; | |||
| const std::string m_name; | |||
| }; | |||
| class Timer { | |||
| public: | |||
| typedef std::chrono::system_clock Clock; | |||
| typedef std::chrono::nanoseconds Nsec; | |||
| Timer(std::string name) : m_name(name) { m_start = Clock::now(); } | |||
| double get_used_time() { | |||
| m_stop = Clock::now(); | |||
| std::chrono::duration<double> elapsed = m_stop - m_start; | |||
| Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
| return static_cast<double>(u.count()) / 1000000.0; | |||
| } | |||
| void print_used_time(int iter) { | |||
| m_stop = Clock::now(); | |||
| std::chrono::duration<double> elapsed = m_stop - m_start; | |||
| Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
| printf("%s used time %f ms\n", (m_name + std::to_string(iter)).c_str(), | |||
| static_cast<double>(u.count()) / 1000000.0); | |||
| } | |||
| void reset_start() { m_start = Clock::now(); } | |||
| private: | |||
| std::chrono::time_point<std::chrono::system_clock> m_start, m_stop; | |||
| const std::string m_name; | |||
| }; | |||
| inline void mark_used_variable() {} | |||
| template <typename T, typename... Arg> | |||
| inline void mark_used_variable(T firstArg, Arg... args) { | |||
| LITE_MARK_USED_VAR(firstArg); | |||
| mark_used_variable(args...); | |||
| } | |||
| } // namespace lite | |||
| #if defined(_WIN32) | |||
| #include <io.h> | |||
| #include <windows.h> | |||
| #undef CONST | |||
| #define F_OK 0 | |||
| #define RTLD_LAZY 0 | |||
| // On the windows platform we use a lib_filename without a full path so | |||
| // the win-api "LoadLibrary" would uses a standard search strategy to | |||
| // find the lib module. As we cannot access to the lib_filename without a | |||
| // full path, we should not use "access(a, b)" to verify it. | |||
| #define access(a, b) false | |||
| static inline void* dlopen(const char* file, int) { | |||
| return static_cast<void*>(LoadLibrary(file)); | |||
| } | |||
| static inline char* dlerror() { | |||
| const char* errmsg = "dlerror not aviable in windows"; | |||
| return const_cast<char*>(errmsg); | |||
| } | |||
| static inline void* dlsym(void* handle, const char* name) { | |||
| FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
| return reinterpret_cast<void*>(symbol); | |||
| } | |||
| #elif __linux__ || __unix__ || __APPLE__ | |||
| #include <dlfcn.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| #if __DEPLOY_ON_XP_SP2__ | |||
| //! refer to | |||
| //! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 | |||
| //! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not | |||
| //! implement some base apis for c++ std function, for example, | |||
| //! std::mutex/std::thread/std::condition_variable as a workround, we will | |||
| //! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc! | |||
| #define LITE_MUTEX size_t | |||
| #define LITE_RECURSIVE_MUTEX size_t | |||
| #define LITE_LOCK_GUARD(mtx) LITE_MARK_USED_VAR(mtx) | |||
| #define LITE_LOCK_GUARD_UNIQUE(mtx) LITE_MARK_USED_VAR(mtx) | |||
| #define LITE_LOCK_GUARD_SHARED(mtx) LITE_MARK_USED_VAR(LITE_MARK_USED_VAR) | |||
| #else | |||
| #define LITE_MUTEX std::mutex | |||
| #define LITE_RECURSIVE_MUTEX std::recursive_mutex | |||
| #define LITE_LOCK_GUARD(mtx) \ | |||
| std::lock_guard<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
| #define LITE_LOCK_GUARD_UNIQUE(mtx) \ | |||
| std::unique_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
| #define LITE_LOCK_GUARD_SHARED(mtx) \ | |||
| std::shared_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,501 @@ | |||
| /** | |||
| * \file src/network.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/network.h" | |||
| #include "function_base.h" | |||
| #include "network_impl_base.h" | |||
| #include "parse_info/parse_info_base.h" | |||
| #include "parse_model/model_parser.h" | |||
| #include "type_info.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "mge/function_dft.h" | |||
| #include "mge/network_impl.h" | |||
| #endif | |||
| #include <fstream> | |||
| #include <memory> | |||
| using namespace lite; | |||
| /** | |||
| * \brief Construct the new work implement | |||
| * the order must be : | |||
| * 1. creeat the implement | |||
| * 2. config and load | |||
| * 3. set_io | |||
| */ | |||
| Network::Network(const Config& config, const NetworkIO& network_io) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_config = config; | |||
| m_network_io = network_io; | |||
| if (config.backend == LiteBackend::LITE_DEFAULT) { | |||
| m_impl = call_func<NetworkImplDft, | |||
| std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
| "create_network"); | |||
| } else if (config.backend == LiteBackend::LITE_RK_NPU) { | |||
| m_impl = call_func<NetworkImplRK, | |||
| std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
| "create_network"); | |||
| } | |||
| m_impl->set_config(config); | |||
| m_impl->set_io(network_io); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network::Network(const NetworkIO& network_io, const Config& config) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_config = config; | |||
| m_network_io = network_io; | |||
| if (config.backend == LiteBackend::LITE_DEFAULT) { | |||
| m_impl = call_func<NetworkImplDft, | |||
| std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
| "create_network"); | |||
| } else if (config.backend == LiteBackend::LITE_RK_NPU) { | |||
| m_impl = call_func<NetworkImplRK, | |||
| std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
| "create_network"); | |||
| } | |||
| m_impl->set_config(config); | |||
| m_impl->set_io(network_io); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::load_model(void* model_mem, size_t size) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| //! this model_mem is managed by user | |||
| std::shared_ptr<void> model{model_mem, [](void*) {}}; | |||
| prase_model(model, size); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::load_model(std::string model_path) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| FILE* fin = fopen(model_path.c_str(), "rb"); | |||
| LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), | |||
| strerror(errno)); | |||
| fseek(fin, 0, SEEK_END); | |||
| size_t size = ftell(fin); | |||
| fseek(fin, 0, SEEK_SET); | |||
| void* ptr = malloc(size); | |||
| std::shared_ptr<void> buf{ptr, ::free}; | |||
| auto nr = fread(buf.get(), 1, size, fin); | |||
| LITE_ASSERT(nr == size); | |||
| fclose(fin); | |||
| prase_model(buf, size); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::prase_model(std::shared_ptr<void> model_data, size_t size) { | |||
| std::unordered_map<std::string, LiteAny> separate_config_map; | |||
| ModelParser model_parser(model_data, size); | |||
| //! parse the model info | |||
| if (model_parser.parse_model_info(m_config, m_network_io, | |||
| separate_config_map, m_extra_info)) { | |||
| if (m_config.backend == LiteBackend::LITE_DEFAULT && | |||
| m_impl->get_backend_type() != LiteBackend::LITE_DEFAULT) { | |||
| m_impl.reset(try_call_func<NetworkImplDft, | |||
| lite::Network::NetworkImplBase*>( | |||
| "parse_model")); | |||
| } else if (m_config.backend == LiteBackend::LITE_RK_NPU && | |||
| m_impl->get_backend_type() != LiteBackend::LITE_RK_NPU) { | |||
| m_impl.reset(try_call_func<NetworkImplRK, | |||
| lite::Network::NetworkImplBase*>( | |||
| "parse_model")); | |||
| } | |||
| m_impl->set_config(m_config); | |||
| m_impl->set_io(m_network_io); | |||
| } | |||
| //! decryption the model | |||
| size_t model_length; | |||
| auto&& model_shared_ptr = model_parser.parse_model(model_length, m_config); | |||
| m_impl->load_model(model_shared_ptr, model_length, separate_config_map); | |||
| m_loaded = true; | |||
| update_from_implement(); | |||
| } | |||
| Network::~Network() = default; | |||
| void Network::update_from_implement() { | |||
| m_config.device_type = m_impl->get_device_type(); | |||
| } | |||
| void Network::compute_only_configured_output() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(!m_loaded, | |||
| "compute_only_configured_output should be used before model " | |||
| "loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->compute_only_configured_output(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::shared_ptr<Tensor> Network::get_io_tensor(std::string name, | |||
| LiteTensorPhase phase) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_io_tensor(name, phase); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, | |||
| "get_input_tensor should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_input_tensor(index); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, | |||
| "get_output_tensor should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_output_tensor(index); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network& Network::set_async_callback(const AsyncCallback& callback) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_async_callback(std::move(callback)); | |||
| return *this; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network& Network::set_start_callback(const StartCallback& callback) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_start_callback(std::move(callback)); | |||
| return *this; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network& Network::set_finish_callback(const FinishCallback& callback) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_finish_callback(std::move(callback)); | |||
| return *this; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network& Network::set_device_id(int device_id) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(!m_loaded, "set_device_id should be used before model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_device_id(device_id); | |||
| return *this; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Network& Network::set_stream_id(int stream_id) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(!m_loaded, "set_stream_id should be used before model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->set_stream_id(stream_id); | |||
| return *this; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::forward() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, "forward should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl.get()); | |||
| m_impl->forward(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::wait() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, "wait should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| m_impl->wait(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::string Network::get_input_name(size_t index) const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, "get_input_name should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_input_name(index); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::string Network::get_output_name(size_t index) const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, "get_output_name should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_output_name(index); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::vector<std::string> Network::get_all_input_name() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, | |||
| "get_all_input_name should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| auto all_input_name = m_impl->get_all_input_name(); | |||
| std::vector<std::string> all_names; | |||
| for (auto& name : all_input_name) { | |||
| all_names.push_back(name); | |||
| } | |||
| return all_names; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::vector<std::string> Network::get_all_output_name() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_loaded, | |||
| "get_all_output_name should be used after model loaded."); | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| auto all_output_name = m_impl->get_all_output_name(); | |||
| std::vector<std::string> all_names; | |||
| for (auto& name : all_output_name) { | |||
| all_names.push_back(name); | |||
| } | |||
| return all_names; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| int Network::get_device_id() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_device_id(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| int Network::get_stream_id() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
| return m_impl->get_stream_id(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Network::enable_profile_performance(std::string profile_file_path) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_impl->enable_profile_performance(profile_file_path); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| const std::string& Network::get_model_extra_info() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| return m_extra_info; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| LiteDeviceType Network::get_device_type() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| return m_impl->get_device_type(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| /*********************** MGE special network function ***************/ | |||
| void Runtime::set_cpu_threads_number(std::shared_ptr<Network> network, | |||
| size_t nr_threads) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT( | |||
| !NetworkHelper::loaded(network), | |||
| "set_cpu_threads_number should be used before model loaded."); | |||
| call_func<NetworkImplDft, void>("set_cpu_threads_number", network_impl, | |||
| nr_threads); | |||
| return; | |||
| } | |||
| LITE_THROW("set_cpu_threads_number is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::use_tensorrt(std::shared_ptr<Network> network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(!NetworkHelper::loaded(network), | |||
| "use_tensorrt should be used before model loaded."); | |||
| call_func<NetworkImplDft, void>("use_tensorrt", network_impl); | |||
| return; | |||
| } | |||
| LITE_THROW("use_tensorrt is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| size_t Runtime::get_cpu_threads_number(const std::shared_ptr<Network> network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| return call_func<NetworkImplDft, size_t>("get_cpu_threads_number", | |||
| network_impl); | |||
| } | |||
| LITE_THROW("get_cpu_threads_number is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::set_runtime_thread_affinity( | |||
| std::shared_ptr<Network> network, | |||
| const ThreadAffinityCallback& thread_affinity_callback) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(NetworkHelper::loaded(network), | |||
| "set_runtime_thread_affinity should be used after model " | |||
| "loaded."); | |||
| call_func<NetworkImplDft, void>("set_runtime_thread_affinity", | |||
| network_impl, thread_affinity_callback); | |||
| return; | |||
| } | |||
| LITE_THROW("set_runtime_thread_affinity is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::set_cpu_inplace_mode(std::shared_ptr<Network> network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(!NetworkHelper::loaded(network), | |||
| "set_cpu_inplace_mode should be used before model loaded."); | |||
| call_func<NetworkImplDft, void>("set_cpu_inplace_mode", network_impl); | |||
| return; | |||
| } | |||
| LITE_THROW("set_cpu_inplace_mode is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| bool Runtime::is_cpu_inplace_mode(const std::shared_ptr<Network> network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| return call_func<NetworkImplDft, bool>("is_cpu_inplace_mode", | |||
| network_impl); | |||
| } | |||
| LITE_THROW("is_cpu_inplace_mode is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| //! set opr algorithm selection strategy in the network | |||
| void Runtime::set_network_algo_policy(std::shared_ptr<Network> network, | |||
| LiteAlgoSelectStrategy strategy, | |||
| uint32_t shared_batch_size, | |||
| bool binary_equal_between_batch) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| call_func<NetworkImplDft, void>("set_network_algo_policy", network_impl, | |||
| strategy, shared_batch_size, | |||
| binary_equal_between_batch); | |||
| return; | |||
| } | |||
| LITE_THROW("set_network_algo_policy is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| //! set opr algorithm selection strategy in the network | |||
| void Runtime::set_network_algo_workspace_limit(std::shared_ptr<Network> network, | |||
| size_t workspace_limit) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(NetworkHelper::loaded(network), | |||
| "set_network_algo_policy should be used after model " | |||
| "loaded."); | |||
| call_func<NetworkImplDft, void>("set_network_algo_workspace_limit", | |||
| network_impl, workspace_limit); | |||
| return; | |||
| } | |||
| LITE_THROW( | |||
| "set_network_algo_workspace_limit is not aviliable in the " | |||
| "backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| //! set the network memroy allocator, the allocator is defined by user | |||
| void Runtime::set_memory_allocator(std::shared_ptr<Network> network, | |||
| std::shared_ptr<Allocator> user_allocator) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(!NetworkHelper::loaded(network), | |||
| "set_memory_allocator should be used before model loaded."); | |||
| call_func<NetworkImplDft, void>("set_memory_allocator", network_impl, | |||
| user_allocator); | |||
| return; | |||
| } | |||
| LITE_THROW("set_memory_allocator is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::share_runtime_memory_with(std::shared_ptr<Network> dst_network, | |||
| std::shared_ptr<Network> src_network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl_dst = NetworkHelper::implement(dst_network); | |||
| if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(!NetworkHelper::loaded(dst_network), | |||
| "share_runtime_memory_with should be used before model " | |||
| "loaded."); | |||
| call_func<NetworkImplDft, void>("share_runtime_memory_with", | |||
| network_impl_dst, | |||
| NetworkHelper::implement(src_network)); | |||
| return; | |||
| } | |||
| LITE_THROW("share_runtime_memory_with is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::enable_io_txt_dump(std::shared_ptr<Network> network, | |||
| std::string io_txt_out_file) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| call_func<NetworkImplDft, void>("enable_io_txt_dump", network_impl, | |||
| io_txt_out_file); | |||
| return; | |||
| } | |||
| LITE_THROW("enable_io_txt_dump is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::enable_io_bin_dump(std::shared_ptr<Network> network, | |||
| std::string io_bin_out_dir) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl = NetworkHelper::implement(network); | |||
| if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| call_func<NetworkImplDft, void>("enable_io_bin_dump", network_impl, | |||
| io_bin_out_dir); | |||
| return; | |||
| } | |||
| LITE_THROW("enable_io_bin_dump is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Runtime::shared_weight_with_network( | |||
| std::shared_ptr<Network> dst_network, | |||
| const std::shared_ptr<Network> src_network) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto network_impl_dst = NetworkHelper::implement(dst_network); | |||
| if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
| LITE_ASSERT(NetworkHelper::loaded(src_network), | |||
| "shared_weight_with_network should be used after the src " | |||
| "network " | |||
| "loaded."); | |||
| auto src_implment = NetworkHelper::implement(src_network); | |||
| call_func<NetworkImplDft, void>("shared_weight_with", network_impl_dst, | |||
| src_implment); | |||
| NetworkHelper::loaded(dst_network, true); | |||
| return; | |||
| } | |||
| LITE_THROW("shared_weight_with_network is not aviliable in the backend."); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,161 @@ | |||
| /** | |||
| * \file src/network_impl_base.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite/network.h" | |||
| #include "misc.h" | |||
| #include "tensor_impl_base.h" | |||
| #include "type_info.h" | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| /*! | |||
| * \brief the Inner IO data struct, add some inner data from IO | |||
| */ | |||
| class IOInner : public IO { | |||
| public: | |||
| //! use to flag the corresponding lite_tensor is filled, when the | |||
| //! value of lite_tensor is filled, the have_sync is true, other wise false, | |||
| //! this is used in async mode | |||
| bool have_sync = false; | |||
| //! Real input and output data location | |||
| std::shared_ptr<Tensor> lite_tensor = nullptr; | |||
| IOInner() = default; | |||
| IOInner(const IO& io) { | |||
| name = io.name; | |||
| is_host = io.is_host; | |||
| io_type = io.io_type; | |||
| config_layout = io.config_layout; | |||
| } | |||
| }; | |||
| /*! | |||
| * \brief the realy network IO info when network run | |||
| */ | |||
| struct NetworkIOInner { | |||
| std::vector<IOInner> inputs; | |||
| std::vector<IOInner> outputs; | |||
| }; | |||
| /*! | |||
| * \brief implement the Network, contain the mgb related member | |||
| */ | |||
| class Network::NetworkImplBase : public DynTypeObj { | |||
| public: | |||
| virtual ~NetworkImplBase() = default; | |||
| //! set the config of the network, include: | |||
| //! the inference device | |||
| //! the other inference options, such as record_level, weight_preprocess... | |||
| virtual void set_config(const Config& config) = 0; | |||
| //! set the special io infomation, if not set, default io tensor will used, | |||
| //! this is special for input/output is not host tensor, default the | |||
| //! input/output tensors are host tensor | |||
| virtual void set_io(const NetworkIO& network_io) = 0; | |||
| //! only compute the output tensor in user configured | |||
| virtual void compute_only_configured_output() = 0; | |||
| //! get the network input and ouput tensor, the layout of which is | |||
| //! sync from mge tensor | |||
| virtual std::shared_ptr<Tensor> get_io_tensor( | |||
| std::string io_name, | |||
| LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0; | |||
| //! get the input tensor by index in the load_result tensormap | |||
| virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0; | |||
| //! get the output tensor by index in the load_result output_var_list | |||
| virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0; | |||
| //! get all the input tensor name in the order in load return | |||
| virtual std::vector<const char*> get_all_input_name() const = 0; | |||
| //! get all the output tensor name in the order in load return | |||
| virtual std::vector<const char*> get_all_output_name() const = 0; | |||
| //! get the input tensor name in the order in load return | |||
| virtual const char* get_input_name(size_t index) const = 0; | |||
| //! get the output tensor name in the order in load return | |||
| virtual const char* get_output_name(size_t index) const = 0; | |||
| //! set the callback in async model | |||
| virtual void set_async_callback(const AsyncCallback& callback) = 0; | |||
| //! set the start callback which will execute before network forward | |||
| virtual void set_start_callback(const StartCallback& callback) = 0; | |||
| //! set the finish callback which will execute after network forward | |||
| virtual void set_finish_callback(const FinishCallback& callback) = 0; | |||
| //! load the model and get the m_load_result | |||
| virtual void load_model(std::shared_ptr<void> model_mem, size_t size, | |||
| std::unordered_map<std::string, LiteAny> | |||
| separate_config_map = {}) = 0; | |||
| //! forward the network with filled input data and fill the output data | |||
| //! to the output tensor | |||
| virtual void forward() = 0; | |||
| //! in sync model, wait utile the inference finish | |||
| virtual void wait() = 0; | |||
| //! set device id, default device id = 0 | |||
| virtual void set_device_id(int device_id) = 0; | |||
| virtual int get_device_id() const = 0; | |||
| virtual LiteBackend get_backend_type() const = 0; | |||
| //! set stream id, default stream id = 0 | |||
| virtual void set_stream_id(int stream_id) = 0; | |||
| virtual int get_stream_id() const = 0; | |||
| virtual LiteDeviceType get_device_type() const = 0; | |||
| //! enable profile the network, a file will be generated | |||
| virtual void enable_profile_performance(std::string profile_file_path) = 0; | |||
| }; | |||
| /******************************** friend class *****************************/ | |||
| /*! | |||
| * \brief friend class of Network, for convenient accessing the Network members | |||
| */ | |||
| class NetworkHelper { | |||
| public: | |||
| static bool loaded(const std::shared_ptr<Network> network) { | |||
| LITE_ASSERT(network); | |||
| return network->m_loaded; | |||
| } | |||
| static void loaded(const std::shared_ptr<Network> network, bool loaded) { | |||
| LITE_ASSERT(network); | |||
| network->m_loaded = loaded; | |||
| } | |||
| static Network::NetworkImplBase* implement(const Network* network) { | |||
| LITE_ASSERT(network); | |||
| return network->m_impl.get(); | |||
| } | |||
| static Network::NetworkImplBase* implement( | |||
| const std::shared_ptr<Network> network) { | |||
| LITE_ASSERT(network); | |||
| return network->m_impl.get(); | |||
| } | |||
| static void implement(const std::shared_ptr<Network> network, | |||
| std::unique_ptr<Network::NetworkImplBase> impl) { | |||
| LITE_ASSERT(network); | |||
| network->m_impl = std::move(impl); | |||
| } | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,246 @@ | |||
| /** | |||
| * \file src/parse_info/default_parse.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "../misc.h" | |||
| #include "lite/global.h" | |||
| #include "lite/network.h" | |||
| #include "nlohmann/json.hpp" | |||
| namespace lite { | |||
| //! The LITE_default parse info function | |||
| bool default_parse_info( | |||
| const void* info_ptr, size_t length, const std::string& model_name, | |||
| Config& config, NetworkIO& network_io, | |||
| std::unordered_map<std::string, LiteAny>& separate_config_map, | |||
| std::string& extra_info) { | |||
| using json = nlohmann::json; | |||
| std::string json_string(static_cast<const char*>(info_ptr), length); | |||
| auto info = json::parse(json_string); | |||
| if (!info["valid"]) { | |||
| return false; | |||
| } | |||
| auto info_model_name = info["name"]; | |||
| if (info_model_name != model_name) { | |||
| LITE_THROW( | |||
| ssprintf("infomation of model name is not match, packed model " | |||
| "is %s, but json info get %s.", | |||
| model_name.c_str(), | |||
| static_cast<std::string>(info_model_name).c_str())); | |||
| } | |||
| //! check version | |||
| std::string model_version = info["version"]; | |||
| int major = std::stoi(model_version.substr(0, model_version.find("."))); | |||
| int start = model_version.find(".") + 1; | |||
| int minor = std::stoi( | |||
| model_version.substr(start, model_version.find(".", start))); | |||
| start = model_version.find(".", start) + 1; | |||
| int patch = std::stoi(model_version.substr(start)); | |||
| int lite_major, lite_minor, lite_patch; | |||
| lite::get_version(lite_major, lite_minor, lite_patch); | |||
| size_t model_version_sum = (major * 10000 + minor) * 100 + patch; | |||
| size_t lite_version_sum = | |||
| (lite_major * 10000 + lite_minor) * 100 + lite_patch; | |||
| if (model_version_sum > lite_version_sum) { | |||
| LITE_WARN("Lite load the future version model !!!!!!!!!!!!!"); | |||
| } | |||
| if (info.contains("has_compression")) { | |||
| config.has_compression = info["has_compression"]; | |||
| } | |||
| if (info.contains("backend")) { | |||
| if (info["backend"] == "MGE") { | |||
| config.backend = LiteBackend::LITE_DEFAULT; | |||
| } | |||
| if (info["backend"] == "RK") { | |||
| config.backend = LiteBackend::LITE_RK_NPU; | |||
| } | |||
| } | |||
| auto get_device_type = [](std::string type) -> LiteDeviceType { | |||
| if (type == "CPU") | |||
| return LiteDeviceType::LITE_CPU; | |||
| if (type == "CUDA") | |||
| return LiteDeviceType::LITE_CUDA; | |||
| if (type == "OPENCL") | |||
| return LiteDeviceType::LITE_OPENCL; | |||
| if (type == "ATLAS") | |||
| return LiteDeviceType::LITE_ATLAS; | |||
| if (type == "NPU") | |||
| return LiteDeviceType::LITE_NPU; | |||
| else { | |||
| LITE_THROW(ssprintf("LITE not support device type of %s.", | |||
| type.c_str())); | |||
| } | |||
| }; | |||
| if (info.contains("device")) { | |||
| auto device_json = info["device"]; | |||
| config.device_type = get_device_type(device_json["type"]); | |||
| if (device_json.contains("device_id")) { | |||
| separate_config_map["device_id"] = | |||
| static_cast<int>(device_json["device_id"]); | |||
| } | |||
| if (device_json.contains("number_threads")) { | |||
| separate_config_map["number_threads"] = | |||
| static_cast<size_t>(device_json["number_threads"]); | |||
| } | |||
| if (device_json.contains("enable_inplace_model")) { | |||
| separate_config_map["enable_inplace_model"] = | |||
| static_cast<bool>(device_json["enable_inplace_model"]); | |||
| } | |||
| if (device_json.contains("use_tensorrt")) { | |||
| separate_config_map["use_tensorrt"] = | |||
| static_cast<bool>(device_json["use_tensorrt"]); | |||
| } | |||
| } | |||
| //! options | |||
| if (info.contains("options")) { | |||
| auto options = info["options"]; | |||
| if (options.contains("weight_preprocess")) | |||
| config.options.weight_preprocess = options["weight_preprocess"]; | |||
| if (options.contains("fuse_preprocess")) | |||
| config.options.fuse_preprocess = options["fuse_preprocess"]; | |||
| if (options.contains("fake_next_exec")) | |||
| config.options.fake_next_exec = options["fake_next_exec"]; | |||
| if (options.contains("var_sanity_check_first_run")) | |||
| config.options.var_sanity_check_first_run = | |||
| options["var_sanity_check_first_run"]; | |||
| if (options.contains("const_shape")) | |||
| config.options.const_shape = options["const_shape"]; | |||
| if (options.contains("force_dynamic_alloc")) | |||
| config.options.force_dynamic_alloc = options["force_dynamic_alloc"]; | |||
| if (options.contains("force_output_dynamic_alloc")) | |||
| config.options.force_output_dynamic_alloc = | |||
| options["force_output_dynamic_alloc"]; | |||
| if (options.contains("no_profiling_on_shape_change")) | |||
| config.options.no_profiling_on_shape_change = | |||
| options["no_profiling_on_shape_change"]; | |||
| if (options.contains("jit_level")) | |||
| config.options.jit_level = options["jit_level"]; | |||
| if (options.contains("comp_node_seq_record_level")) | |||
| config.options.comp_node_seq_record_level = | |||
| options["comp_node_seq_record_level"]; | |||
| if (options.contains("graph_opt_level")) | |||
| config.options.graph_opt_level = options["graph_opt_level"]; | |||
| if (options.contains("async_exec_level")) | |||
| config.options.async_exec_level = options["async_exec_level"]; | |||
| } | |||
| //! IO | |||
| auto get_io_type = [](std::string type) -> LiteIOType { | |||
| if (type == "value") | |||
| return LiteIOType::LITE_IO_VALUE; | |||
| if (type == "shape") | |||
| return LiteIOType::LITE_IO_SHAPE; | |||
| else { | |||
| LITE_THROW( | |||
| ssprintf("LITE not support IO type of %s.", type.c_str())); | |||
| } | |||
| }; | |||
| auto get_data_type = [](std::string type) -> LiteDataType { | |||
| if (type == "float32") | |||
| return LiteDataType::LITE_FLOAT; | |||
| if (type == "float16") | |||
| return LiteDataType::LITE_HALF; | |||
| if (type == "int32") | |||
| return LiteDataType::LITE_INT; | |||
| if (type == "int16") | |||
| return LiteDataType::LITE_INT16; | |||
| if (type == "int8") | |||
| return LiteDataType::LITE_INT8; | |||
| if (type == "uint8") | |||
| return LiteDataType::LITE_UINT8; | |||
| else { | |||
| LITE_THROW(ssprintf("LITE not support data type of %s.", | |||
| type.c_str())); | |||
| } | |||
| }; | |||
| #define SET_SHAPE(shape_json_, config_) \ | |||
| do { \ | |||
| int ndim = 0; \ | |||
| for (int i = 0; i < 4; i++) { \ | |||
| if (shape_json_.contains(shape_name[i])) { \ | |||
| ndim++; \ | |||
| config_.config_layout.shapes[i] = shape_json_[shape_name[i]]; \ | |||
| } else { \ | |||
| break; \ | |||
| } \ | |||
| } \ | |||
| config_.config_layout.ndim = ndim; \ | |||
| } while (0) | |||
| #define Config_IO(io_json_, io_config_) \ | |||
| if (io_json_.contains("is_host")) \ | |||
| io_config_.is_host = io_json_["is_host"]; \ | |||
| if (io_json_.contains("io_type")) \ | |||
| io_config_.io_type = get_io_type(io_json_["io_type"]); \ | |||
| if (io_json_.contains("dtype")) \ | |||
| io_config_.config_layout.data_type = get_data_type(io_json_["dtype"]); \ | |||
| if (io_json_.contains("shape")) { \ | |||
| auto shape_json = io_json_["shape"]; \ | |||
| SET_SHAPE(shape_json, io_config_); \ | |||
| } | |||
| const std::string shape_name[] = {"dim0", "dim1", "dim2", "dim3"}; | |||
| if(info.contains("IO")){ | |||
| auto IOs = info["IO"]; | |||
| if(IOs.contains("inputs")){ | |||
| auto inputs = IOs["inputs"]; | |||
| for (size_t i = 0; i < inputs.size(); i++) { | |||
| auto input_json = inputs[i]; | |||
| bool found = false; | |||
| for (auto&& io_config : network_io.inputs) { | |||
| if (io_config.name == input_json["name"]) { | |||
| found = true; | |||
| Config_IO(input_json, io_config); | |||
| } | |||
| } | |||
| if (!found) { | |||
| IO input; | |||
| input.name = input_json["name"]; | |||
| Config_IO(input_json, input); | |||
| network_io.inputs.push_back(input); | |||
| } | |||
| } | |||
| } | |||
| if (IOs.contains("outputs")) { | |||
| auto outputs = IOs["outputs"]; | |||
| for (size_t i = 0; i < outputs.size(); i++) { | |||
| auto output_json = outputs[i]; | |||
| bool found = false; | |||
| for (auto&& io_config : network_io.outputs) { | |||
| if (io_config.name == output_json["name"]) { | |||
| found = true; | |||
| Config_IO(output_json, io_config); | |||
| } | |||
| } | |||
| if (!found) { | |||
| IO output; | |||
| output.name = output_json["name"]; | |||
| Config_IO(output_json, output); | |||
| network_io.outputs.push_back(output); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| //! extra_info | |||
| if (info.contains("extra_info")) { | |||
| extra_info = info["extra_info"].dump(); | |||
| } | |||
| return true; | |||
| #undef GET_BOOL | |||
| #undef Config_IO | |||
| } | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * \file src/parse_info/parse_info_base.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite/global.h" | |||
| #include "mutex" | |||
| namespace lite { | |||
| struct ParseInfoStaticData { | |||
| std::unordered_map<std::string, ParseInfoFunc> parse_info_methods; | |||
| LITE_MUTEX map_mutex; | |||
| }; | |||
| ParseInfoStaticData& parse_info_static_data(); | |||
| template <int count> | |||
| struct ParseInfoRegister; | |||
| } // namespace lite | |||
| #define REGIST_PARSE_INFO_FUNCTION(name_, func_) \ | |||
| REGIST_PARSE_INFO_FUNCTION_WITH_NUM(__COUNTER__, name_, func_) | |||
| #define REGIST_PARSE_INFO_FUNCTION_WITH_NUM(number_, name_, func_) \ | |||
| template <> \ | |||
| struct ParseInfoRegister<number_> { \ | |||
| ParseInfoRegister() { register_parse_info_func(name_, func_); } \ | |||
| }; \ | |||
| namespace { \ | |||
| ParseInfoRegister<number_> parse_info_##number_; \ | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,134 @@ | |||
| /** | |||
| * \file src/model_parser.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "model_parser.h" | |||
| #include "decryption/decrypt_base.h" | |||
| #include "parse_info/parse_info_base.h" | |||
| using namespace lite; | |||
| using namespace model_parse; | |||
| std::string ModelParser::sm_model_tag = "packed_model"; | |||
| void ModelParser::parse_header() { | |||
| size_t tag_length = sm_model_tag.size(); | |||
| //! parse model tag | |||
| const char* ptr = static_cast<char*>(m_model.get()); | |||
| std::string tag(static_cast<const char*>(ptr), tag_length); | |||
| if (sm_model_tag == tag) { | |||
| m_is_bare_model = false; | |||
| } else { | |||
| //! if no tag, the model is bare model, return | |||
| m_is_bare_model = true; | |||
| return; | |||
| } | |||
| uint8_t* buffer = static_cast<uint8_t*>(m_model.get()) + tag_length; | |||
| auto packed_model = GetPackModel(buffer); | |||
| auto models = packed_model->models(); | |||
| LITE_ASSERT(models->size() == 1, "Now only support one model"); | |||
| auto model = models->Get(0); | |||
| m_model_name = model->header()->name()->c_str(); | |||
| m_model_decryption_name = | |||
| model->header()->model_decryption_method()->c_str(); | |||
| m_info_decryption_name = model->header()->info_decryption_method()->c_str(); | |||
| m_info_parse_func_name = model->header()->info_parse_method()->c_str(); | |||
| m_info = model->info(); | |||
| m_model_data = model->data(); | |||
| } | |||
| bool ModelParser::parse_model_info( | |||
| Config& network_config, NetworkIO& network_io, | |||
| std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
| std::string& extra_info) const { | |||
| //! no model info, no parse, direct return | |||
| if (m_is_bare_model || !m_info) { | |||
| return false; | |||
| } | |||
| size_t info_length = m_info->data()->size(); | |||
| const uint8_t* info_data = m_info->data()->Data(); | |||
| //! decryption the info | |||
| auto info_ptr = decrypt_memory(info_data, info_length, | |||
| m_info_decryption_name, info_length); | |||
| //! parse the info | |||
| LITE_LOCK_GUARD(parse_info_static_data().map_mutex); | |||
| auto it_parse = parse_info_static_data().parse_info_methods.find( | |||
| m_info_parse_func_name); | |||
| if (it_parse == parse_info_static_data().parse_info_methods.end()) { | |||
| LITE_THROW(ssprintf("can't find model info parse function %s.", | |||
| m_info_parse_func_name.c_str())); | |||
| } | |||
| auto model_info_parse_func = | |||
| parse_info_static_data().parse_info_methods[m_info_parse_func_name]; | |||
| //! convert for NetworkIOInner to NetworkIO | |||
| if (model_info_parse_func) { | |||
| model_info_parse_func(info_ptr.get(), info_length, m_model_name, | |||
| network_config, network_io, isolated_config_map, | |||
| extra_info); | |||
| } else { | |||
| LITE_THROW(ssprintf("model info parse function of %s is empty", | |||
| m_info_parse_func_name.c_str())); | |||
| } | |||
| return true; | |||
| } | |||
| std::shared_ptr<void> ModelParser::parse_model(size_t& model_length, | |||
| const Config& config) const { | |||
| if (m_is_bare_model) { | |||
| if (config.bare_model_cryption_name.size() == 0) { | |||
| model_length = m_total_length; | |||
| return m_model; | |||
| } else { | |||
| return decrypt_memory( | |||
| static_cast<uint8_t*>(m_model.get()), m_total_length, | |||
| config.bare_model_cryption_name, model_length); | |||
| } | |||
| } | |||
| LITE_ASSERT(m_model_data, "packed model parse error!"); | |||
| model_length = m_model_data->data()->size(); | |||
| const uint8_t* model_data = m_model_data->data()->Data(); | |||
| LITE_ASSERT(model_length > 0, "The loaded model is of zero length."); | |||
| return decrypt_memory(model_data, model_length, m_model_decryption_name, | |||
| model_length); | |||
| } | |||
| std::shared_ptr<void> ModelParser::decrypt_memory( | |||
| const uint8_t* data, size_t length, const std::string decryption_name, | |||
| size_t& result_length) const { | |||
| const uint8_t* memory_ptr = data; | |||
| if (decryption_name == "NONE") { | |||
| result_length = length; | |||
| return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr), | |||
| [](void*) {}); | |||
| } | |||
| LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
| auto it = decryption_static_data().decryption_methods.find(decryption_name); | |||
| if (it == decryption_static_data().decryption_methods.end()) { | |||
| LITE_THROW(ssprintf("The decryption method %s is not registed yet.", | |||
| decryption_name.c_str())); | |||
| } | |||
| auto&& func = it->second.first; | |||
| auto&& key = it->second.second; | |||
| if (func) { | |||
| auto model_vector = func(memory_ptr, length, *key); | |||
| result_length = model_vector.size(); | |||
| auto tmp_model_vector = | |||
| new std::vector<uint8_t>(std::move(model_vector)); | |||
| return std::shared_ptr<void>( | |||
| tmp_model_vector->data(), | |||
| [tmp_model_vector](void*) { delete tmp_model_vector; }); | |||
| } else { | |||
| LITE_THROW(ssprintf("No decryption function in %s method.", | |||
| decryption_name.c_str())); | |||
| } | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,75 @@ | |||
| /** | |||
| * \file src/model_parser.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite/global.h" | |||
| #include "../network_impl_base.h" | |||
| #include "pack_model_generated.h" | |||
| #include <flatbuffers/flatbuffers.h> | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| /*! | |||
| * \brief parse the model and decyt | |||
| */ | |||
| class ModelParser { | |||
| public: | |||
| ModelParser(std::shared_ptr<void> model_ptr, size_t model_length) | |||
| : m_model(model_ptr), m_total_length(model_length) { | |||
| //! parse the header | |||
| parse_header(); | |||
| } | |||
| //! parse the Info part of the model, update the network_config and | |||
| //! network_io | |||
| bool parse_model_info( | |||
| Config& network_config, NetworkIO& network_io, | |||
| std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
| std::string& extra_info) const; | |||
| //! parse the model and decrypt the model | |||
| std::shared_ptr<void> parse_model(size_t& model_length, | |||
| const Config& config) const; | |||
| private: | |||
| //! parse the header of the model and store the model related information | |||
| //! to the menber data | |||
| void parse_header(); | |||
| //! decrypt a memory with length of length and decryption method name | |||
| //! decrypt_name | |||
| std::shared_ptr<void> decrypt_memory(const uint8_t* data, size_t length, | |||
| const std::string decryption_name, | |||
| size_t& result_length) const; | |||
| private: | |||
| std::string m_model_name; | |||
| //! the info and model decryption method name, the | |||
| //! decryption func can be found through this name | |||
| std::string m_info_decryption_name; | |||
| std::string m_model_decryption_name; | |||
| //! the function name to parse the model info | |||
| std::string m_info_parse_func_name; | |||
| //! if a model is not added json info to the model is not crypted, the | |||
| //! model is a bare model | |||
| bool m_is_bare_model = true; | |||
| const model_parse::ModelInfo* m_info = nullptr; | |||
| const model_parse::ModelData* m_model_data = nullptr; | |||
| std::shared_ptr<void> m_model; | |||
| size_t m_total_length; | |||
| static std::string sm_model_tag; | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,28 @@ | |||
| namespace model_parse; | |||
| table ModelHeader { | |||
| name:string; | |||
| info_decryption_method:string; | |||
| info_parse_method:string; | |||
| model_decryption_method:string; | |||
| } | |||
| table ModelInfo { | |||
| data:[ubyte]; | |||
| } | |||
| table ModelData { | |||
| data:[ubyte]; | |||
| } | |||
| table Model { | |||
| header:ModelHeader; | |||
| info:ModelInfo; | |||
| data:ModelData; | |||
| } | |||
| table PackModel { | |||
| models:[Model]; | |||
| } | |||
| root_type PackModel; | |||
| @@ -0,0 +1,339 @@ | |||
| /** | |||
| * \file src/tensor.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite/tensor.h" | |||
| #include "function_base.h" | |||
| #include "tensor_impl_base.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "mge/function_dft.h" | |||
| #include "mge/tensor_impl.h" | |||
| #endif | |||
| #include <memory> | |||
| using namespace lite; | |||
| size_t Layout::get_elem_size() const { | |||
| size_t elesize = 1; | |||
| switch (data_type) { | |||
| case LiteDataType::LITE_INT64: | |||
| elesize = 8; | |||
| break; | |||
| case LiteDataType::LITE_FLOAT: | |||
| case LiteDataType::LITE_INT: | |||
| case LiteDataType::LITE_UINT: | |||
| elesize = 4; | |||
| break; | |||
| case LiteDataType::LITE_HALF: | |||
| case LiteDataType::LITE_INT16: | |||
| case LiteDataType::LITE_UINT16: | |||
| elesize = 2; | |||
| break; | |||
| case LiteDataType::LITE_INT8: | |||
| case LiteDataType::LITE_UINT8: | |||
| elesize = 1; | |||
| break; | |||
| default: | |||
| LITE_THROW("not support data type."); | |||
| } | |||
| return elesize; | |||
| } | |||
| bool Layout::operator==(const Layout& other) const { | |||
| bool equal = true; | |||
| equal &= (ndim == other.ndim); | |||
| equal &= (data_type == other.data_type); | |||
| for (size_t i = 0; i < ndim; i++) { | |||
| equal &= (shapes[i] == other.shapes[i]); | |||
| } | |||
| return equal; | |||
| } | |||
| Tensor::~Tensor() = default; | |||
| Tensor::Tensor() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_tensor_impl = call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor"); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Tensor::Tensor(LiteDeviceType device_type, bool is_pinned_host) | |||
| : m_is_pinned_host(is_pinned_host), m_device_type(device_type) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_tensor_impl = call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor", device_type, is_pinned_host); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Tensor::Tensor(LiteDeviceType device_type, const Layout& layout, | |||
| bool is_pinned_host) | |||
| : m_is_pinned_host(is_pinned_host), | |||
| m_layout(layout), | |||
| m_device_type(device_type) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_tensor_impl = call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor", device_type, layout, is_pinned_host); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Tensor::Tensor(int device_id, LiteDeviceType device_type, const Layout& layout, | |||
| bool is_pinned_host) | |||
| : m_is_pinned_host(is_pinned_host), | |||
| m_device_id(device_id), | |||
| m_layout(layout), | |||
| m_device_type(device_type) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_tensor_impl = call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor", device_id, device_type, layout, is_pinned_host); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Tensor::Tensor(int device_id, int stream_id, LiteDeviceType device_type, | |||
| bool is_pinned_host) | |||
| : m_is_pinned_host(is_pinned_host), | |||
| m_device_id(device_id), | |||
| m_device_type(device_type) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_tensor_impl = call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor", device_id, stream_id, device_type, is_pinned_host); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| Tensor::Tensor(LiteBackend backend, LiteDeviceType device_type, int device_id, | |||
| const Layout& layout, bool is_pinned_host) { | |||
| if (backend == LiteBackend::LITE_DEFAULT) { | |||
| m_tensor_impl = | |||
| call_func<TensorImplDft, | |||
| std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
| "create_tensor", device_id, device_type, layout, | |||
| is_pinned_host); | |||
| } else { | |||
| LITE_MARK_USED_VAR(device_type); | |||
| LITE_MARK_USED_VAR(is_pinned_host); | |||
| LITE_MARK_USED_VAR(layout); | |||
| LITE_MARK_USED_VAR(device_id); | |||
| LITE_THROW("unknow backend, enum id is : %d."); | |||
| } | |||
| } | |||
| void Tensor::reshape(const std::vector<int>& shape) { | |||
| LITE_ASSERT(m_layout.ndim > 0, "The tensor to be reshape is empty."); | |||
| uint32_t length = shape.size(); | |||
| LITE_ASSERT(length < Layout::MAXDIM, | |||
| "The ndim of reshape input is too large."); | |||
| Layout new_layout = m_layout; | |||
| new_layout.ndim = length; | |||
| size_t total_length = | |||
| get_tensor_total_size_in_byte() / m_layout.get_elem_size(); | |||
| uint32_t unfixed_number = 0; | |||
| uint32_t unfixed_index = 0; | |||
| for (uint32_t i = 0; i < length; i++) { | |||
| if (shape[i] == -1) { | |||
| unfixed_number += 1; | |||
| unfixed_index = i; | |||
| } else { | |||
| LITE_ASSERT(shape[i] > 0, "The reshape inputs invalid."); | |||
| new_layout.shapes[i] = shape[i]; | |||
| } | |||
| } | |||
| LITE_ASSERT(unfixed_number <= 1, "The reshape inputs invalid."); | |||
| if (unfixed_number) { | |||
| size_t left = total_length; | |||
| for (uint32_t i = 0; i < length; i++) { | |||
| if (i == unfixed_index) { | |||
| continue; | |||
| } else { | |||
| LITE_ASSERT(left > 0 && (left % new_layout.shapes[i] == 0), | |||
| "The reshape inputs invalid."); | |||
| left = left / new_layout.shapes[i]; | |||
| } | |||
| } | |||
| LITE_ASSERT(left > 0, "The reshape inputs invalid."); | |||
| new_layout.shapes[unfixed_index] = left; | |||
| } | |||
| size_t new_total = 1; | |||
| for (uint32_t i = 0; i < length; i++) { | |||
| new_total *= new_layout.shapes[i]; | |||
| } | |||
| LITE_ASSERT(new_total == total_length, "The reshape inputs invalid."); | |||
| m_layout = new_layout; | |||
| m_tensor_impl->reshape(m_layout); | |||
| } | |||
| size_t Tensor::get_tensor_total_size_in_byte() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| size_t elemsize = m_layout.get_elem_size(); | |||
| size_t total = m_layout.ndim == 0 ? 0 : 1; | |||
| for (size_t i = 0; i < m_layout.ndim; i++) { | |||
| total *= m_layout.shapes[i]; | |||
| } | |||
| return total * elemsize; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void* Tensor::get_memory_ptr() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_layout.ndim != 0, | |||
| "Tensor layout is not valid when get memory ptr."); | |||
| return m_tensor_impl->get_memory_ptr(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void* Tensor::get_memory_ptr(const std::vector<size_t>& idx) const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| return m_tensor_impl->get_memory_ptr(idx); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| std::shared_ptr<Tensor> Tensor::slice(const std::vector<size_t>& start, | |||
| const std::vector<size_t>& end, | |||
| const std::vector<size_t>& step) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| auto ret = m_tensor_impl->slice(start, end, step); | |||
| ret->update_from_implement(); | |||
| return ret; | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::fill_zero() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_layout.ndim > 0, | |||
| "fill_zero can't apply on a tensor with empty layout."); | |||
| m_tensor_impl->fill_zero(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::share_memory_with(const Tensor& src_tensor) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(src_tensor.m_layout.ndim > 0, | |||
| "To be shared tensor with empty layout."); | |||
| m_tensor_impl->share_memory_with(src_tensor.m_tensor_impl.get()); | |||
| update_from_implement(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::set_layout(const Layout& layout) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_layout = layout; | |||
| m_tensor_impl->set_layout(layout); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::reset(void* prepared_data, size_t data_length_in_byte) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(m_layout.ndim, | |||
| "Tensor layout is empty, please reset with layout"); | |||
| LITE_ASSERT(data_length_in_byte >= get_tensor_total_size_in_byte(), | |||
| "the memory reset to the tensor is too small."); | |||
| m_tensor_impl->reset(prepared_data); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::reset(void* prepared_data, const Layout& layout) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_layout = layout; | |||
| m_tensor_impl->reset(prepared_data, layout); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| bool Tensor::is_continue_memory() const { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| return m_tensor_impl->is_continue_memory(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::copy_from(const Tensor& src) { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| LITE_ASSERT(src.get_layout().ndim != 0, | |||
| "when tensor copy, the src tensor layout is empty."); | |||
| m_tensor_impl->copy_from(src.m_tensor_impl.get()); | |||
| update_from_implement(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void Tensor::update_from_implement() { | |||
| LITE_ERROR_HANDLER_BEGIN | |||
| m_layout = m_tensor_impl->get_layout(); | |||
| m_device_type = m_tensor_impl->get_device_type(); | |||
| m_device_id = m_tensor_impl->get_device_id(); | |||
| m_is_pinned_host = m_tensor_impl->is_pinned_host(); | |||
| LITE_ERROR_HANDLER_END | |||
| } | |||
| void LiteAny::type_missmatch(size_t expect, size_t get) const { | |||
| LITE_THROW(ssprintf( | |||
| "The type store in LiteAny is not match the visit type, type of " | |||
| "storage length is %zu, type of visit length is %zu.", | |||
| expect, get)); | |||
| } | |||
| std::shared_ptr<Tensor> TensorUtils::concat(const std::vector<Tensor>& tensors, | |||
| int dim, LiteDeviceType dst_device, | |||
| int dst_device_id) { | |||
| if (tensors.size() <= 0) { | |||
| return std::make_shared<Tensor>(); | |||
| } | |||
| if (dst_device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
| dst_device = tensors.front().get_device_type(); | |||
| } | |||
| if (dst_device_id == -1) { | |||
| dst_device_id = tensors.front().get_device_id(); | |||
| } | |||
| bool is_pinned_host = tensors.front().is_pinned_host(); | |||
| auto layout = tensors.front().get_layout(); | |||
| LITE_ASSERT(static_cast<int>(layout.ndim) > dim, | |||
| "the dim in concat is error."); | |||
| size_t sum_in_dim = layout.shapes[dim]; | |||
| for (size_t i = 1; i < tensors.size(); ++i) { | |||
| auto other_layout = tensors[i].get_layout(); | |||
| LITE_ASSERT(other_layout.ndim == layout.ndim, | |||
| "the dim size of tensors is not same!"); | |||
| LITE_ASSERT(other_layout.data_type == layout.data_type, | |||
| "the dtype of tensors is not same!"); | |||
| for (size_t j = 0; j < other_layout.ndim; ++j) { | |||
| if (dim == static_cast<int>(j)) { | |||
| sum_in_dim += other_layout.shapes[j]; | |||
| continue; | |||
| } | |||
| LITE_ASSERT(other_layout.shapes[j] == layout.shapes[j], | |||
| "the shape of tensors is not same!"); | |||
| } | |||
| } | |||
| layout.shapes[dim] = sum_in_dim; | |||
| auto result = std::make_shared<Tensor>(dst_device_id, dst_device, layout, | |||
| is_pinned_host); | |||
| size_t index = 0; | |||
| std::vector<size_t> start(dim + 1, 0); | |||
| std::vector<size_t> end(dim + 1, 0); | |||
| for (int i = 0; i < dim; i++) { | |||
| end[i] = layout.shapes[i]; | |||
| } | |||
| for (size_t i = 0; i < tensors.size(); ++i) { | |||
| auto&& tensor = tensors[i]; | |||
| auto layout = tensor.get_layout(); | |||
| if (layout.shapes[dim] == 0) | |||
| continue; | |||
| start[dim] = index; | |||
| end[dim] = index + layout.shapes[dim]; | |||
| auto&& sub_dst = result->slice(start, end); | |||
| sub_dst->copy_from(tensor); | |||
| index += layout.shapes[dim]; | |||
| } | |||
| return result; | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,101 @@ | |||
| /** | |||
| * \file src/tensor_impl_base.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite/tensor.h" | |||
| #include "misc.h" | |||
| #include "type_info.h" | |||
| #include <unordered_map> | |||
| namespace lite { | |||
| /*! | |||
| * \brief implement the Tensor | |||
| */ | |||
| class Tensor::TensorImplBase : public DynTypeObj { | |||
| public: | |||
| virtual ~TensorImplBase() = default; | |||
| virtual LiteDeviceType get_device_type() const = 0; | |||
| virtual int get_device_id() const = 0; | |||
| virtual LiteBackend get_backend_type() const = 0; | |||
| virtual Layout get_layout() const = 0; | |||
| virtual bool is_pinned_host() const = 0; | |||
| virtual void* get_memory_ptr() const = 0; | |||
| virtual void* get_memory_ptr(const std::vector<size_t>& idx) const = 0; | |||
| virtual void set_layout(const Layout& layout) = 0; | |||
| //! use the user allocated data to reset the memory of the tensor, the | |||
| //! memory will not be managed by the lite, later, the user should delete | |||
| //! it. | |||
| virtual void reset(void* prepared_data) = 0; | |||
| //! use the user allocated data and corresponding layout to reset the data | |||
| //! and layout of the tensor, the memory will not be managed by lite, later, | |||
| //! the user should delete it. | |||
| virtual void reset(void* prepared_data, const Layout& layout) = 0; | |||
| //! reshape the tensor with new shape, keep the data_type the same | |||
| virtual void reshape(const Layout& layout) = 0; | |||
| //! get a new tensor slice from the origin tensor | |||
| virtual std::shared_ptr<Tensor> slice( | |||
| const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
| const std::vector<size_t>& step = {}) = 0; | |||
| //! set the tensor memory with zero | |||
| virtual void fill_zero() = 0; | |||
| //! copy tensor form other tensor | |||
| //! Note: the best way for tensor copy is just set the dst device, left | |||
| //! layout empty, when copying the dst layout will be set the same with | |||
| //! src | |||
| virtual void copy_from(const TensorImplBase* src_impl) = 0; | |||
| //! share memory with other tensor | |||
| virtual void share_memory_with(const TensorImplBase* src_impl) = 0; | |||
| //! whether the memory of tensor is continue | |||
| virtual bool is_continue_memory() const = 0; | |||
| }; | |||
| /*! | |||
| * \brief friend class of Tensor, for convenient accessing the Network members | |||
| */ | |||
| class TensorHelper { | |||
| public: | |||
| static inline std::shared_ptr<Tensor::TensorImplBase> implement( | |||
| const std::shared_ptr<Tensor> tensor) { | |||
| LITE_ASSERT(tensor); | |||
| return tensor->m_tensor_impl; | |||
| } | |||
| static inline std::shared_ptr<Tensor::TensorImplBase> implement( | |||
| const Tensor* tensor) { | |||
| LITE_ASSERT(tensor); | |||
| return tensor->m_tensor_impl; | |||
| } | |||
| static inline void implement(const std::shared_ptr<Tensor> tensor, | |||
| std::shared_ptr<Tensor::TensorImplBase> impl) { | |||
| LITE_ASSERT(tensor); | |||
| tensor->m_tensor_impl = impl; | |||
| } | |||
| }; | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,97 @@ | |||
| /** | |||
| * \file src/type_info.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "misc.h" | |||
| namespace lite { | |||
| /*! | |||
| * \brief an object to represent a type | |||
| * | |||
| * LITE has a lightweight RTTI system. Each type is represented by the | |||
| * address of a Typeinfo object, which is stored in the .bss segment. | |||
| * | |||
| * LITE_TYPEINFO_OBJ_DECL should be placed into the definition of classes that | |||
| * need compile-time type support. | |||
| * | |||
| * For classes that need RTTI, they should be derived from DynTypeObj | |||
| */ | |||
| struct Typeinfo { | |||
| //! name of the corresponding type; nullptr if MGB_VERBOSE_TYPEINFO_NAME==0 | |||
| const char* const name; | |||
| /*! | |||
| * \brief whether this is the type of given object | |||
| * \tparam T a class with static typeinfo() method | |||
| */ | |||
| template <typename T> | |||
| bool is() const { | |||
| return T::typeinfo() == this; | |||
| } | |||
| }; | |||
| /*! | |||
| * \brief base class to emulate RTTI without compiler support | |||
| */ | |||
| class DynTypeObj { | |||
| public: | |||
| virtual Typeinfo* dyn_typeinfo() const = 0; | |||
| //! cast this to a final object with type check | |||
| template <class T> | |||
| T& cast_final_safe() { | |||
| LITE_ASSERT(T::typeinfo() == dyn_typeinfo(), | |||
| "can not convert type %s to %s", dyn_typeinfo()->name, | |||
| T::typeinfo()->name); | |||
| return *static_cast<T*>(this); | |||
| } | |||
| template <class T> | |||
| const T& cast_final_safe() const { | |||
| return const_cast<DynTypeObj*>(this)->cast_final_safe<T>(); | |||
| } | |||
| //! check whether this is same to given type | |||
| template <class T> | |||
| bool same_type() const { | |||
| return dyn_typeinfo() == T::typeinfo(); | |||
| } | |||
| protected: | |||
| ~DynTypeObj() = default; | |||
| }; | |||
| //! put in the declaration of a final class inherited from DynTypeObj | |||
| #define LITE_DYN_TYPE_OBJ_FINAL_DECL \ | |||
| public: \ | |||
| ::lite::Typeinfo* dyn_typeinfo() const override final; \ | |||
| static inline ::lite::Typeinfo* typeinfo() { return &sm_typeinfo; } \ | |||
| \ | |||
| private: \ | |||
| static ::lite::Typeinfo sm_typeinfo | |||
| #if LITE_ENABLE_LOGGING | |||
| //! get class name from class object | |||
| #define _LITE_TYPEINFO_CLASS_NAME(_cls) #_cls | |||
| #else | |||
| #define _LITE_TYPEINFO_CLASS_NAME(_cls) nullptr | |||
| #endif | |||
| //! put in the impl file of a class that needs static typeinfo() | |||
| #define LITE_TYPEINFO_OBJ_IMPL(_cls) \ | |||
| ::lite::Typeinfo _cls::sm_typeinfo { _LITE_TYPEINFO_CLASS_NAME(_cls) } | |||
| //! put in the impl file of a final class inherited from DynTypeObj | |||
| #define LITE_DYN_TYPE_OBJ_FINAL_IMPL(_cls) \ | |||
| ::lite::Typeinfo* _cls::dyn_typeinfo() const { return &sm_typeinfo; } \ | |||
| LITE_TYPEINFO_OBJ_IMPL(_cls) | |||
| } // namespace lite | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,10 @@ | |||
| { | |||
| global: | |||
| extern "C++" {lite::*;}; | |||
| Lite*; | |||
| LITE*; | |||
| default_config; | |||
| default_network_io; | |||
| local: *; | |||
| }; | |||
| @@ -0,0 +1,23 @@ | |||
| if (MGE_WITH_TEST) | |||
| file (GLOB_RECURSE SOURCES ./*.cpp main.cpp) | |||
| add_executable (lite_test ${SOURCES}) | |||
| target_link_libraries(lite_test gtest) | |||
| target_link_libraries(lite_test lite_static) | |||
| if(LITE_BUILD_WITH_MGE) | |||
| # lite_test will depends megbrain interface | |||
| target_link_libraries(lite_test megbrain) | |||
| endif() | |||
| if(UNIX) | |||
| if(APPLE OR ANDROID) | |||
| target_link_libraries(lite_test dl) | |||
| else() | |||
| target_link_libraries(lite_test dl rt) | |||
| endif() | |||
| endif() | |||
| install (TARGETS lite_test | |||
| EXPORT ${LITE_EXPORT_TARGETS} | |||
| RUNTIME DESTINATION lite/bin) | |||
| endif() | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * \file test/main.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include <gtest/gtest.h> | |||
| #include "../src/misc.h" | |||
| #include "lite/global.h" | |||
| namespace { | |||
| class ResetSeedListener : public ::testing::EmptyTestEventListener { | |||
| void OnTestStart(const ::testing::TestInfo&) override {} | |||
| }; | |||
| } // namespace | |||
| int main(int argc, char** argv) { | |||
| ResetSeedListener listener; | |||
| auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); | |||
| ::testing::InitGoogleTest(&argc, argv); | |||
| listeners.Append(&listener); | |||
| lite::set_log_level(LiteLogLevel::WARN); | |||
| auto ret = RUN_ALL_TESTS(); | |||
| listeners.Release(&listener); | |||
| return ret; | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,638 @@ | |||
| /* | |||
| Copyright 2017 Leon Merten Lohse | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in | |||
| all copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| */ | |||
| /* | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #ifndef NPY_H | |||
| #define NPY_H | |||
| #include <algorithm> | |||
| #include <complex> | |||
| #include <cstdint> | |||
| #include <cstring> | |||
| #include <fstream> | |||
| #include <iostream> | |||
| #include <regex> | |||
| #include <sstream> | |||
| #include <stdexcept> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| namespace npy { | |||
| /* Compile-time test for byte order. | |||
| If your compiler does not define these per default, you may want to define | |||
| one of these constants manually. | |||
| Defaults to little endian order. */ | |||
| #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ | |||
| defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ | |||
| defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ | |||
| defined(__MIBSEB) || defined(__MIBSEB__) | |||
| const bool big_endian = true; | |||
| #else | |||
| const bool big_endian = false; | |||
| #endif | |||
| const char magic_string[] = "\x93NUMPY"; | |||
| const size_t magic_string_length = 6; | |||
| const char little_endian_char = '<'; | |||
| const char big_endian_char = '>'; | |||
| const char no_endian_char = '|'; | |||
| constexpr char host_endian_char = | |||
| (big_endian ? big_endian_char : little_endian_char); | |||
| /* npy array length */ | |||
| typedef unsigned long int ndarray_len_t; | |||
| inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, | |||
| unsigned char v_minor = 0) { | |||
| ostream.write(magic_string, magic_string_length); | |||
| ostream.put(v_major); | |||
| ostream.put(v_minor); | |||
| } | |||
| inline void read_magic(std::istream& istream, unsigned char& v_major, | |||
| unsigned char& v_minor) { | |||
| char buf[magic_string_length + 2]; | |||
| istream.read(buf, magic_string_length + 2); | |||
| if (!istream) { | |||
| fprintf(stderr, "io error: failed reading file"); | |||
| } | |||
| if (0 != std::memcmp(buf, magic_string, magic_string_length)) { | |||
| fprintf(stderr, "this file does not have a valid npy format."); | |||
| } | |||
| v_major = buf[magic_string_length]; | |||
| v_minor = buf[magic_string_length + 1]; | |||
| } | |||
| // typestring magic | |||
| struct Typestring { | |||
| private: | |||
| char c_endian; | |||
| char c_type; | |||
| int len; | |||
| public: | |||
| inline std::string str() { | |||
| const size_t max_buflen = 16; | |||
| char buf[max_buflen]; | |||
| std::sprintf(buf, "%c%c%u", c_endian, c_type, len); | |||
| return std::string(buf); | |||
| } | |||
| Typestring(const std::vector<float>&) | |||
| : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} | |||
| Typestring(const std::vector<double>&) | |||
| : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} | |||
| Typestring(const std::vector<long double>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'f'}, | |||
| len{sizeof(long double)} {} | |||
| Typestring(const std::vector<char>&) | |||
| : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} | |||
| Typestring(const std::vector<short>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} | |||
| Typestring(const std::vector<int>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} | |||
| Typestring(const std::vector<long>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} | |||
| Typestring(const std::vector<long long>&) | |||
| : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} | |||
| Typestring(const std::vector<unsigned char>&) | |||
| : c_endian{no_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned char)} {} | |||
| Typestring(const std::vector<unsigned short>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned short)} {} | |||
| Typestring(const std::vector<unsigned int>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned int)} {} | |||
| Typestring(const std::vector<unsigned long>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned long)} {} | |||
| Typestring(const std::vector<unsigned long long>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'u'}, | |||
| len{sizeof(unsigned long long)} {} | |||
| Typestring(const std::vector<std::complex<float>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<float>)} {} | |||
| Typestring(const std::vector<std::complex<double>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<double>)} {} | |||
| Typestring(const std::vector<std::complex<long double>>&) | |||
| : c_endian{host_endian_char}, | |||
| c_type{'c'}, | |||
| len{sizeof(std::complex<long double>)} {} | |||
| }; | |||
| inline void parse_typestring(std::string typestring) { | |||
| std::regex re("'([<>|])([ifuc])(\\d+)'"); | |||
| std::smatch sm; | |||
| std::regex_match(typestring, sm, re); | |||
| if (sm.size() != 4) { | |||
| fprintf(stderr, "invalid typestring"); | |||
| } | |||
| } | |||
| namespace pyparse { | |||
| /** | |||
| Removes leading and trailing whitespaces | |||
| */ | |||
| inline std::string trim(const std::string& str) { | |||
| const std::string whitespace = " \t"; | |||
| auto begin = str.find_first_not_of(whitespace); | |||
| if (begin == std::string::npos) | |||
| return ""; | |||
| auto end = str.find_last_not_of(whitespace); | |||
| return str.substr(begin, end - begin + 1); | |||
| } | |||
| inline std::string get_value_from_map(const std::string& mapstr) { | |||
| size_t sep_pos = mapstr.find_first_of(":"); | |||
| if (sep_pos == std::string::npos) | |||
| return ""; | |||
| std::string tmp = mapstr.substr(sep_pos + 1); | |||
| return trim(tmp); | |||
| } | |||
| /** | |||
| Parses the string representation of a Python dict | |||
| The keys need to be known and may not appear anywhere else in the data. | |||
| */ | |||
| inline std::unordered_map<std::string, std::string> parse_dict( | |||
| std::string in, std::vector<std::string>& keys) { | |||
| std::unordered_map<std::string, std::string> map; | |||
| if (keys.size() == 0) | |||
| return map; | |||
| in = trim(in); | |||
| // unwrap dictionary | |||
| if ((in.front() == '{') && (in.back() == '}')) | |||
| in = in.substr(1, in.length() - 2); | |||
| else { | |||
| fprintf(stderr, "Not a Python dictionary."); | |||
| } | |||
| std::vector<std::pair<size_t, std::string>> positions; | |||
| for (auto const& value : keys) { | |||
| size_t pos = in.find("'" + value + "'"); | |||
| if (pos == std::string::npos) { | |||
| fprintf(stderr, "Missing %s key.", value.c_str()); | |||
| } | |||
| std::pair<size_t, std::string> position_pair{pos, value}; | |||
| positions.push_back(position_pair); | |||
| } | |||
| // sort by position in dict | |||
| std::sort(positions.begin(), positions.end()); | |||
| for (size_t i = 0; i < positions.size(); ++i) { | |||
| std::string raw_value; | |||
| size_t begin{positions[i].first}; | |||
| size_t end{std::string::npos}; | |||
| std::string key = positions[i].second; | |||
| if (i + 1 < positions.size()) | |||
| end = positions[i + 1].first; | |||
| raw_value = in.substr(begin, end - begin); | |||
| raw_value = trim(raw_value); | |||
| if (raw_value.back() == ',') | |||
| raw_value.pop_back(); | |||
| map[key] = get_value_from_map(raw_value); | |||
| } | |||
| return map; | |||
| } | |||
| /** | |||
| Parses the string representation of a Python boolean | |||
| */ | |||
| inline bool parse_bool(const std::string& in) { | |||
| if (in == "True") | |||
| return true; | |||
| if (in == "False") | |||
| return false; | |||
| fprintf(stderr, "Invalid python boolan."); | |||
| return false; | |||
| } | |||
| /** | |||
| Parses the string representation of a Python str | |||
| */ | |||
| inline std::string parse_str(const std::string& in) { | |||
| if ((in.front() == '\'') && (in.back() == '\'')) | |||
| return in.substr(1, in.length() - 2); | |||
| fprintf(stderr, "Invalid python string."); | |||
| return ""; | |||
| } | |||
| /** | |||
| Parses the string represenatation of a Python tuple into a vector of its items | |||
| */ | |||
| inline std::vector<std::string> parse_tuple(std::string in) { | |||
| std::vector<std::string> v; | |||
| const char seperator = ','; | |||
| in = trim(in); | |||
| if ((in.front() == '(') && (in.back() == ')')) | |||
| in = in.substr(1, in.length() - 2); | |||
| else { | |||
| fprintf(stderr, "Invalid Python tuple."); | |||
| } | |||
| std::istringstream iss(in); | |||
| for (std::string token; std::getline(iss, token, seperator);) { | |||
| v.push_back(token); | |||
| } | |||
| return v; | |||
| } | |||
| template <typename T> | |||
| inline std::string write_tuple(const std::vector<T>& v) { | |||
| if (v.size() == 0) | |||
| return ""; | |||
| std::ostringstream ss; | |||
| if (v.size() == 1) { | |||
| ss << "(" << v.front() << ",)"; | |||
| } else { | |||
| const std::string delimiter = ", "; | |||
| // v.size() > 1 | |||
| ss << "("; | |||
| std::copy(v.begin(), v.end() - 1, | |||
| std::ostream_iterator<T>(ss, delimiter.c_str())); | |||
| ss << v.back(); | |||
| ss << ")"; | |||
| } | |||
| return ss.str(); | |||
| } | |||
| inline std::string write_boolean(bool b) { | |||
| if (b) | |||
| return "True"; | |||
| else | |||
| return "False"; | |||
| } | |||
| } // namespace pyparse | |||
| inline void parse_header(std::string header, std::string& descr) { | |||
| /* | |||
| The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
| The next 1 byte is an unsigned byte: the major version number of the file | |||
| format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
| number of the file format, e.g. x00. Note: the version of the file format | |||
| is not tied to the version of the numpy package. The next 2 bytes form a | |||
| little-endian unsigned short int: the length of the header data | |||
| HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
| array's format. It is an ASCII string which contains a Python literal | |||
| expression of a dictionary. It is terminated by a newline ('n') and | |||
| padded with spaces | |||
| ('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
| evenly divisible by 16 for alignment purposes. The dictionary contains | |||
| three keys: | |||
| "descr" : dtype.descr | |||
| An object that can be passed as an argument to the numpy.dtype() | |||
| constructor to create the array's dtype. For repeatability and | |||
| readability, this dictionary is formatted using pprint.pformat() so the | |||
| keys are in alphabetic order. | |||
| */ | |||
| // remove trailing newline | |||
| if (header.back() != '\n') | |||
| fprintf(stderr, "invalid header"); | |||
| header.pop_back(); | |||
| // parse the dictionary | |||
| std::vector<std::string> keys{"descr"}; | |||
| auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
| if (dict_map.size() == 0) | |||
| fprintf(stderr, "invalid dictionary in header"); | |||
| std::string descr_s = dict_map["descr"]; | |||
| parse_typestring(descr_s); | |||
| // remove | |||
| descr = npy::pyparse::parse_str(descr_s); | |||
| return; | |||
| } | |||
| inline void parse_header(std::string header, std::string& descr, | |||
| bool& fortran_order, | |||
| std::vector<ndarray_len_t>& shape) { | |||
| /* | |||
| The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
| The next 1 byte is an unsigned byte: the major version number of the file | |||
| format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
| number of the file format, e.g. x00. Note: the version of the file format | |||
| is not tied to the version of the numpy package. The next 2 bytes form a | |||
| little-endian unsigned short int: the length of the header data | |||
| HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
| array's format. It is an ASCII string which contains a Python literal | |||
| expression of a dictionary. It is terminated by a newline ('n') and | |||
| padded with spaces | |||
| ('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
| evenly divisible by 16 for alignment purposes. The dictionary contains | |||
| three keys: | |||
| "descr" : dtype.descr | |||
| An object that can be passed as an argument to the numpy.dtype() | |||
| constructor to create the array's dtype. "fortran_order" : bool Whether | |||
| the array data is Fortran-contiguous or not. Since Fortran-contiguous | |||
| arrays are a common form of non-C-contiguity, we allow them to be written | |||
| directly to disk for efficiency. "shape" : tuple of int The shape of the | |||
| array. For repeatability and readability, this dictionary is formatted | |||
| using pprint.pformat() so the keys are in alphabetic order. | |||
| */ | |||
| // remove trailing newline | |||
| if (header.back() != '\n') | |||
| fprintf(stderr, "invalid header"); | |||
| header.pop_back(); | |||
| // parse the dictionary | |||
| std::vector<std::string> keys{"descr", "fortran_order", "shape"}; | |||
| auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
| if (dict_map.size() == 0) | |||
| fprintf(stderr, "invalid dictionary in header"); | |||
| std::string descr_s = dict_map["descr"]; | |||
| std::string fortran_s = dict_map["fortran_order"]; | |||
| std::string shape_s = dict_map["shape"]; | |||
| // TODO: extract info from typestring | |||
| parse_typestring(descr_s); | |||
| // remove | |||
| descr = npy::pyparse::parse_str(descr_s); | |||
| // convert literal Python bool to C++ bool | |||
| fortran_order = npy::pyparse::parse_bool(fortran_s); | |||
| // parse the shape tuple | |||
| auto shape_v = npy::pyparse::parse_tuple(shape_s); | |||
| if (shape_v.size() == 0) | |||
| fprintf(stderr, "invalid shape tuple in header"); | |||
| for (auto item : shape_v) { | |||
| ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item)); | |||
| shape.push_back(dim); | |||
| } | |||
| } | |||
| inline std::string write_header_dict(const std::string& descr, | |||
| bool fortran_order, | |||
| const std::vector<ndarray_len_t>& shape) { | |||
| std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); | |||
| std::string shape_s = npy::pyparse::write_tuple(shape); | |||
| return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + | |||
| ", 'shape': " + shape_s + ", }"; | |||
| } | |||
| inline void write_header(std::ostream& out, const std::string& descr, | |||
| bool fortran_order, | |||
| const std::vector<ndarray_len_t>& shape_v) { | |||
| std::string header_dict = write_header_dict(descr, fortran_order, shape_v); | |||
| size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; | |||
| unsigned char version[2] = {1, 0}; | |||
| if (length >= 255 * 255) { | |||
| length = magic_string_length + 2 + 4 + header_dict.length() + 1; | |||
| version[0] = 2; | |||
| version[1] = 0; | |||
| } | |||
| size_t padding_len = 16 - length % 16; | |||
| std::string padding(padding_len, ' '); | |||
| // write magic | |||
| write_magic(out, version[0], version[1]); | |||
| // write header length | |||
| if (version[0] == 1 && version[1] == 0) { | |||
| char header_len_le16[2]; | |||
| uint16_t header_len = static_cast<uint16_t>(header_dict.length() + | |||
| padding.length() + 1); | |||
| header_len_le16[0] = (header_len >> 0) & 0xff; | |||
| header_len_le16[1] = (header_len >> 8) & 0xff; | |||
| out.write(reinterpret_cast<char*>(header_len_le16), 2); | |||
| } else { | |||
| char header_len_le32[4]; | |||
| uint32_t header_len = static_cast<uint32_t>(header_dict.length() + | |||
| padding.length() + 1); | |||
| header_len_le32[0] = (header_len >> 0) & 0xff; | |||
| header_len_le32[1] = (header_len >> 8) & 0xff; | |||
| header_len_le32[2] = (header_len >> 16) & 0xff; | |||
| header_len_le32[3] = (header_len >> 24) & 0xff; | |||
| out.write(reinterpret_cast<char*>(header_len_le32), 4); | |||
| } | |||
| out << header_dict << padding << '\n'; | |||
| } | |||
| inline std::string read_header(std::istream& istream) { | |||
| // check magic bytes an version number | |||
| unsigned char v_major, v_minor; | |||
| read_magic(istream, v_major, v_minor); | |||
| uint32_t header_length = 0; | |||
| if (v_major == 1 && v_minor == 0) { | |||
| char header_len_le16[2]; | |||
| istream.read(header_len_le16, 2); | |||
| header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); | |||
| if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { | |||
| // TODO: display warning | |||
| } | |||
| } else if (v_major == 2 && v_minor == 0) { | |||
| char header_len_le32[4]; | |||
| istream.read(header_len_le32, 4); | |||
| header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | | |||
| (header_len_le32[2] << 16) | (header_len_le32[3] << 24); | |||
| if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { | |||
| // TODO: display warning | |||
| } | |||
| } else { | |||
| fprintf(stderr, "unsupported file format version"); | |||
| } | |||
| auto buf_v = std::vector<char>(); | |||
| buf_v.reserve(header_length); | |||
| istream.read(buf_v.data(), header_length); | |||
| std::string header(buf_v.data(), header_length); | |||
| return header; | |||
| } | |||
| inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) { | |||
| ndarray_len_t size = 1; | |||
| for (ndarray_len_t i : shape) | |||
| size *= i; | |||
| return size; | |||
| } | |||
| template <typename Scalar> | |||
| inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, | |||
| unsigned int n_dims, const unsigned long shape[], | |||
| const std::vector<Scalar>& data) { | |||
| Typestring typestring_o(data); | |||
| std::string typestring = typestring_o.str(); | |||
| std::ofstream stream(filename, std::ofstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::vector<ndarray_len_t> shape_v(shape, shape + n_dims); | |||
| write_header(stream, typestring, fortran_order, shape_v); | |||
| auto size = static_cast<size_t>(comp_size(shape_v)); | |||
| stream.write(reinterpret_cast<const char*>(data.data()), | |||
| sizeof(Scalar) * size); | |||
| } | |||
| template <typename Scalar> | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::vector<unsigned long>& shape, | |||
| std::vector<Scalar>& data) { | |||
| bool fortran_order; | |||
| LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data); | |||
| } | |||
| template <typename Scalar> | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::vector<unsigned long>& shape, | |||
| bool& fortran_order, std::vector<Scalar>& data) { | |||
| std::ifstream stream(filename, std::ifstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::string header = read_header(stream); | |||
| // parse header | |||
| std::string typestr; | |||
| parse_header(header, typestr, fortran_order, shape); | |||
| // check if the typestring matches the given one | |||
| Typestring typestring_o{data}; | |||
| std::string expect_typestr = typestring_o.str(); | |||
| if (typestr != expect_typestr) { | |||
| fprintf(stderr, "formatting error: typestrings not matching"); | |||
| } | |||
| // compute the data size based on the shape | |||
| auto size = static_cast<size_t>(comp_size(shape)); | |||
| data.resize(size); | |||
| // read the data | |||
| stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size); | |||
| } | |||
| inline void LoadArrayFromNumpy(const std::string& filename, | |||
| std::string& type_str, | |||
| std::vector<ndarray_len_t>& shape, | |||
| std::vector<int8_t>& data) { | |||
| std::ifstream stream(filename, std::ifstream::binary); | |||
| if (!stream) { | |||
| fprintf(stderr, "io error: failed to open a file."); | |||
| } | |||
| std::string header = read_header(stream); | |||
| bool fortran_order; | |||
| // parse header | |||
| parse_header(header, type_str, fortran_order, shape); | |||
| // check if the typestring matches the given one | |||
| std::string size_str = type_str.substr(type_str.size() - 1); | |||
| size_t elem_size = atoi(size_str.c_str()); | |||
| // compute the data size based on the shape | |||
| auto byte_size = elem_size * static_cast<size_t>(comp_size(shape)); | |||
| data.resize(byte_size); | |||
| // read the data | |||
| stream.read(reinterpret_cast<char*>(data.data()), byte_size); | |||
| } | |||
| } // namespace npy | |||
| #endif // NPY_H | |||
| @@ -0,0 +1,184 @@ | |||
| /** | |||
| * \file test/test_common.h | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #pragma once | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../src/misc.h" | |||
| #include "../src/mge/network_impl.h" | |||
| #include "../src/mge/common.h" | |||
| #include "lite/network.h" | |||
| #include "lite/tensor.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "megbrain/graph/bases.h" | |||
| #include "megbrain/plugin/opr_io_dump.h" | |||
| #include "megbrain/plugin/profiler.h" | |||
| #include "megbrain/serialization/extern_c_opr.h" | |||
| #include "megbrain/serialization/file.h" | |||
| #include "megbrain/serialization/load_dump_config.h" | |||
| #include "megbrain/serialization/serializer.h" | |||
| #include "megbrain/utils/thin/hash_table.h" | |||
| #include "npy.h" | |||
| #include <gtest/gtest.h> | |||
| #include <string.h> | |||
| #include <chrono> | |||
| #include <memory> | |||
| #include <random> | |||
| namespace lite { | |||
| template <typename T> | |||
| static ::testing::AssertionResult compare_memory(const void* memory0, | |||
| const void* memory1, | |||
| size_t length, | |||
| float maxerr = 1e-3) { | |||
| const T* data_ptr0 = static_cast<const T*>(memory0); | |||
| const T* data_ptr1 = static_cast<const T*>(memory1); | |||
| for (size_t i = 0; i < length; i++) { | |||
| auto diff = std::abs(data_ptr0[i] - data_ptr1[i]); | |||
| if (diff > maxerr) { | |||
| return ::testing::AssertionFailure() | |||
| << "Unequal value:\n" | |||
| << "value 0 = " << data_ptr0[i] << "\n" | |||
| << "value 1 = " << data_ptr1[i] << "\n" | |||
| << "At index: " << i << "\n"; | |||
| } | |||
| } | |||
| return ::testing::AssertionSuccess(); | |||
| } | |||
| template <typename T> | |||
| void compare_lite_tensor(std::shared_ptr<Tensor> tensor0, | |||
| std::shared_ptr<Tensor> tensor1, float maxerr = 1e-3) { | |||
| size_t elemsize = tensor0->get_layout().get_elem_size(); | |||
| T* data_ptr0 = static_cast<T*>(tensor0->get_memory_ptr()); | |||
| T* data_ptr1 = static_cast<T*>(tensor1->get_memory_ptr()); | |||
| size_t length = tensor0->get_tensor_total_size_in_byte() / elemsize; | |||
| EXPECT_TRUE(compare_memory<T>(data_ptr0, data_ptr1, length, maxerr)); | |||
| } | |||
| __attribute__((unused)) static std::shared_ptr<Tensor> get_input_data( | |||
| std::string path) { | |||
| std::string type_str; | |||
| std::vector<npy::ndarray_len_t> stl_shape; | |||
| std::vector<int8_t> raw; | |||
| npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); | |||
| auto lite_tensor = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU); | |||
| Layout layout; | |||
| layout.ndim = stl_shape.size(); | |||
| const std::map<std::string, LiteDataType> type_map = { | |||
| {"f4", LiteDataType::LITE_FLOAT}, | |||
| {"i4", LiteDataType::LITE_INT}, | |||
| {"i1", LiteDataType::LITE_INT8}, | |||
| {"u1", LiteDataType::LITE_UINT8}}; | |||
| layout.shapes[0] = 1; | |||
| for (size_t i = 0; i < stl_shape.size(); i++) { | |||
| layout.shapes[i] = static_cast<size_t>(stl_shape[i]); | |||
| } | |||
| for (auto& item : type_map) { | |||
| if (type_str.find(item.first) != std::string::npos) { | |||
| layout.data_type = item.second; | |||
| break; | |||
| } | |||
| } | |||
| lite_tensor->set_layout(layout); | |||
| size_t length = lite_tensor->get_tensor_total_size_in_byte(); | |||
| void* dest = lite_tensor->get_memory_ptr(); | |||
| memcpy(dest, raw.data(), length); | |||
| return lite_tensor; | |||
| } | |||
| __attribute__((unused)) static std::shared_ptr<Tensor> mgelite_lar( | |||
| std::string model_path, const Config& config, std::string, | |||
| std::shared_ptr<Tensor> input) { | |||
| std::unique_ptr<Network> network = std::make_unique<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
| auto src_ptr = input->get_memory_ptr(); | |||
| auto src_layout = input->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| network->forward(); | |||
| network->wait(); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| Layout out_layout = output_tensor->get_layout(); | |||
| auto ret = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU, out_layout); | |||
| void* out_data = output_tensor->get_memory_ptr(); | |||
| void* dst_data = ret->get_memory_ptr(); | |||
| memcpy(dst_data, out_data, ret->get_tensor_total_size_in_byte()); | |||
| return ret; | |||
| } | |||
| __attribute__((unused)) static std::shared_ptr<Tensor> mgb_lar( | |||
| std::string model_path, const Config& config, std::string input_name, | |||
| std::shared_ptr<Tensor> input) { | |||
| LITE_ASSERT(config.bare_model_cryption_name.size() == 0); | |||
| using namespace mgb; | |||
| serialization::GraphLoader::LoadConfig mgb_config; | |||
| mgb_config.comp_node_mapper = [config](CompNode::Locator& loc) { | |||
| loc = to_compnode_locator(config.device_type); | |||
| }; | |||
| mgb_config.comp_graph = ComputingGraph::make(); | |||
| auto&& graph_opt = mgb_config.comp_graph->options(); | |||
| if (config.options.weight_preprocess) { | |||
| graph_opt.graph_opt.enable_weight_preprocess(); | |||
| } | |||
| graph_opt.comp_node_seq_record_level = | |||
| config.options.comp_node_seq_record_level; | |||
| auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); | |||
| auto format = | |||
| serialization::GraphLoader::identify_graph_dump_format(*inp_file); | |||
| mgb_assert(format.valid(), | |||
| "invalid model: unknown model format, please make sure input " | |||
| "file is generated by GraphDumper"); | |||
| auto loader = | |||
| serialization::GraphLoader::make(std::move(inp_file), format.val()); | |||
| auto load_ret = loader->load(mgb_config, false); | |||
| ComputingGraph::OutputSpec out_spec; | |||
| std::vector<HostTensorND> output_tensors(load_ret.output_var_list.size()); | |||
| for (size_t i = 0; i < load_ret.output_var_list.size(); i++) { | |||
| auto cb = [&output_tensors, i](const DeviceTensorND& dv) mutable { | |||
| output_tensors[i].copy_from(dv); | |||
| }; | |||
| out_spec.emplace_back(load_ret.output_var_list[i], std::move(cb)); | |||
| } | |||
| auto func = load_ret.graph_compile(out_spec); | |||
| auto& in = load_ret.tensor_map.find(input_name)->second; | |||
| in->copy_from(*TensorHelper::implement(input) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor()); | |||
| func->execute(); | |||
| func->wait(); | |||
| std::shared_ptr<Tensor> ret = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| to_lite_layout(output_tensors[0].layout())); | |||
| auto mge_tensor = TensorHelper::implement(ret) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor(); | |||
| mge_tensor->copy_from(output_tensors[0]); | |||
| return ret; | |||
| } | |||
| } // namespace lite | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,115 @@ | |||
| /** | |||
| * \file test/test_misc.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "test_common.h" | |||
| #include "../src/decryption/decrypt_base.h" | |||
| #include "../src/network_impl_base.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "megbrain/utils/metahelper.h" | |||
| #include <gtest/gtest.h> | |||
| #include <string.h> | |||
| #include <chrono> | |||
| #include <memory> | |||
| #include <random> | |||
| using namespace lite; | |||
| TEST(TestMisc, DecryptionRegister) { | |||
| size_t number = decryption_static_data().decryption_methods.size(); | |||
| //! At least one method is register by lite | |||
| ASSERT_GE(number, 1); | |||
| DecryptionFunc func; | |||
| register_decryption_and_key("AllForTest0", func, {}); | |||
| ASSERT_EQ(number + 1, decryption_static_data().decryption_methods.size()); | |||
| } | |||
| TEST(TestMisc, DecryptionUpdate) { | |||
| DecryptionFunc func; | |||
| register_decryption_and_key("AllForTest1", func, {}); | |||
| func = [](const void*, size_t, | |||
| const std::vector<uint8_t>&) -> std::vector<uint8_t> { | |||
| return {}; | |||
| }; | |||
| update_decryption_or_key("AllForTest1", func, {}); | |||
| ASSERT_NE(decryption_static_data().decryption_methods["AllForTest1"].first, | |||
| nullptr); | |||
| ASSERT_EQ(decryption_static_data() | |||
| .decryption_methods["AllForTest1"] | |||
| .second->size(), | |||
| 0); | |||
| update_decryption_or_key("AllForTest1", {}, {1, 2, 3}); | |||
| ASSERT_EQ(decryption_static_data() | |||
| .decryption_methods["AllForTest1"] | |||
| .second->size(), | |||
| 3); | |||
| } | |||
| TEST(TestMisc, SharedSameDeviceTensor) { | |||
| using namespace mgb; | |||
| serialization::GraphLoader::LoadConfig mgb_config; | |||
| mgb_config.comp_node_mapper = [](CompNode::Locator& loc) { | |||
| loc = to_compnode_locator(LiteDeviceType::LITE_CPU); | |||
| }; | |||
| mgb_config.comp_graph = ComputingGraph::make(); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); | |||
| auto format = | |||
| serialization::GraphLoader::identify_graph_dump_format(*inp_file); | |||
| mgb_assert(format.valid(), | |||
| "invalid model: unknown model format, please make sure input " | |||
| "file is generated by GraphDumper"); | |||
| auto loader = | |||
| serialization::GraphLoader::make(std::move(inp_file), format.val()); | |||
| auto load_ret_1 = loader->load(mgb_config, true); | |||
| auto load_ret_2 = loader->load(mgb_config, true); | |||
| ASSERT_EQ(load_ret_1.output_var_list.size(), | |||
| load_ret_2.output_var_list.size()); | |||
| ComputingGraph::OutputSpec out_spec_1, out_spec_2; | |||
| for (size_t i = 0; i < load_ret_1.output_var_list.size(); i++) { | |||
| out_spec_1.emplace_back(load_ret_1.output_var_list[i], nullptr); | |||
| out_spec_2.emplace_back(load_ret_2.output_var_list[i], nullptr); | |||
| } | |||
| auto func_1 = load_ret_1.graph_compile(out_spec_1); | |||
| auto func_2 = load_ret_2.graph_compile(out_spec_1); | |||
| std::vector<cg::OperatorNodeBase*> oprs_1, oprs_2; | |||
| func_1->iter_opr_seq([&oprs_1](cg::OperatorNodeBase* opr) -> bool { | |||
| if (opr->try_cast_final<opr::ImmutableTensor>()) { | |||
| oprs_1.push_back(opr); | |||
| } | |||
| return true; | |||
| }); | |||
| func_1->iter_opr_seq([&oprs_2](cg::OperatorNodeBase* opr) -> bool { | |||
| if (opr->try_cast_final<opr::ImmutableTensor>()) { | |||
| oprs_2.push_back(opr); | |||
| } | |||
| return true; | |||
| }); | |||
| ASSERT_EQ(oprs_1.size(), oprs_2.size()); | |||
| for (size_t i = 0; i < oprs_1.size(); i++) { | |||
| auto tensor_1 = | |||
| oprs_1[i]->try_cast_final<opr::ImmutableTensor>()->value(); | |||
| auto tensor_2 = | |||
| oprs_2[i]->try_cast_final<opr::ImmutableTensor>()->value(); | |||
| ASSERT_EQ(tensor_1.raw_ptr(), tensor_2.raw_ptr()); | |||
| } | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,895 @@ | |||
| /** | |||
| * \file test/test_network_c.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "../src/misc.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../src/common.h" | |||
| #include "../src/mge/network_impl.h" | |||
| #include "../lite-c/src/common.h" | |||
| #include "lite-c/global_c.h" | |||
| #include "lite-c/network_c.h" | |||
| #include "lite-c/tensor_c.h" | |||
| #include "./test_common.h" | |||
| #include "megbrain/tensor.h" | |||
| #include <string.h> | |||
| #include <chrono> | |||
| #include <memory> | |||
| #include <random> | |||
| #include <unordered_map> | |||
| namespace { | |||
| int affinity_set = false; | |||
| int single_thread_affinity(int) { | |||
| affinity_set = true; | |||
| return 0; | |||
| } | |||
| std::atomic_size_t m_nr_left{0}; | |||
| std::atomic_size_t m_nr_allocated{0}; | |||
| void* allocate(LiteDeviceType device, int, size_t size, size_t align) { | |||
| LITE_ASSERT(device == LiteDeviceType::LITE_CPU); | |||
| m_nr_left++; | |||
| m_nr_allocated++; | |||
| #ifdef WIN32 | |||
| return _aligned_malloc(size, align); | |||
| #elif defined(__ANDROID__) || defined(ANDROID) | |||
| return memalign(align, size); | |||
| #else | |||
| void* ptr = nullptr; | |||
| auto err = posix_memalign(&ptr, align, size); | |||
| mgb_assert(!err, "failed to malloc %zu bytes with align %zu", size, align); | |||
| return ptr; | |||
| #endif | |||
| } | |||
| void free(LiteDeviceType device, int, void* ptr) { | |||
| m_nr_left--; | |||
| LITE_ASSERT(device == LiteDeviceType::LITE_CPU); | |||
| #ifdef WIN32 | |||
| _aligned_free(ptr); | |||
| #else | |||
| ::free(ptr); | |||
| #endif | |||
| }; | |||
| #define NUMBER_THREDS (4) | |||
| std::vector<std::thread::id> thread_ids(NUMBER_THREDS); | |||
| int multi_thread_affinity(int id) { | |||
| thread_ids[id] = std::this_thread::get_id(); | |||
| return 0; | |||
| }; | |||
| volatile bool finished = false; | |||
| int finish_callback() { | |||
| finished = true; | |||
| return 0; | |||
| } | |||
| volatile bool start_checked = false; | |||
| int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors, | |||
| size_t size) { | |||
| start_checked = true; | |||
| auto check_func = [&]() { | |||
| ASSERT_EQ(size, 1); | |||
| ASSERT_EQ(std::string(inputs->name), "data"); | |||
| LiteLayout layout; | |||
| LITE_get_tensor_layout(*input_tensors, &layout); | |||
| ASSERT_EQ(layout.ndim, 4); | |||
| ASSERT_EQ(layout.shapes[1], 3); | |||
| ASSERT_EQ(layout.shapes[2], 224); | |||
| ASSERT_EQ(layout.shapes[3], 224); | |||
| }; | |||
| check_func(); | |||
| return 0; | |||
| } | |||
| volatile bool finish_checked = false; | |||
| int finish_callback(const LiteIO* outputs, const LiteTensor* output_tensors, | |||
| size_t size) { | |||
| finish_checked = true; | |||
| auto check_func = [&]() { | |||
| ASSERT_EQ(size, 1); | |||
| ASSERT_EQ(std::string(outputs->name), | |||
| "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); | |||
| LiteLayout layout; | |||
| LITE_get_tensor_layout(*output_tensors, &layout); | |||
| ASSERT_EQ(layout.shapes[1], 1000); | |||
| }; | |||
| check_func(); | |||
| return 0; | |||
| } | |||
| } // namespace | |||
| #define LITE_CAPI_CHECK(_expr) \ | |||
| do { \ | |||
| int _ret = (_expr); \ | |||
| if (_ret) { \ | |||
| LITE_THROW(LITE_get_last_error()); \ | |||
| } \ | |||
| } while (0) | |||
| #define ForwardMgb \ | |||
| lite::Config config; \ | |||
| auto lite_tensor = lite::get_input_data("./input_data.npy"); \ | |||
| size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); \ | |||
| std::string model_path = "./shufflenet.mge"; \ | |||
| auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor) | |||
| #define MakeNetwork \ | |||
| LiteNetwork c_network; \ | |||
| LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), \ | |||
| *default_network_io())) | |||
| #define LoadNetwork \ | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str())) | |||
| #define SetInput \ | |||
| LiteTensor c_input_tensor, c_output_tensor; \ | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, "data", LITE_INPUT, \ | |||
| &c_input_tensor)); \ | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, \ | |||
| lite_tensor->get_memory_ptr(), \ | |||
| data_length_in_byte)) | |||
| #define ForwardNetwork \ | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); \ | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)) | |||
| #define GetOutput \ | |||
| const char* output_name; \ | |||
| LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); \ | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_OUTPUT, \ | |||
| &c_output_tensor)); \ | |||
| void* output_ptr; \ | |||
| LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)) | |||
| #define CompareResult \ | |||
| EXPECT_TRUE(lite::compare_memory<float>( \ | |||
| output_ptr, result_mgb->get_memory_ptr(), \ | |||
| result_mgb->get_tensor_total_size_in_byte() / sizeof(float))) | |||
| TEST(TestCapiNetWork, BasicResetInput) { | |||
| ForwardMgb; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_destroy_network(c_network); | |||
| } | |||
| TEST(TestCapiNetWork, GetAllName) { | |||
| std::string model_path = "./shufflenet.mge"; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
| LoadNetwork; | |||
| size_t input_size, output_size; | |||
| LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
| LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
| std::vector<const char*> input_names(input_size); | |||
| LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
| ASSERT_EQ(input_names.size(), 1); | |||
| ASSERT_TRUE(std::string(input_names[0]) == "data"); | |||
| std::vector<const char*> output_names(output_size); | |||
| LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
| ASSERT_TRUE(std::string(output_names[0]) == | |||
| "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); | |||
| ASSERT_EQ(output_names.size(), 1); | |||
| LITE_destroy_network(c_network); | |||
| } | |||
| #if LITE_BUILD_WITH_RKNPU | |||
| static int GetTop(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass, | |||
| uint32_t outputCount, uint32_t topNum) { | |||
| uint32_t i, j; | |||
| #define MAX_TOP_NUM 20 | |||
| if (topNum > MAX_TOP_NUM) | |||
| return 0; | |||
| memset(pfMaxProb, 0, sizeof(float) * topNum); | |||
| memset(pMaxClass, 0xff, sizeof(float) * topNum); | |||
| for (j = 0; j < topNum; j++) { | |||
| for (i = 0; i < outputCount; i++) { | |||
| if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) || | |||
| (i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) || | |||
| (i == *(pMaxClass + 4))) { | |||
| continue; | |||
| } | |||
| if (pfProb[i] > *(pfMaxProb + j)) { | |||
| *(pfMaxProb + j) = pfProb[i]; | |||
| *(pMaxClass + j) = i; | |||
| } | |||
| } | |||
| } | |||
| return 1; | |||
| } | |||
| TEST(TestCapiNetWork, rknntest_set_info) { | |||
| #define SET_INFO_SIZE 2 | |||
| #define TENSOR_TYPE_UINT8 3 | |||
| #define TENSOR_FORMAT_NHWC 1 | |||
| LiteConfig config; | |||
| config.backend = LiteBackend::LITE_RK_NPU; | |||
| config.device_type = LiteDeviceType::LITE_NPU; | |||
| config.bare_model_cryption_name = nullptr; | |||
| auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); | |||
| auto true_tensor = lite::get_input_data("./output_data.npy"); | |||
| auto rknn_model = "./model/mobilenet_v1.rknn"; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); | |||
| size_t input_size, output_size; | |||
| LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
| LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
| std::vector<const char*> input_names(input_size); | |||
| std::vector<const char*> output_names(output_size); | |||
| LiteTensor c_input_tensor, c_output_tensor; | |||
| LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
| LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, | |||
| &c_input_tensor)); | |||
| size_t input_length = 0; | |||
| LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); | |||
| size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
| { | |||
| LiteLayout input_layout; | |||
| LITE_get_tensor_layout(c_input_tensor, &input_layout); | |||
| ASSERT_TRUE(input_layout.data_type == LITE_INT8); | |||
| std::vector<int> input_shape={1,224,224,3}; | |||
| for (size_t i = 0; i < input_layout.ndim; i++) { | |||
| ASSERT_TRUE(input_layout.shapes[i]=input_shape[i]); | |||
| } | |||
| } | |||
| { | |||
| int size_attr = 0; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, nullptr, nullptr, | |||
| &size_attr)); | |||
| ASSERT_TRUE(size_attr > 0); | |||
| const char* keys[size_attr]; | |||
| void* values[size_attr]; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, keys, values, | |||
| &size_attr)); | |||
| ASSERT_TRUE(size_attr > 5); | |||
| std::unordered_map<std::string, uint32_t> result_map = { | |||
| {"zp", 0}, | |||
| {"index", 0}, | |||
| {"size_with_stride", 150528}, | |||
| {"stride", 224}, | |||
| {"n_size", 150528}, | |||
| {"n_elems", 150528}, | |||
| {"qnt_type", 2}, | |||
| {"n_dims", 4}, | |||
| {"type", 2}, | |||
| {"fmt", 1}, | |||
| {"dims0", 1}, | |||
| {"dims1", 224}, | |||
| {"dims2", 224}, | |||
| {"dims3", 3}, | |||
| }; | |||
| for (int i = 0; i < size_attr; i++) { | |||
| std::string key(keys[i]); | |||
| if (key == "names") { | |||
| ASSERT_TRUE(std::string("input") == | |||
| std::string(static_cast<const char*>(values[i]))); | |||
| } else if (key == "scale") { | |||
| float scale = *static_cast<float*>(values[i]); | |||
| ASSERT_TRUE(std::fabs(scale - 0.007812) < 0.00001); | |||
| } else if (key == "fl" || key == "pass_through") { | |||
| uint8_t val = *static_cast<uint8_t*>(values[i]); | |||
| if (key == "fl") { | |||
| ASSERT_TRUE(val == 0); | |||
| } else { | |||
| ASSERT_TRUE(val == 1); | |||
| } | |||
| } else { | |||
| uint32_t val = *static_cast<uint32_t*>(values[i]); | |||
| ASSERT_TRUE(result_map[std::string(keys[i])]==val); | |||
| } | |||
| } | |||
| } | |||
| const char* keys[] = {"type", "fmt"}; | |||
| int info_size = SET_INFO_SIZE; | |||
| int type = TENSOR_TYPE_UINT8; | |||
| int fmt = TENSOR_FORMAT_NHWC; | |||
| void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)}; | |||
| LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, | |||
| info_size)); | |||
| ASSERT_TRUE(std::string(output_names[0]) == | |||
| std::string("MobilenetV1/Predictions/Reshape_1")); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
| &c_output_tensor)); | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
| lite_tensor->get_memory_ptr(), | |||
| data_length_in_byte)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
| &c_output_tensor)); | |||
| //LiteLayout tmp_output_layout; | |||
| //LITE_get_tensor_layout(c_output_tensor, &tmp_output_layout); | |||
| //tmp_output_layout.data_type = LiteDataType::LITE_FLOAT; | |||
| //LITE_set_tensor_layout(c_output_tensor, tmp_output_layout); | |||
| { | |||
| const char* keys[] = {"want_float"}; | |||
| uint8_t want_float = 1; | |||
| void* values[] = {static_cast<void*>(&want_float)}; | |||
| LITE_CAPI_CHECK( | |||
| LITE_set_tensor_information(c_output_tensor, keys, values, 1)); | |||
| } | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
| ASSERT_TRUE(std::string(output_names[0]) == "MobilenetV1/Predictions/Reshape_1"); | |||
| ASSERT_EQ(output_names.size(), 1); | |||
| { | |||
| LiteLayout output_layout; | |||
| LITE_get_tensor_layout(c_output_tensor, &output_layout); | |||
| ASSERT_TRUE(output_layout.data_type == LITE_FLOAT); | |||
| int size_attr = 0; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, nullptr, nullptr, | |||
| &size_attr)); | |||
| ASSERT_TRUE(size_attr > 0); | |||
| const char* keys[size_attr]; | |||
| void* values[size_attr]; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, keys, values, | |||
| &size_attr)); | |||
| ASSERT_TRUE(size_attr > 5); | |||
| std::unordered_map<std::string, uint32_t> result_map = { | |||
| {"zp", 0}, | |||
| {"index", 0}, | |||
| {"size_with_stride", 2002}, | |||
| {"stride", 0}, | |||
| {"n_size", 2002}, | |||
| {"n_elems", 1001}, | |||
| {"qnt_type", 2}, | |||
| {"n_dims", 2}, | |||
| {"type", 0}, | |||
| {"fmt", 2}, | |||
| {"dims0", 1}, | |||
| {"dims1", 1001}, | |||
| }; | |||
| for (int i = 0; i < size_attr; i++) { | |||
| std::string key(keys[i]); | |||
| if (key == "names") { | |||
| ASSERT_TRUE("MobilenetV1/Predictions/Reshape_1" == | |||
| std::string(static_cast<const char*>(values[i]))); | |||
| } else if (key == "scale") { | |||
| float scale = *static_cast<float*>(values[i]); | |||
| ASSERT_TRUE(std::fabs(scale - 1.0) < 0.00001); | |||
| } else if (key == "fl" || key == "pass_through") { | |||
| uint8_t val = *static_cast<uint8_t*>(values[i]); | |||
| ASSERT_TRUE(val == 0); | |||
| } else { | |||
| uint32_t val = *static_cast<uint32_t*>(values[i]); | |||
| ASSERT_TRUE(result_map[std::string(keys[i])]==val); | |||
| } | |||
| } | |||
| } | |||
| { | |||
| uint32_t MaxClass[5]; | |||
| float fMaxProb[5]; | |||
| void* output_ptr; | |||
| LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
| float* buffer = (float*)output_ptr; | |||
| uint32_t sz = true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
| GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
| std::vector<uint32_t> result_class = { | |||
| 286, 464, 282, 357, 285, | |||
| }; | |||
| std::vector<float> result_prob = { | |||
| 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
| }; | |||
| for (int i = 0; i < 5; i++) { | |||
| ASSERT_TRUE(result_class[i] == MaxClass[i]); | |||
| ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
| } | |||
| } | |||
| { | |||
| float* true_data = static_cast<float*>(true_tensor->get_memory_ptr()); | |||
| void* output_ptr; | |||
| LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
| float* data1 = static_cast<float*>(output_ptr); | |||
| size_t length = | |||
| true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
| for (size_t i = 0; i < length; i++) { | |||
| ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); | |||
| } | |||
| } | |||
| LITE_destroy_network(c_network); | |||
| #undef SET_INFO_SIZE | |||
| #undef TENSOR_FORMAT_NHWC | |||
| #undef TENSOR_TYPE_UINT8 | |||
| } | |||
| TEST(TestCapiNetWork, rknntest_set_info_two_input) { | |||
| #define SET_INFO_SIZE 2 | |||
| #define TENSOR_TYPE_UINT8 3 | |||
| #define TENSOR_FORMAT_NHWC 1 | |||
| LiteConfig config; | |||
| config.backend = LiteBackend::LITE_RK_NPU; | |||
| config.device_type = LiteDeviceType::LITE_NPU; | |||
| config.bare_model_cryption_name = nullptr; | |||
| auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); | |||
| auto lite_tensor_dog = lite::get_input_data("./model/dog_224x224.npy"); | |||
| auto true_tensor = lite::get_input_data("./output_data.npy"); | |||
| auto rknn_model = "./model/mobilenet_v1.rknn"; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); | |||
| LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); | |||
| size_t input_size, output_size; | |||
| LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
| LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
| std::vector<const char*> input_names(input_size); | |||
| std::vector<const char*> output_names(output_size); | |||
| LiteTensor c_input_tensor, c_output_tensor; | |||
| LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
| LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, | |||
| &c_input_tensor)); | |||
| size_t input_length = 0; | |||
| LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); | |||
| size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
| { | |||
| LiteLayout input_layout; | |||
| LITE_get_tensor_layout(c_input_tensor, &input_layout); | |||
| ASSERT_TRUE(input_layout.data_type == LITE_INT8); | |||
| std::vector<int> input_shape = {1, 224, 224, 3}; | |||
| for (size_t i = 0; i < input_layout.ndim; i++) { | |||
| ASSERT_TRUE(input_layout.shapes[i] = input_shape[i]); | |||
| } | |||
| } | |||
| const char* keys[] = {"type", "fmt"}; | |||
| int info_size = SET_INFO_SIZE; | |||
| int type = TENSOR_TYPE_UINT8; | |||
| int fmt = TENSOR_FORMAT_NHWC; | |||
| void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)}; | |||
| LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, | |||
| info_size)); | |||
| ASSERT_TRUE(std::string(output_names[0]) == | |||
| std::string("MobilenetV1/Predictions/Reshape_1")); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
| &c_output_tensor)); | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
| lite_tensor->get_memory_ptr(), | |||
| data_length_in_byte)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
| &c_output_tensor)); | |||
| { | |||
| const char* keys[] = {"want_float"}; | |||
| uint8_t want_float = 1; | |||
| void* values[] = {static_cast<void*>(&want_float)}; | |||
| LITE_CAPI_CHECK( | |||
| LITE_set_tensor_information(c_output_tensor, keys, values, 1)); | |||
| } | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
| ASSERT_TRUE(std::string(output_names[0]) == | |||
| "MobilenetV1/Predictions/Reshape_1"); | |||
| ASSERT_EQ(output_names.size(), 1); | |||
| { | |||
| uint32_t MaxClass[5]; | |||
| float fMaxProb[5]; | |||
| void* output_ptr; | |||
| LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
| float* buffer = (float*)output_ptr; | |||
| uint32_t sz = | |||
| true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
| GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
| std::vector<uint32_t> result_class = { | |||
| 286, 464, 282, 357, 285, | |||
| }; | |||
| std::vector<float> result_prob = { | |||
| 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
| }; | |||
| for (int i = 0; i < 5; i++) { | |||
| ASSERT_TRUE(result_class[i] == MaxClass[i]); | |||
| ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
| } | |||
| } | |||
| { | |||
| float* true_data = static_cast<float*>(true_tensor->get_memory_ptr()); | |||
| void* output_ptr; | |||
| LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
| float* data1 = static_cast<float*>(output_ptr); | |||
| size_t length = | |||
| true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
| for (size_t i = 0; i < length; i++) { | |||
| ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); | |||
| } | |||
| } | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
| lite_tensor_dog->get_memory_ptr(), | |||
| data_length_in_byte)); | |||
| LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
| ASSERT_TRUE(std::string(output_names[0]) == | |||
| "MobilenetV1/Predictions/Reshape_1"); | |||
| ASSERT_EQ(output_names.size(), 1); | |||
| { | |||
| uint32_t MaxClass[5]; | |||
| float fMaxProb[5]; | |||
| void* output_ptr; | |||
| LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
| float* buffer = (float*)output_ptr; | |||
| uint32_t sz = | |||
| true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
| GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
| std::vector<float> result_prob = { | |||
| 0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
| }; | |||
| for (int i = 0; i < 5; i++) { | |||
| ASSERT_FALSE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
| } | |||
| } | |||
| LITE_destroy_network(c_network); | |||
| #undef SET_INFO_SIZE | |||
| #undef TENSOR_FORMAT_NHWC | |||
| #undef TENSOR_TYPE_UINT8 | |||
| } | |||
| #endif | |||
| TEST(TestCapiNetWork, BasicResetOutput) { | |||
| ForwardMgb; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; | |||
| std::shared_ptr<float> ptr(new float[1000], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| const char* output_name; | |||
| LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
| &c_output_tensor)); | |||
| LITE_CAPI_CHECK( | |||
| LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); | |||
| ForwardNetwork; | |||
| EXPECT_TRUE(lite::compare_memory<float>( | |||
| ptr.get(), result_mgb->get_memory_ptr(), | |||
| result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, BasicInplaceAndSingleThreadAffinity) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| //! config the network with cpu inplace mode | |||
| LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network)); | |||
| LoadNetwork; | |||
| //! set single thread affinith callback | |||
| LITE_CAPI_CHECK(LITE_set_runtime_thread_affinity(c_network, | |||
| single_thread_affinity)); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| ASSERT_EQ(affinity_set, true); | |||
| affinity_set = false; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_destroy_network(c_network); | |||
| } | |||
| TEST(TestCapiNetWork, UserAllocator) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LITE_CAPI_CHECK(LITE_set_memory_allocator(c_network, allocate, free)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| ASSERT_GE(m_nr_allocated, 1); | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| ASSERT_EQ(m_nr_left, 0); | |||
| } | |||
| TEST(TestCapiNetWork, BasicMultiThread) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LITE_CAPI_CHECK(LITE_set_cpu_threads_number(c_network, NUMBER_THREDS)); | |||
| LoadNetwork; | |||
| LITE_CAPI_CHECK( | |||
| LITE_set_runtime_thread_affinity(c_network, multi_thread_affinity)); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| for (size_t i = 0; i < NUMBER_THREDS; i++) { | |||
| for (size_t j = i + 1; j < NUMBER_THREDS; j++) { | |||
| ASSERT_NE(thread_ids[i], thread_ids[j]); | |||
| } | |||
| } | |||
| for (size_t i = 0; i < NUMBER_THREDS; i++) { | |||
| thread_ids[i] = std::thread::id(); | |||
| } | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, DeviceIO) { | |||
| ForwardMgb; | |||
| LiteNetwork c_network; | |||
| LiteIO input_io = default_io; | |||
| input_io.is_host = true; | |||
| input_io.name = "data"; | |||
| LiteNetworkIO network_io = *default_network_io(); | |||
| network_io.inputs = &input_io; | |||
| network_io.input_size = 1; | |||
| LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), network_io)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, StartCallBack) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LoadNetwork; | |||
| LITE_CAPI_CHECK(LITE_set_start_callback(c_network, start_callback)); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| ASSERT_TRUE(start_checked); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, FinishCallBack) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LoadNetwork; | |||
| LITE_CAPI_CHECK(LITE_set_finish_callback(c_network, finish_callback)); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| ASSERT_TRUE(finish_checked); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, BasicCryptAes) { | |||
| ForwardMgb; | |||
| LiteConfig c_config = *default_config(); | |||
| c_config.bare_model_cryption_name = "AES_default"; | |||
| LiteNetwork c_network; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network, c_config, *default_network_io())); | |||
| std::string model_crypt_path = "./shufflenet_crypt_aes.mge"; | |||
| LITE_CAPI_CHECK( | |||
| LITE_load_model_from_path(c_network, model_crypt_path.c_str())); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, PackedCryptRc4) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| std::string model_crypt_path = "./test_packed_model_rc4.lite"; | |||
| LITE_CAPI_CHECK( | |||
| LITE_load_model_from_path(c_network, model_crypt_path.c_str())); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, AsyncExec) { | |||
| finished = false; | |||
| ForwardMgb; | |||
| LiteNetwork c_network; | |||
| LiteConfig c_config = *default_config(); | |||
| c_config.options.var_sanity_check_first_run = false; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network, c_config, *default_network_io())); | |||
| LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| LITE_forward(c_network); | |||
| size_t count = 0; | |||
| while (finished == false) { | |||
| count++; | |||
| } | |||
| ASSERT_GT(count, 0); | |||
| finished = false; | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, OutputShapeOnly) { | |||
| ForwardMgb; | |||
| LiteNetwork c_network; | |||
| LiteNetworkIO c_network_io = *default_network_io(); | |||
| LiteIO io_output = default_io; | |||
| io_output.io_type = LiteIOType::LITE_IO_SHAPE; | |||
| io_output.name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; | |||
| c_network_io.outputs = &io_output; | |||
| c_network_io.output_size = 1; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network, *default_config(), c_network_io)); | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| size_t length = 0; | |||
| LITE_CAPI_CHECK( | |||
| LITE_get_tensor_total_size_in_byte(c_output_tensor, &length)); | |||
| ASSERT_EQ(length / sizeof(float), 1000); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, ProfileIOdump) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LITE_CAPI_CHECK( | |||
| LITE_enable_profile_performance(c_network, "./profile.json")); | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| ASSERT_TRUE(fopen("./profile.json", "r")); | |||
| LITE_CAPI_CHECK(LITE_enable_io_txt_dump(c_network, "./io_txt_dump.txt")); | |||
| ForwardNetwork; | |||
| ASSERT_TRUE(fopen("./io_txt_dump.txt", "r")); | |||
| GetOutput; | |||
| CompareResult; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, GetDeviceType) { | |||
| lite::Config config; | |||
| auto lite_tensor = lite::get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| MakeNetwork; | |||
| LoadNetwork; | |||
| LiteDeviceType devicetype; | |||
| LITE_CAPI_CHECK(LITE_get_device_type(c_network, &devicetype)); | |||
| ASSERT_TRUE(devicetype == LiteDeviceType::LITE_CPU); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, GetModelExtraInfo) { | |||
| lite::Config config; | |||
| std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite"; | |||
| MakeNetwork; | |||
| LITE_load_model_from_path(c_network, model_path.c_str()); | |||
| const char* info = nullptr; | |||
| int info_size = 0; | |||
| LITE_CAPI_CHECK(LITE_get_model_extra_info(c_network, &info, &info_size)); | |||
| ASSERT_TRUE(info_size > 0); | |||
| printf("info %s \n", info); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, TestWorkSpaceLimit) { | |||
| lite::Config config; | |||
| auto lite_tensor = lite::get_input_data("./input_data.npy"); | |||
| size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| MakeNetwork; | |||
| LoadNetwork; | |||
| printf("go to config workspace limit\n"); | |||
| LITE_CAPI_CHECK(LITE_set_network_algo_workspace_limit(c_network, 1000)); | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| } | |||
| TEST(TestCapiNetWork, TestShareWeights) { | |||
| ForwardMgb; | |||
| MakeNetwork; | |||
| LoadNetwork; | |||
| SetInput; | |||
| ForwardNetwork; | |||
| GetOutput; | |||
| CompareResult; | |||
| LiteNetwork c_network2; | |||
| LITE_CAPI_CHECK( | |||
| LITE_make_network(&c_network2, *default_config(), *default_network_io())); | |||
| LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network2)); | |||
| LITE_CAPI_CHECK(LITE_shared_weight_with_network(c_network2, c_network)); | |||
| int is_cpu_inplace_mode = false; | |||
| LITE_CAPI_CHECK(LITE_is_cpu_inplace_mode(c_network2, &is_cpu_inplace_mode)); | |||
| ASSERT_EQ(is_cpu_inplace_mode, true); | |||
| LiteTensor c_input_tensor2, c_output_tensor2; | |||
| LITE_CAPI_CHECK( | |||
| LITE_get_io_tensor(c_network2, "data", LITE_IO, &c_input_tensor2)); | |||
| LITE_CAPI_CHECK(LITE_reset_tensor_memory( | |||
| c_input_tensor2, lite_tensor->get_memory_ptr(), | |||
| lite_tensor->get_tensor_total_size_in_byte())); | |||
| LITE_CAPI_CHECK(LITE_forward(c_network2)); | |||
| LITE_CAPI_CHECK(LITE_wait(c_network2)); | |||
| LITE_CAPI_CHECK(LITE_get_io_tensor(c_network2, output_name, LITE_IO, | |||
| &c_output_tensor2)); | |||
| void* output_ptr2; | |||
| LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor2, &output_ptr2)); | |||
| EXPECT_TRUE(lite::compare_memory<float>( | |||
| output_ptr2, result_mgb->get_memory_ptr(), | |||
| result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
| LITE_CAPI_CHECK(LITE_destroy_network(c_network2)); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,351 @@ | |||
| /** | |||
| * \file test/test_network_options.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../src/common.h" | |||
| #include "../src/misc.h" | |||
| #include "../src/mge/network_impl.h" | |||
| #include "lite/global.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "test_common.h" | |||
| #include <string.h> | |||
| #include <chrono> | |||
| #include <memory> | |||
| #include <random> | |||
| using namespace lite; | |||
| TEST(TestNetWorkOptions, no_var_sanity_check_and_record) { | |||
| Config config; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.var_sanity_check_first_run = false; | |||
| config.options.comp_node_seq_record_level = 1; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, const_shape) { | |||
| Config config; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.var_sanity_check_first_run = false; | |||
| config.options.const_shape = true; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, NCHW44) { | |||
| Config config; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.var_sanity_check_first_run = false; | |||
| config.options.enable_nchw44 = true; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| Runtime::set_network_algo_policy( | |||
| network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
| LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, test_cache) { | |||
| Config config; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| set_persistent_cache("./algo_cache.txt", true); | |||
| network->load_model(model_path); | |||
| Runtime::set_network_algo_policy( | |||
| network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
| LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| dump_persistent_cache("./algo_cache.txt"); | |||
| ASSERT_TRUE(fopen("./algo_cache.txt", "r")); | |||
| set_persistent_cache("./algo_cache.txt"); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, FastRunIgnorBatch) { | |||
| Config config; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| set_persistent_cache("./algo_cache.txt"); | |||
| network->load_model(model_path); | |||
| Runtime::set_network_algo_policy( | |||
| network, | |||
| LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
| LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE, | |||
| 1, true); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| dump_persistent_cache("./algo_cache.txt"); | |||
| ASSERT_TRUE(fopen("./algo_cache.txt", "r")); | |||
| } | |||
| #if LITE_WITH_CUDA | |||
| TEST(TestNetWorkOptions, NCHW4) { | |||
| Config config; | |||
| config.device_type = LiteDeviceType::LITE_CUDA; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.enable_nchw4 = 1; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, NCHW32) { | |||
| Config config; | |||
| config.device_type = LiteDeviceType::LITE_CUDA; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.enable_nchw32 = 1; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| Runtime::set_network_algo_policy( | |||
| network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
| LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| TEST(TestNetWorkOptions, jit_level) { | |||
| Config config; | |||
| config.device_type = LiteDeviceType::LITE_CUDA; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| config.options.jit_level = 1; | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| #endif | |||
| #if MGB_ENABLE_TENSOR_RT && LITE_WITH_CUDA | |||
| TEST(TestNetWorkOptions, TensorRT) { | |||
| Config config; | |||
| config.device_type = LiteDeviceType::LITE_CUDA; | |||
| auto tensor = get_input_data("./input_data.npy"); | |||
| std::string model_path = "./shufflenet.mge"; | |||
| std::string input_name = "data"; | |||
| auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
| std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
| Runtime::use_tensorrt(network); | |||
| set_tensor_rt_cache("./tensorrt_cache.txt"); | |||
| network->load_model(model_path); | |||
| std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
| auto src_ptr = tensor->get_memory_ptr(); | |||
| auto src_layout = tensor->get_layout(); | |||
| input_tensor->reset(src_ptr, src_layout); | |||
| std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
| auto result_tensor = std::make_shared<Tensor>( | |||
| LiteDeviceType::LITE_CPU, | |||
| Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
| void* out_data = result_tensor->get_memory_ptr(); | |||
| output_tensor->reset(out_data, result_tensor->get_layout()); | |||
| network->forward(); | |||
| network->wait(); | |||
| dump_tensor_rt_cache(); | |||
| ASSERT_TRUE(fopen("./tensorrt_cache.txt", "r")); | |||
| compare_lite_tensor<float>(output_tensor, result_mgb); | |||
| } | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,589 @@ | |||
| /** | |||
| * \file test/test_tensor.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../src/misc.h" | |||
| #include "../src/mge/common.h" | |||
| #include "../src/mge/network_impl.h" | |||
| #include "lite/tensor.h" | |||
| #include <gtest/gtest.h> | |||
| #include <string.h> | |||
| #include <memory> | |||
| using namespace lite; | |||
| TEST(TestTensor, Basic) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
| //! mge tensor has created | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor1)); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor2)); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor3)); | |||
| //! check member | |||
| ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); | |||
| ASSERT_EQ(tensor2.get_layout(), layout); | |||
| ASSERT_EQ(tensor3.get_layout(), layout); | |||
| //! check the real tensor | |||
| ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
| ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor()); | |||
| ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .dev_tensor()); | |||
| ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .dev_tensor()); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor()); | |||
| } | |||
| TEST(TestTensor, SetLayoutReAlloc) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1; | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| auto old_ptr3 = tensor3.get_memory_ptr(); | |||
| //! layout set through | |||
| Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
| tensor1.set_layout(layout1); | |||
| tensor2.set_layout(layout1); | |||
| tensor3.set_layout(layout1); | |||
| ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
| ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
| auto layout2 = TensorHelper::implement(&tensor2) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor() | |||
| ->layout(); | |||
| auto layout3 = TensorHelper::implement(&tensor3) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor() | |||
| ->layout(); | |||
| ASSERT_EQ(to_lite_layout(layout2), layout1); | |||
| ASSERT_EQ(to_lite_layout(layout3), layout1); | |||
| auto new_ptr2 = tensor2.get_memory_ptr(); | |||
| auto new_ptr3 = tensor3.get_memory_ptr(); | |||
| ASSERT_EQ(old_ptr2, new_ptr2); | |||
| ASSERT_EQ(old_ptr3, new_ptr3); | |||
| } | |||
| TEST(TestTensor, Reset) { | |||
| Layout layout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| Tensor tensor1; | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| auto old_ptr3 = tensor3.get_memory_ptr(); | |||
| //! make sure memory is allocted | |||
| ASSERT_NO_THROW(memcpy(old_ptr2, old_ptr3, 3 * 20 * 2)); | |||
| std::shared_ptr<float> new_ptr2(new float[3 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| std::shared_ptr<float> new_ptr3(new float[3 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| tensor1.reset(new_ptr2.get(), layout); | |||
| tensor2.reset(new_ptr2.get(), 3 * 20 * 4); | |||
| tensor3.reset(new_ptr3.get(), 3 * 20 * 4); | |||
| //! After reset the original mem is freed | |||
| /*ASSERT_EXIT((memcpy(old_ptr2, old_ptr3, 3 * 20 * 2), exit(0)), | |||
| ::testing::KilledBySignal(SIGSEGV), ".*");*/ | |||
| ASSERT_EQ(tensor2.get_memory_ptr(), new_ptr2.get()); | |||
| ASSERT_EQ(tensor3.get_memory_ptr(), new_ptr3.get()); | |||
| ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); | |||
| Layout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| std::shared_ptr<float> ptr2(new float[6 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| std::shared_ptr<float> ptr3(new float[6 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| tensor2.reset(ptr2.get(), layout1); | |||
| tensor3.reset(ptr3.get(), layout1); | |||
| //! memory is not freed by Tensor reset | |||
| ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); | |||
| auto host_layout2 = TensorHelper::implement(&tensor2) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor() | |||
| ->layout(); | |||
| auto host_layout3 = TensorHelper::implement(&tensor3) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor() | |||
| ->layout(); | |||
| ASSERT_EQ(to_lite_layout(host_layout2), layout1); | |||
| ASSERT_EQ(to_lite_layout(host_layout3), layout1); | |||
| } | |||
| TEST(TestTensor, CrossCNCopy) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| auto old_ptr3 = tensor3.get_memory_ptr(); | |||
| //! test source tenor is empty | |||
| ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); | |||
| tensor1.copy_from(tensor2); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
| ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
| } | |||
| TEST(TestTensor, SharedTensorMemory) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
| { | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| tensor1.share_memory_with(tensor2); | |||
| auto ptr1 = tensor1.get_memory_ptr(); | |||
| auto ptr2 = tensor2.get_memory_ptr(); | |||
| ASSERT_EQ(ptr1, ptr2); | |||
| } | |||
| // check after tensor2 destroy, tensor1 can also visit | |||
| auto ptr1 = static_cast<float*>(tensor1.get_memory_ptr()); | |||
| size_t length = tensor1.get_tensor_total_size_in_byte() / | |||
| tensor1.get_layout().get_elem_size(); | |||
| for (size_t i = 0; i < length; i++) { | |||
| ptr1[i] = i; | |||
| } | |||
| } | |||
| TEST(TestTensor, Reshape) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| auto ptr = tensor2.get_memory_ptr(); | |||
| //! test wrong case | |||
| ASSERT_THROW(tensor2.reshape({-1, -1, 3 * 224 * 224}), std::exception); | |||
| ASSERT_THROW(tensor2.reshape({-1, 3, 3 * 224 * 224}), std::exception); | |||
| ASSERT_THROW(tensor2.reshape({1, 3, 3 * 224 * 224}), std::exception); | |||
| ASSERT_THROW(tensor2.reshape({3, 3, 3 * 224 * 224}), std::exception); | |||
| tensor2.reshape({3 * 224 * 224}); | |||
| ASSERT_EQ(tensor2.get_layout().ndim, 1); | |||
| ASSERT_EQ(tensor2.get_layout().data_type, LiteDataType::LITE_FLOAT); | |||
| ASSERT_EQ(tensor2.get_layout().shapes[0], 3 * 224 * 224); | |||
| tensor2.reshape({-1, 224, 224}); | |||
| ASSERT_EQ(tensor2.get_layout().ndim, 3); | |||
| ASSERT_EQ(tensor2.get_layout().shapes[0], 3); | |||
| ASSERT_EQ(tensor2.get_layout().shapes[1], 224); | |||
| ASSERT_EQ(tensor2.get_memory_ptr(), ptr); | |||
| } | |||
| TEST(TestTensor, Slice) { | |||
| Layout layout{{20, 20}, 2}; | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| auto ptr = tensor2.get_memory_ptr(); | |||
| //! test source tenor is empty | |||
| ASSERT_THROW(tensor2.slice({5, 10, 10}, {10, 15}), std::exception); | |||
| ASSERT_THROW(tensor2.slice({5, 10}, {10, 15}, {5}), std::exception); | |||
| ASSERT_THROW(tensor2.slice({5, 10}, {10, 15, 10}), std::exception); | |||
| for (int i = 0; i < 20 * 20; i++) { | |||
| *(static_cast<float*>(ptr) + i) = i; | |||
| } | |||
| auto check = [&](size_t start, size_t end, size_t step) { | |||
| Tensor tensor3; | |||
| tensor3.copy_from( | |||
| *tensor2.slice({start, start}, {end, end}, {step, step})); | |||
| float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr()); | |||
| for (size_t i = start; i < end; i += step) { | |||
| for (size_t j = start; j < end; j += step) { | |||
| ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
| ++new_ptr; | |||
| } | |||
| } | |||
| }; | |||
| check(5, 10, 1); | |||
| check(5, 11, 2); | |||
| check(2, 18, 4); | |||
| Tensor tensor3; | |||
| tensor3.copy_from(*tensor2.slice({3}, {9}, {2})); | |||
| float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr()); | |||
| for (size_t i = 3; i < 9; i += 2) { | |||
| for (size_t j = 0; j < 20; j++) { | |||
| ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
| ++new_ptr; | |||
| } | |||
| } | |||
| } | |||
| TEST(TestTensor, SliceCopy) { | |||
| Layout layout{{20, 20}, 2}; | |||
| Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
| //! alloc memory | |||
| auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
| Layout layout_slice{{20, 10}, 2}; | |||
| Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); | |||
| auto ptr0 = tensor0.get_memory_ptr(); | |||
| for (int i = 0; i < 10 * 20; i++) { | |||
| *(static_cast<float*>(ptr0) + i) = i; | |||
| } | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); | |||
| auto ptr1 = tensor1.get_memory_ptr(); | |||
| for (int i = 0; i < 10 * 20; i++) { | |||
| *(static_cast<float*>(ptr1) + i) = i + 200; | |||
| } | |||
| auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
| auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
| slice0->copy_from(tensor0); | |||
| slice1->copy_from(tensor1); | |||
| ASSERT_FALSE(slice0->is_continue_memory()); | |||
| ASSERT_FALSE(slice1->is_continue_memory()); | |||
| for (size_t i = 0; i < 20; i++) { | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(i * 10 + j), *ptr); | |||
| ++ptr; | |||
| } | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(i * 10 + j + 200), *ptr); | |||
| ++ptr; | |||
| } | |||
| } | |||
| slice0->fill_zero(); | |||
| Tensor tmp; | |||
| tmp.copy_from(*slice0); | |||
| float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr()); | |||
| for (size_t i = 0; i < 20; i++) { | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(0), *tmp_ptr); | |||
| ++tmp_ptr; | |||
| } | |||
| } | |||
| } | |||
| TEST(TestTensor, GetPtrOffset) { | |||
| Layout layout{{20, 20}, 2}; | |||
| Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
| //! alloc memory | |||
| auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
| auto ptr_offset = tensor.get_memory_ptr({10, 10}); | |||
| ASSERT_EQ(ptr_offset, ptr + 10 * 20 + 10); | |||
| auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
| auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
| ASSERT_FALSE(slice0->is_continue_memory()); | |||
| ASSERT_FALSE(slice1->is_continue_memory()); | |||
| auto ptr_offset_slice0 = slice0->get_memory_ptr({6, 5}); | |||
| auto ptr_offset_slice1 = slice1->get_memory_ptr({2, 5}); | |||
| ASSERT_EQ(ptr_offset_slice0, ptr + 6 * 20 + 5); | |||
| ASSERT_EQ(ptr_offset_slice1, ptr + 2 * 20 + 10 + 5); | |||
| } | |||
| TEST(TestTensor, Concat) { | |||
| Layout layout{{5, 5, 5}, 3}; | |||
| std::vector<Tensor> tensors; | |||
| for (int i = 0; i < 4; i++) { | |||
| Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
| auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
| for (int n = 0; n < 5 * 5 * 5; n++) { | |||
| ptr[n] = i; | |||
| } | |||
| tensors.push_back(tensor); | |||
| } | |||
| auto check = [&](int dim) { | |||
| auto new_tensor = TensorUtils::concat(tensors, dim); | |||
| auto ptr = static_cast<float*>(new_tensor->get_memory_ptr()); | |||
| size_t stride = std::pow(5, (3 - dim)); | |||
| for (int i = 0; i < 4; i++) { | |||
| for (size_t j = 0; j < stride; j++) { | |||
| ASSERT_EQ(ptr[i * stride + j], i); | |||
| } | |||
| } | |||
| }; | |||
| check(0); | |||
| check(1); | |||
| check(2); | |||
| } | |||
| #if LITE_WITH_CUDA | |||
| TEST(TestTensor, BasicDevice) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1(LiteDeviceType::LITE_CUDA, layout); | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| //! mge tensor has created | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor1)); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor2)); | |||
| //! check member | |||
| ASSERT_EQ(tensor1.get_device_type(), LiteDeviceType::LITE_CUDA); | |||
| ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); | |||
| ASSERT_EQ(tensor2.get_layout(), layout); | |||
| //! check the real tensor | |||
| ASSERT_EQ(tensor1.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
| ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor2) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor()); | |||
| ASSERT_FALSE(TensorHelper::implement(&tensor2) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .dev_tensor()); | |||
| ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .dev_tensor()); | |||
| ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .host_tensor()); | |||
| } | |||
| TEST(TestTensor, SetLayoutReAllocDevice) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor2(LiteDeviceType::LITE_CUDA, layout); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| //! layout set through | |||
| Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
| tensor2.set_layout(layout1); | |||
| ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
| auto layout2 = TensorHelper::implement(&tensor2) | |||
| ->cast_final_safe<TensorImplDft>() | |||
| .dev_tensor() | |||
| ->layout(); | |||
| ASSERT_EQ(to_lite_layout(layout2), layout1); | |||
| auto new_ptr2 = tensor2.get_memory_ptr(); | |||
| ASSERT_EQ(old_ptr2, new_ptr2); | |||
| } | |||
| TEST(TestTensor, CrossCNCopyDevice) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor0; | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
| Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| auto old_ptr3 = tensor3.get_memory_ptr(); | |||
| ASSERT_THROW(tensor3.copy_from(tensor1), std::exception); | |||
| tensor1.copy_from(tensor3); | |||
| tensor0.copy_from(tensor3); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
| ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
| } | |||
| TEST(TestTensor, PinnedHostMem) { | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
| bool is_pinned_host = true; | |||
| Tensor tensor2(LiteDeviceType::LITE_CUDA, layout, is_pinned_host); | |||
| Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| ASSERT_EQ(tensor2.is_pinned_host(), true); | |||
| ASSERT_EQ(tensor3.is_pinned_host(), false); | |||
| auto old_ptr2 = tensor2.get_memory_ptr(); | |||
| auto old_ptr3 = tensor3.get_memory_ptr(); | |||
| //! test source tenor is empty | |||
| ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); | |||
| tensor1.copy_from(tensor2); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
| ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
| } | |||
| TEST(TestTensor, DeviceId) { | |||
| if(get_device_count(LITE_CUDA) <= 1) | |||
| return; | |||
| Layout layout{{1, 3, 224, 224}, 4}; | |||
| Tensor tensor2(0, LiteDeviceType::LITE_CUDA, layout); | |||
| Tensor tensor3(1, LiteDeviceType::LITE_CUDA, layout); | |||
| tensor2.copy_from(tensor3); | |||
| tensor3.copy_from(tensor2); | |||
| Tensor tensor1; | |||
| tensor1.copy_from(tensor2); | |||
| tensor1.copy_from(tensor3); | |||
| } | |||
| TEST(TestTensor, SliceDevice) { | |||
| Layout layout{{20, 20}, 2}; | |||
| Tensor host_tensor0; | |||
| Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); | |||
| host_tensor0.copy_from(dev_tensor0); | |||
| auto ptr = host_tensor0.get_memory_ptr(); | |||
| for (int i = 0; i < 20 * 20; i++) { | |||
| *(static_cast<float*>(ptr) + i) = i; | |||
| } | |||
| dev_tensor0.copy_from(host_tensor0); | |||
| auto check = [&](size_t start, size_t end, size_t step) { | |||
| Tensor host_tensor; | |||
| host_tensor.copy_from( | |||
| *dev_tensor0.slice({start, start}, {end, end}, {step, step})); | |||
| float* new_ptr = static_cast<float*>(host_tensor.get_memory_ptr()); | |||
| for (size_t i = start; i < end; i += step) { | |||
| for (size_t j = start; j < end; j += step) { | |||
| ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
| ++new_ptr; | |||
| } | |||
| } | |||
| }; | |||
| check(5, 10, 1); | |||
| check(5, 11, 2); | |||
| check(2, 18, 4); | |||
| } | |||
| TEST(TestTensor, MemSetDevice) { | |||
| Layout layout{{20, 20}, 2, LiteDataType::LITE_INT8}; | |||
| Tensor host_tensor0(LiteDeviceType::LITE_CPU, layout); | |||
| Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); | |||
| auto check = [&](uint8_t val, const Tensor& tensor) { | |||
| auto ptr = static_cast<uint8_t*>(tensor.get_memory_ptr()); | |||
| for (int i = 0; i < 20 * 20; i++) { | |||
| ASSERT_EQ(val, *(ptr + i)); | |||
| } | |||
| }; | |||
| host_tensor0.fill_zero(); | |||
| check(0, host_tensor0); | |||
| Tensor host_tensor1; | |||
| dev_tensor0.fill_zero(); | |||
| host_tensor1.copy_from(dev_tensor0); | |||
| check(0, host_tensor1); | |||
| } | |||
| TEST(TestTensor, DeviceSliceCopy) { | |||
| Layout layout{{20, 20}, 2}; | |||
| Tensor tensor(LiteDeviceType::LITE_CUDA, layout); | |||
| //! alloc memory | |||
| tensor.get_memory_ptr(); | |||
| Layout layout_slice{{20, 10}, 2}; | |||
| Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); | |||
| auto ptr0 = tensor0.get_memory_ptr(); | |||
| for (int i = 0; i < 10 * 20; i++) { | |||
| *(static_cast<float*>(ptr0) + i) = i; | |||
| } | |||
| Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); | |||
| auto ptr1 = tensor1.get_memory_ptr(); | |||
| for (int i = 0; i < 10 * 20; i++) { | |||
| *(static_cast<float*>(ptr1) + i) = i + 200; | |||
| } | |||
| auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
| auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
| slice0->copy_from(tensor0); | |||
| slice1->copy_from(tensor1); | |||
| ASSERT_FALSE(slice0->is_continue_memory()); | |||
| ASSERT_FALSE(slice1->is_continue_memory()); | |||
| Tensor host_tensor; | |||
| host_tensor.copy_from(tensor); | |||
| auto ptr = static_cast<float*>(host_tensor.get_memory_ptr()); | |||
| for (size_t i = 0; i < 20; i++) { | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(i * 10 + j), *ptr); | |||
| ++ptr; | |||
| } | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(i * 10 + j + 200), *ptr); | |||
| ++ptr; | |||
| } | |||
| } | |||
| slice0->fill_zero(); | |||
| Tensor tmp; | |||
| tmp.copy_from(*slice0); | |||
| float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr()); | |||
| for (size_t i = 0; i < 20; i++) { | |||
| for (size_t j = 0; j < 10; j++) { | |||
| ASSERT_EQ(float(0), *tmp_ptr); | |||
| ++tmp_ptr; | |||
| } | |||
| } | |||
| } | |||
| TEST(TestTensor, ConcatDevice) { | |||
| Layout layout{{5, 5, 5}, 3}; | |||
| std::vector<Tensor> tensors; | |||
| for (int i = 0; i < 4; i++) { | |||
| Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
| auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
| for (int n = 0; n < 5 * 5 * 5; n++) { | |||
| ptr[n] = i; | |||
| } | |||
| tensors.push_back(tensor); | |||
| } | |||
| auto check = [&](int dim) { | |||
| auto new_tensor = | |||
| TensorUtils::concat(tensors, dim, LiteDeviceType::LITE_CUDA, 0); | |||
| Tensor tensor(LiteDeviceType::LITE_CPU); | |||
| tensor.copy_from(*new_tensor); | |||
| auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
| size_t stride = std::pow(5, (3 - dim)); | |||
| for (int i = 0; i < 4; i++) { | |||
| for (size_t j = 0; j < stride; j++) { | |||
| ASSERT_EQ(ptr[i * stride + j], i); | |||
| } | |||
| } | |||
| ASSERT_EQ(new_tensor->get_device_type(), LiteDeviceType::LITE_CUDA); | |||
| ASSERT_EQ(new_tensor->get_device_id(), 0); | |||
| }; | |||
| check(0); | |||
| check(1); | |||
| check(2); | |||
| } | |||
| #endif | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,316 @@ | |||
| /** | |||
| * \file test/test_tensor_c.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include "lite_build_config.h" | |||
| #if LITE_BUILD_WITH_MGE | |||
| #include "../src/misc.h" | |||
| #include "lite-c/global_c.h" | |||
| #include "lite-c/tensor_c.h" | |||
| #include <gtest/gtest.h> | |||
| #include <memory> | |||
| TEST(TestCapiTensor, Basic) { | |||
| LiteTensor c_tensor0, c_tensor1; | |||
| LiteTensorDesc description = default_desc; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| int is_pinned_host = false; | |||
| LITE_is_pinned_host(c_tensor0, &is_pinned_host); | |||
| ASSERT_FALSE(is_pinned_host); | |||
| LiteDeviceType device_type; | |||
| LITE_get_tensor_device_type(c_tensor0, &device_type); | |||
| ASSERT_EQ(device_type, LiteDeviceType::LITE_CPU); | |||
| size_t length = 0; | |||
| LITE_get_tensor_total_size_in_byte(c_tensor0, &length); | |||
| ASSERT_EQ(length, 0); | |||
| LiteLayout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
| description.device_type = LiteDeviceType::LITE_CPU; | |||
| description.layout = layout; | |||
| description.is_pinned_host = true; | |||
| LITE_make_tensor(description, &c_tensor1); | |||
| LITE_is_pinned_host(c_tensor1, &is_pinned_host); | |||
| ASSERT_TRUE(is_pinned_host); | |||
| LITE_get_tensor_total_size_in_byte(c_tensor1, &length); | |||
| ASSERT_EQ(length, 1 * 3 * 224 * 224 * 4); | |||
| LiteLayout get_layout; | |||
| LITE_get_tensor_layout(c_tensor1, &get_layout); | |||
| ASSERT_EQ(get_layout.ndim, layout.ndim); | |||
| ASSERT_EQ(get_layout.data_type, layout.data_type); | |||
| ASSERT_EQ(get_layout.shapes[0], layout.shapes[0]); | |||
| ASSERT_EQ(get_layout.shapes[1], layout.shapes[1]); | |||
| ASSERT_EQ(get_layout.shapes[2], layout.shapes[2]); | |||
| ASSERT_EQ(get_layout.shapes[3], layout.shapes[3]); | |||
| //! test error | |||
| ASSERT_EQ(LITE_is_pinned_host(c_tensor0, nullptr), -1); | |||
| ASSERT_NE(strlen(LITE_get_last_error()), 0); | |||
| printf("The last error is: %s\n", LITE_get_last_error()); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| LITE_destroy_tensor(c_tensor1); | |||
| } | |||
| TEST(TestCapiTensor, SetLayoutReAlloc) { | |||
| LiteTensor c_tensor0; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = | |||
| LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| void *old_ptr, *new_ptr; | |||
| LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
| LiteLayout new_layout = | |||
| LiteLayout{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
| LITE_set_tensor_layout(c_tensor0, new_layout); | |||
| LITE_get_tensor_memory(c_tensor0, &new_ptr); | |||
| size_t length = 0; | |||
| LITE_get_tensor_total_size_in_byte(c_tensor0, &length); | |||
| ASSERT_EQ(length, 1 * 3 * 100 * 100); | |||
| ASSERT_EQ(old_ptr, new_ptr); | |||
| } | |||
| TEST(TestCapiTensor, Reset) { | |||
| LiteTensor c_tensor0, c_tensor1; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = LiteLayout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| LITE_make_tensor(description, &c_tensor1); | |||
| void *old_ptr0, *old_ptr1; | |||
| LITE_get_tensor_memory(c_tensor0, &old_ptr0); | |||
| LITE_get_tensor_memory(c_tensor1, &old_ptr1); | |||
| //! make sure memory is allocted | |||
| ASSERT_NO_THROW(memcpy(old_ptr0, old_ptr1, 3 * 20 * 4)); | |||
| std::shared_ptr<float> new_ptr0(new float[3 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| std::shared_ptr<float> new_ptr1(new float[3 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| LITE_reset_tensor_memory(c_tensor0, new_ptr0.get(), 3 * 20 * 4); | |||
| LITE_reset_tensor_memory(c_tensor1, new_ptr1.get(), 3 * 20 * 4); | |||
| void *tmp_ptr0, *tmp_ptr1; | |||
| LITE_get_tensor_memory(c_tensor0, &tmp_ptr0); | |||
| LITE_get_tensor_memory(c_tensor1, &tmp_ptr1); | |||
| ASSERT_EQ(tmp_ptr0, new_ptr0.get()); | |||
| ASSERT_EQ(tmp_ptr1, new_ptr1.get()); | |||
| ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); | |||
| LiteLayout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| std::shared_ptr<float> ptr2(new float[6 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| std::shared_ptr<float> ptr3(new float[6 * 20], | |||
| [](float* ptr) { delete[] ptr; }); | |||
| LITE_reset_tensor(c_tensor0, layout1, new_ptr0.get()); | |||
| LITE_reset_tensor(c_tensor1, layout1, new_ptr1.get()); | |||
| //! memory is not freed by Tensor reset | |||
| ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); | |||
| LiteLayout tmp_layout0, tmp_layout1; | |||
| LITE_get_tensor_layout(c_tensor0, &tmp_layout0); | |||
| LITE_get_tensor_layout(c_tensor1, &tmp_layout1); | |||
| ASSERT_EQ(tmp_layout0.ndim, tmp_layout1.ndim); | |||
| ASSERT_EQ(tmp_layout0.data_type, tmp_layout1.data_type); | |||
| ASSERT_EQ(tmp_layout0.shapes[0], tmp_layout1.shapes[0]); | |||
| ASSERT_EQ(tmp_layout0.shapes[1], tmp_layout1.shapes[1]); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| LITE_destroy_tensor(c_tensor1); | |||
| } | |||
| TEST(TestCapiTensor, CrossCNCopy) { | |||
| LiteTensor c_tensor0, c_tensor1, c_tensor2; | |||
| LiteTensorDesc description = default_desc; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| description.layout = | |||
| LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor1); | |||
| LITE_make_tensor(description, &c_tensor2); | |||
| LITE_tensor_copy(c_tensor1, c_tensor2); | |||
| LITE_tensor_copy(c_tensor2, c_tensor1); | |||
| void *old_ptr1, *old_ptr2, *new_ptr1, *new_ptr2; | |||
| LITE_get_tensor_memory(c_tensor1, &old_ptr1); | |||
| LITE_get_tensor_memory(c_tensor2, &old_ptr2); | |||
| //! test source tenor is empty | |||
| ASSERT_EQ(LITE_tensor_copy(c_tensor1, c_tensor0), -1); | |||
| ASSERT_NE(strlen(LITE_get_last_error()), 0); | |||
| printf("The last error is: %s\n", LITE_get_last_error()); | |||
| LITE_tensor_copy(c_tensor0, c_tensor1); | |||
| LITE_tensor_copy(c_tensor1, c_tensor2); | |||
| LITE_tensor_copy(c_tensor2, c_tensor0); | |||
| LITE_get_tensor_memory(c_tensor1, &new_ptr1); | |||
| LITE_get_tensor_memory(c_tensor2, &new_ptr2); | |||
| ASSERT_EQ(old_ptr1, new_ptr1); | |||
| ASSERT_EQ(old_ptr2, new_ptr2); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| LITE_destroy_tensor(c_tensor1); | |||
| LITE_destroy_tensor(c_tensor2); | |||
| } | |||
| TEST(TestCapiTensor, ShareMemoryWith) { | |||
| LiteTensor c_tensor0, c_tensor1; | |||
| LiteTensorDesc description = default_desc; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| description.layout = | |||
| LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor1); | |||
| ASSERT_EQ(LITE_tensor_share_memory_with(c_tensor1, c_tensor0), -1); | |||
| LITE_tensor_share_memory_with(c_tensor0, c_tensor1); | |||
| void *ptr0, *ptr1; | |||
| LITE_get_tensor_memory(c_tensor0, &ptr0); | |||
| LITE_get_tensor_memory(c_tensor1, &ptr1); | |||
| ASSERT_EQ(ptr0, ptr1); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| LITE_destroy_tensor(c_tensor1); | |||
| } | |||
| TEST(TestCapiTensor, Reshape) { | |||
| LiteTensor c_tensor0; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = | |||
| LiteLayout{{8, 8, 100, 100}, 4, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| void* old_ptr; | |||
| LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
| auto check = [&](std::vector<size_t> expect, const LiteTensor& tensor) { | |||
| LiteLayout get_layout; | |||
| LITE_get_tensor_layout(tensor, &get_layout); | |||
| ASSERT_EQ(get_layout.ndim, expect.size()); | |||
| for (size_t i = 0; i < expect.size(); i++) { | |||
| ASSERT_EQ(get_layout.shapes[i], expect[i]); | |||
| } | |||
| void* new_ptr; | |||
| LITE_get_tensor_memory(tensor, &new_ptr); | |||
| ASSERT_EQ(old_ptr, new_ptr); | |||
| }; | |||
| { | |||
| int shape[2] = {-1, 50}; | |||
| LITE_tensor_reshape(c_tensor0, shape, 2); | |||
| check({8 * 8 * 100 * 2, 50}, c_tensor0); | |||
| } | |||
| { | |||
| int shape[3] = {64, 100, 100}; | |||
| LITE_tensor_reshape(c_tensor0, shape, 3); | |||
| check({8 * 8, 100, 100}, c_tensor0); | |||
| } | |||
| { | |||
| int shape[3] = {16, 100, -1}; | |||
| LITE_tensor_reshape(c_tensor0, shape, 3); | |||
| check({16, 100, 400}, c_tensor0); | |||
| } | |||
| LITE_destroy_tensor(c_tensor0); | |||
| } | |||
| TEST(TestCapiTensor, Slice) { | |||
| LiteTensor c_tensor0; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| void* old_ptr; | |||
| LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
| for (size_t i = 0; i < 20 * 20; i++) { | |||
| *(static_cast<float*>(old_ptr) + i) = i; | |||
| } | |||
| auto check = [&](size_t start, size_t end, size_t step, bool have_step) { | |||
| LiteTensor tensor, slice_tensor; | |||
| LITE_make_tensor(default_desc, &tensor); | |||
| size_t start_ptr[2] = {start, start}; | |||
| size_t end_ptr[2] = {end, end}; | |||
| size_t step_ptr[2] = {step, step}; | |||
| if (have_step) { | |||
| LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, step_ptr, 2, | |||
| &slice_tensor); | |||
| } else { | |||
| LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, nullptr, 2, | |||
| &slice_tensor); | |||
| } | |||
| int is_continue = true; | |||
| LITE_is_memory_continue(slice_tensor, &is_continue); | |||
| ASSERT_FALSE(is_continue); | |||
| LITE_tensor_copy(tensor, slice_tensor); | |||
| void* new_ptr; | |||
| LITE_get_tensor_memory(tensor, &new_ptr); | |||
| float* ptr = static_cast<float*>(new_ptr); | |||
| for (size_t i = start; i < end; i += step) { | |||
| for (size_t j = start; j < end; j += step) { | |||
| ASSERT_EQ(float(i * 20 + j), *ptr); | |||
| ++ptr; | |||
| } | |||
| } | |||
| LITE_destroy_tensor(tensor); | |||
| }; | |||
| check(1, 8, 1, true); | |||
| check(1, 8, 1, false); | |||
| check(2, 10, 2, true); | |||
| check(10, 18, 4, true); | |||
| check(10, 18, 1, false); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| } | |||
| TEST(TestCapiTensor, Memset) { | |||
| LiteTensor c_tensor0; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| void* ptr; | |||
| uint8_t* uint8_ptr; | |||
| LITE_get_tensor_memory(c_tensor0, &ptr); | |||
| LITE_tensor_fill_zero(c_tensor0); | |||
| uint8_ptr = static_cast<uint8_t*>(ptr); | |||
| for (size_t i = 0; i < 20 * 20; i++) { | |||
| ASSERT_EQ(0, *uint8_ptr); | |||
| uint8_ptr++; | |||
| } | |||
| LITE_destroy_tensor(c_tensor0); | |||
| } | |||
| TEST(TestCapiTensor, GetMemoryByIndex) { | |||
| LiteTensor c_tensor0; | |||
| LiteTensorDesc description = default_desc; | |||
| description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
| LITE_make_tensor(description, &c_tensor0); | |||
| void *ptr0, *ptr1, *ptr2, *ptr3; | |||
| LITE_get_tensor_memory(c_tensor0, &ptr0); | |||
| size_t index0[] = {3, 4}; | |||
| LITE_get_tensor_memory_with_index(c_tensor0, &index0[0], 2, &ptr1); | |||
| size_t index1[] = {5, 7}; | |||
| LITE_get_tensor_memory_with_index(c_tensor0, &index1[0], 2, &ptr2); | |||
| size_t index2[] = {5}; | |||
| LITE_get_tensor_memory_with_index(c_tensor0, &index2[0], 1, &ptr3); | |||
| ASSERT_EQ(ptr1, static_cast<float*>(ptr0) + 3 * 20 + 4); | |||
| ASSERT_EQ(ptr2, static_cast<float*>(ptr0) + 5 * 20 + 7); | |||
| ASSERT_EQ(ptr3, static_cast<float*>(ptr0) + 5 * 20); | |||
| LITE_destroy_tensor(c_tensor0); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -0,0 +1,26 @@ | |||
| #! /bin/bash -e | |||
| set -e | |||
| if [ $# -lt 2 ] ; then | |||
| echo "USAGE: $0 src dst" | |||
| echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl" | |||
| echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl key" | |||
| exit 1; | |||
| fi | |||
| IV=`openssl rand -hex 16` | |||
| Key=000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F | |||
| if [ $# == 3 ] ; then | |||
| Key=$3 | |||
| fi | |||
| # get file size | |||
| size=`wc -c $1` | |||
| echo "encrypt aes-256-cbc ..." | |||
| openssl enc -e -aes-256-cbc -in $1 -out $1.tmp -K $Key -iv $IV | |||
| echo $IV | xxd -r -p | cat - $1.tmp > $2 | |||
| # write size into file | |||
| printf "%016x" ${size%\ *} | xxd -r -p >> $2 | |||
| rm -f $1.tmp | |||
| @@ -0,0 +1,134 @@ | |||
| #!/usr/bin/env mdl | |||
| # -*- coding: utf-8 -*- | |||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| # | |||
| # Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, | |||
| # software distributed under the License is distributed on an | |||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| from megskull.graph import NodeFilter, FpropEnv | |||
| from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization | |||
| from megskull.utils.logconf import get_logger | |||
| from meghair.utils import io | |||
| import megbrain as mgb | |||
| import argparse | |||
| import struct | |||
| import re | |||
| import os | |||
| import numpy as np | |||
| import cv2 | |||
| logger = get_logger(__name__) | |||
| def optimize_for_inference(args, outputs): | |||
| args_map = { | |||
| 'enable_io16xc32': 'f16_io_f32_comp', | |||
| 'enable_ioc16': 'f16_io_comp', | |||
| 'enable_hwcd4': 'use_nhwcd4', | |||
| 'enable_nchw4': 'use_nchw4', | |||
| 'enable_nchw88': 'use_nchw88', | |||
| 'enable_nchw44': 'use_nchw44', | |||
| 'enable_nchw44_dot': 'use_nchw44_dot', | |||
| 'enable_nchw32': 'use_nchw32', | |||
| 'enable_chwn4': 'use_chwn4', | |||
| 'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity', | |||
| 'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z', | |||
| } | |||
| kwargs = {} | |||
| for k, v in args_map.items(): | |||
| if getattr(args, k): | |||
| assert args.optimize_for_inference, ( | |||
| 'optimize_for_inference should be set when {} is given'.format( | |||
| k)) | |||
| kwargs[v] = True | |||
| if args.optimize_for_inference: | |||
| return mgb.optimize_for_inference(outputs, **kwargs) | |||
| return outputs | |||
| def main(): | |||
| parser = argparse.ArgumentParser( | |||
| description='Dump the Python Megbrain model to C++ model, by the way ' | |||
| 'optimizing for inference', | |||
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |||
| ) | |||
| parser.add_argument('input', help='input pkl model file ') | |||
| parser.add_argument('-o', '--output', help='output file', required=True) | |||
| parser.add_argument('--init-bn', action='store_true', | |||
| help='initialize untrained batch-normalization, to ' | |||
| 'avoid NaN or Inf results') | |||
| parser.add_argument('--silent', action='store_true', | |||
| help='set verbose to False in AssertEqual opr') | |||
| parser.add_argument('--optimize-for-inference', action='store_true', | |||
| help='enbale optimization for inference') | |||
| parser.add_argument('--discard-var-name', action='store_true', | |||
| help='discard variable and param names in the ' | |||
| 'generated output') | |||
| parser.add_argument('--output-strip-info', action='store_true', | |||
| help='output code strip information') | |||
| parser.add_argument('--enable-io16xc32', action='store_true', | |||
| help='transform the mode to float16 io float32 compute') | |||
| parser.add_argument('--enable-ioc16', action='store_true', | |||
| help='transform the dtype of the model to float16 io ' | |||
| 'and compute') | |||
| parser.add_argument('--enable-fuse-conv-bias-nonlinearity', | |||
| action='store_true', | |||
| help='fuse convolution bias and nonlinearity opr to a ' | |||
| 'conv_bias opr and compute') | |||
| parser.add_argument('--enable-hwcd4', action='store_true', | |||
| help='transform the model format from NCHW to NHWCD4 ' | |||
| 'for inference; you may need to disable CUDA and set ' | |||
| 'MGB_USE_MEGDNN_DBG=2') | |||
| parser.add_argument('--enable-nchw4', action='store_true', | |||
| help='transform the model format from NCHW to NCHW4 ' | |||
| 'for inference') | |||
| parser.add_argument('--enable-nchw88', action='store_true', | |||
| help='transform the model format from NCHW to NCHW88 ' | |||
| 'for inference') | |||
| parser.add_argument('--enable-nchw44', action='store_true', | |||
| help='transform the model format from NCHW to NCHW44 ' | |||
| 'for inference') | |||
| parser.add_argument('--enable-nchw44-dot', action='store_true', | |||
| help='transform the model format from NCHW to NCHW44_DOT ' | |||
| 'for optimizing armv8.2 dot in inference') | |||
| parser.add_argument('--enable-chwn4', action='store_true', | |||
| help='transform the model format to CHWN4 ' | |||
| 'for inference, mainly used for nvidia tensorcore') | |||
| parser.add_argument('--enable-nchw32', action='store_true', | |||
| help='transform the model format from NCHW4 to NCHW32 ' | |||
| 'for inference on nvidia TensoCore') | |||
| parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true', | |||
| help='fuse conv_bias with z input for inference on ' | |||
| 'nvidia GPU (this optimization pass will result in mismatch ' | |||
| 'of the precision of output of training and inference)') | |||
| args = parser.parse_args() | |||
| env = FpropEnv(verbose_fprop=False) | |||
| outputs = io.load_network(args.input).outputs | |||
| output_mgbvars = list(map(env.get_mgbvar, outputs)) | |||
| output_mgbvars = optimize_for_inference(args, output_mgbvars) | |||
| if args.discard_var_name: | |||
| sereg_kwargs = dict(keep_var_name=0, keep_param_name=False) | |||
| else: | |||
| sereg_kwargs = dict(keep_var_name=2, keep_param_name=True) | |||
| stat = mgb.serialize_comp_graph_to_file( | |||
| args.output, output_mgbvars, append=False, | |||
| output_strip_info=args.output_strip_info, | |||
| **sereg_kwargs) | |||
| logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'. | |||
| format(stat.tot_bytes / 1024, | |||
| (stat.tot_bytes - stat.tensor_value_bytes) / 1024)) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,75 @@ | |||
| #!/usr/bin/env bash | |||
| set -e | |||
| function usage() { | |||
| echo "$0 args1 args2 .." | |||
| echo "available args detail:" | |||
| echo "-i info.json : input info.json file" | |||
| echo "-m model: model name" | |||
| echo "-e encryption mode: encryption mode rc4 encrypt_predefined_rc4 " | |||
| echo "-o output name: output name" | |||
| echo "-n input model name: input model name match with info.json" | |||
| echo "-h : show usage" | |||
| exit -1 | |||
| } | |||
| while getopts "i:m:e:o:n:h" arg | |||
| do | |||
| case $arg in | |||
| i) | |||
| INFO_NAME=$OPTARG | |||
| ;; | |||
| m) | |||
| MODEL_NAME=$OPTARG | |||
| ;; | |||
| n) | |||
| INPUT_MODEL_NAME=$OPTARG | |||
| ;; | |||
| e) | |||
| ENCRYPT_MODE=$OPTARG | |||
| ;; | |||
| o) | |||
| OUTPUT_NAME=$OPTARG | |||
| ;; | |||
| h) | |||
| usage | |||
| ;; | |||
| \?) | |||
| echo "show usage" | |||
| usage | |||
| ;; | |||
| esac | |||
| done | |||
| echo "----------------------------------------------------" | |||
| echo "commad args summary:" | |||
| echo "INFO_NAME: $INFO_NAME" | |||
| echo "MODEL_NAME: $MODEL_NAME" | |||
| echo "ENCRYPT_MODE: $ENCRYPT_MODE" | |||
| echo "OUTPUT_NAME: $OUTPUT_NAME" | |||
| echo "INPUT_MODEL_NAME: $INPUT_MODEL_NAME" | |||
| echo "----------------------------------------------------" | |||
| if [[ $INFO_NAME == '' ]]; then | |||
| echo "INFO_NAME is NULL,exit now..." | |||
| exit -1 | |||
| fi | |||
| if [[ $MODEL_NAME == '' ]]; then | |||
| echo "MODEL_NAME is NULL,exit now..." | |||
| exit -1 | |||
| fi | |||
| if [[ $INPUT_MODEL_NAME == '' ]]; then | |||
| echo "INPUT_MODEL_NAME is NULL,exit now..." | |||
| exit -1 | |||
| fi | |||
| if [[ $OUTPUT_NAME == '' ]]; then | |||
| echo "OUTPUT_NAME is NULL,exit now..." | |||
| exit -1 | |||
| fi | |||
| ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod | |||
| ENCRYPT_MODEL_NAME=$MODEL_NAME.pr_rc4.emod | |||
| ./rc4_encryptor $ENCRYPT_MODE $INFO_NAME $INFO_NAME.pr_rc4.emod | |||
| ./rc4_encryptor $ENCRYPT_MODE $MODEL_NAME $MODEL_NAME.pr_rc4.emod | |||
| ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod | |||
| python3 pack_model_and_info.py --input-model=$ENCRYPT_MODEL_NAME --model-name=$INPUT_MODEL_NAME --model-cryption="RC4_default" --info-cryption="RC4_default" --input-info=$ENCRYPT_INFO_NAME --info-parser="LITE_default" -o $OUTPUT_NAME | |||
| @@ -0,0 +1,135 @@ | |||
| #!/usr/bin/python3 | |||
| # -*- coding: utf-8 -*- | |||
| # | |||
| # This file is part of MegEngine, a deep learning framework developed by | |||
| # Megvii. | |||
| # | |||
| # copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| import argparse | |||
| import struct | |||
| import os | |||
| import subprocess | |||
| import flatbuffers | |||
| def generate_flatbuffer(): | |||
| status, path = subprocess.getstatusoutput('which flatc') | |||
| if not status: | |||
| cwd = os.path.dirname(os.path.dirname(__file__)) | |||
| fbs_file = os.path.abspath(os.path.join(cwd, | |||
| "../../src/parse_model/pack_model.fbs")) | |||
| cmd = path + ' -p -b '+fbs_file | |||
| ret, _ = subprocess.getstatusoutput(str(cmd)) | |||
| if ret: | |||
| raise Exception("flatc generate error!") | |||
| else: | |||
| raise Exception('no flatc in current environment, please build flatc ' | |||
| 'and put in the system PATH!') | |||
| def main(): | |||
| parser = argparse.ArgumentParser( | |||
| description='load a encrypted or not encrypted model and a ' | |||
| 'json format of the infomation of the model, pack them to a file ' | |||
| 'which can be loaded by lite.') | |||
| parser.add_argument('--input-model', help='input a encrypted or not encrypted model') | |||
| parser.add_argument('--input-info', help='input a encrypted or not encrypted ' | |||
| 'json format file.') | |||
| parser.add_argument('--model-name', help='the model name, this must match ' | |||
| 'with the model name in model info', default = 'NONE') | |||
| parser.add_argument('--model-cryption', help='the model encryption method ' | |||
| 'name, this is used to find the right decryption method. e.g. ' | |||
| '--model_cryption = "AES_default", default is NONE.', default = | |||
| 'NONE') | |||
| parser.add_argument('--info-cryption', help='the info encryption method ' | |||
| 'name, this is used to find the right decryption method. e.g. ' | |||
| '--model_cryption = "AES_default", default is NONE.', default = | |||
| 'NONE') | |||
| parser.add_argument('--info-parser', help='The information parse method name ' | |||
| 'default is "LITE_default". ', default = 'LITE_default') | |||
| parser.add_argument('--append', '-a', help='append another model to a ' | |||
| 'packed model.') | |||
| parser.add_argument('--output', '-o', help='output file of packed model.') | |||
| args = parser.parse_args() | |||
| generate_flatbuffer() | |||
| assert not args.append, ('--append is not support yet') | |||
| assert args.input_model, ('--input_model must be given') | |||
| with open(args.input_model, 'rb') as fin: | |||
| raw_model = fin.read() | |||
| model_length = len(raw_model) | |||
| if args.input_info: | |||
| with open(args.input_info, 'rb') as fin: | |||
| raw_info = fin.read() | |||
| info_length = len(raw_info) | |||
| else: | |||
| raw_info = None | |||
| info_length = 0 | |||
| # Generated by `flatc`. | |||
| from model_parse import Model, ModelData, ModelHeader, ModelInfo, PackModel | |||
| builder = flatbuffers.Builder(1024) | |||
| model_name = builder.CreateString(args.model_name) | |||
| model_cryption = builder.CreateString(args.model_cryption) | |||
| info_cryption = builder.CreateString(args.info_cryption) | |||
| info_parser = builder.CreateString(args.info_parser) | |||
| info_data = builder.CreateByteVector(raw_info) | |||
| arr_data = builder.CreateByteVector(raw_model) | |||
| #model header | |||
| ModelHeader.ModelHeaderStart(builder) | |||
| ModelHeader.ModelHeaderAddName(builder, model_name) | |||
| ModelHeader.ModelHeaderAddModelDecryptionMethod(builder, model_cryption) | |||
| ModelHeader.ModelHeaderAddInfoDecryptionMethod(builder, info_cryption) | |||
| ModelHeader.ModelHeaderAddInfoParseMethod(builder, info_parser) | |||
| model_header = ModelHeader.ModelHeaderEnd(builder) | |||
| #model info | |||
| ModelInfo.ModelInfoStart(builder) | |||
| ModelInfo.ModelInfoAddData(builder, info_data) | |||
| model_info = ModelInfo.ModelInfoEnd(builder) | |||
| #model data | |||
| ModelData.ModelDataStart(builder) | |||
| ModelData.ModelDataAddData(builder, arr_data) | |||
| model_data = ModelData.ModelDataEnd(builder) | |||
| Model.ModelStart(builder) | |||
| Model.ModelAddHeader(builder, model_header) | |||
| Model.ModelAddData(builder, model_data) | |||
| Model.ModelAddInfo(builder, model_info) | |||
| model = Model.ModelEnd(builder) | |||
| PackModel.PackModelStartModelsVector(builder, 1) | |||
| builder.PrependUOffsetTRelative(model) | |||
| models = builder.EndVector(1) | |||
| PackModel.PackModelStart(builder) | |||
| PackModel.PackModelAddModels(builder, models) | |||
| packed_model = PackModel.PackModelEnd(builder) | |||
| builder.Finish(packed_model) | |||
| buff = builder.Output() | |||
| result = struct.pack(str(len("packed_model")) + 's', "packed_model".encode('ascii')) | |||
| result += buff | |||
| assert args.output, ('--output must be given') | |||
| with open(args.output, 'wb') as fin: | |||
| fin.write(result) | |||
| print("Model packaged successfully!!!") | |||
| print("model name is: {}.".format(args.model_name)) | |||
| print("model encryption method is: {}. ".format(args.model_cryption)) | |||
| print("model json infomation encryption method is: {}. ".format(args.info_cryption)) | |||
| print("model json infomation parse method is: {}. ".format(args.info_parser)) | |||
| print("packed model is write to {} ".format(args.output)) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,211 @@ | |||
| /** \file tools/rc4_encrypt.cpp | |||
| * | |||
| * This file is part of MegEngine, a deep learning framework developed by | |||
| * Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
| */ | |||
| #include <stdio.h> | |||
| #include <algorithm> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "../src/decryption/rc4/rc4_cryption_base.h" | |||
| #include "../src/decryption/rc4_cryption.h" | |||
| using namespace lite; | |||
| std::shared_ptr<void> read_file(std::string file_path, size_t& size) { | |||
| FILE* fin = fopen(file_path.c_str(), "rb"); | |||
| if (!fin) { | |||
| printf("failed to open %s.", file_path.c_str()); | |||
| }; | |||
| fseek(fin, 0, SEEK_END); | |||
| size = ftell(fin); | |||
| fseek(fin, 0, SEEK_SET); | |||
| void* ptr = malloc(size); | |||
| std::shared_ptr<void> buf{ptr, ::free}; | |||
| fread(buf.get(), 1, size, fin); | |||
| fclose(fin); | |||
| return buf; | |||
| } | |||
| void write_file(std::string file_path, const std::vector<uint8_t>& data) { | |||
| FILE* fin = fopen(file_path.c_str(), "wb"); | |||
| if (!fin) { | |||
| printf("failed to open %s.", file_path.c_str()); | |||
| }; | |||
| fwrite(data.data(), 1, data.size(), fin); | |||
| fclose(fin); | |||
| } | |||
| typedef int (*CommandHandler)(int, char**); | |||
| const char* usage = | |||
| "Usage:\n" | |||
| " rc4_encryptor encrypt_predefined_rc4 <input file> <output file>\n" | |||
| " rc4_encryptor encrypt_rc4 <hash key> <enc key> <input file> <output " | |||
| "file>\n" | |||
| " rc4_encryptor encrypt_predefined_sfrc4 <input file> <output file>\n" | |||
| " rc4_encryptor encrypt_sfrc4 <hash key> <enc key> <input file> " | |||
| "<output " | |||
| "file>\n" | |||
| " rc4_encryptor hash <input file>\n"; | |||
| int command_encrypt_predefined_rc4(int argc, char** argv) { | |||
| if (argc != 4) { | |||
| printf("Invalid encrypt_predefined_rc4 arguments.\n"); | |||
| return 1; | |||
| } | |||
| const char* input_file_path = argv[2]; | |||
| const char* output_file_path = argv[3]; | |||
| size_t size = 0; | |||
| auto keys = RC4::get_decrypt_key(); | |||
| auto input = read_file(input_file_path, size); | |||
| printf("Reading input file ...\n"); | |||
| auto output = RC4::encrypt_model(input.get(), size, keys); | |||
| write_file(output_file_path, output); | |||
| printf("Done.\n"); | |||
| return 0; | |||
| } | |||
| int command_encrypt_rc4(int argc, char** argv) { | |||
| if (argc != 6) { | |||
| printf("Invalid encrypt_rc4 arguments.\n"); | |||
| return 1; | |||
| } | |||
| uint64_t hash_key = std::stoull(argv[2], 0, 0); | |||
| uint64_t enc_key = std::stoull(argv[3], 0, 0); | |||
| const char* input_file_path = argv[4]; | |||
| const char* output_file_path = argv[5]; | |||
| std::vector<uint8_t> keys(128, 0); | |||
| uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
| data[0] = hash_key; | |||
| data[1] = enc_key; | |||
| size_t size = 0; | |||
| auto input = read_file(input_file_path, size); | |||
| printf("Reading input file ...\n"); | |||
| auto output = RC4::encrypt_model(input.get(), size, keys); | |||
| printf("Encrypting ...\n"); | |||
| write_file(output_file_path, output); | |||
| printf("Done.\n"); | |||
| return 0; | |||
| } | |||
| int command_encrypt_predefined_sfrc4(int argc, char** argv) { | |||
| if (argc != 4) { | |||
| printf("Invalid encrypt_predefined_rc4 arguments.\n"); | |||
| return 1; | |||
| } | |||
| const char* input_file_path = argv[2]; | |||
| const char* output_file_path = argv[3]; | |||
| size_t size = 0; | |||
| auto keys = SimpleFastRC4::get_decrypt_key(); | |||
| auto input = read_file(input_file_path, size); | |||
| printf("Reading input file ...\n"); | |||
| auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); | |||
| write_file(output_file_path, output); | |||
| printf("Done.\n"); | |||
| return 0; | |||
| } | |||
| int command_encrypt_sfrc4(int argc, char** argv) { | |||
| if (argc != 6) { | |||
| printf("Invalid encrypt_rc4 arguments.\n"); | |||
| return 1; | |||
| } | |||
| uint64_t hash_key = std::stoull(argv[2], 0, 0); | |||
| uint64_t enc_key = std::stoull(argv[3], 0, 0); | |||
| const char* input_file_path = argv[4]; | |||
| const char* output_file_path = argv[5]; | |||
| std::vector<uint8_t> keys(128, 0); | |||
| uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
| data[0] = hash_key; | |||
| data[1] = enc_key; | |||
| size_t size = 0; | |||
| auto input = read_file(input_file_path, size); | |||
| printf("Reading input file ...\n"); | |||
| auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); | |||
| printf("Encrypting ...\n"); | |||
| write_file(output_file_path, output); | |||
| printf("Done.\n"); | |||
| return 0; | |||
| } | |||
| int command_hash(int argc, char** argv) { | |||
| if (argc != 3) { | |||
| printf("Invalid hash arguments.\n"); | |||
| return 1; | |||
| } | |||
| const char* input_file_path = argv[2]; | |||
| size_t len = 0; | |||
| auto input = read_file(input_file_path, len); | |||
| rc4::FastHash64 hasher(rc4::key_gen_hash_key()); | |||
| auto start = static_cast<const char*>(input.get()); | |||
| auto ptr = reinterpret_cast<const uint64_t*>(start); | |||
| while (reinterpret_cast<const char*>(ptr + 1) <= start + len) { | |||
| hasher.feed(*ptr); | |||
| ++ptr; | |||
| } | |||
| auto cptr = reinterpret_cast<const char*>(ptr); | |||
| if (cptr < start + len) { | |||
| uint64_t v = 0; | |||
| std::copy(cptr, start + len, reinterpret_cast<char*>(&v)); | |||
| hasher.feed(v); | |||
| } | |||
| printf("%llx\n", static_cast<unsigned long long>(hasher.get())); | |||
| return 0; | |||
| } | |||
| std::unordered_map<std::string, CommandHandler> commands = { | |||
| {"encrypt_predefined_rc4", command_encrypt_predefined_rc4}, | |||
| {"encrypt_rc4", command_encrypt_rc4}, | |||
| {"encrypt_predefined_sfrc4", command_encrypt_predefined_sfrc4}, | |||
| {"encrypt_sfrc4", command_encrypt_sfrc4}, | |||
| {"hash", command_hash}, | |||
| }; | |||
| int main(int argc, char** argv) { | |||
| if (argc == 1) { | |||
| printf("%s", usage); | |||
| return 1; | |||
| } | |||
| auto it = commands.find(argv[1]); | |||
| if (it == commands.end()) { | |||
| printf("Invalid command arguments.\n"); | |||
| printf("%s", usage); | |||
| return 1; | |||
| } | |||
| return it->second(argc, argv); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -209,6 +209,35 @@ function do_build() { | |||
| echo "comapt whl name: ${compat_whl_name}" | |||
| cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} | |||
| # handle megenginelite | |||
| cd ${BUILD_DIR} | |||
| rm -rf lite_staging | |||
| mkdir -p lite_staging/megenginelite | |||
| cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ | |||
| cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ | |||
| cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ | |||
| VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py | |||
| if [ -f ${VER_FILE} ];then | |||
| cp ${VER_FILE} lite_staging/megenginelite | |||
| else | |||
| echo "ERROR: can not find version file" | |||
| exit -1 | |||
| fi | |||
| mkdir -p ${BUILD_DIR}/lite_staging/megenginelite/libs | |||
| LITE_LIB=${BUILD_DIR}/lite_staging/megenginelite/libs/liblite_shared.dylib | |||
| cp ${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/lite/liblite_shared.dylib ${LITE_LIB} | |||
| llvm-strip -s ${LITE_LIB} | |||
| cd ${BUILD_DIR}/lite_staging/ | |||
| ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel | |||
| cd ${BUILD_DIR}/lite_staging/dist/ | |||
| org_whl_name=`ls Meg*.whl` | |||
| index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'` | |||
| compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl | |||
| echo "megenginelite org whl name: ${org_whl_name}" | |||
| echo "megenginelite comapt whl name: ${compat_whl_name}" | |||
| cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} | |||
| cd ${SRC_DIR} | |||
| echo "" | |||
| echo "##############################################################################################" | |||
| @@ -155,6 +155,33 @@ do | |||
| echo "comapt whl name: ${compat_whl_name}" | |||
| mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} | |||
| # handle megenginelite | |||
| cd ${BUILD_DIR} | |||
| rm -rf lite_staging | |||
| mkdir -p lite_staging/megenginelite | |||
| cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ | |||
| cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ | |||
| cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ | |||
| VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py | |||
| if [ -f ${VER_FILE} ];then | |||
| cp ${VER_FILE} lite_staging/megenginelite | |||
| else | |||
| echo "ERROR: can not find version file" | |||
| exit -1 | |||
| fi | |||
| patch_elf_depend_lib_megenginelite | |||
| cd ${BUILD_DIR}/lite_staging/ | |||
| ${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||
| cd /home/output | |||
| mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME} | |||
| cd ${BUILD_DIR}/lite_staging/dist/ | |||
| org_whl_name=`ls Meg*${ver}*.whl` | |||
| compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||
| echo "megenginelite org whl name: ${org_whl_name}" | |||
| echo "megenginelite comapt whl name: ${compat_whl_name}" | |||
| mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} | |||
| cd /home/output | |||
| chown -R ${UID}.${UID} . | |||
| # compat for root-less docker env to remove output at host side | |||