| @@ -26,7 +26,7 @@ project(akg C CXX) | |||||
| set(AKG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") | set(AKG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") | ||||
| set(TVM_DIR "${AKG_SOURCE_DIR}/third_party/incubator-tvm") | set(TVM_DIR "${AKG_SOURCE_DIR}/third_party/incubator-tvm") | ||||
| if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT USE_KC_AIR) | |||||
| if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT USE_KC_AIR AND NOT USE_CCE_PROFILING) | |||||
| add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0) | add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0) | ||||
| endif() | endif() | ||||
| @@ -63,21 +63,36 @@ link_directories(${AKG_RPATH}) | |||||
| # Search AKG_EXTEND by order | # Search AKG_EXTEND by order | ||||
| set(AKG_EXTEND ) | set(AKG_EXTEND ) | ||||
| if(NOT USE_CUDA) | if(NOT USE_CUDA) | ||||
| set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/libakg_ext.a) # Search libakg_ext.a in directory akg | |||||
| if(NOT EXISTS ${AKG_EXTEND_FILE}) | |||||
| set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/build/libakg_ext.a) # Search libakg_ext.a in directory akg/build | |||||
| if(NOT EXISTS ${AKG_EXTEND_FILE} AND NOT USE_KC_AIR) # Download libakg_ext.a to directory akg/build | |||||
| set(AKG_EXTEND_FILE ) | |||||
| set(LIB_PATH1 ${AKG_SOURCE_DIR}/libakg_ext.a) | |||||
| set(LIB_PATH2 ${AKG_SOURCE_DIR}/build/libakg_ext.a) | |||||
| if(EXISTS ${LIB_PATH1}) # Search libakg_ext.a in akg/ | |||||
| set(AKG_EXTEND_FILE ${LIB_PATH1}) | |||||
| else() | |||||
| if(EXISTS ${LIB_PATH2}) # Search libakg_ext.a in akg/build/ | |||||
| set(AKG_EXTEND_FILE ${LIB_PATH2}) | |||||
| elseif(NOT USE_KC_AIR AND NOT USE_CCE_PROFILING) # Download libakg_ext.a to akg/build/ | |||||
| execute_process(COMMAND bash ${AKG_SOURCE_DIR}/build.sh -a | execute_process(COMMAND bash ${AKG_SOURCE_DIR}/build.sh -a | ||||
| WORKING_DIRECTORY ${AKG_SOURCE_DIR} | WORKING_DIRECTORY ${AKG_SOURCE_DIR} | ||||
| OUTPUT_VARIABLE OUTPUT_URL | |||||
| RESULT_VARIABLE RESULT) | RESULT_VARIABLE RESULT) | ||||
| if(RESULT EQUAL 0) | |||||
| set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/build/libakg_ext.a) | |||||
| if(RESULT EQUAL 0 AND OUTPUT_URL MATCHES "libakg_ext.a") | |||||
| # Download library | |||||
| string(STRIP ${OUTPUT_URL} LIB_URL) | |||||
| message("-- Downloading ${LIB_URL} --> ${LIB_PATH2}") | |||||
| file(DOWNLOAD ${LIB_URL} ${LIB_PATH2} STATUS DOWNLOAD_STATUS) | |||||
| message("-- Download status: ${DOWNLOAD_STATUS}") | |||||
| list(GET DOWNLOAD_STATUS 0 DOWNLOAD_CODE) | |||||
| if(DOWNLOAD_CODE EQUAL 0) | |||||
| set(AKG_EXTEND_FILE ${LIB_PATH2}) | |||||
| endif() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| message("-- AKG_EXTEND_FILE: ${AKG_EXTEND_FILE}") | |||||
| if(EXISTS ${AKG_EXTEND_FILE}) | if(EXISTS ${AKG_EXTEND_FILE}) | ||||
| message("-- AKG_EXTEND_FILE: ${AKG_EXTEND_FILE}") | |||||
| file(COPY ${AKG_EXTEND_FILE} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/akg_extend) | file(COPY ${AKG_EXTEND_FILE} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/akg_extend) | ||||
| execute_process(COMMAND ar -x libakg_ext.a | execute_process(COMMAND ar -x libakg_ext.a | ||||
| WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/akg_extend) | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/akg_extend) | ||||
| @@ -1,121 +1,24 @@ | |||||
| # Release 1.1.1 | |||||
| ## Major Features and Improvements | |||||
| * Enable Tensor core when processing GEMM operators in AKG by using poly to create the schedule needed by tensor core pass automatically; | |||||
| * Implemented an akg mma lib with inlined ptx codes instead of wmma interface of cuda; | |||||
| * Enable one-dimensional mapping to optimize memory promotion. | |||||
| ## Bugfixes | |||||
| * Fix Segmentation fault in Mapping OuterBand in mindspore (!321). | |||||
| * Fix bugs for memory promotion issues (!306). | |||||
| * Fix bugs during gen tuning space for scalar ops (!326). | |||||
| ## Contributors | |||||
| Thanks goes to these wonderful people: | |||||
| chengyun, chendeshi, chenlei_autodiff, gengzhen, hanhuifeng, lvwenyuan, lishanni513, hujiahui8, polyhedral, shiliang, wYann, xixixian, xxxxxxw, xuhui, xiaruijie, yangsijia, yiyanzhi, zhangzhaochuang, zhengzuohe | |||||
| Contributions of any kind are welcome! | |||||
| # Release 1.1.0 | |||||
| ## Major Features and Improvements | |||||
| * GPU operators improvements | |||||
| * Propose a new strategy to handle the reduction operators: The reduce axises would be detected and rescheduled as a seperated band in the schedule tree and then mapping to blocks, then it will call the akg_reduce_lib which using atomic operation to do reduction in the codegen pass. The experimental results show that AKG improves the execution performance relative to cudnn in the large shape cases; | |||||
| * Optimize the auto-tiling algorithms which can improve the performance of reduction operators dramatically in most scenarios. | |||||
| * Support AutoTuning for composite operators on GPU; | |||||
| * Refactor composite framework to enable optimization in DSL level; | |||||
| * Enhance CSE to support eliminating redundant vmadd on Ascend; | |||||
| * Update scipy to 1.5.3. | |||||
| ## Bugfixes | |||||
| * TensorAdd support FRACTAL_NZ and DefaultFormat(!228). | |||||
| * GPU fix cast: fp32 -> uint8(!216). | |||||
| * bugfix: Fix bug in opt_broadcast(!272). | |||||
| * fix vadds for int32(!250). | |||||
| ## Contributors | |||||
| ## Release 1.2.0 | |||||
| ### Major Features and Improvements | |||||
| * [STABLE] Rebuild the AKG repository for providing a new way to support ascend backend by linking a static library contained all the ascend passes. (Ascend) | |||||
| * [STABLE] Optimize the reduction add operation in ascend backend. (Ascend) | |||||
| * [STABLE] Add support for tuning elemwise&&reduction operators. (GPU) | |||||
| ### Bug fixes | |||||
| * Fixed a problem that data prefetch cannot be enabled by attributes in DSL. | |||||
| * Fixed bugs of autotiling algorithms (tiling too small, cannot adapted matmul+bias, etc.) in Ascend platform. | |||||
| * Fixed local memory promotion for large thread (2980!) | |||||
| * Fixed reduce binding dimension issue on gpu platform (ff38!) | |||||
| ### Contributors | |||||
| Thanks goes to these wonderful people: | Thanks goes to these wonderful people: | ||||
| chengyun, chendeshi, chenlei_autodiff, gaoxiong, gengzhen, guanxiaowei, hanhuifeng, laekov, luoyin, lvwenyuan, liuchang, lishanni513, lingyunli63, polyhedral, shiliang, wYann, wangrao124, xiaruijie, xixixian, xuhui, 要术甲杰, yiyanzhi_akane, yangshuo, yangsijia, zhangzhaochuang, zhengzuohe, zhangrenwei, zengzitao | |||||
| Contributions of any kind are welcome! | |||||
| # Release 1.0.0 | |||||
| ## Major Features and Improvements | |||||
| * GPU Support | |||||
| * AKG now can generate gpu cuda kernel with no-schedule by using polyhedral techniques, which will create initial schedule, tile outerBands, map with blocks and threads and memory promotion automatically in the AutoPoly pass. | |||||
| * Some primitive and fused operators(most are element-wise operators and reduce operators) were added, as well as corresponding testcases. | |||||
| * Schedule-templates enhancement | |||||
| * Optimize the TVM original schedule-templates to get better performance in some reduce cases. | |||||
| * Support fusing multi-outputs into one kernel for element-wise operators. | |||||
| * Davinci Enhancement | |||||
| * Eliminate unnecessary broadcast by transforming the element-wise computation, such as `D[i, j] = A[i] + B[i, j] + C[i]` -> `D[i, j] = A[i] + C[i] + B[i, j]`, which satisfies commutative law and associative law. | |||||
| * Enhance the pass to_three_address to match more cases for vmadd. | |||||
| ## Bugfixes | |||||
| * fix a bug that random test case segment_max failed(!127). | |||||
| * fix the permisson denied error of rewriting meta_file with same name(!147). | |||||
| * fix warning for unsupported gpu built-in ops(!148). | |||||
| ## Contributors | |||||
| Thanks goes to these wonderful people: | |||||
| baita, ConnZhai, gengzhen, guanxiaowei, hanhuifeng, hujiahui8, laekov, lvwenyuan, lishanni513, lingyunli63, polyhedral, wYann, wangrao124, xixixian, xuhui, 要术甲杰, yiyanzhi_akane, yangsijia, zhengzuohe, zhangrenwei, zengzitao | |||||
| yangsijia, xxxxxxw, polyhedral, zhangrenwei, yiyanzhi, xixixian, hujiahui8, zhengzuohe, lishanni, zhangzhaochuang, xuhui, liuchao, gengzhen, xiaruijie, | |||||
| chenlei_autodiff, lingyunli63, wYann, lvwenyuan, peiwenfang, hanhuifeng, gaoxiong, chengyun | |||||
| Contributions of any kind are welcome! | Contributions of any kind are welcome! | ||||
| # Release 0.7.0-beta | |||||
| ## Major Features and Improvements | |||||
| * Backend refactoring | |||||
| * Rewrite instruction args calculation module in EmitInsn by implementing a new computing strategy based on axis spliting, which achieved improvement both on performance and code simplicity. | |||||
| ## Bugfixes | |||||
| * fix dump code error when running gpu operators and set env MS_AKG_DUMP_CODE=ON(!113). | |||||
| ## Contributors | |||||
| Thanks goes to these wonderful people: | |||||
| lvwenyuan, shiliang, xuhui, wYann | |||||
| Contributions of any kind are welcome! | |||||
| # Release 0.6.0-beta | |||||
| ## Major Features and Improvements | |||||
| * AutoPoly refactor to support integrating multi-backend targets easily | |||||
| * Employ a pass/passmgr framework to manage all the transformations of ISL schedule tree in which transformation such as InitialSchTree and tileOuterBand would be considered as a pass to schedule tree. | |||||
| * Refactor some data structure of poly so that they can de-couple with Davinci chips. | |||||
| * Backend refactoring | |||||
| * Enhance min alignment analysis with more accurate propagate conditions. | |||||
| * Finetune pragma using alignment information before EmitInsn pass. | |||||
| * Simplify EmitInsn pass by unifying the emit method for different patterns. | |||||
| * Change the way of using TVM | |||||
| * Delete the repository ktvm and reference TVM directly in sourcecode(third_party/incubator-tvm). | |||||
| * Enable GPU operators generation which was tailored in ktvm. | |||||
| ## Bugfixes | |||||
| * fix wrong hoist problem in multicore loop switch hoist pass(!87). | |||||
| * fix scalar rearrange bug(!84). | |||||
| * fix matmul tuning and support all space tuning(!73). | |||||
| * fix variable broadcast_idx redefinition error when pragma dma_copy is replaced by opt_broadcast(!45). | |||||
| * fix the bug in broadcast_rewrite(!22). | |||||
| * fix bugs of multi-core processing(!33). | |||||
| * fix a bug that extra pipe_barrier inserted in the loop(!30). | |||||
| * fix inefficient auto tiling for axis with tail and remove duplicated check(!6). | |||||
| ## Contributors | |||||
| Thanks goes to these wonderful people: | |||||
| brovensmile, chengyun, chenlei_autodiff, chengbin, ConnZhai, fuxiaoteng, gaoxiong, gengzhen, hanhuifeng, KasonChan, luoyin, lvwenyuan, peiwenfang, xuhui, yangsijia, wangzhuo325, wYann | |||||
| Contributions of any kind are welcome! | |||||
| # Release 0.5.0-beta | |||||
| ## Major Features | |||||
| * Support auto-schedule and code-generation on Ascend platform. | |||||
| * Provide C++ APIs of basic operators used in MindSpore. | |||||
| * Support Elementwise-Elementwise, Reduce-Elementwise fusion patterns in Bert. | |||||
| * Support LambUpdateWithLR, LambNextMv, BatchMatmul optimazition for Bert. | |||||
| ## Initial Version | ## Initial Version | ||||
| * Upload the initial framework | * Upload the initial framework | ||||
| * Basic support for Ascend910 platform | |||||
| * Integration with GraphKernel | |||||
| * Basic support for Ascend910 platform and gpu v100 | |||||
| * Integration with GraphKernel fusion of MindSpore. | |||||
| @@ -50,22 +50,23 @@ write_checksum() | |||||
| done | done | ||||
| } | } | ||||
| download_lib() | |||||
| acquire_lib_url() | |||||
| { | { | ||||
| uname_info=`uname -a | tr '[A-Z]' '[a-z]'` | |||||
| os_info=`cat /etc/os-release | grep '^NAME=' | tr '[A-Z]' '[a-z]'` | |||||
| os_name="" | os_name="" | ||||
| arch_name="" | |||||
| if [[ "${uname_info}" =~ "ubuntu" ]]; then | |||||
| if [[ "${os_info}" =~ "ubuntu" ]]; then | |||||
| os_name="ubuntu" | os_name="ubuntu" | ||||
| elif [[ "${uname_info}" =~ "euleros" ]]; then | |||||
| elif [[ "${os_info}" =~ "euleros" ]]; then | |||||
| os_name="euleros" | os_name="euleros" | ||||
| elif [[ "${uname_info}" =~ "centos" ]]; then | |||||
| elif [[ "${os_info}" =~ "centos" ]]; then | |||||
| os_name="centos" | os_name="centos" | ||||
| fi | fi | ||||
| if [[ "${uname_info}" =~ "aarch64" ]]; then | |||||
| arch_info=`arch | tr '[A-Z]' '[a-z]'` | |||||
| arch_name="" | |||||
| if [[ "${arch_info}" =~ "aarch64" ]]; then | |||||
| arch_name="aarch64" | arch_name="aarch64" | ||||
| elif [[ "${uname_info}" =~ "x86_64" ]]; then | |||||
| elif [[ "${arch_info}" =~ "x86" ]]; then | |||||
| arch_name="x86" | arch_name="x86" | ||||
| fi | fi | ||||
| @@ -73,32 +74,7 @@ download_lib() | |||||
| url_prefix="https://repo.mindspore.cn/public/ms-incubator/akg-binary/version" | url_prefix="https://repo.mindspore.cn/public/ms-incubator/akg-binary/version" | ||||
| lib_mark="202103/20210318/master_20210318142553_3e77f3a799ca87c23f1a906eaad5ec4c1f78bc95" | lib_mark="202103/20210318/master_20210318142553_3e77f3a799ca87c23f1a906eaad5ec4c1f78bc95" | ||||
| lib_url="${url_prefix}/${lib_mark}/lib/${os_arch}/libakg_ext.a" | lib_url="${url_prefix}/${lib_mark}/lib/${os_arch}/libakg_ext.a" | ||||
| hash_url="${url_prefix}/${lib_mark}/lib/${os_arch}/libakg_ext.a.sha256" | |||||
| if [ ! -d ${BUILD_DIR} ]; then | |||||
| mkdir -pv ${BUILD_DIR} | |||||
| fi | |||||
| # Download libakg_ext.a.sha256 | |||||
| wget -P ${BUILD_DIR} --waitretry=10 --tries=3 ${hash_url} | |||||
| if [ $? -ne 0 ]; then | |||||
| echo "Fail to download ${hash_url}" | |||||
| return 1 | |||||
| fi | |||||
| # Download libakg_ext.a | |||||
| wget -P ${BUILD_DIR} --waitretry=10 --tries=3 ${lib_url} | |||||
| if [ $? -ne 0 ]; then | |||||
| echo "Fail to download ${lib_url}" | |||||
| return 1 | |||||
| fi | |||||
| # Check hash | |||||
| cur_hash=`sha256sum -b ${BUILD_DIR}/libakg_ext.a | awk '{print $1}'` | |||||
| orig_hash=`grep libakg_ext.a ${BUILD_DIR}/libakg_ext.a.sha256 | awk '{print $1}'` | |||||
| if [ "${cur_hash}" != "${orig_hash}" ]; then | |||||
| echo "Hash check failed!" | |||||
| return 1 | |||||
| fi | |||||
| echo "${lib_url}" | |||||
| } | } | ||||
| if [ ! -n "$1" ]; then | if [ ! -n "$1" ]; then | ||||
| @@ -130,10 +106,7 @@ do | |||||
| t) | t) | ||||
| ;; | ;; | ||||
| a) | a) | ||||
| download_lib | |||||
| if [ $? -ne 0 ]; then | |||||
| exit 1 | |||||
| fi | |||||
| acquire_lib_url | |||||
| exit 0 | exit 0 | ||||
| ;; | ;; | ||||
| *) | *) | ||||
| @@ -50,7 +50,17 @@ def dump_tiling_info(level): | |||||
| logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1], | logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1], | ||||
| tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1], | tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1], | ||||
| tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0], | tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0], | ||||
| tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0]) | |||||
| tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0], | |||||
| ) | |||||
| idx_to_str = {0: "x", 1: "y", 2: "z"} | |||||
| for i in range(len(tuning_spaces["thread_range"])): | |||||
| info = "[thread.%s] range [%d, %d](jump by %d), " | |||||
| logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1], | |||||
| tuning_spaces['thread_mod'][i][0], ) | |||||
| for i in range(len(tuning_spaces["block_range"])): | |||||
| info = "[block.%s] range [%d, %d](jump by %d)" | |||||
| logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0], | |||||
| tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],) | |||||
| logging.info("===============================================") | logging.info("===============================================") | ||||
| elif isinstance(indice, int) and indice == EMPTY_CODE: | elif isinstance(indice, int) and indice == EMPTY_CODE: | ||||
| logging.info("Empty tiling space.") | logging.info("Empty tiling space.") | ||||
| @@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att | |||||
| tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist() | tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist() | ||||
| tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist() | tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist() | ||||
| tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist() | tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist() | ||||
| tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist() | |||||
| tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist() | |||||
| tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist() | |||||
| tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist() | |||||
| if level >= help_tiling_level["Candidates"]: | if level >= help_tiling_level["Candidates"]: | ||||
| tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist() | tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist() | ||||
| if not tuning: | if not tuning: | ||||
| @@ -70,15 +70,33 @@ class TileConstraint(Enum): | |||||
| SET_EXPANSION = "SET_EXPANSION" | SET_EXPANSION = "SET_EXPANSION" | ||||
| SET_MEM_RATIO = "SET_MEM_RATIO" | SET_MEM_RATIO = "SET_MEM_RATIO" | ||||
| SET_AXIS_INFO = "SET_AXIS_INFO" | SET_AXIS_INFO = "SET_AXIS_INFO" | ||||
| THREAD_MIN = "THREAD_MIN" | |||||
| THREAD_MAX = "THREAD_MAX" | |||||
| THREAD_MOD = "THREAD_MOD" | |||||
| BLOCK_MIN = "BLOCK_MIN" | |||||
| BLOCK_MAX = "BLOCK_MAX" | |||||
| BLOCK_MOD = "BLOCK_MOD" | |||||
| @check_input_type((double, float, int), TileConstraint, TileLevel) | |||||
| @check_input_type((double, float, int, list), TileConstraint, TileLevel) | |||||
| def modify_common_constraints(value, constraint, level=TileLevel.C1): | def modify_common_constraints(value, constraint, level=TileLevel.C1): | ||||
| """api for dsl to modify some default constraint used in auto tiling.""" | """api for dsl to modify some default constraint used in auto tiling.""" | ||||
| if constraint not in TileConstraint: | if constraint not in TileConstraint: | ||||
| raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint)) | raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint)) | ||||
| if constraint == TileConstraint.SET_MEM_RATIO: | if constraint == TileConstraint.SET_MEM_RATIO: | ||||
| return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value)) | return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value)) | ||||
| if constraint == TileConstraint.THREAD_MIN: | |||||
| return create_custom_tiling_node(TileMode.COMMON, thread_min=value) | |||||
| if constraint == TileConstraint.THREAD_MAX: | |||||
| return create_custom_tiling_node(TileMode.COMMON, thread_max=value) | |||||
| if constraint == TileConstraint.THREAD_MOD: | |||||
| return create_custom_tiling_node(TileMode.COMMON, thread_mod=value) | |||||
| if constraint == TileConstraint.BLOCK_MIN: | |||||
| return create_custom_tiling_node(TileMode.COMMON, block_min=value) | |||||
| if constraint == TileConstraint.BLOCK_MAX: | |||||
| return create_custom_tiling_node(TileMode.COMMON, block_max=value) | |||||
| if constraint == TileConstraint.BLOCK_MOD: | |||||
| return create_custom_tiling_node(TileMode.COMMON, block_mod=value) | |||||
| raise TypeError("Constraint {} is not supported in this api, please use other api" | raise TypeError("Constraint {} is not supported in this api, please use other api" | ||||
| .format(constraint.value)) | .format(constraint.value)) | ||||
| @@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode, | |||||
| axis_info=DEFAULT_STRING, | axis_info=DEFAULT_STRING, | ||||
| priority=DEFAULT_VALUE, | priority=DEFAULT_VALUE, | ||||
| expansion=DEFAULT_VALUE, | expansion=DEFAULT_VALUE, | ||||
| mem_ratio=double(DEFAULT_VALUE)): | |||||
| mem_ratio=double(DEFAULT_VALUE), | |||||
| thread_min=[], | |||||
| thread_max=[], | |||||
| thread_mod=[], | |||||
| block_min=[], | |||||
| block_max=[], | |||||
| block_mod=[]): | |||||
| """default method to create custom tiling node, all values are default except tile mode.""" | """default method to create custom tiling node, all values are default except tile mode.""" | ||||
| tile_min = to_tvm_type(tile_min, "tile_min") | tile_min = to_tvm_type(tile_min, "tile_min") | ||||
| @@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode, | |||||
| axis_info=akg.tvm.expr.StringImm(axis_info), | axis_info=akg.tvm.expr.StringImm(axis_info), | ||||
| priority=priority, | priority=priority, | ||||
| expansion=expansion, | expansion=expansion, | ||||
| mem_ratio=mem_ratio) | |||||
| mem_ratio=mem_ratio, | |||||
| thread_min=thread_min, | |||||
| thread_max=thread_max, | |||||
| thread_mod=thread_mod, | |||||
| block_min=block_min, | |||||
| block_max=block_max, | |||||
| block_mod=block_mod) | |||||
| def template_nc1hwc0(tensor_name, level): | def template_nc1hwc0(tensor_name, level): | ||||
| @@ -35,6 +35,7 @@ import numpy as np | |||||
| import akg | import akg | ||||
| from akg.build_module import help_tiling_level | from akg.build_module import help_tiling_level | ||||
| from akg import backend as cce | |||||
| import akg.tvm | import akg.tvm | ||||
| from akg.tvm import autotvm | from akg.tvm import autotvm | ||||
| from akg.tvm import rpc | from akg.tvm import rpc | ||||
| @@ -88,7 +89,6 @@ def debug_mode(debug_flag): | |||||
| pass_list.append((0, ir_pass.inject_dma_intrin)) | pass_list.append((0, ir_pass.inject_dma_intrin)) | ||||
| return pass_list | return pass_list | ||||
| def func_time_required(func_name): | def func_time_required(func_name): | ||||
| """Checking the Time Required for Function Running.""" | """Checking the Time Required for Function Running.""" | ||||
| def wrapper(*args, **kwargs): | def wrapper(*args, **kwargs): | ||||
| @@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs): | |||||
| return None | return None | ||||
| @func_time_required | @func_time_required | ||||
| def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None): | |||||
| def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400): | |||||
| """ | """ | ||||
| unified run CCE kernel api. | unified run CCE kernel api. | ||||
| @@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None) | |||||
| if not tuning: | if not tuning: | ||||
| return out_list[0] if len(out_list) == 1 else tuple(out_list) | return out_list[0] if len(out_list) == 1 else tuple(out_list) | ||||
| else: | else: | ||||
| cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True) | |||||
| cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time) | |||||
| return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles} | return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles} | ||||
| stat_info = {} | stat_info = {} | ||||
| @@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", | |||||
| level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None | level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None | ||||
| if tuning or (level is not None and level > help_tiling_level['None']): | if tuning or (level is not None and level > help_tiling_level['None']): | ||||
| return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target) | return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target) | ||||
| mode = get_runtime_mode() | mode = get_runtime_mode() | ||||
| if mode == "cpu": | if mode == "cpu": | ||||
| mod = akg.tvm.build(s, op_var, "llvm") | mod = akg.tvm.build(s, op_var, "llvm") | ||||
| @@ -1069,12 +1068,12 @@ def get_device_id(): | |||||
| logging.error(e) | logging.error(e) | ||||
| return 0 | return 0 | ||||
| def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False): | |||||
| def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400): | |||||
| "get gpu profiling cycles." | "get gpu profiling cycles." | ||||
| func = tvm.get_global_func('GPUProfilerInit') | func = tvm.get_global_func('GPUProfilerInit') | ||||
| func("") | func("") | ||||
| from akg.utils.result_analysis import gpu_profiling | from akg.utils.result_analysis import gpu_profiling | ||||
| gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id) | |||||
| gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id) | |||||
| func = tvm.get_global_func('GPUProfilerStop') | func = tvm.get_global_func('GPUProfilerStop') | ||||
| a = func() | a = func() | ||||
| return int(a) | return int(a) | ||||
| @@ -1,177 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <chrono> | |||||
| #include <m16n16k4.hpp> | |||||
| // Usage: nvcc -std=c++11 -lineinfo -lcublas -arch=sm_70 -DCUDA_ARCH_SM=70 -I./ mma_test.cu -o mma_test | |||||
| // const int WARP_SIZE = 32; | |||||
| const int M = 16; | |||||
| const int N = 16; | |||||
| const int K = 4; | |||||
| const int MMA_M = 16; | |||||
| const int MMA_N = 16; | |||||
| const int MMA_K = 4; | |||||
| template<typename CType=float, typename ABType=half> | |||||
| __global__ void wmma_test_kernel(CType *const c_ptr, const ABType *const a_ptr, const ABType *const b_ptr) { | |||||
| akg::wmma::fragment<nvcuda::wmma::matrix_a, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::row_major> frag_a_row; | |||||
| akg::wmma::fragment<nvcuda::wmma::matrix_a, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::col_major> frag_a_col; | |||||
| akg::wmma::fragment<nvcuda::wmma::matrix_b, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::row_major> frag_b_row; | |||||
| akg::wmma::fragment<nvcuda::wmma::matrix_b, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::col_major> frag_b_col; | |||||
| akg::wmma::fragment<nvcuda::wmma::accumulator, MMA_M, MMA_N, MMA_K, CType> frag_c; | |||||
| akg::wmma::load_matrix_sync(frag_a_row, a_ptr, K); | |||||
| akg::wmma::load_matrix_sync(frag_a_col, a_ptr, M); | |||||
| akg::wmma::load_matrix_sync(frag_b_row, b_ptr, N); | |||||
| akg::wmma::load_matrix_sync(frag_b_col, b_ptr, K); | |||||
| akg::wmma::load_matrix_sync<CType>(frag_c, c_ptr, N, nvcuda::wmma::mem_row_major); | |||||
| print_fragment(frag_c, "frag_c"); | |||||
| akg::wmma::fill_fragment<CType>(frag_c, 0.0f); | |||||
| akg::wmma::mma_sync(frag_c, frag_a_col, frag_b_col, frag_c); | |||||
| print_fragment(frag_a_row, "frag_a_row"); | |||||
| print_fragment(frag_a_col, "frag_a_col"); | |||||
| print_fragment(frag_b_row, "frag_b_row"); | |||||
| print_fragment(frag_b_col, "frag_b_col"); | |||||
| akg::wmma::store_matrix_sync(frag_c, c_ptr, N, nvcuda::wmma::mem_row_major); | |||||
| } | |||||
| #define FP16_EXPONENT_BITS 0x1F | |||||
| #define FP16_EXPONENT_SHIFT 10 | |||||
| #define FP16_EXPONENT_BIAS 15 | |||||
| #define FP16_MANTISSA_BITS 0x3ff | |||||
| #define FP16_MANTISSA_SHIFT (23 - FP16_EXPONENT_SHIFT) | |||||
| #define FP16_MAX_EXPONENT (FP16_EXPONENT_BITS << FP16_EXPONENT_SHIFT) | |||||
| inline half FP32toFP16(float val) { | |||||
| unsigned int f32 = (*(unsigned int *)&val); | |||||
| unsigned short f16 = 0; | |||||
| /* Decode IEEE 754 little-endian 32-bit floating-point value */ | |||||
| int sign = (f32 >> 16) & 0x8000; | |||||
| /* Map exponent to the range [-127,128] */ | |||||
| int exponent = ((f32 >> 23) & 0xff) - 127; | |||||
| int mantissa = f32 & 0x007fffff; | |||||
| if (exponent == 128) { /* Infinity or NaN */ | |||||
| f16 = sign | FP16_MAX_EXPONENT; | |||||
| if (mantissa) f16 |= (mantissa & FP16_MANTISSA_BITS); | |||||
| } else if (exponent > 15) { /* Overflow - flush to Infinity */ | |||||
| f16 = sign | FP16_MAX_EXPONENT; | |||||
| } else if (exponent > -15) { /* Representable value */ | |||||
| exponent += FP16_EXPONENT_BIAS; | |||||
| mantissa >>= FP16_MANTISSA_SHIFT; | |||||
| f16 = sign | exponent << FP16_EXPONENT_SHIFT | mantissa; | |||||
| } else { | |||||
| f16 = sign; | |||||
| } | |||||
| return *(half *)&f16; | |||||
| } | |||||
| template <class T> | |||||
| void oneInit(T *data, int size) { | |||||
| for (int i = 0; i < size; ++i) { | |||||
| data[i] = (T)FP32toFP16(1.f); | |||||
| } | |||||
| } | |||||
| template <class T> | |||||
| void randomInit(T *data, int size) { | |||||
| for (int i = 0; i < size; ++i) { | |||||
| data[i] = (T)FP32toFP16(i); | |||||
| } | |||||
| } | |||||
| using stype = half; | |||||
| using dtype = float; | |||||
| int main() { | |||||
| half *da; | |||||
| half *db; | |||||
| float *dc; | |||||
| half *dc_fp16; | |||||
| unsigned int size_A = M * K; | |||||
| unsigned int size_B = K * N; | |||||
| unsigned int size_C = M * N; | |||||
| unsigned int size_C_fp16 = M * N; | |||||
| unsigned int mem_size_A = sizeof(stype) * size_A; | |||||
| unsigned int mem_size_B = sizeof(stype) * size_B; | |||||
| unsigned int mem_size_C = sizeof(dtype) * size_C; | |||||
| unsigned int mem_size_C_fp16 = sizeof(stype) * size_C_fp16; | |||||
| printf("M = %d, N = %d, K = %d\n", M, N, K); | |||||
| printf("size_A = %d, size_B = %d, size_C = %d, size_C_fp16 = %d\n", mem_size_A, mem_size_B, mem_size_C, mem_size_C_fp16); | |||||
| stype *h_A = (stype *)malloc(mem_size_A); | |||||
| stype *h_B = (stype *)malloc(mem_size_B); | |||||
| dtype *h_C = (dtype *)malloc(mem_size_C); | |||||
| stype *h_C_fp16 = (stype *)malloc(mem_size_C_fp16); | |||||
| // dtype *reference = (dtype *)malloc(mem_size_C); | |||||
| // stype *reference = (stype *)malloc(mem_size_C_fp16); | |||||
| randomInit<stype>(h_A, size_A); | |||||
| randomInit<stype>(h_B, size_B); | |||||
| randomInit<dtype>(h_C, size_C); | |||||
| randomInit<stype>(h_C_fp16, size_C_fp16); | |||||
| cudaMalloc(&da, mem_size_A); | |||||
| cudaMalloc(&db, mem_size_B); | |||||
| cudaMalloc(&dc, mem_size_C); | |||||
| cudaMalloc(&dc_fp16, mem_size_C_fp16); | |||||
| // copy host memory to device | |||||
| cudaMemcpy(da, h_A, mem_size_A, cudaMemcpyHostToDevice); | |||||
| cudaMemcpy(db, h_B, mem_size_B, cudaMemcpyHostToDevice); | |||||
| cudaMemcpy(dc, h_C, mem_size_C, cudaMemcpyHostToDevice); | |||||
| cudaMemcpy(dc_fp16, h_C_fp16, mem_size_C_fp16, cudaMemcpyHostToDevice); | |||||
| dim3 threads, grid; | |||||
| threads = dim3(32); | |||||
| grid = dim3(1, 1); | |||||
| // CType == fp32 | |||||
| wmma_test_kernel<float, half><<<grid, threads>>>(dc, da, db); | |||||
| cudaDeviceSynchronize(); | |||||
| auto error_code = cudaGetLastError(); | |||||
| printf("CType == fp32, last error: %d\n", error_code); | |||||
| cudaMemcpy(h_C, dc, mem_size_C, cudaMemcpyDeviceToHost); | |||||
| // CType == fp16 | |||||
| wmma_test_kernel<half, half><<<grid, threads>>>(dc_fp16, da, db); | |||||
| cudaDeviceSynchronize(); | |||||
| error_code = cudaGetLastError(); | |||||
| printf("CType == fp16, last error: %d\n", error_code); | |||||
| cudaMemcpy(h_C_fp16, dc_fp16, mem_size_C_fp16, cudaMemcpyDeviceToHost); | |||||
| free(h_A); | |||||
| free(h_B); | |||||
| free(h_C); | |||||
| free(h_C_fp16); | |||||
| // free(reference); | |||||
| cudaFree(da); | |||||
| cudaFree(db); | |||||
| cudaFree(dc); | |||||
| cudaFree(dc_fp16); | |||||
| } | |||||
| @@ -356,7 +356,7 @@ void RegisterMemoryManager::IsOutofMemory(std::vector<BufferDefInfo> promoted_in | |||||
| auto tensor_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies<size_t>()); | auto tensor_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies<size_t>()); | ||||
| auto data_bytes = scop_info_.user_config_.GetDataType(promoted_info.tensor_id.get_name()); | auto data_bytes = scop_info_.user_config_.GetDataType(promoted_info.tensor_id.get_name()); | ||||
| total_alloc_size += tensor_size * std::max<int>(1, data_bytes / BYTES_PER_REGISTER); | total_alloc_size += tensor_size * std::max<int>(1, data_bytes / BYTES_PER_REGISTER); | ||||
| if (total_alloc_size * alloc_threads > MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO) { | |||||
| if (total_alloc_size * alloc_threads >= MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO) { | |||||
| memory_exceeding_ = true; | memory_exceeding_ = true; | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -80,6 +80,24 @@ class CustomTilingNode : public Node { | |||||
| * default is 0.5 which is reserved for double buffer*/ | * default is 0.5 which is reserved for double buffer*/ | ||||
| double mem_ratio; | double mem_ratio; | ||||
| /*! \brief minimal thread binding factor on gpu, greater than 0*/ | |||||
| Array<Expr> thread_min; | |||||
| /*! \brief maximal thread binding factor on gpu*/ | |||||
| Array<Expr> thread_max; | |||||
| /*! \brief constraint thread binding factor % thread_mod == 0*/ | |||||
| Array<Expr> thread_mod; | |||||
| /*! \brief minimal block binding factor on gpu, greater than 0*/ | |||||
| Array<Expr> block_min; | |||||
| /*! \brief maximal block binding factor on gpu*/ | |||||
| Array<Expr> block_max; | |||||
| /*! \brief constraint block binding factor % block_mod == 0*/ | |||||
| Array<Expr> block_mod; | |||||
| void VisitAttrs(AttrVisitor *v) { | void VisitAttrs(AttrVisitor *v) { | ||||
| v->Visit("tile_level", &tile_level); | v->Visit("tile_level", &tile_level); | ||||
| v->Visit("tile_mode", &tile_mode); | v->Visit("tile_mode", &tile_mode); | ||||
| @@ -97,6 +115,12 @@ class CustomTilingNode : public Node { | |||||
| v->Visit("priority", &priority); | v->Visit("priority", &priority); | ||||
| v->Visit("expansion", &expansion); | v->Visit("expansion", &expansion); | ||||
| v->Visit("mem_ratio", &mem_ratio); | v->Visit("mem_ratio", &mem_ratio); | ||||
| v->Visit("thread_min", &thread_min); | |||||
| v->Visit("thread_max", &thread_max); | |||||
| v->Visit("thread_mod", &thread_mod); | |||||
| v->Visit("block_min", &block_min); | |||||
| v->Visit("block_max", &block_max); | |||||
| v->Visit("block_mod", &block_mod); | |||||
| } | } | ||||
| static constexpr const char *_type_key = "CustomTilingNode"; | static constexpr const char *_type_key = "CustomTilingNode"; | ||||
| @@ -36,6 +36,15 @@ class TileSpaceCollector { | |||||
| space_->c1_tile_mod_table = init_array; | space_->c1_tile_mod_table = init_array; | ||||
| space_->c0_tile_mod_table = init_array; | space_->c0_tile_mod_table = init_array; | ||||
| space_->tiling_candidate = init_array; | space_->tiling_candidate = init_array; | ||||
| space_->gpu_thread_range_table = init_array; | |||||
| space_->gpu_block_range_table = init_array; | |||||
| space_->gpu_thread_mod_table = init_array; | |||||
| space_->gpu_block_mod_table = init_array; | |||||
| if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | |||||
| cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"}; | |||||
| } else { | |||||
| cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; | |||||
| } | |||||
| } | } | ||||
| ~TileSpaceCollector() = default; | ~TileSpaceCollector() = default; | ||||
| @@ -122,38 +131,61 @@ class TileSpaceCollector { | |||||
| // step 2. collect cared info from each axis | // step 2. collect cared info from each axis | ||||
| for (const auto &con : cared_info_) { | for (const auto &con : cared_info_) { | ||||
| int length = con.find("mod") != std::string::npos ? 1 : 2; | int length = con.find("mod") != std::string::npos ? 1 : 2; | ||||
| auto array = air::runtime::NDArray::Empty({static_cast<int64_t>(tile_size), length}, type, ctx); | |||||
| auto size = static_cast<int64_t>(tile_size); | |||||
| if (con.find("gpu") != std::string::npos) { | |||||
| size = std::max<int64_t>(3, size); | |||||
| } | |||||
| auto array = air::runtime::NDArray::Empty({size, length}, type, ctx); | |||||
| auto spaceDlPack = array.ToDLPack(); | auto spaceDlPack = array.ToDLPack(); | ||||
| auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data); | auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data); | ||||
| for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { | |||||
| for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { | |||||
| if (con == "index") { | |||||
| *ptr++ = b_idx; | |||||
| *ptr++ = a_idx; | |||||
| if (con.find("gpu") != std::string::npos) { | |||||
| size_t s = con.find("thread") != std::string::npos ? 0 : 3; | |||||
| size_t e = con.find("thread") != std::string::npos ? 3 : 6; | |||||
| for (size_t i = s; i < e; ++i) { | |||||
| if (length == 1) { | |||||
| *ptr++ = analyzer_.binding_spaces_[i].map_mod_; | |||||
| } else { | } else { | ||||
| if (con == "C1_range") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||||
| } else if (con == "C0_range") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||||
| } else if (con == "C1_mod") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||||
| } else if (con == "C0_mod") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||||
| *ptr++ = analyzer_.binding_spaces_[i].map_min_; | |||||
| *ptr++ = analyzer_.binding_spaces_[i].map_extent_; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { | |||||
| for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { | |||||
| if (con == "index") { | |||||
| *ptr++ = b_idx; | |||||
| *ptr++ = a_idx; | |||||
| } else { | |||||
| if (con == "C1_range") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||||
| } else if (con == "C0_range") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||||
| } else if (con == "C1_mod") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||||
| } else if (con == "C0_mod") { | |||||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| if (con == "index") space_->index_table = array; | if (con == "index") space_->index_table = array; | ||||
| if (con == "C1_range") space_->c1_tile_range_table = array; | if (con == "C1_range") space_->c1_tile_range_table = array; | ||||
| if (con == "C0_range") space_->c0_tile_range_table = array; | if (con == "C0_range") space_->c0_tile_range_table = array; | ||||
| if (con == "C1_mod") space_->c1_tile_mod_table = array; | if (con == "C1_mod") space_->c1_tile_mod_table = array; | ||||
| if (con == "C0_mod") space_->c0_tile_mod_table = array; | if (con == "C0_mod") space_->c0_tile_mod_table = array; | ||||
| if (con == "gpu_thread_range") space_->gpu_thread_range_table = array; | |||||
| if (con == "gpu_block_range") space_->gpu_block_range_table = array; | |||||
| if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array; | |||||
| if (con == "gpu_block_mod") space_->gpu_block_mod_table = array; | |||||
| delete spaceDlPack; | delete spaceDlPack; | ||||
| } | } | ||||
| } | } | ||||
| @@ -196,7 +228,8 @@ class TileSpaceCollector { | |||||
| bool min_tile_ok = false; | bool min_tile_ok = false; | ||||
| for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) { | for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) { | ||||
| bool break_constraint = | bool break_constraint = | ||||
| (tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0); | |||||
| ((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) || | |||||
| (axis->forbid_iso && tile_extent->value % tile != 0); | |||||
| if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) { | if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -365,7 +398,7 @@ class TileSpaceCollector { | |||||
| DLContext ctx = {kDLCPU, 0}; | DLContext ctx = {kDLCPU, 0}; | ||||
| std::vector<TileAxis *> tile_axes_; | std::vector<TileAxis *> tile_axes_; | ||||
| std::vector<bool> is_shared_; | std::vector<bool> is_shared_; | ||||
| std::unordered_set<std::string> cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; | |||||
| std::unordered_set<std::string> cared_info_; | |||||
| struct Result { | struct Result { | ||||
| std::vector<int> tile; | std::vector<int> tile; | ||||
| @@ -28,6 +28,11 @@ class TileSpaceNode : public Node { | |||||
| air::runtime::NDArray c1_tile_mod_table; | air::runtime::NDArray c1_tile_mod_table; | ||||
| air::runtime::NDArray c0_tile_mod_table; | air::runtime::NDArray c0_tile_mod_table; | ||||
| air::runtime::NDArray tiling_candidate; | air::runtime::NDArray tiling_candidate; | ||||
| air::runtime::NDArray gpu_thread_range_table; | |||||
| air::runtime::NDArray gpu_block_range_table; | |||||
| air::runtime::NDArray gpu_thread_mod_table; | |||||
| air::runtime::NDArray gpu_block_mod_table; | |||||
| void VisitAttrs(AttrVisitor *v) { | void VisitAttrs(AttrVisitor *v) { | ||||
| v->Visit("index_table", &index_table); | v->Visit("index_table", &index_table); | ||||
| @@ -36,6 +41,11 @@ class TileSpaceNode : public Node { | |||||
| v->Visit("c1_tile_mod_table", &c1_tile_mod_table); | v->Visit("c1_tile_mod_table", &c1_tile_mod_table); | ||||
| v->Visit("c0_tile_mod_table", &c0_tile_mod_table); | v->Visit("c0_tile_mod_table", &c0_tile_mod_table); | ||||
| v->Visit("tiling_candidate", &tiling_candidate); | v->Visit("tiling_candidate", &tiling_candidate); | ||||
| v->Visit("gpu_thread_range_table", &gpu_thread_range_table); | |||||
| v->Visit("gpu_block_range_table", &gpu_block_range_table); | |||||
| v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table); | |||||
| v->Visit("gpu_block_mod_table", &gpu_block_mod_table); | |||||
| } | } | ||||
| static constexpr const char *_type_key = "TileSpace"; | static constexpr const char *_type_key = "TileSpace"; | ||||
| TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node); | TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node); | ||||
| @@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() { | |||||
| if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | ||||
| ReduceStrategy reduce_strategy(this); | ReduceStrategy reduce_strategy(this); | ||||
| actived_strategies.push_back(&reduce_strategy); | |||||
| ModStrategy mod_strategy(this); | ModStrategy mod_strategy(this); | ||||
| actived_strategies.push_back(&mod_strategy); | |||||
| GemmStrategy gemm_strategy(this); | |||||
| GpuDmaAnalysisStrategy dma_analysis_strategy(this); | GpuDmaAnalysisStrategy dma_analysis_strategy(this); | ||||
| CustomTilingStrategy custom_strategy(this); | |||||
| GpuStrategy gpu_strategy(this); | GpuStrategy gpu_strategy(this); | ||||
| if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) { | if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) { | ||||
| actived_strategies.push_back(&dma_analysis_strategy); | actived_strategies.push_back(&dma_analysis_strategy); | ||||
| } else { | } else { | ||||
| if (scop_info_.user_config_.GetIsTuning()) { | |||||
| actived_strategies.push_back(&custom_strategy); | |||||
| } else { | |||||
| actived_strategies.push_back(&reduce_strategy); | |||||
| actived_strategies.push_back(&mod_strategy); | |||||
| actived_strategies.push_back(&gemm_strategy); | |||||
| } | |||||
| actived_strategies.push_back(&gpu_strategy); | actived_strategies.push_back(&gpu_strategy); | ||||
| } | } | ||||
| strategy_manager->SetStrategies(actived_strategies); | strategy_manager->SetStrategies(actived_strategies); | ||||
| strategy_manager->ExecuteGpu(); | strategy_manager->ExecuteGpu(); | ||||
| if (scop_info_.user_config_.GetIsTuning()) { | |||||
| binding_spaces_.clear(); | |||||
| for (auto i : gpu_strategy.thread_binding_spaces_) { | |||||
| UpdateBindingSpace(i); | |||||
| } | |||||
| for (auto i : gpu_strategy.block_binding_spaces_) { | |||||
| UpdateBindingSpace(i); | |||||
| } | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| } | } | ||||
| @@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() { | |||||
| if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | ||||
| CastStrategy cast_strategy(this); | CastStrategy cast_strategy(this); | ||||
| actived_strategies.push_back(&cast_strategy); | actived_strategies.push_back(&cast_strategy); | ||||
| strategy_manager->SetStrategies(actived_strategies); | strategy_manager->SetStrategies(actived_strategies); | ||||
| strategy_manager->ExecuteGpu(); | strategy_manager->ExecuteGpu(); | ||||
| return; | return; | ||||
| @@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() { | |||||
| bool TilingAnalyzer::Prepare() { | bool TilingAnalyzer::Prepare() { | ||||
| logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger( | logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger( | ||||
| scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); | |||||
| scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); | |||||
| CHECK(logger_) << "memory alloc fail."; | CHECK(logger_) << "memory alloc fail."; | ||||
| // Stage 1: Analyze schedule tree. | // Stage 1: Analyze schedule tree. | ||||
| ScheduleTreeAnalyzer sch_ana(this, this->sch_); | ScheduleTreeAnalyzer sch_ana(this, this->sch_); | ||||
| @@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) { | |||||
| return (ALIGN_BYTES + dtype - 1) / dtype; | return (ALIGN_BYTES + dtype - 1) / dtype; | ||||
| } | } | ||||
| inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||||
| inline int64_t GetMinBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||||
| int64_t min_byte = -1; | int64_t min_byte = -1; | ||||
| for (const auto &it : dtypes) { | for (const auto &it : dtypes) { | ||||
| if (it.second.empty()) { | if (it.second.empty()) { | ||||
| @@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int> | |||||
| min_byte = min_elem; | min_byte = min_elem; | ||||
| } | } | ||||
| } | } | ||||
| return GetAlignBytes(min_byte); | |||||
| return min_byte; | |||||
| } | |||||
| inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||||
| return GetAlignBytes(GetMinBytes(dtypes)); | |||||
| } | } | ||||
| inline Expr CastToExpr(const std::string &value) { | inline Expr CastToExpr(const std::string &value) { | ||||
| @@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND"; | |||||
| constexpr auto AT_MOD = "MOD"; | constexpr auto AT_MOD = "MOD"; | ||||
| constexpr auto AT_CAST = "CAST"; | constexpr auto AT_CAST = "CAST"; | ||||
| constexpr auto AT_MEM_RATIO = "MEM_RATIO"; | constexpr auto AT_MEM_RATIO = "MEM_RATIO"; | ||||
| constexpr auto AT_THREAD_MIN = "THREAD_MIN"; | |||||
| constexpr auto AT_THREAD_MAX = "THREAD_MAX"; | |||||
| constexpr auto AT_THREAD_MOD = "THREAD_MOD"; | |||||
| constexpr auto AT_BLOCK_MIN = "BLOCK_MIN"; | |||||
| constexpr auto AT_BLOCK_MAX = "BLOCK_MAX"; | |||||
| constexpr auto AT_BLOCK_MOD = "BLOCK_MOD"; | |||||
| class TilingAnalyzer; | class TilingAnalyzer; | ||||
| @@ -233,12 +243,12 @@ class TilingAnalyzer { | |||||
| sch_(sch), | sch_(sch), | ||||
| scop_info_(scop_info), | scop_info_(scop_info), | ||||
| is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) { | is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) { | ||||
| if (scop_info.mmu_info_.IsGemm()) { | |||||
| op_type_ = GEMM_OP; | |||||
| } else if (scop_info.mmu_info_.IsConv()) { | |||||
| op_type_ = CONV_OP; | |||||
| } else { | |||||
| op_type_ = VECTOR_OP; | |||||
| if (scop_info.mmu_info_.IsGemm()) { | |||||
| op_type_ = GEMM_OP; | |||||
| } else if (scop_info.mmu_info_.IsConv()) { | |||||
| op_type_ = CONV_OP; | |||||
| } else { | |||||
| op_type_ = VECTOR_OP; | |||||
| } | } | ||||
| } | } | ||||
| @@ -292,7 +302,7 @@ class TilingAnalyzer { | |||||
| CHECK(logger_); | CHECK(logger_); | ||||
| return *(logger_.get()); | return *(logger_.get()); | ||||
| } | } | ||||
| void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); } | |||||
| Stmt body_; | Stmt body_; | ||||
| Binds &binds_; | Binds &binds_; | ||||
| isl::schedule sch_; | isl::schedule sch_; | ||||
| @@ -306,9 +316,8 @@ class TilingAnalyzer { | |||||
| std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_; | std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_; | ||||
| std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_; | std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_; | ||||
| bool is_retry_{false}; | bool is_retry_{false}; | ||||
| std::vector<TileAxis::MappingConstraint> binding_spaces_; // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z] | |||||
| private: | private: | ||||
| void AddTilingConstraints(); | void AddTilingConstraints(); | ||||
| void AddPostTilingConstraints(); | void AddPostTilingConstraints(); | ||||
| @@ -58,8 +58,10 @@ class TilingStrategy { | |||||
| // gpu configs | // gpu configs | ||||
| int64_t warp_sizes_ = 32; | int64_t warp_sizes_ = 32; | ||||
| int64_t max_num_blocks_ = 256 * 256; | |||||
| int64_t max_num_threads_ = 1024; | |||||
| int64_t max_x_dim_block_ = pow(2, 31) - 1; | |||||
| int64_t max_y_z_dim_block_ = 65535; | |||||
| int64_t max_x_y_dim_thread_ = 1024; | |||||
| int64_t max_z_dim_thread_ = 64; | |||||
| size_t max_dim_ = 3; | size_t max_dim_ = 3; | ||||
| int64_t max_elem_per_thread_ = 1024; | int64_t max_elem_per_thread_ = 1024; | ||||
| }; | }; | ||||
| @@ -284,8 +286,6 @@ class GemmStrategy : public TilingStrategy { | |||||
| ~GemmStrategy() {} | ~GemmStrategy() {} | ||||
| void AddNpuConstraint(); | void AddNpuConstraint(); | ||||
| void AddGpuConstraint(); | void AddGpuConstraint(); | ||||
| std::string interested_attr_key = AT_GEMM; | |||||
| }; | }; | ||||
| class GpuStrategy : public TilingStrategy { | class GpuStrategy : public TilingStrategy { | ||||
| @@ -306,6 +306,8 @@ class GpuStrategy : public TilingStrategy { | |||||
| }; | }; | ||||
| void AddNpuConstraint(); | void AddNpuConstraint(); | ||||
| void AddGpuConstraint(); | void AddGpuConstraint(); | ||||
| std::vector<TileAxis::MappingConstraint> thread_binding_spaces_; // [thread.x, thread.y, thread.z] | |||||
| std::vector<TileAxis::MappingConstraint> block_binding_spaces_; // [block.x, block.y, block.z] | |||||
| private: | private: | ||||
| void DetermineTemplate(); | void DetermineTemplate(); | ||||
| @@ -326,6 +328,8 @@ class GpuStrategy : public TilingStrategy { | |||||
| // Step 1. Collect axes and sort them from inner to outer | // Step 1. Collect axes and sort them from inner to outer | ||||
| void BuildAxesQueue(); | void BuildAxesQueue(); | ||||
| void ApplyCustomConstraint(); | |||||
| /* | /* | ||||
| * Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks. | * Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks. | ||||
| * e.g. | * e.g. | ||||
| @@ -342,6 +346,8 @@ class GpuStrategy : public TilingStrategy { | |||||
| // Step 3. Transform list of integer into string mapping config. | // Step 3. Transform list of integer into string mapping config. | ||||
| void SetMappingConfig(); | void SetMappingConfig(); | ||||
| int GetLocalAllocBufCount(); | |||||
| Template template_{Template::DEFAULT}; | Template template_{Template::DEFAULT}; | ||||
| bool is_reduce_op_[TEMPLATE_BULK] = {false, false, true, true, true, false}; | bool is_reduce_op_[TEMPLATE_BULK] = {false, false, true, true, true, false}; | ||||
| @@ -350,13 +356,12 @@ class GpuStrategy : public TilingStrategy { | |||||
| std::vector<int64_t> thread_limit_; | std::vector<int64_t> thread_limit_; | ||||
| std::vector<int64_t> block_cfg_; | std::vector<int64_t> block_cfg_; | ||||
| std::vector<int64_t> thread_cfg_; | std::vector<int64_t> thread_cfg_; | ||||
| int64_t max_x_y_dim_thread_ = 1024; | |||||
| int64_t max_z_dim_thread_ = 64; | |||||
| int block_count_{0}; // number of mapped blocks | int block_count_{0}; // number of mapped blocks | ||||
| int64_t elem_per_thread_[3]{SpItemPerThread::AUTO}; | int64_t elem_per_thread_[3]{SpItemPerThread::AUTO}; | ||||
| int64_t min_elem_for_io_bound_ = 2; | int64_t min_elem_for_io_bound_ = 2; | ||||
| size_t depth_{0}; | size_t depth_{0}; | ||||
| bool need_reverse_{false}; | bool need_reverse_{false}; | ||||
| bool reverse_binding_{false}; | |||||
| int64_t fused_size_{1}; | int64_t fused_size_{1}; | ||||
| std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"}, | std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"}, | ||||
| {3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"}, | {3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"}, | ||||
| @@ -378,7 +383,7 @@ class MulticoreStrategy { | |||||
| class TilingPriorityScorer { | class TilingPriorityScorer { | ||||
| public: | public: | ||||
| TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} | |||||
| TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} | |||||
| ~TilingPriorityScorer() {} | ~TilingPriorityScorer() {} | ||||
| /* | /* | ||||
| @@ -18,7 +18,6 @@ | |||||
| #include <numeric> | #include <numeric> | ||||
| #include "tiling_analyzer.h" | #include "tiling_analyzer.h" | ||||
| namespace akg { | namespace akg { | ||||
| namespace ir { | namespace ir { | ||||
| namespace poly { | namespace poly { | ||||
| @@ -174,13 +173,13 @@ void ReduceStrategy::AkgReduceLibStrategyOnGpu() { | |||||
| int64_t min_blocks = square_thread ? 32 : 512; | int64_t min_blocks = square_thread ? 32 : 512; | ||||
| int64_t min_elem_per_thread = use_local ? 2 : 8; | int64_t min_elem_per_thread = use_local ? 2 : 8; | ||||
| int64_t min_ty = 8; | int64_t min_ty = 8; | ||||
| if (total_injective_size * total_reduce_size / min_blocks / max_num_threads_ < min_elem_per_thread) { | |||||
| if (total_injective_size * total_reduce_size / min_blocks / max_x_y_dim_thread_ < min_elem_per_thread) { | |||||
| min_blocks = 32; | min_blocks = 32; | ||||
| min_ty = square_thread ? min_ty : 1; | min_ty = square_thread ? min_ty : 1; | ||||
| } | } | ||||
| std::pair<int64_t, int64_t> tx_range{1, max_num_threads_}; | |||||
| std::pair<int64_t, int64_t> ty_range{1, max_num_threads_}; | |||||
| std::pair<int64_t, int64_t> tx_range{1, max_x_y_dim_thread_}; | |||||
| std::pair<int64_t, int64_t> ty_range{1, max_x_y_dim_thread_}; | |||||
| auto AlignToPowerOfTwo = [](int64_t original_factor) -> int64_t { | auto AlignToPowerOfTwo = [](int64_t original_factor) -> int64_t { | ||||
| while ((original_factor) & (original_factor - 1)) { | while ((original_factor) & (original_factor - 1)) { | ||||
| --original_factor; | --original_factor; | ||||
| @@ -340,9 +339,9 @@ void ReduceStrategy::DealWith4DFusedReduce() { | |||||
| continue; | continue; | ||||
| } | } | ||||
| axis->TileRestrainToSingleValue(CastIntToExpr(last_mod_value), TileLevel::CACHE1); | axis->TileRestrainToSingleValue(CastIntToExpr(last_mod_value), TileLevel::CACHE1); | ||||
| if (last_mod_value > max_num_threads_) { | |||||
| if (last_mod_value > max_x_y_dim_thread_) { | |||||
| LOG(WARNING) << "Cannot bind axis to " << last_mod_value << " threads, maximal thread number is " | LOG(WARNING) << "Cannot bind axis to " << last_mod_value << " threads, maximal thread number is " | ||||
| << max_num_threads_ | |||||
| << max_x_y_dim_thread_ | |||||
| << ". If fusing more than two axes together, footprint box calculated by isl may not be correct."; | << ". If fusing more than two axes together, footprint box calculated by isl may not be correct."; | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -377,13 +376,141 @@ void ReduceStrategy::DealWithPostReduceTensors() { | |||||
| } | } | ||||
| } | } | ||||
| int GpuStrategy::GetLocalAllocBufCount() { | |||||
| int count = 0; | |||||
| for (auto &it : analyzer_->buf_info_) { | |||||
| auto buf = it.second.get(); | |||||
| CHECK(buf); | |||||
| if (buf->scope == TilingMemScope::MEM_SCOPE_LOCAL) { | |||||
| count++; | |||||
| } | |||||
| } | |||||
| return count; | |||||
| } | |||||
| void GpuStrategy::ApplyCustomConstraint() { | |||||
| auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) { | |||||
| std::vector<std::string> sp = akg::common::Split(constraint, ","); | |||||
| std::vector<int64_t> ret; | |||||
| for (auto val : sp) { | |||||
| if (ret.size() == max_size) { | |||||
| break; | |||||
| } | |||||
| CHECK(!val.empty()); | |||||
| ret.emplace_back(static_cast<int>(std::strtol(val.c_str(), nullptr, 10))); | |||||
| } | |||||
| return ret; | |||||
| }; | |||||
| // init binding space through template-determined limit | |||||
| thread_binding_spaces_.clear(); | |||||
| block_binding_spaces_.clear(); | |||||
| for (size_t i = 0; i < thread_limit_.size(); ++i) { | |||||
| TileAxis::MappingConstraint elem; | |||||
| elem.map_extent_ = thread_limit_[i]; | |||||
| thread_binding_spaces_.emplace_back(elem); | |||||
| } | |||||
| for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) { | |||||
| TileAxis::MappingConstraint elem; | |||||
| elem.map_extent_ = block_limit_[i]; | |||||
| block_binding_spaces_.emplace_back(elem); | |||||
| } | |||||
| // add constraints to binding space according to custom tiling | |||||
| std::unordered_set<std::string> thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD}; | |||||
| std::unordered_set<std::string> block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD}; | |||||
| for (const auto attr : analyzer_->RootAxis()->attrs) { | |||||
| std::vector<int64_t> constraint; | |||||
| std::vector<TileAxis::MappingConstraint> target; | |||||
| if (thread_keys.find(attr.attr_key) != thread_keys.end()) { | |||||
| constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size()); | |||||
| target = thread_binding_spaces_; | |||||
| } else if (block_keys.find(attr.attr_key) != block_keys.end()) { | |||||
| constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size()); | |||||
| target = block_binding_spaces_; | |||||
| } | |||||
| if (constraint.empty()) { | |||||
| continue; | |||||
| } | |||||
| for (size_t i = 0; i < constraint.size(); ++i) { | |||||
| if (attr.attr_key.find("MIN") != std::string::npos) { | |||||
| target[i].map_min_ = std::max<int64_t>(target[i].map_min_, constraint[i]); | |||||
| } else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) { | |||||
| target[i].map_extent_ = std::min<int64_t>(target[i].map_extent_, constraint[i]); | |||||
| } else if (attr.attr_key.find("MOD") != std::string::npos) { | |||||
| target[i].map_mod_ = std::max<int64_t>(1, constraint[i]); | |||||
| } | |||||
| } | |||||
| if (thread_keys.find(attr.attr_key) != thread_keys.end()) { | |||||
| thread_binding_spaces_ = target; | |||||
| } else if (block_keys.find(attr.attr_key) != block_keys.end()) { | |||||
| block_binding_spaces_ = target; | |||||
| } | |||||
| } | |||||
| // apply custom constraint to corresponding axis and modify binding space according to tile range of axis | |||||
| size_t cur_depth = 0; | |||||
| analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) { | |||||
| if (axis == analyzer_->RootAxis()) { | |||||
| return; | |||||
| } | |||||
| auto cons = axis->GetConstConstraint(CACHE1); | |||||
| auto range_extent = axis->GetConstExtent(); | |||||
| int tile_min = cons.tile_min_.as<IntImm>()->value; | |||||
| int tile_extent = cons.tile_extent_.as<IntImm>()->value; | |||||
| auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth; | |||||
| auto thread_extent = tile_extent; | |||||
| if (idx < thread_binding_spaces_.size()) { | |||||
| thread_extent = std::min<int64_t>(thread_extent, thread_binding_spaces_[idx].map_extent_); | |||||
| thread_binding_spaces_[idx].map_extent_ = thread_extent; | |||||
| } | |||||
| auto block_extent = range_extent / tile_min; | |||||
| if (idx < block_binding_spaces_.size()) { | |||||
| block_extent = std::min<int64_t>(block_extent, block_binding_spaces_[idx].map_extent_); | |||||
| block_binding_spaces_[idx].map_extent_ = block_extent; | |||||
| } | |||||
| auto block_min = block_extent / std::max<int64_t>(1, thread_extent); | |||||
| if (idx < block_binding_spaces_.size()) { | |||||
| block_min = std::max<int64_t>(block_min, block_binding_spaces_[idx].map_min_); | |||||
| block_binding_spaces_[idx].map_min_ = block_min; | |||||
| } | |||||
| axis->thread_constraints.map_extent_ = thread_extent; | |||||
| axis->block_constraints.map_extent_ = block_extent; | |||||
| axis->block_constraints.map_min_ = block_min; | |||||
| if (idx < thread_binding_spaces_.size()) { | |||||
| axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_; | |||||
| } | |||||
| if (idx < block_binding_spaces_.size()) { | |||||
| axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_; | |||||
| } | |||||
| ++cur_depth; | |||||
| }); | |||||
| } | |||||
| void GpuStrategy::AddGpuConstraint() { | void GpuStrategy::AddGpuConstraint() { | ||||
| InitMappingLimit(); | InitMappingLimit(); | ||||
| if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) { | |||||
| if (!analyzer_->scop_info_.user_config_.GetIsTuning() && | |||||
| (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) { | |||||
| BroadcastSpeedup(); | BroadcastSpeedup(); | ||||
| } | } | ||||
| BuildAxesQueue(); | BuildAxesQueue(); | ||||
| if (analyzer_->scop_info_.user_config_.GetIsTuning()) { | if (analyzer_->scop_info_.user_config_.GetIsTuning()) { | ||||
| ApplyCustomConstraint(); | |||||
| for (size_t i = 0; i < max_dim_; ++i) { | |||||
| TileAxis::MappingConstraint pad; | |||||
| if (i >= thread_binding_spaces_.size()) { | |||||
| thread_binding_spaces_.emplace_back(pad); | |||||
| } | |||||
| if (i >= block_binding_spaces_.size()) { | |||||
| block_binding_spaces_.emplace_back(pad); | |||||
| } | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| InnerThreadOuterBlock(); | InnerThreadOuterBlock(); | ||||
| @@ -391,19 +518,36 @@ void GpuStrategy::AddGpuConstraint() { | |||||
| InjectiveSpeedup(); | InjectiveSpeedup(); | ||||
| } | } | ||||
| SetMappingConfig(); | SetMappingConfig(); | ||||
| if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||||
| analyzer_->ForEachAxisTopDown([this](TileAxis *axis) { | |||||
| if (axis == analyzer_->RootAxis()) { | |||||
| return; | |||||
| } | |||||
| axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0); | |||||
| }); | |||||
| } | |||||
| // TODO: This is a very naive strategy to avoid cuda launch out of resources | |||||
| // and we should fix this in register memory promotion pass. | |||||
| if (template_ != Template::REDUCTION && template_ != Template::ALL_REDUCE) { | |||||
| auto local_buf_count = GetLocalAllocBufCount(); | |||||
| auto thread_size = std::accumulate(thread_cfg_.begin(), thread_cfg_.end(), 1, std::multiplies<int>()); | |||||
| if (local_buf_count >= 4 || local_buf_count * 4 * thread_size >= 65536) { | |||||
| analyzer_->scop_info_.user_config_.SetUseRegisterMemory(false); | |||||
| } | |||||
| } | |||||
| } | } | ||||
| void GpuStrategy::InitMappingLimit() { | void GpuStrategy::InitMappingLimit() { | ||||
| max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread(); | |||||
| max_x_y_dim_thread_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread(); | |||||
| DetermineTemplate(); | DetermineTemplate(); | ||||
| std::stringstream ss; | std::stringstream ss; | ||||
| need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; | |||||
| reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; | |||||
| if (template_ == Template::CUSTOM_CONFIG) { | if (template_ == Template::CUSTOM_CONFIG) { | ||||
| auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig(); | auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig(); | ||||
| for (size_t i = 0; i < thread_config->bound; ++i) { | for (size_t i = 0; i < thread_config->bound; ++i) { | ||||
| auto idx = need_reverse_ ? thread_config->bound - 1 - i : i; | |||||
| auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i; | |||||
| if (idx >= depth_) { | if (idx >= depth_) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -427,29 +571,33 @@ void GpuStrategy::InitMappingLimit() { | |||||
| } else if (template_ == Template::MATMUL) { | } else if (template_ == Template::MATMUL) { | ||||
| // This is a naive tiling strategy used in gpu when thread and block configs are already set. | // This is a naive tiling strategy used in gpu when thread and block configs are already set. | ||||
| // This strategy will tile up to three inner-most axes to 32 (for thread binding). | // This strategy will tile up to three inner-most axes to 32 (for thread binding). | ||||
| thread_limit_ = {32, 8}; | |||||
| if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||||
| thread_limit_ = {warp_sizes_, 16}; | |||||
| } else { | |||||
| thread_limit_ = {warp_sizes_, 8}; | |||||
| } | |||||
| } else { | } else { | ||||
| thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_}; | thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_}; | ||||
| } | } | ||||
| if (template_ != Template::CUSTOM_CONFIG) { | |||||
| if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||||
| AdjustThreadMappingLimit(); | AdjustThreadMappingLimit(); | ||||
| } | } | ||||
| if (template_ == Template::CUSTOM_CONFIG) { | if (template_ == Template::CUSTOM_CONFIG) { | ||||
| auto block_config = analyzer_->scop_info_.user_config_.GetBlockConfig(); | auto block_config = analyzer_->scop_info_.user_config_.GetBlockConfig(); | ||||
| for (int i = block_config->bound - 1; i >= 0; --i) { | |||||
| for (int i = 0; i < static_cast<int>(block_config->bound) - 1; ++i) { | |||||
| if (i >= static_cast<int>(depth_)) { | if (i >= static_cast<int>(depth_)) { | ||||
| continue; | |||||
| break; | |||||
| } | } | ||||
| block_limit_.emplace_back(block_config->GetAt(i).second); | block_limit_.emplace_back(block_config->GetAt(i).second); | ||||
| } | } | ||||
| } else if (template_ <= Template::REDUCTION) { | } else if (template_ <= Template::REDUCTION) { | ||||
| block_limit_ = {max_num_blocks_, max_num_blocks_, max_num_blocks_}; | |||||
| block_limit_ = {max_x_dim_block_, max_y_z_dim_block_, max_y_z_dim_block_}; | |||||
| } else if (template_ == Template::ALL_REDUCE && !analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib()) { | } else if (template_ == Template::ALL_REDUCE && !analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib()) { | ||||
| block_limit_ = {1}; | block_limit_ = {1}; | ||||
| } else { | } else { | ||||
| block_limit_ = {max_num_blocks_, max_num_blocks_, max_num_blocks_}; | |||||
| block_limit_ = {max_x_dim_block_, max_y_z_dim_block_, max_y_z_dim_block_}; | |||||
| } | } | ||||
| std::vector<std::string> elem_cfg = common::Split(analyzer_->scop_info_.user_config_.GetElemPerThread(), " "); | std::vector<std::string> elem_cfg = common::Split(analyzer_->scop_info_.user_config_.GetElemPerThread(), " "); | ||||
| @@ -490,13 +638,20 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| auto block_dim = std::min(block_limit_.size(), max_dim_); | auto block_dim = std::min(block_limit_.size(), max_dim_); | ||||
| // tile from inner to outer and map to thread | // tile from inner to outer and map to thread | ||||
| analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to thread-----"); | |||||
| ss << "[Thread Limit]: "; | |||||
| for (auto l : thread_limit_) { | |||||
| ss << l << ", "; | |||||
| } | |||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | |||||
| size_t ori_size = pending_axes_.size(); | size_t ori_size = pending_axes_.size(); | ||||
| size_t inner_dim = 0; | size_t inner_dim = 0; | ||||
| for (size_t i = 0; i < ori_size; ++i) { | for (size_t i = 0; i < ori_size; ++i) { | ||||
| TileAxis *axis; | TileAxis *axis; | ||||
| int64_t shape; | int64_t shape; | ||||
| std::tie(axis, shape) = pending_axes_[i]; | std::tie(axis, shape) = pending_axes_[i]; | ||||
| int64_t rest_threads = std::min(max_num_threads_ / activated_threads, thread_limit_[thread_cfg_.size()]); | |||||
| int64_t rest_threads = std::min(max_x_y_dim_thread_ / activated_threads, thread_limit_[thread_cfg_.size()]); | |||||
| ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape | ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape | ||||
| << ", rest_threads = " << rest_threads; | << ", rest_threads = " << rest_threads; | ||||
| auto SkipMapping = [this, &axis, &shape, &ss, &inner_dim, &thread_dim]() { | auto SkipMapping = [this, &axis, &shape, &ss, &inner_dim, &thread_dim]() { | ||||
| @@ -505,16 +660,26 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_) | tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_) | ||||
| : tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_) | : tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_) | ||||
| : 1; | : 1; | ||||
| if (axis->block_constraints.map_extent_ > 1) { | |||||
| tile = | |||||
| std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1)); | |||||
| pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1))); | |||||
| ss << ", map to block."; | |||||
| auto tile_min = axis->c1_constraints.tile_min_.as<IntImm>()->value; | |||||
| auto tile_extent = axis->c1_constraints.tile_extent_.as<IntImm>()->value; | |||||
| if (tile_min == tile_extent && tile_extent != MIN_TILE) { | |||||
| ss << "tile extent is already determined = " << tile_extent; | |||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | |||||
| tile = tile_min; | |||||
| } else { | } else { | ||||
| tile = std::min(tile, shape); | |||||
| if (axis->block_constraints.map_extent_ > 1) { | |||||
| tile = | |||||
| std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1)); | |||||
| } else { | |||||
| tile = std::min(tile, shape); | |||||
| } | |||||
| } | } | ||||
| axis->TileRestrainLower(tile, TileLevel::CACHE1); | axis->TileRestrainLower(tile, TileLevel::CACHE1); | ||||
| ss << ", tile = " << tile; | ss << ", tile = " << tile; | ||||
| if (axis->block_constraints.map_extent_ > 1) { | |||||
| pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1))); | |||||
| ss << ", map to block."; | |||||
| } | |||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | ||||
| }; | }; | ||||
| @@ -535,6 +700,7 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| SkipMapping(); | SkipMapping(); | ||||
| continue; | continue; | ||||
| } | } | ||||
| auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_ | auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_ | ||||
| : elem_per_thread_[inner_dim]; | : elem_per_thread_[inner_dim]; | ||||
| item = std::min(item, max_elem_per_thread_); | item = std::min(item, max_elem_per_thread_); | ||||
| @@ -559,8 +725,9 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| if (template_ == Template::PURE_ELEM) { | if (template_ == Template::PURE_ELEM) { | ||||
| std::map<int64_t, std::vector<size_t>, std::greater<int64_t>> sorted_by_gcd; | std::map<int64_t, std::vector<size_t>, std::greater<int64_t>> sorted_by_gcd; | ||||
| for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) { | for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) { | ||||
| auto use = (max_num_blocks_ > 0 && pending_axes_[i].second > 0) | |||||
| ? TilingAnalyzer::FindDivisibleTilingFactor(max_num_blocks_, pending_axes_[i].second) | |||||
| auto block_limit = i == 0 ? max_x_dim_block_ : max_y_z_dim_block_; | |||||
| auto use = (block_limit > 0 && pending_axes_[i].second > 0) | |||||
| ? TilingAnalyzer::FindDivisibleTilingFactor(block_limit, pending_axes_[i].second) | |||||
| : 1; | : 1; | ||||
| if (sorted_by_gcd.find(use) == sorted_by_gcd.end()) { | if (sorted_by_gcd.find(use) == sorted_by_gcd.end()) { | ||||
| sorted_by_gcd[use] = {i}; | sorted_by_gcd[use] = {i}; | ||||
| @@ -575,6 +742,7 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| if (pending_axes_.size() - i > block_dim) { | if (pending_axes_.size() - i > block_dim) { | ||||
| auto axis = pending_axes_[i].first; | auto axis = pending_axes_[i].first; | ||||
| ss << "axis " << axis->index << "_" << axis->dim_axis | ss << "axis " << axis->index << "_" << axis->dim_axis | ||||
| << " exceeded block dim and should be mapped to block for higher performance, consider flatten"; | << " exceeded block dim and should be mapped to block for higher performance, consider flatten"; | ||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | ||||
| continue; | continue; | ||||
| @@ -584,20 +752,30 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) { | for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) { | ||||
| if (pending_axes_[i].second <= 1 && indexing.size() == block_limit_.size()) { | |||||
| continue; | |||||
| } | |||||
| indexing.emplace_back(i); | indexing.emplace_back(i); | ||||
| } | } | ||||
| } | } | ||||
| // map outer band to block according to predefined indice | // map outer band to block according to predefined indice | ||||
| analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to block-----"); | |||||
| ss << "[Block Limit]: "; | |||||
| for (auto l : block_limit_) { | |||||
| ss << l << ", "; | |||||
| } | |||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | |||||
| for (const auto &i : indexing) { | for (const auto &i : indexing) { | ||||
| TileAxis *axis; | TileAxis *axis; | ||||
| int64_t shape; | int64_t shape; | ||||
| std::tie(axis, shape) = pending_axes_[i]; | std::tie(axis, shape) = pending_axes_[i]; | ||||
| auto idx = pending_axes_.size() - 1 - i; | |||||
| idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx; | |||||
| auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]); | |||||
| rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_); | |||||
| ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks; | |||||
| auto idx = depth_ - 1 - (pending_axes_.size() - 1 - i); | |||||
| idx = reverse_binding_ ? std::min(depth_, block_limit_.size()) - 1 - idx : idx; | |||||
| auto rest_blocks = idx < block_limit_.size() ? std::min(block_limit_[idx], axis->block_constraints.map_extent_) : 1; | |||||
| ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", block_idx = " << idx | |||||
| << ", rest blocks = " << rest_blocks; | |||||
| if (block_count_ >= static_cast<int>(block_dim)) { | if (block_count_ >= static_cast<int>(block_dim)) { | ||||
| ss << "-> No mapping."; | ss << "-> No mapping."; | ||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | ||||
| @@ -635,11 +813,9 @@ void GpuStrategy::SetMappingConfig() { | |||||
| if (block_cfg_.empty()) { | if (block_cfg_.empty()) { | ||||
| block_cfg_.emplace_back(1); | block_cfg_.emplace_back(1); | ||||
| } | } | ||||
| bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION); | |||||
| std::string block_str = ""; | std::string block_str = ""; | ||||
| std::string thread_str = ""; | std::string thread_str = ""; | ||||
| if (reverse_binding) { | |||||
| if (reverse_binding_) { | |||||
| for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) { | for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) { | ||||
| if (i >= block_count_) { | if (i >= block_count_) { | ||||
| continue; | continue; | ||||
| @@ -753,7 +929,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in | |||||
| tile = thread_size; | tile = thread_size; | ||||
| ss << "tile = thread size, "; | ss << "tile = thread size, "; | ||||
| } else { | } else { | ||||
| auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim; | |||||
| auto block_dim = reverse_binding_ ? block_limit_.size() - 1 - inner_dim : inner_dim; | |||||
| int64_t least_blocks; | int64_t least_blocks; | ||||
| if (block_dim >= 0 && block_dim < block_limit_.size()) { | if (block_dim >= 0 && block_dim < block_limit_.size()) { | ||||
| least_blocks = block_limit_[block_dim]; | least_blocks = block_limit_[block_dim]; | ||||
| @@ -903,7 +1079,7 @@ void GpuStrategy::InjectiveSpeedup() { | |||||
| while (shape % lower != 0) { | while (shape % lower != 0) { | ||||
| --lower; | --lower; | ||||
| } | } | ||||
| bool is_efficient = lower * 2 > thread_size || total_threads / thread_size * lower * 2 >= max_num_threads_; | |||||
| bool is_efficient = lower * 2 > thread_size || total_threads / thread_size * lower * 2 >= max_x_y_dim_thread_; | |||||
| if (is_efficient) { | if (is_efficient) { | ||||
| ss << "align thread from " << thread_size << " to " << lower << " according to shape " << shape; | ss << "align thread from " << thread_size << " to " << lower << " according to shape " << shape; | ||||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | ||||
| @@ -923,8 +1099,8 @@ void GpuStrategy::InjectiveSpeedup() { | |||||
| auto coaleasced_size = injective_axes.back()->thread_constraints.map_extent_; | auto coaleasced_size = injective_axes.back()->thread_constraints.map_extent_; | ||||
| auto proposal_blocks = coaleasced_size >= warp_sizes_ ? 256 : 512; | auto proposal_blocks = coaleasced_size >= warp_sizes_ ? 256 : 512; | ||||
| auto proposal_threads = (coaleasced_size >= warp_sizes_ && injective_axes.size() > 1U) ? 128 | auto proposal_threads = (coaleasced_size >= warp_sizes_ && injective_axes.size() > 1U) ? 128 | ||||
| : coaleasced_size < max_num_threads_ ? 512 | |||||
| : max_num_threads_; | |||||
| : coaleasced_size < max_x_y_dim_thread_ ? 512 | |||||
| : max_x_y_dim_thread_; | |||||
| auto total_blocks = std::accumulate(block_cfg_.begin(), block_cfg_.end(), 1, std::multiplies<int>()); | auto total_blocks = std::accumulate(block_cfg_.begin(), block_cfg_.end(), 1, std::multiplies<int>()); | ||||
| auto proposal_elem_per_thread = coaleasced_size < warp_sizes_ ? 1 | auto proposal_elem_per_thread = coaleasced_size < warp_sizes_ ? 1 | ||||
| : total_blocks < proposal_blocks * 8 ? min_elem_for_io_bound_ | : total_blocks < proposal_blocks * 8 ? min_elem_for_io_bound_ | ||||
| @@ -1091,7 +1267,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||||
| int total_injective_size = 1; | int total_injective_size = 1; | ||||
| auto broadcast_innermost = broadcast_idx_.find(original_shape.size() - 1) != broadcast_idx_.end(); | auto broadcast_innermost = broadcast_idx_.find(original_shape.size() - 1) != broadcast_idx_.end(); | ||||
| for (size_t i = 0; i < original_shape.size(); ++i) { | for (size_t i = 0; i < original_shape.size(); ++i) { | ||||
| if (original_shape[i] * possible_threads <= max_num_threads_) { | |||||
| if (original_shape[i] * possible_threads <= max_x_y_dim_thread_) { | |||||
| possible_threads *= original_shape[i]; | possible_threads *= original_shape[i]; | ||||
| } | } | ||||
| auto rev_idx = original_shape.size() - 1 - i; | auto rev_idx = original_shape.size() - 1 - i; | ||||
| @@ -1100,7 +1276,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||||
| coalesced_size = coalesced_size == 0 ? original_shape[i] : coalesced_size; | coalesced_size = coalesced_size == 0 ? original_shape[i] : coalesced_size; | ||||
| if (broadcast_innermost) { | if (broadcast_innermost) { | ||||
| auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1; | auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1; | ||||
| auto thread_limit = max_num_threads_ / prev_extent; | |||||
| auto thread_limit = max_x_y_dim_thread_ / prev_extent; | |||||
| auto coef = analyzer_->FindDivisibleTilingFactor(thread_limit, original_shape[i]); | auto coef = analyzer_->FindDivisibleTilingFactor(thread_limit, original_shape[i]); | ||||
| axis->thread_constraints.map_extent_ = prev_extent * coef; | axis->thread_constraints.map_extent_ = prev_extent * coef; | ||||
| possible_threads = axis->thread_constraints.map_extent_; | possible_threads = axis->thread_constraints.map_extent_; | ||||
| @@ -1108,7 +1284,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||||
| } else if (broadcast_innermost) { | } else if (broadcast_innermost) { | ||||
| auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1; | auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1; | ||||
| axis->thread_constraints.map_extent_ = | axis->thread_constraints.map_extent_ = | ||||
| prev_extent * original_shape[i] <= max_num_threads_ ? prev_extent * original_shape[i] : prev_extent; | |||||
| prev_extent * original_shape[i] <= max_x_y_dim_thread_ ? prev_extent * original_shape[i] : prev_extent; | |||||
| possible_threads = axis->thread_constraints.map_extent_; | possible_threads = axis->thread_constraints.map_extent_; | ||||
| } | } | ||||
| coalesced_size = coalesced_size == 0 ? 1 : coalesced_size; | coalesced_size = coalesced_size == 0 ? 1 : coalesced_size; | ||||
| @@ -1121,10 +1297,10 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||||
| std::min(elem_per_thread, std::max<int>((fused_size_ / possible_threads / min_block + 1) / 2 * 2, 1)); | std::min(elem_per_thread, std::max<int>((fused_size_ / possible_threads / min_block + 1) / 2 * 2, 1)); | ||||
| ss << "thread for-loop speedup = " << axis->thread_constraints.item_process_; | ss << "thread for-loop speedup = " << axis->thread_constraints.item_process_; | ||||
| } else if (total_injective_size > min_block) { | } else if (total_injective_size > min_block) { | ||||
| while (possible_threads % warp_sizes_ != 0 && possible_threads < max_num_threads_) { | |||||
| while (possible_threads % warp_sizes_ != 0 && possible_threads < max_x_y_dim_thread_) { | |||||
| ++possible_threads; | ++possible_threads; | ||||
| } | } | ||||
| int elem_per_block = std::max<int>(16 / (max_num_threads_ / possible_threads), 1); | |||||
| int elem_per_block = std::max<int>(16 / (max_x_y_dim_thread_ / possible_threads), 1); | |||||
| auto proposal_blocks = std::max(min_block, std::max<int>(fused_size_ / possible_threads / elem_per_block, 1)); | auto proposal_blocks = std::max(min_block, std::max<int>(fused_size_ / possible_threads / elem_per_block, 1)); | ||||
| axis->block_constraints.map_extent_ = proposal_blocks; | axis->block_constraints.map_extent_ = proposal_blocks; | ||||
| axis->thread_constraints.map_extent_ = possible_threads; | axis->thread_constraints.map_extent_ = possible_threads; | ||||
| @@ -1139,12 +1315,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||||
| } | } | ||||
| } | } | ||||
| void CustomTilingStrategy::AddGpuConstraint() { | |||||
| auto interested_info = GetInterestedInfo(interested_attr_key, false); | |||||
| for (auto it : interested_info) { | |||||
| TileAxis *axis = it.first; | |||||
| for (auto attr : it.second) { | |||||
| std::vector<std::string> modes = akg::common::Split(attr.attr_key, ":"); | |||||
| CHECK_EQ(modes.size(), 2U); | |||||
| std::string constraint_str = attr.attr_value; | |||||
| if (constraint_str.find("->") != std::string::npos) { | |||||
| std::vector<std::string> res = akg::common::Split(constraint_str, "->"); | |||||
| constraint_str = res[1]; | |||||
| } | |||||
| std::vector<std::string> constraints = akg::common::Split(constraint_str, "_"); | |||||
| CHECK_GE(constraints.size(), 1U); | |||||
| std::vector<std::string> level = akg::common::Split(constraints[0], ":"); | |||||
| CHECK(level.size() == 2U && level[0] == "LEVEL"); | |||||
| CHECK(level[1] == "C1" || level[1] == "C0"); | |||||
| TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0; | |||||
| constraints.erase(constraints.begin()); | |||||
| for (const auto &con : constraints) { | |||||
| std::vector<std::string> items = akg::common::Split(con, ":"); | |||||
| CHECK_EQ(items.size(), 2U); | |||||
| CHECK_NE(items[0], ""); | |||||
| CHECK_NE(items[1], ""); | |||||
| if (items[0] == "MIN") { | |||||
| if (items[1] == "MIN") { | |||||
| if (lv == CACHE1) { | |||||
| axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_; | |||||
| } else if (lv == CACHE0) { | |||||
| axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_; | |||||
| } | |||||
| } else { | |||||
| if (lv == CACHE1) { | |||||
| axis->c1_constraints.tile_min_ = CastToExpr(items[1]); | |||||
| } else if (lv == CACHE0) { | |||||
| axis->c0_constraints.tile_min_ = CastToExpr(items[1]); | |||||
| } | |||||
| } | |||||
| } else if (items[0] == "FACTOR") { | |||||
| axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv); | |||||
| } else if (items[0] == "FORBIDISO") { | |||||
| axis->forbid_iso = true; | |||||
| } else if (items[0] == "MAX") { | |||||
| if (items[1] == "FULL") { | |||||
| axis->TileRestrainEntire(lv); | |||||
| } else { | |||||
| if (lv == CACHE1) { | |||||
| axis->c1_constraints.tile_extent_ = CastToExpr(items[1]); | |||||
| } else if (lv == CACHE0) { | |||||
| axis->c0_constraints.tile_extent_ = CastToExpr(items[1]); | |||||
| } | |||||
| } | |||||
| } else if (items[0] == AT_MOD) { | |||||
| axis->TileRestrainMod(CastToExpr(items[1]), lv); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| // No constraint found in cuda | // No constraint found in cuda | ||||
| void ModStrategy::AddGpuConstraint() {} | void ModStrategy::AddGpuConstraint() {} | ||||
| void CustomTilingStrategy::AddGpuConstraint() {} | |||||
| void ConflictTreeRangeStrategy::AddGpuConstraint() {} | void ConflictTreeRangeStrategy::AddGpuConstraint() {} | ||||
| void VectorizedStrategy::AddGpuConstraint() {} | void VectorizedStrategy::AddGpuConstraint() {} | ||||
| @@ -0,0 +1,17 @@ | |||||
| import sys | |||||
| if __name__ == "__main__": | |||||
| from_log_file = str(sys.argv[1]) | |||||
| sorted_log_file = str(sys.argv[2]) | |||||
| f_in = open(from_log_file, 'r') | |||||
| f_out = open(sorted_log_file, "wt") | |||||
| d = dict() | |||||
| for line in f_in: | |||||
| config = line.split("|") | |||||
| d[str(config[1])] = float(config[2]) | |||||
| sorted_dict = {k: v for k, v in sorted( | |||||
| d.items(), key=lambda item: (item[1], item[0]))} | |||||
| for k, v in sorted_dict.items(): | |||||
| f_out.write("|" + str(k) + "|" + str(v) + "\n") | |||||
| f_in.close() | |||||
| f_out.close() | |||||
| @@ -0,0 +1,95 @@ | |||||
| from .kernel_compiler import compile_kernel | |||||
| from collections import namedtuple | |||||
| from .space import ListConfigSpace | |||||
| def get_reduce_axis_length(in_shape,reduce_axis): | |||||
| lx, ly = 1, 1 | |||||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||||
| for v in in_shape: lx *= v | |||||
| elif (len(in_shape) - 1) in reduce_axis: | |||||
| for i in range(len(in_shape)): | |||||
| if i in reduce_axis: | |||||
| lx *= in_shape[i] | |||||
| else: | |||||
| ly *= in_shape[i] | |||||
| else: | |||||
| for i in range(len(in_shape)): | |||||
| if i in reduce_axis: | |||||
| ly *= in_shape[i] | |||||
| else: | |||||
| lx *= in_shape[i] | |||||
| return lx, ly | |||||
| def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||||
| """get config space of reduce_sum operators in gpu""" | |||||
| space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, | |||||
| gen_tiling_spaces=True) | |||||
| in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis | |||||
| dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2 | |||||
| dim_names = ['tiling_' + str(i) for i in range(dim_len)] | |||||
| dim_names.append("block_x") | |||||
| dim_names.append("block_y") | |||||
| dim_names.append("block_z") | |||||
| dim_names.append("thread_x") | |||||
| dim_names.append("thread_y") | |||||
| dim_names.append("thread_z") | |||||
| for key in tuning_attrs_info[0]: | |||||
| dim_names.append(key) | |||||
| lx, ly = get_reduce_axis_length(in_shape, reduce_axis) | |||||
| tiling_spaces = [] | |||||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||||
| """all-reduce""" | |||||
| possible_tx_list = [2**i for i in range(4,11)] | |||||
| for tx in possible_tx_list: | |||||
| if tx > lx: break | |||||
| possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)] | |||||
| if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx) | |||||
| for d0 in possible_dim0_list: | |||||
| bx = lx//d0 if lx % d0 == 0 else lx//d0+1 | |||||
| tiling_spaces.append([d0,bx,1,1,tx,1,1]) | |||||
| elif (len(in_shape) - 1) in reduce_axis: | |||||
| """reduce-x""" | |||||
| possible_tx_list = [2**i for i in range(4,11)] | |||||
| for tx in possible_tx_list: | |||||
| if tx > lx: break | |||||
| ty = 1 | |||||
| by = ly | |||||
| possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)] | |||||
| if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx) | |||||
| for d1 in possible_dim1_list: | |||||
| bx = lx//d1 if lx % d1 == 0 else lx//d1+1 | |||||
| tiling_spaces.append([1,d1,bx,by,1,tx,ty,1]) | |||||
| else: | |||||
| """reduce-y""" | |||||
| tx = min(32,lx) | |||||
| bx = lx//tx if lx %tx==0 else lx//tx + 1 | |||||
| d0 = tx | |||||
| for ty in range(min(8,ly),1025): | |||||
| if ty * tx > 1024: break | |||||
| possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)] | |||||
| for d1 in possible_dim1_list: | |||||
| by = ly//d1 if ly % d1 == 0 else ly//d1+1 | |||||
| tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1]) | |||||
| input_type = namedtuple(op_type, dim_names) | |||||
| space = ListConfigSpace(input_type) | |||||
| if len(tuning_attrs_info[0]) != 0: | |||||
| for tiling_space in tiling_spaces: | |||||
| for tuning_attrs_config in tuning_attrs_info[1]: | |||||
| tmp = tiling_space[:] | |||||
| tmp.extend(tuning_attrs_config) | |||||
| config = input_type(*tmp) | |||||
| space.add(config) | |||||
| else: | |||||
| for tiling_space in tiling_spaces: | |||||
| config = input_type(*tiling_space) | |||||
| space.add(config) | |||||
| return space_res.index_table, space, key, expect, input_for_mod | |||||
| @@ -0,0 +1,501 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """AutoTuning job""" | |||||
| import os | |||||
| import json | |||||
| import time | |||||
| import datetime | |||||
| import importlib | |||||
| import logging | |||||
| import pandas as pd | |||||
| import subprocess | |||||
| import numpy as np | |||||
| from collections import namedtuple | |||||
| from multiprocessing import Process, Manager | |||||
| from akg import composite | |||||
| from akg.utils import kernel_exec as utils | |||||
| from akg.composite.build_module import generate_trait | |||||
| from autotuning.runner import KernelRunner, error_time_list, error_time_string | |||||
| from autotuning.tuner import ModelBasedTuner, Tuner | |||||
| from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc | |||||
| from autotuning.space_generators import get_space | |||||
| from autotuning.space import ListConfigSpace | |||||
| from autotuning.test_data_generators import gen_data | |||||
| from autotuning.space_generators import gen_bool_list | |||||
| from autotuning.tuning_utils import * | |||||
| logging.basicConfig(level=logging.DEBUG) | |||||
| logger = logging.getLogger('fuzz.tune.autotuning.job') | |||||
| storage_dir = './res/' | |||||
| if not os.path.exists(storage_dir): | |||||
| os.makedirs(storage_dir) | |||||
| json_file = './res/' + "{0}" + ".json" | |||||
| json_load = './autotuning/shapes/' + "{0}" | |||||
| def get_repo(repo, keys, default=None): | |||||
| for key in keys: | |||||
| repo = repo.get(key) | |||||
| if not repo: | |||||
| return default | |||||
| return repo | |||||
| def get_json_space(json_input, space_dict): | |||||
| space_res = composite.get_tiling_space(json_input, 2) | |||||
| space_dict['res'] = space_res | |||||
| def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False, | |||||
| skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]): | |||||
| """composite json tuning launch""" | |||||
| subprocess.run("mkdir -p res/", shell=True) | |||||
| iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] | |||||
| files = os.listdir(json_dir) | |||||
| with open(repo_path, 'r') as f: | |||||
| repo = json.loads(f.read()) | |||||
| for input_file in files: | |||||
| print("----Start tuning for ", input_file) | |||||
| with open(json_dir + '/' + input_file, 'r') as f: | |||||
| json_input = f.read() | |||||
| json_content = json.loads(json_input) | |||||
| for input_desc in json_content["input_desc"]: | |||||
| if input_desc[0]["shape"] == []: | |||||
| input_desc[0]["shape"] = [1] | |||||
| json_input = json.dumps(json_content) | |||||
| # skip tuning for info in repo | |||||
| if skip_exist: | |||||
| compute, shape, dtype = generate_trait(json_content) | |||||
| if get_repo(repo, [compute, shape, dtype]): | |||||
| print("Info for %s already exists" % input_file) | |||||
| print("ops are ", str(compute)) | |||||
| print("shape is ", str(shape)) | |||||
| print("dtype is ", str(dtype)) | |||||
| with open('res/skip_file.txt', 'a') as fe: | |||||
| fe.write(input_file) | |||||
| fe.write("\n") | |||||
| continue | |||||
| # generate tuning space | |||||
| if not extra_tune: | |||||
| time_start_get_space = time.time() | |||||
| with Manager() as manager: | |||||
| space_dict = manager.dict() | |||||
| p = Process(target=get_json_space, | |||||
| args=(json_input, space_dict)) | |||||
| p.start() | |||||
| p.join(600) | |||||
| if 'res' not in space_dict: | |||||
| with open('res/error_space_list.txt', 'a') as fe: | |||||
| fe.write(input_file) | |||||
| fe.write("\n") | |||||
| continue | |||||
| space_res = space_dict['res'] | |||||
| time_end_get_space = time.time() | |||||
| print("get space time: ", time_end_get_space - time_start_get_space) | |||||
| index_table = space_res['index'] | |||||
| tiling_spaces = space_res['tuning_space'] | |||||
| if not isinstance(tiling_spaces, list): | |||||
| with open('res/empty_space_list.txt', 'a') as fe: | |||||
| fe.write(input_file) | |||||
| fe.write("\n") | |||||
| continue | |||||
| dim_names = ['tiling_' + str(i) | |||||
| for i in range(len(tiling_spaces[0]))] | |||||
| use_tuning_attrs = len(tiling_spaces) < 10 ** 5 | |||||
| if tuning_attrs and use_tuning_attrs: | |||||
| dim_names.extend(tuning_attrs) | |||||
| input_type = namedtuple("json", dim_names) | |||||
| space = ListConfigSpace(input_type) | |||||
| if tuning_attrs and use_tuning_attrs: | |||||
| attr_options = gen_bool_list(tuning_attrs) | |||||
| for tiling_space in tiling_spaces: | |||||
| for attr_option in attr_options: | |||||
| tmp = tiling_space[:] | |||||
| tmp.extend(attr_option) | |||||
| config = input_type(*tmp) | |||||
| space.add(config) | |||||
| else: | |||||
| for tiling_space in tiling_spaces: | |||||
| config = input_type(*tiling_space) | |||||
| space.add(config) | |||||
| else: | |||||
| index_table = [] | |||||
| pre_lists = gen_bool_list(self_attrs) | |||||
| pre_input_type = namedtuple("extra_tune", self_attrs) | |||||
| space = ListConfigSpace(pre_input_type) | |||||
| for item in pre_lists: | |||||
| config = pre_input_type(*item) | |||||
| space.add(config) | |||||
| key = json_content["op"] | |||||
| try: | |||||
| input_for_mod, expect = gen_data( | |||||
| op_type="json", op_desc=json_input) | |||||
| except BaseException as e: | |||||
| logger.debug( | |||||
| "gen numpy data from [%s] failed: %s", input_file, str(e)) | |||||
| with open('res/error_gen_data_list.txt', 'a') as fe: | |||||
| fe.write(input_file) | |||||
| fe.write(": ") | |||||
| fe.write(str(e)) | |||||
| fe.write("\n") | |||||
| continue | |||||
| print('space size:', space.length) | |||||
| print('index table:', index_table) | |||||
| output_para = None # this is for multi-output | |||||
| if len(json_content["output_desc"]) > 1: | |||||
| output_para = [] | |||||
| for i in range(len(json_content["output_desc"])): | |||||
| output_para.append(i - len(json_content["output_desc"])) | |||||
| runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs, | |||||
| input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180, | |||||
| repeat_times=1) | |||||
| # we can only get a valid tiling, or accurate get cycles | |||||
| is_truly_profiling = utils.get_profiling_mode( | |||||
| ) or os.environ['RUNTIME_MODE'] == "gpu" | |||||
| # available device numbers, normally is 8 or 1 | |||||
| available_device_numbers = utils.get_available_devices_num() | |||||
| if all_space: | |||||
| tuner = Tuner(runner, index_table, space, | |||||
| n_parallel=available_device_numbers) | |||||
| least_try_times = 3 # space.length | |||||
| else: | |||||
| tuner = ModelBasedTuner(runner, index_table, space, | |||||
| n_parallel=available_device_numbers if is_truly_profiling else 1, | |||||
| plan_size=64, pre_model=None) | |||||
| least_try_times = iter_times[0 if space.length < | |||||
| 10 ** 4 else 1 if space.length < 10 ** 5 else 2] | |||||
| tuner.tune(least_try_times, output_file="json.log") | |||||
| print_tuning_result("json", space, index_table, tuner, key) | |||||
| if save_res: | |||||
| if extra_tune: | |||||
| save_tuning_result(key, "extra_tune", | |||||
| json_content, index_table, tuner, repo_path) | |||||
| else: | |||||
| save_tuning_result(key, "json", json_content, | |||||
| index_table, tuner, repo_path) | |||||
| def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False, | |||||
| all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None): | |||||
| """AutoTuning jobs""" | |||||
| iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] | |||||
| time_start_get_space = time.time() | |||||
| index_table, space, key, expect, input_for_mod = get_space( | |||||
| op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) | |||||
| time_end_get_space = time.time() | |||||
| print("get space time: ", time_end_get_space - time_start_get_space) | |||||
| print('space size:', space.length) | |||||
| print('index table:', index_table) | |||||
| key = key if insert_key == '' else insert_key | |||||
| # filter already tuned shape | |||||
| if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys(): | |||||
| if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]: | |||||
| return | |||||
| if isinstance(conf_of_set_dim[key], dict): | |||||
| return | |||||
| output_para = None # this is for multi-output | |||||
| if isinstance(input_for_mod, dict): | |||||
| input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs'] | |||||
| runner = KernelRunner(op_type, desc, index_table, | |||||
| self_attrs=None, input_data=input_for_mod, | |||||
| expect=expect, mod_output_param=output_para, | |||||
| timeout=30, repeat_times=1, | |||||
| is_all_space=all_space, | |||||
| skip_config_set=skip_config_set, | |||||
| need_tune_json=tuning_attrs_info[2]) | |||||
| # we can only get a valid tiling, or accurate get cycles | |||||
| is_truly_profiling = utils.get_profiling_mode() | |||||
| # number of multi-processing for build kernels | |||||
| available_device_numbers = get_parallel_build_num() | |||||
| time_start_tuning = time.time() | |||||
| if all_space: | |||||
| tuner = Tuner(runner, index_table, space, | |||||
| n_parallel=available_device_numbers) | |||||
| least_try_times = space.length | |||||
| else: | |||||
| tuner = ModelBasedTuner(runner, index_table, space, | |||||
| n_parallel=available_device_numbers if is_truly_profiling else 1, | |||||
| plan_size=100, pre_model=None) | |||||
| least_try_times = space.length | |||||
| tuner.tune(least_try_times, output_file=op_type + ".log") | |||||
| time_end_tuning = time.time() | |||||
| print("tuning time: ", time_end_tuning - time_start_tuning) | |||||
| print_tuning_result(op_type, space, index_table, tuner, key) | |||||
| # save_results_to_csv(op_type, space, index_table, tuner, key) | |||||
| # if save_res: | |||||
| # save_tuning_result(key, op_type, desc, index_table, tuner) | |||||
| def print_tuning_result(op_type, space, index_table, tuner, key): | |||||
| """print tuning result""" | |||||
| print(op_type + " shape is:", key) | |||||
| print('space size:', space.length) | |||||
| print('index table:', index_table) | |||||
| print('best config:', tuner.best_config) | |||||
| print('best time:', | |||||
| tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time]) | |||||
| print('original time:', tuner.original_time) | |||||
| print('optimal result is ', tuner.original_time / | |||||
| tuner.best_time, "faster then auto set dim.") | |||||
| print("total try times", len(tuner.xs)) | |||||
| for x, y in zip(tuner.xs, tuner.ys): | |||||
| print(space.get(x), y if y not in error_time_string.keys() | |||||
| else error_time_string[y]) | |||||
| def save_results_to_csv(op_type, space, index_table, tuner, key): | |||||
| """save all results to csv""" | |||||
| data = [] | |||||
| for x, y in zip(tuner.xs, tuner.ys): | |||||
| data.append([space.get(x), y if y not in error_time_string.keys() | |||||
| else 9999999]) | |||||
| df = pd.DataFrame(data, columns=["config", "time"]) | |||||
| df.to_csv(op_type + "_" + key + ".csv") | |||||
| def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"): | |||||
| """save tuning result""" | |||||
| if tuner.best_config is not None and tuner.best_time not in error_time_list: | |||||
| set_dim_configs = tuner.best_config.input | |||||
| if op_type == "matmul": | |||||
| param = [] | |||||
| for _ in range(len(desc.x_shape) - 2): | |||||
| param.append((1, 1)) | |||||
| if set_dim_configs.n_l1 > 0: | |||||
| param.append((set_dim_configs.n_l1, set_dim_configs.n_l0)) | |||||
| if set_dim_configs.m_l1 > 0: | |||||
| param.append((set_dim_configs.m_l1, set_dim_configs.m_l0)) | |||||
| param.extend( | |||||
| [(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)]) | |||||
| tiling_param = (param, {"bypass": set_dim_configs.bypass}) | |||||
| # special case with different tiling parameter format | |||||
| elif op_type in ("conv", "conv_bn1"): | |||||
| param = [] | |||||
| tile_hh = set_dim_configs.tile_h | |||||
| tile_coco = set_dim_configs.tile_co | |||||
| tile_mm = set_dim_configs.tile_m | |||||
| tile_kk = set_dim_configs.tile_k | |||||
| tile_nn = set_dim_configs.tile_n | |||||
| tile_ww = set_dim_configs.tile_w | |||||
| param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||||
| tiling_param = (param, {"bypass": set_dim_configs.bypass}) | |||||
| elif op_type == "conv_backprop_input": | |||||
| param = [] | |||||
| tile_hh = set_dim_configs.tile_h | |||||
| tile_coco = set_dim_configs.tile_co | |||||
| tile_mm = set_dim_configs.tile_m | |||||
| tile_kk = set_dim_configs.tile_k | |||||
| tile_nn = set_dim_configs.tile_n | |||||
| tile_ww = set_dim_configs.tile_w | |||||
| param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||||
| tiling_param = (param) | |||||
| elif op_type == "conv_backprop_filter": | |||||
| param = [] | |||||
| tile_cici = set_dim_configs.tile_ci | |||||
| tile_khkh = set_dim_configs.tile_kh | |||||
| tile_kwkw = set_dim_configs.tile_kw | |||||
| tile_coco = set_dim_configs.tile_co | |||||
| tile_bb = set_dim_configs.tile_batch | |||||
| tile_hh = set_dim_configs.tile_h | |||||
| tile_ww = set_dim_configs.tile_w | |||||
| tile_mm = set_dim_configs.tile_m | |||||
| tile_kk = set_dim_configs.tile_k | |||||
| tile_nn = set_dim_configs.tile_n | |||||
| param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, | |||||
| tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn] | |||||
| tiling_param = (param) | |||||
| elif ("batch_matmul" in op_type) and (platform == "gpu"): | |||||
| tiling = [str(getattr(set_dim_configs, name)) for name in getattr( | |||||
| set_dim_configs, "_fields") if name.startswith("tiling")] | |||||
| tiling_param = "" | |||||
| for i, tile_v in enumerate(tiling): | |||||
| if i % 2 == 0: | |||||
| tiling_param += "0 " + str(i) + " " | |||||
| tiling_param += tile_v + " " | |||||
| block_param = get_block_str_from_config(set_dim_configs) | |||||
| thread_param = get_thread_str_from_config(set_dim_configs) | |||||
| config = { | |||||
| 'attrs': { | |||||
| 'dim': tiling_param, | |||||
| 'bind_block': block_param, | |||||
| 'bind_thread': thread_param | |||||
| }, | |||||
| 'best_cycles': tuner.best_time, | |||||
| 'original_cycles': tuner.original_time, | |||||
| 'date': str(datetime.datetime.now()), | |||||
| 'tuning_time': tuner.tuning_time, | |||||
| } | |||||
| elif op_type == "json": | |||||
| from autotuning.runner import get_attr_from_config | |||||
| tiling_param = get_attr_from_config(set_dim_configs, index_table) | |||||
| elif op_type == "reduce_sum_gpu": | |||||
| print(set_dim_configs) | |||||
| tiling = [str(getattr(set_dim_configs, name)) | |||||
| for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] | |||||
| tiling_param = "" | |||||
| for i, tile_v in enumerate(tiling): | |||||
| tiling_param += "0 " + str(i) + " " | |||||
| tiling_param += tile_v + " 1 " | |||||
| block_param = get_block_str_from_config(set_dim_configs) | |||||
| thread_param = get_thread_str_from_config(set_dim_configs) | |||||
| config = { | |||||
| 'attrs': { | |||||
| 'dim': tiling_param, | |||||
| 'bind_block': block_param, | |||||
| 'bind_thread': thread_param | |||||
| }, | |||||
| 'best_cycles': tuner.best_time, | |||||
| 'original_cycles': tuner.original_time, | |||||
| 'date': str(datetime.datetime.now()), | |||||
| 'tuning_time': tuner.tuning_time, | |||||
| } | |||||
| else: | |||||
| print(set_dim_configs) | |||||
| tiling = [[getattr(set_dim_configs, name), 1] | |||||
| for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] | |||||
| tiling_param = [] | |||||
| for i, tile_v in enumerate(tiling): | |||||
| tiling_param.append(index_table[i] + tile_v) | |||||
| config = [] | |||||
| else: | |||||
| tiling_param = [] | |||||
| # when there is a valid result, save the result | |||||
| if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list: | |||||
| config = {'attrs': tiling_param, | |||||
| 'best_cycles': tuner.best_time, | |||||
| 'original_cycles': tuner.original_time, | |||||
| "date": str(datetime.datetime.now()), | |||||
| "tuning time": tuner.tuning_time, | |||||
| } | |||||
| if op_type == "json": | |||||
| config["file_name"] = str(key) | |||||
| compute, shape, dtype = generate_trait(desc) | |||||
| tuner.export_dim_configs( | |||||
| config, json_file.format(op_type), False, str(key)) | |||||
| save_file = "autotuning/extra_tune.json" if extra_tune else repo_path | |||||
| with open(save_file, 'r') as f: | |||||
| repo = json.loads(f.read()) | |||||
| if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or | |||||
| int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])): | |||||
| tuner.export_dim_configs_for_keys(config, save_file, False, [ | |||||
| compute, shape, dtype, "metadata"]) | |||||
| else: | |||||
| try: | |||||
| tuner.export_dim_configs( | |||||
| config, json_file.format(op_type), False, str(key)) | |||||
| except UnboundLocalError as e: | |||||
| logger.warning(e) | |||||
| print("[save_tuning_result]: ", "no result is saved.") | |||||
| def load_json_configs(op_type): | |||||
| """load json configs""" | |||||
| dim_file = json_file.format(op_type) | |||||
| file_path = os.path.realpath(dim_file) | |||||
| if os.path.isfile(file_path): | |||||
| try: | |||||
| with open(file_path, 'r') as f: | |||||
| data = json.load(f) | |||||
| return data | |||||
| except IOError as e: | |||||
| logger.debug(e) | |||||
| return {} | |||||
| return {} | |||||
| def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type): | |||||
| """read tuning shapes from file""" | |||||
| file = importlib.import_module('autotuning.shapes.' + op_type) | |||||
| shapes = file.shapes | |||||
| for _, shp in enumerate(shapes): | |||||
| do_profiling(shp, debug_mode, save_res, | |||||
| all_space, op_type, conf_of_set_dim) | |||||
| def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): | |||||
| """do profiling""" | |||||
| # remove undeleted JOB files for previous shapes | |||||
| subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True) | |||||
| if op_type == 'matmul': | |||||
| key = shp[2][0:-1] | |||||
| logger.debug("start profiling: [%s]", str(key)) | |||||
| desc = MatmulCubeDesc(*key) | |||||
| jobs(op_type, desc, debug_mode, save_res, | |||||
| all_space, key.__str__(), conf_of_set_dim) | |||||
| logger.debug("end profiling: [%s]", str(key)) | |||||
| elif op_type.startswith('conv_backprop'): | |||||
| key = shp[2] | |||||
| logger.debug("start profiling: [%s]", str(key)) | |||||
| desc = ConvBackpropDesc(*key) | |||||
| jobs(op_type, desc, debug_mode, save_res, | |||||
| all_space, key.__str__(), conf_of_set_dim) | |||||
| logger.debug("end profiling: [%s]", str(key)) | |||||
| elif op_type.startswith('conv') and "gpu" not in op_type: | |||||
| key = shp[2] | |||||
| logger.debug("start profiling: [%s]", str(key)) | |||||
| desc = ConvDesc(*key) | |||||
| jobs(op_type, desc, debug_mode, save_res, | |||||
| all_space, key.__str__(), conf_of_set_dim) | |||||
| logger.debug("end profiling: [%s]", str(key)) | |||||
| elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]: | |||||
| logger.debug("start profiling: [%s]", str(shp)) | |||||
| jobs(op_type, shp, debug_mode, save_res, | |||||
| all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||||
| else: | |||||
| key = shp | |||||
| logger.debug("start profiling: [%s]", str(key)) | |||||
| desc = key | |||||
| jobs(op_type, desc, debug_mode, save_res, | |||||
| all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set) | |||||
| logger.debug("end profiling: [%s]", str(key)) | |||||
| def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False, | |||||
| from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): | |||||
| # get the existed tiling | |||||
| conf_of_set_dim = load_json_configs(op_type) if from_json else None | |||||
| if desc is None: | |||||
| read_shapes_from_file(debug_mode, save_res, | |||||
| all_space, conf_of_set_dim, op_type) | |||||
| else: | |||||
| shp = desc | |||||
| do_profiling(shp, debug_mode, save_res, all_space, op_type, | |||||
| tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||||
| @@ -0,0 +1,407 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Compile kernel module for operator""" | |||||
| import os | |||||
| from typing import NamedTuple | |||||
| from base import TestBase | |||||
| from akg.utils import kernel_exec as utils | |||||
| from akg.utils import custom_tiling as ct_util | |||||
| from akg.ops.nn import conv_bn1 | |||||
| from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul | |||||
| from test_op.batch_matmul import batch_matmul | |||||
| from akg.ops.math_gpu.reduce_sum import reduce_sum | |||||
| from akg.build_module import tuning_spaces | |||||
| from akg.ops.nn import matmul | |||||
| from test_run import batchmatmul_run, matmul_run | |||||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig | |||||
| import numpy as np | |||||
| from gen_random import random_gaussian | |||||
| from .tuning_utils import merge_attrs | |||||
| def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None): | |||||
| # wait for implementation | |||||
| return | |||||
| def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table, | |||||
| config: ConvConfig = None, idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for conv""" | |||||
| if index_table is not None: | |||||
| raise RuntimeError('index_table should be none') | |||||
| kernel_name = "conv_poly" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tile_hh = config.tile_h | |||||
| tile_coco = config.tile_co | |||||
| tile_mm = config.tile_m | |||||
| tile_kk = config.tile_k | |||||
| tile_nn = config.tile_n | |||||
| tile_ww = config.tile_w | |||||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||||
| attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} | |||||
| if op_desc.use_bias: | |||||
| shape = [input_shape[0], input_shape[1], input_shape[2]] | |||||
| else: | |||||
| shape = [input_shape[0], input_shape[1]] | |||||
| conv_dtype = 'float16' | |||||
| return utils.op_build(conv.conv, [shape], [conv_dtype], | |||||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, | |||||
| op_desc.dilation, op_desc.use_bias, attrs], | |||||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||||
| def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None, | |||||
| idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for conv_bn1""" | |||||
| if index_table is not None: | |||||
| raise RuntimeError('index_table should be none') | |||||
| kernel_name = "conv_bn1_poly" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tile_hh = config.tile_h | |||||
| tile_coco = config.tile_co | |||||
| tile_mm = config.tile_m | |||||
| tile_kk = config.tile_k | |||||
| tile_nn = config.tile_n | |||||
| tile_ww = config.tile_w | |||||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||||
| attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} | |||||
| if op_desc.use_bias: | |||||
| shape = [input_shape[0], input_shape[1], input_shape[2]] | |||||
| else: | |||||
| shape = [input_shape[0], input_shape[1]] | |||||
| conv_dtype = 'float16' | |||||
| return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype], | |||||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, | |||||
| op_desc.dilation, op_desc.use_bias, attrs], | |||||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||||
| def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table, | |||||
| config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for matmul_cube""" | |||||
| if index_table is not None: | |||||
| raise RuntimeError('index_table should be none') | |||||
| kernel_name = "matmul_cube_poly" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tiling_param = [] | |||||
| for _ in range(len(op_desc.x_shape) - 2): | |||||
| tiling_param.append((1, 1)) | |||||
| if config.n_l1 > 0: | |||||
| tiling_param.append((config.n_l1, config.n_l0)) | |||||
| if config.m_l1 > 0: | |||||
| tiling_param.append((config.m_l1, config.m_l0)) | |||||
| tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)]) | |||||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||||
| attrs = {'dim': dim_info, 'bypass': config.bypass} | |||||
| return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format, | |||||
| op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y, | |||||
| op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name, | |||||
| attrs, tuning=gen_tiling_spaces) | |||||
| def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None, | |||||
| idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for conv_backprop_input""" | |||||
| if index_table is not None: | |||||
| raise RuntimeError('index_table should be none') | |||||
| kernel_name = "conv_backprop_input_poly" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tile_hh = config.tile_h | |||||
| tile_coco = config.tile_co | |||||
| tile_mm = config.tile_m | |||||
| tile_kk = config.tile_k | |||||
| tile_nn = config.tile_n | |||||
| tile_ww = config.tile_w | |||||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||||
| attrs = {'conv_tile': tiling_param} | |||||
| conv_dtype = 'float16' | |||||
| block_size = 16 | |||||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| cout, _, w_h, w_w = op_desc.filter_shape | |||||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||||
| cout = (cout + block_size - 1) // block_size * block_size | |||||
| pad_top, pad_bottom, pad_left, pad_right = op_desc.pad | |||||
| stride_h, stride_w = op_desc.stride | |||||
| out_n = in_n | |||||
| out_c = cout | |||||
| out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 | |||||
| out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 | |||||
| x_shape = (out_n, out_c, out_h, out_w) | |||||
| w_shape = (cout, in_c, w_h, w_w) | |||||
| in_nn, in_cc, in_hh, in_ww = x_shape | |||||
| input_shape_nc1hwc0 = (in_nn, in_cc // block_size, | |||||
| in_hh, in_ww, block_size) | |||||
| k_n, k_c, k_h, k_w = w_shape | |||||
| kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) | |||||
| k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0 | |||||
| kernel_shape_fractal = (k_c // block_size * k_h * | |||||
| k_w, k_n // block_size, block_size, block_size) | |||||
| shape = [input_shape_nc1hwc0, kernel_shape_fractal] | |||||
| return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype], | |||||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||||
| op_desc.stride, op_desc.dilation, attrs], | |||||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||||
| def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None, | |||||
| idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for conv_backprop_filter""" | |||||
| if index_table is not None: | |||||
| raise RuntimeError('index_table should be none') | |||||
| kernel_name = "conv_backprop_filter_poly" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tile_cici = config.tile_ci | |||||
| tile_khkh = config.tile_kh | |||||
| tile_kwkw = config.tile_kw | |||||
| tile_coco = config.tile_co | |||||
| tile_bb = config.tile_batch | |||||
| tile_hh = config.tile_h | |||||
| tile_ww = config.tile_w | |||||
| tile_mm = config.tile_m | |||||
| tile_kk = config.tile_k | |||||
| tile_nn = config.tile_n | |||||
| tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww, | |||||
| tile_mm, tile_kk, tile_nn] | |||||
| attrs = {'conv_tile': tiling_param} | |||||
| conv_dtype = 'float16' | |||||
| block_size = 16 | |||||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| cout, _, w_h, w_w = op_desc.filter_shape | |||||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||||
| cout = (cout + block_size - 1) // block_size * block_size | |||||
| pad_top, pad_bottom, pad_left, pad_right = op_desc.pad | |||||
| stride_h, stride_w = op_desc.stride | |||||
| out_n = in_n | |||||
| out_c = cout | |||||
| out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 | |||||
| out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 | |||||
| x_shape = (in_n, in_c, in_h, in_w) | |||||
| y_shape = (out_n, out_c, out_h, out_w) | |||||
| in_n, in_c, in_h, in_w = x_shape | |||||
| input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) | |||||
| o_n, o_c, o_h, o_w = y_shape | |||||
| kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size) | |||||
| o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0 | |||||
| mo = (o_h * o_w + block_size - 1) // block_size | |||||
| mi = block_size | |||||
| kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0) | |||||
| input_shape = [kernel_shape_fractal, input_shape_nc1hwc0] | |||||
| return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype], | |||||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||||
| op_desc.stride, op_desc.dilation, attrs], | |||||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||||
| def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False): | |||||
| """Compile kernel module for vector""" | |||||
| test_base = TestBase() | |||||
| test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd()) | |||||
| kernel_name = "poly_" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| if config is None: | |||||
| attrs = {'dim': ""} | |||||
| else: | |||||
| tiling = [[getattr(config, name), 1] for name in getattr( | |||||
| config, '_fields') if name.startswith('tiling')] | |||||
| tiling_param = [] | |||||
| for i, element in enumerate(tiling): | |||||
| tiling_param.append(index_table[i] + element) | |||||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||||
| attrs = {'dim': dim_info} | |||||
| _, func, args, kwargs = test_base.ana_args(op_desc) | |||||
| if 'attrs' in kwargs.keys(): | |||||
| kwargs['attrs']['dim'] = attrs['dim'] | |||||
| kwargs['attrs']['tuning'] = gen_tiling_spaces | |||||
| kwargs['attrs']['kernel_name'] = kernel_name | |||||
| else: | |||||
| for _, arg_ in enumerate(args): | |||||
| if isinstance(arg_, dict): | |||||
| arg_['dim'] = attrs['dim'] | |||||
| arg_['tuning'] = gen_tiling_spaces | |||||
| arg_['kernel_name'] = kernel_name | |||||
| break | |||||
| try: | |||||
| if gen_tiling_spaces: | |||||
| mod, expect, param_for_mod = func(*args, **kwargs) | |||||
| mod = list(mod) | |||||
| mod.append(expect) | |||||
| mod.append(param_for_mod) | |||||
| else: | |||||
| mod = func(*args, **kwargs) | |||||
| except BaseException as e: | |||||
| print("Compile ERROR message:", e) | |||||
| print(func) | |||||
| print("Compile ERROR") | |||||
| raise Exception("Compile ERROR") | |||||
| return mod | |||||
| def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None, | |||||
| config: NamedTuple = None, idx=None, | |||||
| gen_tiling_spaces=False, need_tune_json=None): | |||||
| """Compile kernel module for batch_matmul in gpu""" | |||||
| kernel_name = "batch_matmul_gpu_" | |||||
| # wait for implementation | |||||
| return | |||||
| def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None, | |||||
| config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): | |||||
| """Compile kernel module for reduce_sum in gpu""" | |||||
| kernel_name = "reduce_sum_gpu_" | |||||
| if idx is not None: | |||||
| kernel_name += str(idx) | |||||
| attrs = op_desc[2] | |||||
| if config is not None: | |||||
| attrs = merge_attrs(attrs, config, need_tune_json) | |||||
| try: | |||||
| if gen_tiling_spaces: | |||||
| # NOTE: don't use this process for reduce spaces generation, | |||||
| # see function: "_get_space_reduce_gpu_manually". | |||||
| from .tiling_strategies_gpu import reduce_gpu_tiling_strategy | |||||
| spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ), | |||||
| (attrs.in_dtype, | |||||
| ), kernel_name="reduce_sum", | |||||
| op_attrs=[ | |||||
| attrs.axis, attrs.keepdims], | |||||
| attrs={"target": "cuda", | |||||
| "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, | |||||
| "enable_atomic_add": attrs.enable_atomic_add, | |||||
| "custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True) | |||||
| from test_ms_reduce_sum import gen_data | |||||
| input_for_mod, output, expect = gen_data( | |||||
| attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims) | |||||
| return [spaces, set_dim_key, expect, [input_for_mod, output]] | |||||
| else: | |||||
| mod = utils.op_build(reduce_sum, (attrs.in_shape, ), | |||||
| (attrs.in_dtype, | |||||
| ), kernel_name="reduce_sum", | |||||
| op_attrs=[ | |||||
| attrs.axis, attrs.keepdims], | |||||
| attrs={"target": "cuda", | |||||
| "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, | |||||
| "dim": attrs.dim, | |||||
| "bind_block": attrs.bind_block, | |||||
| "bind_thread": attrs.bind_thread, | |||||
| "enable_atomic_add": attrs.enable_atomic_add}) | |||||
| return mod | |||||
| except BaseException as e: | |||||
| print("Compile ERROR message:", e) | |||||
| print(reduce_sum) | |||||
| print("Compile ERROR") | |||||
| raise Exception("Compile ERROR") | |||||
| def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): | |||||
| """Compile kernel module for convolution in gpu using image2col+gemm""" | |||||
| # wait for implementation | |||||
| return | |||||
| _compile_kernel_func = { | |||||
| 'conv': gen_kernel_conv, | |||||
| 'conv_bn1': gen_kernel_conv_bn1, | |||||
| 'conv_backprop_input': gen_kernel_conv_backprop_input, | |||||
| 'conv_backprop_filter': gen_kernel_conv_backprop_filter, | |||||
| 'matmul': gen_kernel_matmul_cube, | |||||
| 'reduce_sum_gpu': gen_kernel_reduce_sum_gpu, | |||||
| 'batch_matmul_gpu': gen_kernel_batch_matmul_gpu, | |||||
| 'conv_image2col_gemm_gpu': gen_kernel_conv_image2col_gemm_gpu, | |||||
| } | |||||
| def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None, | |||||
| config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None): | |||||
| """Generate kernel module for operator | |||||
| Parameters | |||||
| op_type: str | |||||
| operator name | |||||
| op_desc: NamedTuple | |||||
| operator definition parameters | |||||
| config_param: NameTuple | |||||
| operator config parameters | |||||
| idx: int | |||||
| operator idx(th) kernel | |||||
| gen_tiling_spaces: bool | |||||
| parameter passed to utils.op_build, whether to get spaces instead of stmt | |||||
| ---------- | |||||
| Returns: | |||||
| kernel if gen_tiling_spaces == False else np.ndarray | |||||
| """ | |||||
| gen_func = _compile_kernel_func.get(op_type, None) | |||||
| if gen_func is None: | |||||
| gen_func = gen_kernel_for_vector | |||||
| if gen_tiling_spaces: | |||||
| space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param, | |||||
| idx, gen_tiling_spaces) | |||||
| else: | |||||
| if "gpu" in op_type: | |||||
| mod = gen_func(op_desc, input_shape, index_table, | |||||
| config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json) | |||||
| else: | |||||
| mod = gen_func(op_desc, input_shape, index_table, | |||||
| config_param, idx, gen_tiling_spaces) | |||||
| return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod | |||||
| @@ -0,0 +1,243 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Runner for compile and execute a configs of an operator on device""" | |||||
| import time | |||||
| import multiprocessing | |||||
| import logging | |||||
| import json | |||||
| import os | |||||
| import subprocess | |||||
| import time | |||||
| from typing import NamedTuple | |||||
| import numpy as np | |||||
| from akg import composite | |||||
| from akg.utils import custom_tiling as ct_util | |||||
| from akg.utils import kernel_exec as utils | |||||
| from .kernel_compiler import compile_kernel | |||||
| from .test_data_generators import gen_data | |||||
| from .tuning_utils import * | |||||
| logger = logging.getLogger('fuzz.tune.autotuning.runner') | |||||
| error_time_list = [ | |||||
| 9999999999.0, | |||||
| 9999999998.0, | |||||
| 9999999997.0, | |||||
| 9999999996.0, | |||||
| ] | |||||
| error_time_string = { | |||||
| error_time_list[0]: 'run_failed', | |||||
| error_time_list[1]: 'precision_error', | |||||
| error_time_list[2]: 'compile_failed', | |||||
| error_time_list[3]: 'timeout' | |||||
| } | |||||
| run_failed_time = error_time_list[0] | |||||
| precision_error_time = error_time_list[1] | |||||
| compile_fail_time = error_time_list[2] | |||||
| timeout_time = error_time_list[3] | |||||
| def get_attr_from_config(config, index_table): | |||||
| tiling = [] | |||||
| attrs = {} | |||||
| tuning_dict = config._asdict() | |||||
| for key, value in tuning_dict.items(): | |||||
| if key.startswith('tiling'): | |||||
| item = [value, 1] | |||||
| tiling.append(item) | |||||
| else: | |||||
| attrs[key] = value | |||||
| if len(tiling): | |||||
| tiling_param = [] | |||||
| for i, element in enumerate(tiling): | |||||
| tiling_param.append(index_table[i] + element) | |||||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||||
| attrs['dim'] = dim_info | |||||
| else: | |||||
| print("No tiling info. Use auto tiling.") | |||||
| return attrs | |||||
| class KernelRunner: | |||||
| """kernel runner | |||||
| This runner will compile and execute configs of an operator, and return their running times. | |||||
| Parameters | |||||
| ---------- | |||||
| op_type: str | |||||
| The name of operator | |||||
| op_desc: NamedTuple | |||||
| The definition parameters of operator | |||||
| timeout: int | |||||
| Timeout for running one config | |||||
| repeat_times: | |||||
| Run one config repeat_times | |||||
| """ | |||||
| def __init__(self, op_type: str, op_desc: NamedTuple, | |||||
| index_table: list, self_attrs: list, timeout: int = 600, | |||||
| repeat_times: int = 2, input_data=None, | |||||
| expect=None, mod_output_param=None, is_all_space=True, | |||||
| skip_config_set=None, need_tune_json=None): | |||||
| self.op_type = op_type | |||||
| self.op_desc = op_desc | |||||
| self._index_table = index_table | |||||
| self.self_attrs = self_attrs | |||||
| self.run_kernel_time = 0.0 | |||||
| self.tune_self_attrs = True | |||||
| self.timeout = timeout | |||||
| self.repeat_times = repeat_times | |||||
| self.mod_output_param = mod_output_param | |||||
| self.is_all_space = is_all_space | |||||
| self.skip_config_set = skip_config_set | |||||
| self.need_tune_json = need_tune_json | |||||
| if input_data is None: | |||||
| self.input, self.expect = gen_data(op_type, op_desc) | |||||
| if isinstance(self.input, dict): | |||||
| self.input, self.mod_output_param = self.input['args'], self.input['outputs'] | |||||
| else: | |||||
| self.input, self.expect = input_data, expect | |||||
| self.input_shape = [x.shape for x in self.input] | |||||
| def info(self): | |||||
| print('run kernel time:', self.run_kernel_time) | |||||
| def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False): | |||||
| """Compile and execute a config of the operator on device""" | |||||
| if json.dumps(config.input._asdict()) in self.skip_config_set: | |||||
| print("CONFIG SKIP:", json.dumps(config.input._asdict())) | |||||
| run_times[idx] = -1 | |||||
| return | |||||
| time_one_kernel_start = time.time() | |||||
| logger.debug('compile %dth kernel', idx) | |||||
| gpu_devices_list = get_available_gpu_num() | |||||
| device_id = gpu_devices_list[idx % len(gpu_devices_list)] | |||||
| logger.debug('run %dth kernel', idx) | |||||
| logger.debug('++++++++++++++++++++++=device_id') | |||||
| logger.debug(device_id) | |||||
| logger.debug('++++++++++++++++++++++=device_id') | |||||
| try: | |||||
| time_start_build = time.time() | |||||
| logger.debug(config) | |||||
| if self.op_type in ["json", "extra_tune"]: | |||||
| if is_auto: | |||||
| mod = composite.build(self.op_desc) | |||||
| if self.op_type == "extra_tune": | |||||
| del os.environ['MS_GRAPH_KERNEL_TILING'] | |||||
| else: | |||||
| attrs = get_attr_from_config( | |||||
| config.input, self._index_table) | |||||
| if os.environ['RUNTIME_MODE'] == "gpu": | |||||
| attrs['target'] = "cuda" | |||||
| mod = composite.build(self.op_desc, attrs, use_repo=False) | |||||
| else: | |||||
| mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table, | |||||
| None if is_auto else config.input, idx, need_tune_json=self.need_tune_json) | |||||
| time_end_build = time.time() | |||||
| logger.debug("build module time: %f", | |||||
| time_end_build - time_start_build) | |||||
| logger.debug('finished compile %dth kernel', idx) | |||||
| except BaseException as e: | |||||
| logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str( | |||||
| config.input), str(e)) | |||||
| run_times[idx] = compile_fail_time | |||||
| return | |||||
| run_times[idx] = run_failed_time | |||||
| try: | |||||
| # NOTE: in gpu tuning, it is no need to use this repeat_times, | |||||
| # repeat_time has been setted in mod_launch in tuning mode. | |||||
| for _ in range(self.repeat_times): | |||||
| stat_info = {} | |||||
| try: | |||||
| time_start_launch = time.time() | |||||
| if self.mod_output_param is not None: | |||||
| pass | |||||
| else: | |||||
| output, stat_info = utils.mod_launch( | |||||
| mod, self.input, tuning=True, device_id=device_id, repeat_time=40) | |||||
| if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True): | |||||
| stat_info['run_time'] = precision_error_time | |||||
| logger.debug("Precision Error: [%s]", | |||||
| "origin" if config is None else str(config.input)) | |||||
| time_end_launch = time.time() | |||||
| logger.debug("mod launch time: %f", | |||||
| time_end_launch - time_start_launch) | |||||
| except BaseException as e: | |||||
| logger.debug("Run Failed: [%s] : %s", str( | |||||
| config.input), str(e)) | |||||
| stat_info['run_time'] = run_failed_time | |||||
| run_times[idx] = np.minimum( | |||||
| run_times[idx], stat_info['run_time']) | |||||
| finally: | |||||
| logger.debug('end of %dth kernel', idx) | |||||
| time_one_kernel_end = time.time() | |||||
| logger.debug('run one kernel time: %f', | |||||
| time_one_kernel_end - time_one_kernel_start) | |||||
| return | |||||
| def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False): | |||||
| """Compile and execute a batch config of the operator on device""" | |||||
| start = time.time() | |||||
| logger.setLevel(logging.DEBUG) | |||||
| logger.debug("gen cce kernels batch: %d kernels", len(configs)) | |||||
| subprocess.run("rm -rf ./jobs/JOB*", shell=True) | |||||
| process_jobs = [] | |||||
| run_times = multiprocessing.Manager().list( | |||||
| np.full((len(configs),), compile_fail_time)) | |||||
| for idx, config in enumerate(configs): | |||||
| p = multiprocessing.Process(target=self.run_one_kernel, | |||||
| args=(run_times, idx, config, best_time, is_auto_set_dim)) | |||||
| process_jobs.append(p) | |||||
| p.start() | |||||
| timeout_error = False | |||||
| for idx, p in enumerate(process_jobs): | |||||
| if not timeout_error: | |||||
| p.join(timeout=self.timeout) | |||||
| if p.is_alive(): | |||||
| timeout_error = True | |||||
| logger.debug("Timeout Error: [%s]", str(configs[idx].input)) | |||||
| run_times[idx] = timeout_time | |||||
| p.terminate() | |||||
| process_end = time.time() | |||||
| logger.debug("process time: %f", process_end - start) | |||||
| # clean the profiling directory | |||||
| tune_device = int(os.environ['DEVICE_ID']) | |||||
| tune_num = int(os.environ['DEVICE_TOTAL_NUM']) | |||||
| if os.environ['RUNTIME_MODE'] == "gpu": | |||||
| subprocess.run("rm -rf cuda_meta_*", shell=True) | |||||
| else: | |||||
| pass | |||||
| end = time.time() | |||||
| logger.debug("run kernels time: %f", end - start) | |||||
| self.run_kernel_time += end - start | |||||
| for idx, config in enumerate(configs): | |||||
| if run_times[idx] not in error_time_list: | |||||
| logger.debug("KernelRunTime : [%s] : %s", str( | |||||
| configs[idx].input), str(run_times[idx])) | |||||
| else: | |||||
| logger.debug("KernelRunTime : [%s] : %s", | |||||
| str(configs[idx].input), str(error_time_string[run_times[idx]])) | |||||
| return run_times | |||||
| @@ -0,0 +1,217 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Config space""" | |||||
| from abc import ABCMeta, abstractmethod | |||||
| from typing import NamedTuple, List | |||||
| import random | |||||
| import numpy as np | |||||
| class ConfigEntity: | |||||
| """General config entity""" | |||||
| def __init__(self, input_id: int, input_space: NamedTuple): | |||||
| self.__input = input_space | |||||
| self.__input_id = input_id | |||||
| self.__input_type = type(input_space) | |||||
| def __len__(self): | |||||
| return len(self.__input) | |||||
| def __str__(self): | |||||
| return str(self.__input_id) + ': ' + str(self.__input) | |||||
| def __repr__(self): | |||||
| return str(self) | |||||
| @property | |||||
| def input_id(self): | |||||
| return self.__input_id | |||||
| @property | |||||
| def input_type(self): | |||||
| return self.__input_type | |||||
| @property | |||||
| def input(self): | |||||
| return self.__input | |||||
| @property | |||||
| def feature(self): | |||||
| return self.__input | |||||
| class ConfigSpace(metaclass=ABCMeta): | |||||
| """Searching space of configs""" | |||||
| def __init__(self, input_type): | |||||
| self._input_type = input_type | |||||
| self._dim_names = getattr(self._input_type, '_fields') | |||||
| self._configs = [] # List[ConfigEntity] | |||||
| @abstractmethod | |||||
| def reset_fetch(self): | |||||
| pass | |||||
| @abstractmethod | |||||
| def has_next(self) -> bool: | |||||
| pass | |||||
| @abstractmethod | |||||
| def fetch_index(self) -> int: | |||||
| """fetch a random index of config""" | |||||
| @abstractmethod | |||||
| def fetch_config(self) -> ConfigEntity: | |||||
| """fetch a random config""" | |||||
| @abstractmethod | |||||
| def random_walk(self, p: int) -> int: | |||||
| """find a neighbor hood of the p-th ConfigEntity, which only | |||||
| differs with p in at most one dimension""" | |||||
| def get(self, idx: int) -> ConfigEntity: | |||||
| """get the `idx`-th config of the space""" | |||||
| return self._configs[idx] | |||||
| @property | |||||
| def configs(self): | |||||
| return self._configs | |||||
| @property | |||||
| def dim_names(self): | |||||
| return self._dim_names | |||||
| @property | |||||
| def input_type(self): | |||||
| return self._input_type | |||||
| @property | |||||
| # @abstractmethod | |||||
| def length(self): | |||||
| return len(self.configs) | |||||
| class ConfigTrie: | |||||
| """Trie node for config entities""" | |||||
| def __init__(self): | |||||
| self.ch = dict() | |||||
| def add(self, config: ConfigEntity, last_dim: int): | |||||
| """add a ConfigEntity""" | |||||
| cur = self | |||||
| for i, x in enumerate(config.input): | |||||
| if i == last_dim: | |||||
| continue | |||||
| if x not in cur.ch: | |||||
| cur.ch[x] = ConfigTrie() | |||||
| if not isinstance(cur.ch, dict): | |||||
| raise TypeError('none-leaf node should have a dict of childs') | |||||
| cur = cur.ch[x] | |||||
| if not isinstance(cur.ch, list): | |||||
| cur.ch = [] | |||||
| cur.ch.append(config.input_id) | |||||
| def fetch_random(self, config: ConfigEntity, last_dim: int) -> int: | |||||
| """randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension""" | |||||
| cur = self | |||||
| for i, x in enumerate(config.input): | |||||
| if i == last_dim: | |||||
| continue | |||||
| if not isinstance(cur.ch, dict): | |||||
| raise TypeError('none leaf node should have a dict of childs') | |||||
| if x not in cur.ch: | |||||
| raise RuntimeError('no element found') | |||||
| cur = cur.ch[x] | |||||
| if not cur.ch: | |||||
| raise RuntimeError('no element found') | |||||
| if len(cur.ch) == 1: | |||||
| return cur.ch[0] | |||||
| idx = config.input_id | |||||
| while idx == config.input_id: | |||||
| idx = random.choice(cur.ch) | |||||
| return idx | |||||
| class ListConfigSpace(ConfigSpace): | |||||
| """Searching space of configs, which stores all possible configs in a list""" | |||||
| def __init__(self, input_type): | |||||
| super(ListConfigSpace, self).__init__(input_type) | |||||
| self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))] | |||||
| self.__fetch_pool = [] | |||||
| def reset_fetch(self): | |||||
| """reset fetch state""" | |||||
| self.__fetch_pool = [i for i in range(len(self._configs))] | |||||
| def fetch_scope(self, start, end): | |||||
| self.__fetch_pool = [i for i in range(start, end)] | |||||
| def has_next(self) -> bool: | |||||
| return len(self.__fetch_pool) > 0 | |||||
| def fetch_index(self) -> int: | |||||
| """fetch a random index of config""" | |||||
| idx = np.random.randint(len(self.__fetch_pool)) | |||||
| ret = self.__fetch_pool[idx] | |||||
| self.__fetch_pool[idx] = self.__fetch_pool[-1] | |||||
| self.__fetch_pool.pop() | |||||
| return ret | |||||
| def fetch_next_index(self) -> int: | |||||
| """fetch next index of config""" | |||||
| idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0] | |||||
| self.__fetch_pool.pop() | |||||
| return idx | |||||
| def fetch_config(self) -> ConfigEntity: | |||||
| """fetch a random config""" | |||||
| return self.get(self.fetch_index()) | |||||
| def add(self, input_space: NamedTuple): | |||||
| """add a new config to space""" | |||||
| if not isinstance(input_space, self._input_type): | |||||
| raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space), | |||||
| self._input_type)) | |||||
| config = ConfigEntity(len(self._configs), input_space) | |||||
| self.__fetch_pool.append(len(self._configs)) | |||||
| for i in range(len(self._dim_names)): | |||||
| self.__config_tries[i].add(config, i) | |||||
| self._configs.append(config) | |||||
| def random_walk(self, p: int) -> int: | |||||
| """find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension""" | |||||
| dim = np.random.randint(len(self._dim_names)) | |||||
| return self.__config_tries[dim].fetch_random(self._configs[p], dim) | |||||
| @property | |||||
| def length(self): | |||||
| return len(self._configs) | |||||
| @classmethod | |||||
| def from_list(cls, configs: List[NamedTuple]): | |||||
| if not isinstance(configs, list): | |||||
| raise TypeError('configs must be of list type, got %s' % type(configs)) | |||||
| if not configs: | |||||
| raise ValueError('configs must be non-empty') | |||||
| space = cls(type(configs[0])) | |||||
| for config in configs: | |||||
| space.add(config) | |||||
| return space | |||||
| @@ -0,0 +1,753 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """space generating functions for operators""" | |||||
| from functools import partial | |||||
| from typing import NamedTuple | |||||
| from collections import namedtuple | |||||
| from test_run import matmul_run | |||||
| from akg.utils import validation_check as vc_util | |||||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig | |||||
| from .space import ListConfigSpace | |||||
| from .kernel_compiler import compile_kernel | |||||
| from .gen_spaces_gpu import _get_space_reduce_gpu_manually | |||||
| from tqdm import tqdm | |||||
| from enum import Enum | |||||
| GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"} | |||||
| class GpuSpacePolicy(Enum): | |||||
| """Policy to expand tile candidates with block and thread.""" | |||||
| FULL = "FULL" | |||||
| BMM = "BMM" | |||||
| REDUCE_ALL = "REDUCE_ALL" | |||||
| REDUCE_X = "REDUCE_X" | |||||
| REDUCE_Y = "REDUCE_Y" | |||||
| def gen_bool_list(attr_list): | |||||
| bool_list = [] | |||||
| for _ in attr_list: | |||||
| if len(bool_list) == 0: | |||||
| bool_list = [[True], [False]] | |||||
| else: | |||||
| tmp_list = [] | |||||
| for attr_option in bool_list: | |||||
| tmp = attr_option[:] | |||||
| tmp.append(True) | |||||
| tmp1 = tmp[:] | |||||
| tmp.pop() | |||||
| tmp.append(False) | |||||
| tmp2 = tmp[:] | |||||
| tmp_list.append(tmp1) | |||||
| tmp_list.append(tmp2) | |||||
| bool_list = tmp_list | |||||
| return bool_list | |||||
| def _get_space_vector(op_type: str, op_desc): | |||||
| """get config space of vector operator""" | |||||
| space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, | |||||
| gen_tiling_spaces=True) | |||||
| if space_res is None: | |||||
| raise RuntimeError('no space returned') | |||||
| if 'index' not in space_res or 'tuning_space' not in space_res: | |||||
| raise RuntimeError('invalid space returned') | |||||
| index_table = space_res['index'] | |||||
| tiling_spaces = space_res['tuning_space'] | |||||
| if not tiling_spaces: | |||||
| raise RuntimeError('empty tiling spaces') | |||||
| dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))] | |||||
| input_type = namedtuple(op_type, dim_names) | |||||
| space = ListConfigSpace(input_type) | |||||
| for tiling_space in tiling_spaces: | |||||
| config = input_type(*tiling_space) | |||||
| space.add(config) | |||||
| return index_table, space, key, expect, input_for_mod | |||||
| def _get_space_conv(op_desc: ConvDesc): | |||||
| """get config space of convolution""" | |||||
| if not isinstance(op_desc, ConvDesc): | |||||
| raise TypeError('op_desc must be ConvDesc') | |||||
| stride_ = op_desc.stride | |||||
| pad_ = op_desc.pad | |||||
| dilation_ = op_desc.dilation | |||||
| vc_util.convolution_format_check( | |||||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||||
| config_space = ListConfigSpace(ConvConfig) | |||||
| # if double buff is not enabled, set it's value to 1 | |||||
| size_scale = 1 | |||||
| l1_max_size = (1024 * 1024) // size_scale | |||||
| l0a_max_size = (64 * 1024) // size_scale | |||||
| l0b_max_size = (64 * 1024) // size_scale | |||||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||||
| padding = (pad_[0], pad_[1], pad_[2], pad_[3]) | |||||
| p_top, p_bottom, p_left, p_right = padding | |||||
| s_h, s_w = stride_ | |||||
| in_c = ((in_c - 1) // 16 + 1) * 16 | |||||
| tile_c = in_c | |||||
| tile_co_start = 16 | |||||
| data_len = 2 | |||||
| h_max = in_h + p_top + p_bottom | |||||
| win_h = (h_max - k_h) // s_h + 1 | |||||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||||
| w_max = in_w + p_left + p_right | |||||
| win_w = (w_max - k_w) // s_w + 1 | |||||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||||
| bypass_options = [0, 1] | |||||
| for bypass in bypass_options: | |||||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||||
| size_h = tile_h | |||||
| if tile_h == h_max: | |||||
| w_range = range(w_max, k_w - 1, -s_w) | |||||
| size_h = in_h | |||||
| else: | |||||
| w_range = [w_max] | |||||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||||
| if h_tiles == 2: | |||||
| size_h = max(tile_h - p_top, in_h + | |||||
| p_top - tile_h + k_h - s_h) | |||||
| for tile_w in w_range: | |||||
| size_w = tile_w | |||||
| if size_w == w_max: | |||||
| size_w = in_w | |||||
| else: | |||||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||||
| if w_tiles == 2: | |||||
| size_w = max(tile_w - p_left, in_w + | |||||
| p_left - tile_w + k_w - s_w) | |||||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||||
| for tile_co in co_range: | |||||
| if bypass == 1: | |||||
| if tile_co != k_n: | |||||
| continue | |||||
| l1_size = data_len * (size_h * size_w * in_c) | |||||
| else: | |||||
| l1_size = data_len * (size_h * size_w * in_c + | |||||
| tile_co * tile_c * k_h * k_w) | |||||
| if l1_size > l1_max_size: | |||||
| continue | |||||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||||
| for tile_n in range(tile_co_, 15, -16): | |||||
| k_max = in_c * k_h * k_w | |||||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||||
| k_size = l0b_max_size // data_len // tile_n | |||||
| k_size_ = k_size // 16 * 16 | |||||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||||
| m_size1 = l0a_max_size // data_len // tile_k | |||||
| m_size1_ = m_size1 // 16 * 16 | |||||
| m_size2 = l0c_max_size // data_len // tile_n | |||||
| m_size2_ = m_size2 // 16 * 16 | |||||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||||
| config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, | |||||
| tile_n, tile_w, bypass)) | |||||
| return None, config_space, op_desc.__str__(), None, None | |||||
| def _get_space_conv_bn1(op_desc: ConvDesc): | |||||
| """get config space of convolution""" | |||||
| if not isinstance(op_desc, ConvDesc): | |||||
| raise TypeError('op_desc must be ConvDesc') | |||||
| stride_ = op_desc.stride | |||||
| pad_ = op_desc.pad | |||||
| dilation_ = op_desc.dilation | |||||
| vc_util.convolution_format_check( | |||||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||||
| config_space = ListConfigSpace(ConvConfig) | |||||
| # if double buff is not enabled, set it's value to 1 | |||||
| size_scale = 1 | |||||
| l1_max_size = (1024 * 1024) // size_scale | |||||
| l0a_max_size = (64 * 1024) // size_scale | |||||
| l0b_max_size = (64 * 1024) // size_scale | |||||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4 | |||||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||||
| padding = (pad_[0], pad_[1], pad_[2], pad_[3]) | |||||
| p_top, p_bottom, p_left, p_right = padding | |||||
| s_h, s_w = stride_ | |||||
| in_c = ((in_c - 1) // 16 + 1) * 16 | |||||
| tile_c = in_c | |||||
| tile_co_start = 16 | |||||
| data_len = 2 | |||||
| h_max = in_h + p_top + p_bottom | |||||
| win_h = (h_max - k_h) // s_h + 1 | |||||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||||
| w_max = in_w + p_left + p_right | |||||
| win_w = (w_max - k_w) // s_w + 1 | |||||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||||
| bypass_options = [0, 1] | |||||
| for bypass in bypass_options: | |||||
| h_range = range(h_max, k_h - 1, -s_h) | |||||
| for tile_h in h_range: | |||||
| size_h = tile_h | |||||
| if tile_h == h_max: | |||||
| w_range = range(w_max, k_w - 1, -s_w) | |||||
| size_h = in_h | |||||
| else: | |||||
| w_range = [w_max] | |||||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||||
| if h_tiles == 2: | |||||
| size_h = max(tile_h - p_top, in_h + | |||||
| p_top - tile_h + k_h - s_h) | |||||
| for tile_w in w_range: | |||||
| size_w = tile_w | |||||
| if size_w == w_max: | |||||
| size_w = in_w | |||||
| else: | |||||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||||
| if w_tiles == 2: | |||||
| size_w = max(tile_w - p_left, in_w + | |||||
| p_left - tile_w + k_w - s_w) | |||||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||||
| for tile_co in co_range: | |||||
| if bypass == 1: | |||||
| if tile_co != k_n: | |||||
| continue | |||||
| l1_size = data_len * (size_h * size_w * in_c) | |||||
| else: | |||||
| l1_size = data_len * (size_h * size_w * in_c + | |||||
| tile_co * tile_c * k_h * k_w) | |||||
| if l1_size > l1_max_size: | |||||
| continue | |||||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||||
| for tile_n in range(tile_co_, 15, -16): | |||||
| k_max = in_c * k_h * k_w | |||||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||||
| k_size = l0b_max_size // data_len // tile_n | |||||
| k_size_ = k_size // 16 * 16 | |||||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||||
| m_size1 = l0a_max_size // data_len // tile_k | |||||
| m_size1_ = m_size1 // 16 * 16 | |||||
| m_size2 = l0c_max_size // data_len // tile_n | |||||
| m_size2_ = m_size2 // 16 * 16 | |||||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||||
| config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, | |||||
| tile_n, tile_w, bypass)) | |||||
| return None, config_space, op_desc.__str__(), None, None | |||||
| def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc): | |||||
| """get config space of convolution backprop input""" | |||||
| if not isinstance(op_desc, ConvBackpropDesc): | |||||
| raise TypeError('op_desc must be ConvDesc') | |||||
| stride_ = op_desc.stride | |||||
| pad_ = op_desc.pad | |||||
| dilation_ = op_desc.dilation | |||||
| vc_util.convolution_format_check( | |||||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||||
| config_space = ListConfigSpace(ConvBackpropInputConfig) | |||||
| # if double buff is not enabled, set it's value to 1 | |||||
| size_scale = 1 | |||||
| block_size = 16 | |||||
| l1_max_size = (1024 * 1024) // size_scale | |||||
| l0a_max_size = (64 * 1024) // size_scale | |||||
| l0b_max_size = (64 * 1024) // size_scale | |||||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||||
| ub_max_size = l0c_max_size | |||||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||||
| k_n = (k_n + block_size - 1) // block_size * block_size | |||||
| pad_top, pad_bottom, pad_left, pad_right = pad_ | |||||
| stride_h, stride_w = stride_ | |||||
| out_c = k_n | |||||
| out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1 | |||||
| out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1 | |||||
| out_h = out_h * stride_h | |||||
| out_w = out_w * stride_w | |||||
| p_top = k_h - pad_[0] - 1 | |||||
| p_bottom = in_h + pad_[0] - stride_[0] * \ | |||||
| ((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1) | |||||
| p_left = k_w - pad_[2] - 1 | |||||
| p_right = in_w + pad_[2] - stride_[1] * \ | |||||
| ((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1) | |||||
| s_h = 1 | |||||
| s_w = 1 | |||||
| tile_c = out_c | |||||
| tile_co_start = 16 | |||||
| data_len = 2 | |||||
| h_max = out_h + p_top + p_bottom | |||||
| win_h = (h_max - k_h) // s_h + 1 | |||||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||||
| w_max = out_w + p_left + p_right | |||||
| win_w = (w_max - k_w) // s_w + 1 | |||||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||||
| size_h = tile_h | |||||
| if tile_h == h_max: | |||||
| w_range = range(w_max, k_w - 1, -s_w) | |||||
| size_h = in_h | |||||
| else: | |||||
| w_range = [w_max] | |||||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||||
| if h_tiles == 2: | |||||
| size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h) | |||||
| for tile_w in w_range: | |||||
| size_w = tile_w | |||||
| if size_w == w_max: | |||||
| size_w = in_w | |||||
| else: | |||||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||||
| if w_tiles == 2: | |||||
| size_w = max(tile_w - p_left, in_w + | |||||
| p_left - tile_w + k_w - s_w) | |||||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||||
| for tile_co in co_range: | |||||
| l1_size = data_len * (size_h * size_w * out_c + | |||||
| tile_co * tile_c * k_h * k_w) | |||||
| if l1_size > l1_max_size: | |||||
| continue | |||||
| ub_size = data_len * (size_h * size_w * out_c) | |||||
| if ub_size > ub_max_size: | |||||
| continue | |||||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||||
| for tile_n in range(tile_co_, 15, -16): | |||||
| k_max = out_c * k_h * k_w | |||||
| k_base = 16 * k_h * k_w | |||||
| k_max_ = ((k_max - 1) // k_base + 1) * k_base | |||||
| k_size = l0b_max_size // data_len // tile_n | |||||
| k_size_ = k_size // k_base * k_base | |||||
| for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base): | |||||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||||
| m_size1 = l0a_max_size // data_len // tile_k | |||||
| m_size1_ = m_size1 // 16 * 16 | |||||
| m_size2 = l0c_max_size // data_len // tile_n | |||||
| m_size2_ = m_size2 // 16 * 16 | |||||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||||
| config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m, | |||||
| tile_k, tile_n, tile_w)) | |||||
| return None, config_space, op_desc.__str__(), None, None | |||||
| def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc): | |||||
| """get config space of convolution backwprop filter""" | |||||
| if not isinstance(op_desc, ConvBackpropDesc): | |||||
| raise TypeError('op_desc must be ConvBackpropDesc') | |||||
| stride_ = op_desc.stride | |||||
| pad_ = op_desc.pad | |||||
| dilation_ = op_desc.dilation | |||||
| vc_util.convolution_format_check( | |||||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||||
| config_space = ListConfigSpace(ConvBackpropFilterConfig) | |||||
| # if double buff is not enabled, set it's value to 1 | |||||
| size_scale = 1 | |||||
| block_size = 16 | |||||
| l1_max_size = (1024 * 1024) // size_scale | |||||
| l0a_max_size = (64 * 1024) // size_scale | |||||
| l0b_max_size = (64 * 1024) // size_scale | |||||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| cout, _, k_h, k_w = op_desc.filter_shape | |||||
| k_n = cout | |||||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||||
| cout = (cout + block_size - 1) // block_size * block_size | |||||
| pad_top, pad_bottom, pad_left, pad_right = pad_ | |||||
| s_h, s_w = stride_ | |||||
| tile_co_start = 16 | |||||
| tile_ci_start = 16 | |||||
| data_len = 2 | |||||
| h_max = in_h + pad_top + pad_bottom | |||||
| win_h = (h_max - k_h) // s_h + 1 | |||||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||||
| w_max = in_w + pad_left + pad_right | |||||
| win_w = (w_max - k_w) // s_w + 1 | |||||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||||
| size_h = tile_h | |||||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||||
| # Only one head for cut H axis | |||||
| if win_tile_h * s_h < pad_top: | |||||
| continue | |||||
| # Only one tail for cut H axis | |||||
| if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top: | |||||
| continue | |||||
| if tile_h == h_max: | |||||
| w_range = range(w_max, k_w - 1, -s_w) | |||||
| size_h = in_h | |||||
| else: | |||||
| w_range = [w_max] | |||||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||||
| if h_tiles == 2: | |||||
| size_h = max(tile_h - pad_top, in_h + | |||||
| pad_top - tile_h + k_h - s_h) | |||||
| for tile_w in w_range: | |||||
| size_w = tile_w | |||||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||||
| # Only one head for cut W axis | |||||
| if win_tile_w * s_w < pad_left: | |||||
| continue | |||||
| # Only one tail for cut W axis | |||||
| if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left: | |||||
| continue | |||||
| if size_w == w_max: | |||||
| size_w = in_w | |||||
| else: | |||||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||||
| if w_tiles == 2: | |||||
| size_w = max(tile_w - pad_left, in_w + | |||||
| pad_left - tile_w + k_w - s_w) | |||||
| for tile_kh in range(k_h, 0, -1): | |||||
| for tile_kw in range(k_w, 0, -1): | |||||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||||
| for tile_co in co_range: | |||||
| in_c_ = ((in_c - 1) // 16 + 1) * 16 | |||||
| ci_range = range(in_c_, tile_ci_start - 1, -16) | |||||
| for tile_ci in ci_range: | |||||
| tile_batch = 1 | |||||
| l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w + | |||||
| tile_ci * size_h * size_w) | |||||
| if l1_size > l1_max_size: | |||||
| continue | |||||
| if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_): | |||||
| tile_m = tile_co | |||||
| tile_n = tile_ci * tile_kh * tile_kw | |||||
| l0c_size = data_len * tile_n * tile_m | |||||
| if l0c_size > l0c_max_size: | |||||
| continue | |||||
| k_max = tile_batch * tile_h * tile_w | |||||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||||
| k_size1 = l0a_max_size // data_len // tile_m | |||||
| k_size1_ = k_size1 // 16 * 16 | |||||
| k_size2 = l0b_max_size // data_len // tile_n | |||||
| k_size2_ = k_size2 // 16 * 16 | |||||
| for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16): | |||||
| config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co, | |||||
| tile_batch, tile_h, tile_w, tile_m, | |||||
| tile_k, tile_n)) | |||||
| else: | |||||
| for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16): | |||||
| k_max = tile_batch * tile_h * tile_w | |||||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||||
| k_size = l0b_max_size // data_len // tile_n | |||||
| k_size_ = k_size // 16 * 16 | |||||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||||
| m_max = tile_co | |||||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||||
| m_size1 = l0a_max_size // data_len // tile_k | |||||
| m_size1_ = m_size1 // 16 * 16 | |||||
| m_size2 = l0c_max_size // data_len // tile_n | |||||
| m_size2_ = m_size2 // 16 * 16 | |||||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||||
| config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, | |||||
| tile_co, tile_batch, tile_h, | |||||
| tile_w, tile_m, tile_k, tile_n)) | |||||
| return None, config_space, op_desc.__str__(), None, None | |||||
| def _get_space_matmul_cube(op_desc: MatmulCubeDesc): | |||||
| """get config space of matmul_cube""" | |||||
| if not isinstance(op_desc, MatmulCubeDesc): | |||||
| raise TypeError('op_desc must be MatmulCubeDesc') | |||||
| config_space = ListConfigSpace(MatmulCubeConfig) | |||||
| batch_tuple, m, k, n = matmul_run.extract_dim( | |||||
| op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) | |||||
| mmax = (m + 15) // 16 | |||||
| nmax = (n + 15) // 16 | |||||
| kmax = (k + 15) // 16 | |||||
| double_buffer = True | |||||
| mad_fp32 = True | |||||
| l1_max_size = (1024 * 1024) # L1 MEM 1024KB | |||||
| l0a_max_size = (64 * 1024) # L0A MEM 64KB | |||||
| l0b_max_size = (64 * 1024) # L0B MEM 64KB | |||||
| l0c_max_size = (256 * 1024) # L0C MEM 256KB | |||||
| # UB MEM 248KB, 8KB reserved for compiler | |||||
| ub_max_size = ((256 - 8) * 1024) | |||||
| if double_buffer: | |||||
| l1_max_size = l1_max_size // 2 | |||||
| l0a_max_size = l0a_max_size // 2 | |||||
| l0b_max_size = l0b_max_size // 2 | |||||
| l0c_max_size = l0c_max_size // 2 | |||||
| ub_max_size = ub_max_size // 2 | |||||
| if mad_fp32: | |||||
| l0c_max_size = l0c_max_size // 2 | |||||
| if op_desc.out_dtype == 'float32': | |||||
| ub_max_size = ub_max_size // 2 | |||||
| bypass_options = [0, 1, 2] | |||||
| for bypass in bypass_options: | |||||
| if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or | |||||
| (op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')): | |||||
| continue | |||||
| if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or | |||||
| (op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')): | |||||
| continue | |||||
| for k_l1 in range(1, kmax + 1): | |||||
| if kmax % k_l1 != 0: | |||||
| continue | |||||
| for k_l0 in range(1, k_l1 + 1): | |||||
| if k_l1 % k_l0 != 0: | |||||
| continue | |||||
| # no need to cut from l1 to l0 for m and n when k is cut | |||||
| for m_l1 in range(1, mmax + 1): | |||||
| if mmax % m_l1 != 0: | |||||
| continue | |||||
| m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1) | |||||
| for m_l0 in m_l0_range: | |||||
| if m_l1 % m_l0 != 0: | |||||
| continue | |||||
| for n_l1 in range(1, nmax + 1): | |||||
| if nmax % n_l1 != 0: | |||||
| continue | |||||
| n_l0_range = [n_l1] if k_l1 != kmax else range( | |||||
| 1, n_l1 + 1) | |||||
| for n_l0 in n_l0_range: | |||||
| if n_l1 % n_l0 != 0: | |||||
| continue | |||||
| if m_l0 * 16 * k_l0 * 16 > l0a_max_size: | |||||
| continue | |||||
| if n_l0 * 16 * k_l0 * 16 > l0b_max_size: | |||||
| continue | |||||
| if m_l0 * 16 * n_l0 * 16 > l0c_max_size: | |||||
| continue | |||||
| if m_l0 * 16 * n_l0 * 16 > ub_max_size: | |||||
| continue | |||||
| if bypass == 2: | |||||
| l1_size = n_l1 * 16 * k_l1 * 16 | |||||
| elif bypass == 1: | |||||
| l1_size = m_l1 * 16 * k_l1 * 16 | |||||
| else: | |||||
| l1_size = (m_l1 * 16 + n_l1 * | |||||
| 16) * k_l1 * 16 | |||||
| if l1_size > l1_max_size: | |||||
| continue | |||||
| if nmax == 1: | |||||
| n_l1 = 0 | |||||
| n_l0 = 0 | |||||
| if mmax == 1: | |||||
| m_l1 = 0 | |||||
| m_l0 = 0 | |||||
| if kmax == 1: | |||||
| k_l1 = 16 | |||||
| k_l0 = 16 | |||||
| config_space.add(MatmulCubeConfig( | |||||
| n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass)) | |||||
| shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, | |||||
| op_desc.bias, op_desc.left_format, | |||||
| op_desc.right_format, op_desc.out_format) | |||||
| return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format, | |||||
| op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype, | |||||
| op_desc.out_dtype)), None, None | |||||
| def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||||
| """get config space of batch_matmul operator in gpu""" | |||||
| return | |||||
| def get_range_block(space_res): | |||||
| block_range = space_res.gpu_block_range_table.asnumpy().tolist() | |||||
| block_mod = space_res.gpu_block_mod_table.asnumpy().tolist() | |||||
| block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0]) | |||||
| block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0]) | |||||
| if len(block_y_range) == 0: block_y_range = range(1,2) | |||||
| block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0]) | |||||
| if len(block_z_range) == 0: block_z_range = range(1,2) | |||||
| return block_x_range,block_y_range,block_z_range | |||||
| def get_range_thread(space_res): | |||||
| thread_range = space_res.gpu_thread_range_table.asnumpy().tolist() | |||||
| thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist() | |||||
| thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0]) | |||||
| thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0]) | |||||
| if len(thread_y_range) == 0: thread_y_range = range(1,2) | |||||
| thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0]) | |||||
| if len(thread_z_range) == 0: thread_z_range = range(1,2) | |||||
| return thread_x_range,thread_y_range,thread_z_range | |||||
| def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL): | |||||
| total_shape = max([max(v) for v in tiling_spaces]) | |||||
| new_spaces = [] | |||||
| block_x_range, block_y_range, block_z_range = get_range_block(space_res) | |||||
| thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res) | |||||
| pbar = tqdm(total=len(tiling_spaces)) | |||||
| max_thread = 1024 | |||||
| for space in tiling_spaces: | |||||
| pbar.set_description("Adding block, thread to spaces") | |||||
| if policy == GpuSpacePolicy.REDUCE_ALL: | |||||
| for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2): | |||||
| for by in block_y_range: | |||||
| for bz in block_z_range: | |||||
| for tx in thread_x_range: | |||||
| for ty in thread_y_range: | |||||
| for tz in thread_z_range: | |||||
| if tx * ty * tz > max_thread: | |||||
| continue | |||||
| tmp_space = space[:] | |||||
| tmp_space.append(bx) | |||||
| tmp_space.append(by) | |||||
| tmp_space.append(bz) | |||||
| tmp_space.append(tx) | |||||
| tmp_space.append(ty) | |||||
| tmp_space.append(tz) | |||||
| new_spaces.append(tmp_space) | |||||
| elif policy == GpuSpacePolicy.BMM: | |||||
| for tx in thread_x_range: | |||||
| for ty in thread_y_range: | |||||
| for tz in thread_z_range: | |||||
| if tx * ty * tz > max_thread: | |||||
| continue | |||||
| tmp_space = space[:] | |||||
| if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]): | |||||
| continue | |||||
| bx = max(1, tmp_space[-1] // tx) | |||||
| by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1 | |||||
| bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1 | |||||
| if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop: | |||||
| continue | |||||
| tmp_space.append(bx) | |||||
| tmp_space.append(by) | |||||
| tmp_space.append(bz) | |||||
| tmp_space.append(tx) | |||||
| tmp_space.append(ty) | |||||
| tmp_space.append(tz) | |||||
| new_spaces.append(tmp_space) | |||||
| elif policy == GpuSpacePolicy.FULL: | |||||
| for bx in block_x_range: | |||||
| for by in block_y_range: | |||||
| for bz in block_z_range: | |||||
| for tx in thread_x_range: | |||||
| for ty in thread_y_range: | |||||
| for tz in thread_z_range: | |||||
| tmp_space = space[:] | |||||
| tmp_space.append(bx) | |||||
| tmp_space.append(by) | |||||
| tmp_space.append(bz) | |||||
| tmp_space.append(tx) | |||||
| tmp_space.append(ty) | |||||
| tmp_space.append(tz) | |||||
| new_spaces.append(tmp_space) | |||||
| else: | |||||
| raise ValueError("Policy {} is not defined.".format(policy)) | |||||
| pbar.update(1) | |||||
| print("total spaces size is: ",len(new_spaces)) | |||||
| return new_spaces | |||||
| def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||||
| """get config space of conv_image2col_gemm operators in gpu""" | |||||
| return | |||||
| _get_space_func = { | |||||
| 'conv': _get_space_conv, | |||||
| 'conv_bn1': _get_space_conv_bn1, | |||||
| 'conv_backprop_input': _get_space_conv_backprop_input, | |||||
| 'conv_backprop_filter': _get_space_conv_backprop_filter, | |||||
| 'matmul': _get_space_matmul_cube, | |||||
| "reduce_sum_gpu": _get_space_reduce_gpu_manually, | |||||
| "batch_matmul_gpu": _get_space_batch_matmul_gpu, | |||||
| "conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu, | |||||
| } | |||||
| def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None): | |||||
| """get space of an operator""" | |||||
| func = _get_space_func.get(op_type, None) | |||||
| if func is None: | |||||
| func = partial(_get_space_vector, op_type=op_type) | |||||
| if "gpu" in op_type: | |||||
| return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) | |||||
| return func(op_desc=op_desc) | |||||
| @@ -0,0 +1,147 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Generating test data for operators""" | |||||
| from typing import NamedTuple | |||||
| import numpy as np | |||||
| from gen_json_data import gen_json_data | |||||
| from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run | |||||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc | |||||
| def _gen_data_json(op_desc): | |||||
| """Generating test data for composite json""" | |||||
| input_for_mod, expect, _ = gen_json_data(op_desc) | |||||
| return input_for_mod, expect | |||||
| def _gen_data_conv(op_desc: ConvDesc): | |||||
| """Generating test data for conv""" | |||||
| fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, | |||||
| op_desc.pad, op_desc.stride, op_desc.dilation, | |||||
| op_desc.use_bias) | |||||
| out_data = np.full(expect.shape, 0, 'float16') | |||||
| if op_desc.use_bias: | |||||
| args = (fmap_data, filter_data, bias_data, out_data) | |||||
| else: | |||||
| args = (fmap_data, filter_data, out_data) | |||||
| return args, expect | |||||
| def _gen_data_conv_bn1(op_desc: ConvDesc): | |||||
| """Generating test data for conv_bn1""" | |||||
| fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, | |||||
| op_desc.pad, op_desc.stride, op_desc.dilation, | |||||
| op_desc.use_bias) | |||||
| axes = (0, 2, 3) | |||||
| conv_mean = np.mean(conv_expect, axis=axes, keepdims=True) | |||||
| conv_square = np.power(conv_expect, 2) | |||||
| conv_var_part = np.mean(conv_square, axis=axes, keepdims=True) | |||||
| expects = (conv_expect, conv_var_part, conv_mean) | |||||
| out_datas = [np.full(e.shape, 0, 'float16') for e in expects] | |||||
| out_datas[1] = out_datas[1].astype(np.float32) | |||||
| out_datas[2] = out_datas[2].astype(np.float32) | |||||
| if op_desc.use_bias: | |||||
| in_data = [fmap_data, filter_data, bias_data] | |||||
| else: | |||||
| in_data = [fmap_data, filter_data] | |||||
| args = in_data | |||||
| for out in out_datas: | |||||
| args.append(out) | |||||
| args = tuple(args) | |||||
| return {"args": args, 'outputs': (-3, -2, -1)}, expects | |||||
| def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc): | |||||
| dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||||
| op_desc.stride, op_desc.dilation) | |||||
| out_data = np.full(dx.shape, 0, 'float16') | |||||
| args = (dout, w, out_data) | |||||
| return args, dx | |||||
| def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc): | |||||
| """Generating test data for conv_backprop_filter""" | |||||
| block_size = 16 | |||||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||||
| cout, _, w_h, w_w = op_desc.filter_shape | |||||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||||
| cout = (cout + block_size - 1) // block_size * block_size | |||||
| x_shape = (in_n, in_c, in_h, in_w) | |||||
| w_shape = (cout, in_c, w_h, w_w) | |||||
| dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride, | |||||
| op_desc.dilation) | |||||
| out_data = np.full(expect.shape, 0, 'float32') | |||||
| args = (dy_data, dx_data, out_data) | |||||
| return args, expect | |||||
| def _gen_data_matmul_cube(op_desc: MatmulCubeDesc): | |||||
| """Generating test data for matmul_cube""" | |||||
| batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) | |||||
| m = (m + 15) // 16 * 16 | |||||
| n = (n + 15) // 16 * 16 | |||||
| k = (k + 15) // 16 * 16 | |||||
| _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, | |||||
| op_desc.bias, op_desc.left_format, op_desc.right_format, | |||||
| op_desc.out_format) | |||||
| m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype, | |||||
| op_desc.out_dtype, op_desc.bias, op_desc.adj_x, | |||||
| op_desc.adj_y, op_desc.left_format, | |||||
| op_desc.right_format, op_desc.out_format) | |||||
| out_data = np.full(out_shape, np.nan, op_desc.out_dtype) | |||||
| if op_desc.bias: | |||||
| args = (m_x, m_y, bias_data, out_data) | |||||
| else: | |||||
| args = (m_x, m_y, out_data) | |||||
| return args, bench_mark | |||||
| _gen_data_func = { | |||||
| 'json': _gen_data_json, | |||||
| 'conv': _gen_data_conv, | |||||
| 'conv_bn1': _gen_data_conv_bn1, | |||||
| 'conv_backprop_input': _gen_data_conv_backprop_input, | |||||
| 'conv_backprop_filter': _gen_data_conv_backprop_filter, | |||||
| 'matmul': _gen_data_matmul_cube, | |||||
| } | |||||
| def gen_data(op_type: str, op_desc: NamedTuple): | |||||
| """Generate test data for operator | |||||
| Parameters | |||||
| op_type: str | |||||
| operator name | |||||
| op_desc: NamedTuple | |||||
| operator definition parameters | |||||
| ---------- | |||||
| """ | |||||
| gen_func = _gen_data_func.get(op_type, None) | |||||
| if gen_func is None: | |||||
| raise ValueError('Unsupported op type for test data generating: %s' % op_type) | |||||
| return gen_func(op_desc) | |||||
| @@ -0,0 +1,84 @@ | |||||
| from akg.utils import custom_tiling as ct_util | |||||
| def reduce_gpu_tiling_strategy(in_shape, reduce_axis): | |||||
| """Custom tiling strategy for reduce op in gpu""" | |||||
| strategy = list() | |||||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||||
| """all-reduce""" | |||||
| strategy.append( | |||||
| ct_util.create_constraint_on_axis( | |||||
| values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0 | |||||
| )[0] | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD | |||||
| ) | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||||
| ) | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN | |||||
| ) | |||||
| ) | |||||
| elif (len(in_shape) - 1) in reduce_axis: | |||||
| """Reduce-X: dummy strategy for hand-write space""" | |||||
| strategy.append( | |||||
| ct_util.create_constraint_on_axis( | |||||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 | |||||
| )[0] | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.create_constraint_on_axis( | |||||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 | |||||
| )[0] | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||||
| ) | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX | |||||
| ) | |||||
| ) | |||||
| else: | |||||
| """Reduce-Y: dummy strategy for hand-write space""" | |||||
| strategy.append( | |||||
| ct_util.create_constraint_on_axis( | |||||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 | |||||
| )[0] | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.create_constraint_on_axis( | |||||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 | |||||
| )[0] | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||||
| ) | |||||
| ) | |||||
| strategy.append( | |||||
| ct_util.modify_common_constraints( | |||||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX | |||||
| ) | |||||
| ) | |||||
| return strategy | |||||
| def conv_dummy_strategy(): | |||||
| """Conv strategy: dummy strategy""" | |||||
| return | |||||
| def batch_matmul_gpu_tiling_strategy(desc): | |||||
| """Custom tiling strategy for batch matmul in gpu with or without tensor core""" | |||||
| return | |||||
| @@ -0,0 +1,359 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Tuner for finding best config for operators""" | |||||
| import logging | |||||
| import time | |||||
| import json | |||||
| import os | |||||
| import numpy as np | |||||
| from multiprocessing import Process | |||||
| from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel | |||||
| from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer | |||||
| from .space import ConfigSpace | |||||
| from .runner import KernelRunner | |||||
| from tqdm import tqdm | |||||
| logger = logging.getLogger('fuzz.tune.autotuning.tuner') | |||||
| class Tuner: | |||||
| """Basic tuner class | |||||
| Parameters | |||||
| ---------- | |||||
| runner: KernelRunner | |||||
| This is for run kernels in physical device | |||||
| config_space: ConfigSpace | |||||
| The space of configs | |||||
| n_parallel: int | |||||
| How many kernels are processed in a turn | |||||
| """ | |||||
| def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None): | |||||
| self._runner = runner | |||||
| self._index_table = index_table | |||||
| self._space = config_space | |||||
| self._n_parallel = n_parallel | |||||
| # trial plan | |||||
| self._trials = [] | |||||
| self._trial_pt = 0 | |||||
| self._visited = set() | |||||
| # observed samples | |||||
| self._xs = [] | |||||
| self._ys = [] | |||||
| # keep the current best | |||||
| self._best_config = None # type: ConfigEntity | |||||
| self._best_time = np.inf | |||||
| self._best_iter = 0 | |||||
| self._tuning_time = 0.0 | |||||
| self._original_time = np.inf | |||||
| self._skip_config_set = skip_config_set | |||||
| @property | |||||
| def best_config(self): | |||||
| return self._best_config | |||||
| @property | |||||
| def best_time(self): | |||||
| return self._best_time | |||||
| @property | |||||
| def best_iter(self): | |||||
| return self._best_iter | |||||
| @property | |||||
| def tuning_time(self): | |||||
| return self._tuning_time | |||||
| @property | |||||
| def original_time(self): | |||||
| return self._original_time | |||||
| @property | |||||
| def xs(self): | |||||
| return self._xs | |||||
| @property | |||||
| def ys(self): | |||||
| return self._ys | |||||
| def info(self): | |||||
| print('space size:', self._space.length) | |||||
| print('best config:', self._best_config) | |||||
| print('best time:', self._best_time) | |||||
| print('best_iter:', self._best_iter) | |||||
| print('tuning time:', self._tuning_time, 'secs') | |||||
| def next_batch(self, batch_size: int, is_add_visited=True): | |||||
| """extract next batch with xgboost model""" | |||||
| ret = [] | |||||
| counter = 0 | |||||
| if not is_add_visited: | |||||
| return [self._space.get(index) for index in range(min(batch_size, self._space.length))] | |||||
| while counter < batch_size and self._space.has_next(): | |||||
| index = 0 | |||||
| while self._trial_pt < len(self._trials): | |||||
| index = self._trials[self._trial_pt] | |||||
| if index not in self._visited: | |||||
| break | |||||
| self._trial_pt += 1 | |||||
| if self._trial_pt >= len(self._trials): | |||||
| # if the trial list is empty choose randomly | |||||
| index = self._space.fetch_index() | |||||
| ret.append(self._space.get(index)) | |||||
| self._visited.add(index) | |||||
| counter += 1 | |||||
| return ret | |||||
| def next_config(self, batch_size: int): | |||||
| """extract next config orderly""" | |||||
| ret = [] | |||||
| counter = 0 | |||||
| while counter < batch_size and self._space.has_next(): | |||||
| index = self._space.fetch_next_index() | |||||
| ret.append(self._space.get(index)) | |||||
| self._visited.add(index) | |||||
| counter += 1 | |||||
| return ret | |||||
| def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""): | |||||
| """export configs""" | |||||
| mode = "a" if append else "w" | |||||
| with open(output_file, mode) as f: | |||||
| for x, y in configs: | |||||
| if y != -1: | |||||
| f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y)) | |||||
| def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""): | |||||
| """export dim configs""" | |||||
| mode = "a" if append else "w" | |||||
| data = {} | |||||
| try: | |||||
| if os.path.isfile(output_file): | |||||
| with open(output_file, 'r') as f: | |||||
| data = json.load(f) | |||||
| except IOError as e: | |||||
| logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) | |||||
| with open(output_file, mode) as f: | |||||
| import re | |||||
| data[key] = configs | |||||
| s = json.dumps(data, sort_keys=True) | |||||
| s = re.sub(r',\s*"', ',\n"', s) | |||||
| s = '{\n' + s[1:-1] + '\n}' | |||||
| f.write(s) | |||||
| def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]): | |||||
| """export dim configs""" | |||||
| mode = "a" if append else "w" | |||||
| data = {} | |||||
| try: | |||||
| if os.path.isfile(output_file): | |||||
| with open(output_file, 'r') as f: | |||||
| data = json.load(f) | |||||
| except IOError as e: | |||||
| logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) | |||||
| with open(output_file, mode) as f: | |||||
| import copy | |||||
| tmp = copy.deepcopy(configs) | |||||
| for key in reversed(keys): | |||||
| info = {key: tmp} | |||||
| tmp = copy.deepcopy(info) | |||||
| data.update(info) | |||||
| s = json.dumps(data, sort_keys=True, indent=4) | |||||
| print(s) | |||||
| f.write(s) | |||||
| def load_configs(self, input_file: str): | |||||
| """load configs""" | |||||
| configs = [] | |||||
| file_path = os.path.realpath(input_file) | |||||
| if os.path.isfile(file_path): | |||||
| with open(file_path, "r") as f: | |||||
| for line in f: | |||||
| x, y, _ = line.split('|') | |||||
| configs.append((self._space.input_type(**json.loads(x)), np.float64(y))) | |||||
| return configs | |||||
| def tune(self, least_try_times: int, output_file: str = None): | |||||
| """grid search all configs""" | |||||
| i = 0 | |||||
| pbar = tqdm(total=least_try_times) | |||||
| while i < least_try_times: | |||||
| if not self._space.has_next(): | |||||
| break | |||||
| configs = self.next_config(min(self._n_parallel, least_try_times - i)) | |||||
| run_times = self._runner.run(configs, self._best_time) | |||||
| results = [] | |||||
| for idx, conf in enumerate(configs): | |||||
| results.append((conf.input_id, run_times[idx])) | |||||
| # keep best config | |||||
| if self.best_time > run_times[idx]: | |||||
| self._best_time = run_times[idx] | |||||
| self._best_iter = i + idx | |||||
| self._best_config = conf | |||||
| i += len(results) | |||||
| pbar.update(len(results)) | |||||
| # update | |||||
| for res in results: | |||||
| self._xs.append(res[0]) | |||||
| self._ys.append(res[1]) | |||||
| if output_file: | |||||
| configs = [(self._space.get(res[0]).input, res[1]) for res in results] | |||||
| self.export_configs(configs, output_file) | |||||
| return run_times | |||||
| class ModelBasedTuner(Tuner): | |||||
| """Model based tuner | |||||
| This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials | |||||
| Parameters | |||||
| ---------- | |||||
| plan_size: int | |||||
| Tuner will re-fit model per `plan_size` new measure samples | |||||
| pre_model: CostModel | |||||
| The cost model that predicts the speed of a config (IR) | |||||
| """ | |||||
| def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None): | |||||
| super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel) | |||||
| self.__plan_size = plan_size | |||||
| if pre_model is not None: | |||||
| self.__cost_model = pre_model | |||||
| self.__cost_model.reset_space(self._space) | |||||
| else: | |||||
| self.__cost_model = XgbCostModel(self._space) | |||||
| self.__model_optimizer = SimulatedAnnealingOptimizer(self._space) | |||||
| self.__train_ct = 0 | |||||
| self.__is_auto_set_dim = False#True | |||||
| # time to leave | |||||
| self.__ttl = None | |||||
| self.__least_try_times = None | |||||
| self.__early_stopping = None | |||||
| self.__model_run_time = 0.0 | |||||
| def info(self): | |||||
| super(ModelBasedTuner, self).info() | |||||
| print('model run time:', self.__model_run_time, 'secs') | |||||
| def model_res(self): | |||||
| self.__cost_model.fit(self._xs, self._ys, self.__plan_size) | |||||
| best_configs = self.__model_optimizer.find_best( | |||||
| self.__cost_model, self.__plan_size, self._visited) | |||||
| self._trials = best_configs | |||||
| def tune(self, least_try_times: int, output_file: str = None): | |||||
| early_stopping = least_try_times | |||||
| self.__least_try_times = least_try_times | |||||
| self.__early_stopping = early_stopping | |||||
| logger.setLevel(logging.DEBUG) | |||||
| old_level = logger.level | |||||
| i = 0 | |||||
| error_ct = 0 | |||||
| tuning_start = time.time() | |||||
| while (i < self._space.length and (i < least_try_times | |||||
| or (self._best_time > self._original_time - 0.9 | |||||
| and i < least_try_times * 3))): | |||||
| if not self._space.has_next(): | |||||
| break | |||||
| iter_start = time.time() | |||||
| if not self.__is_auto_set_dim: | |||||
| configs = self.next_batch(min(self._n_parallel, self._space.length - i)) | |||||
| else: | |||||
| configs = self.next_batch(min(self._n_parallel, self._space.length - i), False) | |||||
| logger.debug('--indexes: %s', str([x.input_id for x in configs])) | |||||
| run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim) | |||||
| if self.__is_auto_set_dim: | |||||
| from operator import add | |||||
| from functools import reduce | |||||
| self._original_time = reduce(add, run_times) / len(run_times) | |||||
| self._best_time = self._original_time | |||||
| self._best_iter = -1 | |||||
| self._best_config = None | |||||
| run_times = None | |||||
| self.__is_auto_set_dim = False | |||||
| continue | |||||
| results = [] | |||||
| for idx, conf in enumerate(configs): | |||||
| if run_times[idx] == -1: | |||||
| continue | |||||
| results.append((conf.input_id, run_times[idx])) | |||||
| # keep best config | |||||
| if self._best_time > run_times[idx]: | |||||
| self._best_time = run_times[idx] | |||||
| self._best_iter = i + idx | |||||
| self._best_config = conf | |||||
| i += len(results) | |||||
| self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i | |||||
| start = time.time() | |||||
| # update | |||||
| for res in results: | |||||
| self._xs.append(res[0]) | |||||
| self._ys.append(res[1]) | |||||
| if output_file: | |||||
| configs = [(self._space.get(res[0]).input, res[1]) for res in results] | |||||
| desc = str(self._runner.op_desc) | |||||
| self.export_configs(configs, output_file, desc=desc) | |||||
| # if we have enough new training samples | |||||
| if len(self._xs) >= self.__plan_size * (self.__train_ct + 1): | |||||
| p = Process(target=self.model_res) | |||||
| p.start() | |||||
| p.join() | |||||
| self._trial_pt = 0 | |||||
| self.__train_ct += 1 | |||||
| end = time.time() | |||||
| logger.debug('model running time: %f seconds', end - start) | |||||
| self.__model_run_time += end - start | |||||
| iter_end = time.time() | |||||
| logger.debug('iter time: %f seconds', iter_end - iter_start) | |||||
| if self._best_iter > 0 and i >= self.best_iter + early_stopping: | |||||
| logger.debug('Early stopped. Best iter: %d', self._best_iter) | |||||
| return | |||||
| print("tuning time already, ", time.time() - tuning_start) | |||||
| if time.time() - tuning_start > 7200: | |||||
| logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter) | |||||
| return | |||||
| if error_ct > 150: | |||||
| logging.warning('Too many errors happen in the tuning. Now is in debug mode') | |||||
| logger.setLevel(logging.DEBUG) | |||||
| else: | |||||
| logger.setLevel(old_level) | |||||
| self._tuning_time += time.time() - tuning_start | |||||
| @@ -0,0 +1,9 @@ | |||||
| { | |||||
| "enable_atomic_add": { | |||||
| "dtype": "bool", | |||||
| "options": [ | |||||
| "False", | |||||
| "True" | |||||
| ] | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,155 @@ | |||||
| from collections import namedtuple | |||||
| import os | |||||
| import logging | |||||
| def get_block_str_from_config(config: namedtuple): | |||||
| block_param = "" | |||||
| if "block_x" in getattr(config, "_fields"): | |||||
| block_param += str(config.block_x) + " " | |||||
| if "block_y" in getattr(config, "_fields"): | |||||
| block_param += str(config.block_y) + " " | |||||
| if "block_z" in getattr(config, "_fields"): | |||||
| block_param += str(config.block_z) + " " | |||||
| return block_param | |||||
| def get_thread_str_from_config(config: namedtuple): | |||||
| thread_param = "" | |||||
| if "thread_x" in getattr(config, "_fields"): | |||||
| thread_param += str(config.thread_x) + " " | |||||
| if "thread_y" in getattr(config, "_fields"): | |||||
| thread_param += str(config.thread_y) + " " | |||||
| if "thread_z" in getattr(config, "_fields"): | |||||
| thread_param += str(config.thread_z) + " " | |||||
| return thread_param | |||||
| def get_parallel_build_num(): | |||||
| """get the num of parallel build""" | |||||
| env_dic = os.environ | |||||
| try: | |||||
| return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1 | |||||
| except NameError as e: | |||||
| logging.error(e) | |||||
| return 1 | |||||
| def get_available_gpu_num(): | |||||
| """get the num of gpu""" | |||||
| env_dic = os.environ | |||||
| try: | |||||
| return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ] | |||||
| except NameError as e: | |||||
| logging.error(e) | |||||
| return 1 | |||||
| def get_real_attr(value ,key ,need_tune_json, need_tune_keys): | |||||
| if key not in need_tune_keys: | |||||
| return value | |||||
| if need_tune_json[key]['dtype'] == "bool": | |||||
| if need_tune_json[key]['options'][value].lower() == "true": | |||||
| return True | |||||
| elif need_tune_json[key]['options'][value].lower() == "false": | |||||
| return False | |||||
| else: | |||||
| raise TypeError("Wrong boolean type, please check json file") | |||||
| elif need_tune_json[key]['dtype'] == "str": | |||||
| if isinstance(need_tune_json[key]['options'][value], str): | |||||
| return need_tune_json[key]['options'][value] | |||||
| else: | |||||
| raise TypeError("Wrong str type, please check json file") | |||||
| elif need_tune_json[key]['dtype'] == "int": | |||||
| if isinstance(need_tune_json[key]['options'][value], int): | |||||
| return need_tune_json[key]['options'][value] | |||||
| else: | |||||
| raise TypeError("Wrong int type, please check json file") | |||||
| def merge_attrs(attrs, config, need_tune_json): | |||||
| tiling = [getattr(config, name) for name in getattr( | |||||
| config, '_fields') if name.startswith('tiling')] | |||||
| dim_str = '' | |||||
| d_config = config._asdict() | |||||
| d_attrs = attrs._asdict() | |||||
| is_2d_tiling = False | |||||
| for name in getattr(config, '_fields'): | |||||
| if name.startswith('tiling'): | |||||
| if name.count("_") == 2: | |||||
| is_2d_tiling = True | |||||
| break | |||||
| for i, element in enumerate(tiling): | |||||
| if is_2d_tiling: | |||||
| if i % 2 == 0: | |||||
| dim_str += "0 " + str(i//2) + " " | |||||
| dim_str += str(element) + " " | |||||
| else: | |||||
| # 1d tiling | |||||
| dim_str += "0 " + str(i) + " " + str(element) + " 1 " | |||||
| # add block, thread information | |||||
| block = [str(getattr(config, name)) for name in getattr( | |||||
| config, '_fields') if name.startswith('block')] | |||||
| bind_block_str = ' '.join(block) | |||||
| thread = [str(getattr(config, name)) for name in getattr( | |||||
| config, '_fields') if name.startswith('thread')] | |||||
| bind_thread_str = ' '.join(thread) | |||||
| d_attrs['dim'] = dim_str | |||||
| d_attrs['bind_block'] = bind_block_str | |||||
| d_attrs['bind_thread'] = bind_thread_str | |||||
| need_tune_keys = need_tune_json.keys() | |||||
| for key in need_tune_keys: | |||||
| d_attrs[key] = d_config[key] | |||||
| # make a new attrs with config info | |||||
| attrs_type = type(attrs) | |||||
| config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs] | |||||
| new_attrs = attrs_type(*config_list) | |||||
| return new_attrs | |||||
| def get_skip_configs_from_log(skip_configs_log): | |||||
| skip_config_set = set() | |||||
| if skip_configs_log != "": | |||||
| with open(skip_configs_log, 'r') as file: | |||||
| for line in file: | |||||
| config = str(line.split("|")[1]).strip() | |||||
| skip_config_set.add(config) | |||||
| print("SKIP CONFIGS NUMBER:", len(skip_config_set)) | |||||
| return skip_config_set | |||||
| def get_tuning_attrs_from_json(tuning_attrs_json): | |||||
| import json | |||||
| need_tune_spaces = [[]] | |||||
| keys = [] | |||||
| json_string = dict() | |||||
| if tuning_attrs_json != "": | |||||
| with open(tuning_attrs_json,'r') as file: | |||||
| json_string =json.load(file) | |||||
| for key in json_string.keys(): | |||||
| keys.append(key) | |||||
| num_options = len(json_string[key]['options']) | |||||
| tmp_spaces = [] | |||||
| for space in need_tune_spaces: | |||||
| for i in range(num_options): | |||||
| tmp_space = space[:] | |||||
| tmp_space.append(i) | |||||
| tmp_spaces.append(tmp_space) | |||||
| need_tune_spaces = tmp_spaces[:] | |||||
| return (keys, need_tune_spaces, json_string) | |||||
| if __name__ == "__main__": | |||||
| """test components""" | |||||
| file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json" | |||||
| keys, need_tune_spaces = get_tuning_attrs_from_json(file_name) | |||||
| print(keys) | |||||
| print(need_tune_spaces) | |||||
| @@ -0,0 +1,49 @@ | |||||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """operator description and config param definitions""" | |||||
| from collections import namedtuple | |||||
| # op desc for ascend | |||||
| ConvDesc = namedtuple("ConvDesc", [ | |||||
| 'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias']) | |||||
| ConvBackpropDesc = namedtuple( | |||||
| "ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation']) | |||||
| MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format", | |||||
| "out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"]) | |||||
| # op desc for gpu | |||||
| ReduceGpuDesc = namedtuple("ReduceGpuDesc", [ | |||||
| "in_shape", "in_dtype", "axis", "keepdims", | |||||
| "poly_sch", "dim", "bind_block", "bind_thread", | |||||
| "enable_akg_reduce_lib", "enable_atomic_add"]) | |||||
| # config param definitions for ascend | |||||
| ConvConfig = namedtuple('ConvConfig', [ | |||||
| 'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass']) | |||||
| ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig', | |||||
| ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w']) | |||||
| ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig', | |||||
| ['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch', | |||||
| 'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n']) | |||||
| MatmulCubeConfig = namedtuple( | |||||
| 'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass']) | |||||
| # config param definitions for gpu | |||||
| EmptyConfig = namedtuple('empty', []) | |||||
| @@ -0,0 +1,16 @@ | |||||
| # how many multi-processing to build | |||||
| export BUILD_PARALLEL_NUM=4 | |||||
| # set the default gpu devices, plz never change it | |||||
| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |||||
| # set the real devices you want to use | |||||
| export USE_GPU_DEVICES=0,1,2,3 | |||||
| export RUNTIME_MODE=gpu | |||||
| export PROFILING_MODE=true | |||||
| # ascend config | |||||
| export DEVICE_ID=0 | |||||
| export DEVICE_TOTAL_NUM=8 | |||||
| @@ -0,0 +1,67 @@ | |||||
| # Copyright 2019-2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """test""" | |||||
| import time | |||||
| from autotuning.job import launch | |||||
| from akg.utils import kernel_exec | |||||
| from akg.ops.math_gpu import reduce_sum | |||||
| from autotuning.type_definitions import ReduceGpuDesc | |||||
| import numpy as np | |||||
| import sys | |||||
| import argparse | |||||
| from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json | |||||
| def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False): | |||||
| mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ), | |||||
| kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims], | |||||
| attrs={"target": "cuda", "enable_akg_reduce_lib": True}) | |||||
| return mod | |||||
| def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None): | |||||
| time_start = time.time() | |||||
| op_type_ = 'reduce_sum_gpu' | |||||
| debug_mode_ = True | |||||
| save_res_ = True | |||||
| all_space_ = True | |||||
| op_config = [in_shape, in_dtype, axis, keepdims, | |||||
| "", "", "", | |||||
| True, True, True] | |||||
| op_config = ReduceGpuDesc(*op_config) | |||||
| desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute, | |||||
| op_config, tuning_attrs_info) | |||||
| launch(op_type=op_type_, debug_mode=debug_mode_, | |||||
| save_res=save_res_, desc=desc_, all_space=all_space_, | |||||
| from_json=False, skip_config_set=skip_config_set, | |||||
| tuning_attrs_info=tuning_attrs_info) | |||||
| time_end = time.time() | |||||
| print("total tuning time: ", time_end - time_start) | |||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument("--skip_configs_log", type=str, | |||||
| default="", help="skip those configs in .log file") | |||||
| parser.add_argument("--tuning_attrs_json", type=str, default="", | |||||
| help="the json file to describe the tuning atttrs") | |||||
| args = parser.parse_args() | |||||
| # check whether have configs need to skip | |||||
| skip_config_set = get_skip_configs_from_log(args.skip_configs_log) | |||||
| # add tuning_attrs from json file | |||||
| tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json) | |||||
| run_test_reduce_sum((1024, 1024), "float32", (1,), | |||||
| False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"11288","input_desc":[[{"data_type":"float32","shape":[1024],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_0"}],[{"data_type":"float16","shape":[8192,1024],"tensor_name":"input_2"}]],"op":"Fused_Cast_Cast_Mul_TensorAdd___12292245117929986167","op_desc":[{"attr":[{"name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_0"}]],"name":"Cast","output_desc":[{"data_type":"float16","name":"output","shape":[1024],"tensor_name":"output_0_0"}]},{"attr":[{"name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_1"}]],"name":"Cast","output_desc":[{"data_type":"float16","name":"output","shape":[1024],"tensor_name":"output_0_1"}]},{"attr":[{"name":"x_shape","value":[8192,1024]},{"name":"y_shape","value":[1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[8192,1024],"tensor_name":"input_2"}],[{"data_type":"float16","name":"y","shape":[1024],"tensor_name":"output_0_1"}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[8192,1024],"tensor_name":"output_0_2"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[8192,1024],"tensor_name":"output_0_2"}],[{"data_type":"float16","name":"y","shape":[1024],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","name":"output","shape":[8192,1024],"tensor_name":"output_0_3"}]}],"output_desc":[{"data_type":"float16","shape":[1024],"tensor_name":"output_0_1"},{"data_type":"float16","shape":[8192,1024],"tensor_name":"output_0_3"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"44349.44349","id":1550,"input_desc":[[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1],"tensor_name":"input_2"}],[{"data_type":"float16","format":"DefaultFormat","shape":[1024],"tensor_name":"input_4"}],[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"input_1"}],[{"data_type":"float32","format":"DefaultFormat","shape":[1024],"tensor_name":"input_0"}]],"op":"Fused_Cast_RealDiv_Mul_TensorAdd_split_16909220147165618805","op_desc":[{"attr":[{"data_type":"bool","name":"is_backed_cast","value":false},{"data_type":"str","name":"pri_format","value":"NC1HWC0"},{"data_type":"str","name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1024],"tensor_name":"input_0"}]],"name":"Cast","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[1024],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"input_1"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[4,1024,1],"tensor_name":"input_2"}]],"name":"RealDiv","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_1"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"output_0_1"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[1024],"tensor_name":"input_4"}]],"name":"Mul","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"output_0_2"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[1024],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_3"}]}],"output_desc":[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"output_0_1"},{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"output_0_3"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"26625.40461","id":2555,"input_desc":[[{"data_type":"float32","format":"DefaultFormat","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","shape":[1024,1024],"tensor_name":"input_8"}]],"op":"Fused_ClipByNormNoDivSum_RealDiv_fusion_4181144419579591378","op_desc":[{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000"}],"fusion":"SelectGT_000","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_1","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_1"}]],"name":"Sqrt","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"output_0_2"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1,1],"tensor_name":"input_0"}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_3"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_3"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_7","value":1.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_4"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"FRACTAL_NZ"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1024,1024],"tensor_name":"input_8"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"output_0_4"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1024,1024],"tensor_name":"output_0_5"}]}],"output_desc":[{"data_type":"float32","format":"DefaultFormat","shape":[1024,1024],"tensor_name":"output_0_5"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"26625.40534","id":2566,"input_desc":[[{"data_type":"float32","format":"DefaultFormat","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","shape":[4096],"tensor_name":"input_7"}]],"op":"Fused_ClipByNormNoDivSum_RealDiv_fusion_8238389606767005164","op_desc":[{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000"}],"fusion":"SelectGT_000","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_1","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_1"}]],"name":"Sqrt","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_0"}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_3"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_4"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"NC1HWC0"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[4096],"tensor_name":"input_7"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"output_0_4"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[4096],"tensor_name":"output_0_5"}]}],"output_desc":[{"data_type":"float32","format":"DefaultFormat","shape":[4096],"tensor_name":"output_0_5"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"1907_sp_1847_1787_1728_1704_1680_1656_1587_sp_552_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[2],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[2],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_12073466097680829202","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0009_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[2]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[2],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[2],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"9739_sp_9282_8825_8368_7912_7889_7864_sp_2735_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[512,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[512,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_13492243466190004284","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0013_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0002_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[512,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[512,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[512,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[512,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[512,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"1944_sp_1884_1824_1747_1716_1692_1668_1612_sp_655_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[4096],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[4096],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_1445905573061742177","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0006_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[4096]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[4096],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[4096],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[4096],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[4096],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"1957_sp_1897_1837_1732_1708_1684_1660_1593_sp_587_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_15689878575778426853","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0003_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"9298_sp_8841_8384_7927_7902_7879_7441_sp_2650_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[21128],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[21128],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_16040335705910473299","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0020","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0020_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0020","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0009_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[21128]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[21128],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[21128],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"1917_sp_1857_1797_1738_1712_1688_1664_1601_sp_621_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[30522],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[30522],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_3830386909471115343","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0008_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[30522]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[30522],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[30522],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"9673_sp_9262_8851_8049_8024_8001_7586_sp_2782_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[1024,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[1024,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_4148121723898026533","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0014","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0014_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0014","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0004_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[1024,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[1024,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[1024,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"9290_sp_8833_8376_7919_7896_7873_7430_sp_2599_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[2,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[2,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_5080003035626701281","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0012","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0012_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0012","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0001_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[2,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[2,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[2,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"9742_sp_9285_8828_8371_7914_7891_7868_sp_2752_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[21128,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[21128,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_8456945009561581117","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0011","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0011_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0011","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0000_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[21128,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[21128,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[21128,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"10313","input_desc":[[{"data_type":"float16","shape":[16,1,1,512],"tensor_name":"input_2"}],[{"data_type":"float16","shape":[16,16,512,512],"tensor_name":"input_1"}]],"op":"Fused_Mul_Mul_TensorAdd__4400644352246048056","op_desc":[{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[16,16,512,512]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[1],"tensor_name":"input_0","value":0.125}],[{"data_type":"float16","name":"y","shape":[16,16,512,512],"tensor_name":"input_1"}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[16,16,512,512],"tensor_name":"output_0_0"}]},{"attr":[{"name":"x_shape","value":[16,1,1,512]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[16,1,1,512],"tensor_name":"input_2"}],[{"data_type":"float16","name":"y","shape":[1],"tensor_name":"input_3","value":-10000.0}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[16,1,1,512],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[16,1,1,512],"tensor_name":"output_0_1"}],[{"data_type":"float16","name":"y","shape":[16,16,512,512],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","name":"output","shape":[16,16,512,512],"tensor_name":"output_0_2"}]}],"output_desc":[{"data_type":"float16","shape":[16,1,1,512],"tensor_name":"output_0_1"},{"data_type":"float16","shape":[16,16,512,512],"tensor_name":"output_0_2"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -0,0 +1,707 @@ | |||||
| { | |||||
| "composite": true, | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_14" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_0" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_3" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_9" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_12" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "op": "Fused_Poly_Schedule_Opt_001", | |||||
| "op_desc": [ | |||||
| { | |||||
| "attr": [ | |||||
| { | |||||
| "name": "x_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "y_shape", | |||||
| "value": [ | |||||
| 1 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "data_format", | |||||
| "value": [ | |||||
| "NC1HWC0", | |||||
| "NC1HWC0" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_0" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1 | |||||
| ], | |||||
| "tensor_name": "input_1", | |||||
| "value": 9.964923265215475e-06 | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_0" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": [ | |||||
| { | |||||
| "name": "x_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "y_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "data_format", | |||||
| "value": [ | |||||
| "NC1HWC0", | |||||
| "NC1HWC0" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_0" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_0" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_1" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": [ | |||||
| { | |||||
| "name": "x_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "y_shape", | |||||
| "value": [ | |||||
| 1 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "data_format", | |||||
| "value": [ | |||||
| "NC1HWC0", | |||||
| "NC1HWC0" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_3" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1 | |||||
| ], | |||||
| "tensor_name": "input_1", | |||||
| "value": 9.964923265215475e-06 | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_2" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_2" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_1" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Sub", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_3" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_3" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1 | |||||
| ], | |||||
| "tensor_name": "input_7", | |||||
| "value": 9.999999747378752e-06 | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_4" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_4" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Sqrt", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_5" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_9" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_5" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "RealDiv", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_6" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": [ | |||||
| { | |||||
| "name": "x_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "y_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "data_format", | |||||
| "value": [ | |||||
| "NC1HWC0", | |||||
| "NC1HWC0" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_6" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_0" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_7" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_12" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_7" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Sub", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_8" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": [ | |||||
| { | |||||
| "name": "x_shape", | |||||
| "value": [ | |||||
| 256 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "y_shape", | |||||
| "value": [ | |||||
| 32, | |||||
| 256, | |||||
| 56, | |||||
| 56 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "name": "data_format", | |||||
| "value": [ | |||||
| "NC1HWC0", | |||||
| "NC1HWC0" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_6" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "input_14" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_9" | |||||
| } | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_9" | |||||
| } | |||||
| ], | |||||
| [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [ | |||||
| 1, | |||||
| 16, | |||||
| 1, | |||||
| 1, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_8" | |||||
| } | |||||
| ] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_10" | |||||
| } | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "output_desc": [ | |||||
| { | |||||
| "data_type": "float32", | |||||
| "shape": [ | |||||
| 32, | |||||
| 16, | |||||
| 56, | |||||
| 56, | |||||
| 16 | |||||
| ], | |||||
| "tensor_name": "output_0_10" | |||||
| } | |||||
| ], | |||||
| "platform": "AKG", | |||||
| "process": "aicore" | |||||
| } | |||||
| @@ -0,0 +1,320 @@ | |||||
| { | |||||
| "composite": true, | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "input_0" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "input_9" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "input_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "input_2" | |||||
| }] | |||||
| ], | |||||
| "op": "Fused_Poly_Schedule_Opt_002", | |||||
| "op_desc": [{ | |||||
| "attr": [{ | |||||
| "name": "dst_type", | |||||
| "value": "float32" | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "input_0" | |||||
| }] | |||||
| ], | |||||
| "name": "Cast", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_0" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "axis", | |||||
| "value": [0, 2, 3] | |||||
| }, { | |||||
| "name": "keep_dims", | |||||
| "value": true | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_0" | |||||
| }] | |||||
| ], | |||||
| "name": "ReduceSum", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "input_2" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_3", | |||||
| "value": 9.999999747378752e-05 | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_2" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_2" | |||||
| }] | |||||
| ], | |||||
| "name": "Sqrt", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_3" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_5", | |||||
| "value": 1.0 | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_3" | |||||
| }] | |||||
| ], | |||||
| "name": "RealDiv", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_4" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [256] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "input_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_8", | |||||
| "value": -1.0 | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_5" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "dst_type", | |||||
| "value": "float32" | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "input_9" | |||||
| }] | |||||
| ], | |||||
| "name": "Cast", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_6" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_6" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_5" | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_7" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [32, 256, 56, 56] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [256] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_4" | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_8" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [32, 256, 56, 56] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [32, 256, 56, 56] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_0" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_8" | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_9" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "axis", | |||||
| "value": [0, 2, 3] | |||||
| }, { | |||||
| "name": "keep_dims", | |||||
| "value": true | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 16, 56, 56, 16], | |||||
| "tensor_name": "output_0_9" | |||||
| }] | |||||
| ], | |||||
| "name": "ReduceSum", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_10" | |||||
| }] | |||||
| }], | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_10" | |||||
| }, { | |||||
| "data_type": "float32", | |||||
| "shape": [1, 16, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }], | |||||
| "platform": "AKG", | |||||
| "process": "aicore" | |||||
| } | |||||
| @@ -0,0 +1,500 @@ | |||||
| { | |||||
| "composite": true, | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "input_19" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "input_14" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_5" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_3" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_9" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_0" | |||||
| }] | |||||
| ], | |||||
| "op": "Fused_Poly_Schedule_Opt_003", | |||||
| "op_desc": [{ | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_0" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_1", | |||||
| "value": 9.999999747378752e-05 | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_0" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_0" | |||||
| }] | |||||
| ], | |||||
| "name": "Sqrt", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_3" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }] | |||||
| ], | |||||
| "name": "RealDiv", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_2" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [1] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_5" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_6", | |||||
| "value": -9.964923265215475e-06 | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_3" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [1] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_8", | |||||
| "value": 9.964923265215475e-06 | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_4" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_9" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }] | |||||
| ], | |||||
| "name": "RealDiv", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_5" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_5" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_4" | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_6" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_6" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_3" | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_7" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "dst_type", | |||||
| "value": "float32" | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "input_14" | |||||
| }] | |||||
| ], | |||||
| "name": "Cast", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_8" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [1] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "input_7" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1], | |||||
| "tensor_name": "input_15", | |||||
| "value": -9.964923265215475e-06 | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_9" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_9" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_1" | |||||
| }] | |||||
| ], | |||||
| "name": "RealDiv", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_10" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [32, 64, 56, 56] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_10" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_8" | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_11" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "dst_type", | |||||
| "value": "float32" | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float16", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "input_19" | |||||
| }] | |||||
| ], | |||||
| "name": "Cast", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_12" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_12" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_11" | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_13" | |||||
| }] | |||||
| }, { | |||||
| "attr": null, | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_13" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_7" | |||||
| }] | |||||
| ], | |||||
| "name": "TensorAdd", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_14" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "x_shape", | |||||
| "value": [32, 64, 56, 56] | |||||
| }, { | |||||
| "name": "y_shape", | |||||
| "value": [64] | |||||
| }, { | |||||
| "name": "data_format", | |||||
| "value": ["NC1HWC0", "NC1HWC0"] | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_14" | |||||
| }], | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "y", | |||||
| "shape": [1, 4, 1, 1, 16], | |||||
| "tensor_name": "output_0_2" | |||||
| }] | |||||
| ], | |||||
| "name": "Mul", | |||||
| "output_desc": [{ | |||||
| "data_type": "float32", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_15" | |||||
| }] | |||||
| }, { | |||||
| "attr": [{ | |||||
| "name": "dst_type", | |||||
| "value": "float16" | |||||
| }], | |||||
| "impl_path": "", | |||||
| "input_desc": [ | |||||
| [{ | |||||
| "data_type": "float32", | |||||
| "name": "x", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_15" | |||||
| }] | |||||
| ], | |||||
| "name": "Cast", | |||||
| "output_desc": [{ | |||||
| "data_type": "float16", | |||||
| "name": "output", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_16" | |||||
| }] | |||||
| }], | |||||
| "output_desc": [{ | |||||
| "data_type": "float16", | |||||
| "shape": [32, 4, 56, 56, 16], | |||||
| "tensor_name": "output_0_16" | |||||
| }], | |||||
| "platform": "AKG", | |||||
| "process": "aicore" | |||||
| } | |||||
| @@ -0,0 +1 @@ | |||||
| {"composite":true,"composite_graph":"12299","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[1216,30522],"tensor_name":"input_1"}]],"op":"Fused_Reciprocal_ReduceSum_Mul___1222261331617186059","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}]],"name":"Reciprocal","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":[{"name":"axis","value":[0]},{"name":"keep_dims","value":false}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1216,30522],"tensor_name":"input_1"}]],"name":"ReduceSum","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_1"}]},{"attr":[{"name":"x_shape","value":[30522]},{"name":"y_shape","value":[]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"output_0_0"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_2"}]}],"output_desc":[{"data_type":"float32","shape":[1],"tensor_name":"output_0_0"},{"data_type":"float32","shape":[30522],"tensor_name":"output_0_2"}],"platform":"AKG","process":"aicore"} | |||||
| @@ -25,7 +25,7 @@ else | |||||
| TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm" | TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm" | ||||
| export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH} | export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH} | ||||
| export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH} | |||||
| export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH} | |||||
| if [ $# -eq 1 ] && [ $1 = "gpu" ]; then | if [ $# -eq 1 ] && [ $1 = "gpu" ]; then | ||||
| export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} | export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} | ||||
| fi | fi | ||||
| @@ -20,23 +20,25 @@ | |||||
| /*! | /*! | ||||
| * \file cuda_module.cc | * \file cuda_module.cc | ||||
| * 2020.09.19 - Modify operator() for kc_air. | * 2020.09.19 - Modify operator() for kc_air. | ||||
| * 2020.09.22 - Separate the implementation of KC and GPU. | |||||
| * 2020.09.22 - Separate the implementation of KC and GPU. | |||||
| */ | */ | ||||
| #include "cuda_module.h" | #include "cuda_module.h" | ||||
| #include <tvm/runtime/registry.h> | |||||
| #include <cuda.h> | #include <cuda.h> | ||||
| #include <cuda_runtime.h> | #include <cuda_runtime.h> | ||||
| #include <vector> | |||||
| #include <tvm/runtime/registry.h> | |||||
| #include <array> | #include <array> | ||||
| #include <string> | |||||
| #include <mutex> | #include <mutex> | ||||
| #include <string> | |||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include "cuda_common.h" | |||||
| #include <vector> | |||||
| #include "../file_util.h" | |||||
| #include "../meta_data.h" | |||||
| #include "../pack_args.h" | #include "../pack_args.h" | ||||
| #include "../thread_storage_scope.h" | #include "../thread_storage_scope.h" | ||||
| #include "../meta_data.h" | |||||
| #include "../file_util.h" | |||||
| #include "cuda_common.h" | |||||
| namespace air { | namespace air { | ||||
| namespace runtime { | namespace runtime { | ||||
| @@ -47,8 +49,7 @@ namespace runtime { | |||||
| // The modules will be lazily loaded | // The modules will be lazily loaded | ||||
| class CUDAModuleNode : public runtime::ModuleNode { | class CUDAModuleNode : public runtime::ModuleNode { | ||||
| public: | public: | ||||
| explicit CUDAModuleNode(std::string data, | |||||
| std::string fmt, | |||||
| explicit CUDAModuleNode(std::string data, std::string fmt, | |||||
| std::unordered_map<std::string, FunctionInfo> fmap, | std::unordered_map<std::string, FunctionInfo> fmap, | ||||
| std::string cuda_source) | std::string cuda_source) | ||||
| : data_(data), fmt_(fmt), fmap_(fmap), cuda_source_(cuda_source) { | : data_(data), fmt_(fmt), fmap_(fmap), cuda_source_(cuda_source) { | ||||
| @@ -65,16 +66,11 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| } | } | ||||
| } | } | ||||
| const char* type_key() const final { | |||||
| return "cuda"; | |||||
| } | |||||
| const char* type_key() const final { return "cuda"; } | |||||
| PackedFunc GetFunction( | |||||
| const std::string& name, | |||||
| const ObjectPtr<Object>& sptr_to_self) final; | |||||
| PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final; | |||||
| void SaveToFile(const std::string& file_name, | |||||
| const std::string& format) final { | |||||
| void SaveToFile(const std::string& file_name, const std::string& format) final { | |||||
| std::string fmt = GetFileFormat(file_name, format); | std::string fmt = GetFileFormat(file_name, format); | ||||
| std::string meta_file = GetMetaFilePath(file_name); | std::string meta_file = GetMetaFilePath(file_name); | ||||
| if (fmt == "cu") { | if (fmt == "cu") { | ||||
| @@ -82,8 +78,7 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| SaveMetaDataToFile(meta_file, fmap_); | SaveMetaDataToFile(meta_file, fmap_); | ||||
| SaveBinaryToFile(file_name, cuda_source_); | SaveBinaryToFile(file_name, cuda_source_); | ||||
| } else { | } else { | ||||
| CHECK_EQ(fmt, fmt_) | |||||
| << "Can only save to format=" << fmt_; | |||||
| CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_; | |||||
| SaveMetaDataToFile(meta_file, fmap_); | SaveMetaDataToFile(meta_file, fmap_); | ||||
| SaveBinaryToFile(file_name, data_); | SaveBinaryToFile(file_name, data_); | ||||
| } | } | ||||
| @@ -106,11 +101,18 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| } | } | ||||
| // get a CUfunction from primary context in device_id | // get a CUfunction from primary context in device_id | ||||
| CUfunction GetFunc(int device_id, const std::string& func_name) { | |||||
| CUfunction GetFunc(int device_id, const std::string& func_name, ThreadWorkLoad wl) { | |||||
| std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
| // must recheck under the lock scope | // must recheck under the lock scope | ||||
| if (module_[device_id] == nullptr) { | if (module_[device_id] == nullptr) { | ||||
| CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str())); | |||||
| CUjit_option options[1]; | |||||
| options[0] = CU_JIT_MAX_REGISTERS; | |||||
| void* values[1]; | |||||
| long register_nums = | |||||
| MAX_REGISTER_PER_THREAD_BLOCK / (wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2)); | |||||
| values[0] = (void*)register_nums; | |||||
| CUDA_DRIVER_CALL( | |||||
| cuModuleLoadDataEx(&(module_[device_id]), data_.c_str(), 1, options, values)); | |||||
| } | } | ||||
| CUresult result = CUDA_SUCCESS; | CUresult result = CUDA_SUCCESS; | ||||
| CUfunction func = nullptr; | CUfunction func = nullptr; | ||||
| @@ -122,11 +124,9 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| #endif | #endif | ||||
| } | } | ||||
| if (result != CUDA_SUCCESS) { | if (result != CUDA_SUCCESS) { | ||||
| const char *msg; | |||||
| const char* msg; | |||||
| cuGetErrorName(result, &msg); | cuGetErrorName(result, &msg); | ||||
| LOG(FATAL) | |||||
| << "CUDAError: cuModuleGetFunction " << func_name | |||||
| << " failed with error: " << msg; | |||||
| LOG(FATAL) << "CUDAError: cuModuleGetFunction " << func_name << " failed with error: " << msg; | |||||
| } | } | ||||
| #ifdef USE_KC_AIR | #ifdef USE_KC_AIR | ||||
| return func_[device_id]; | return func_[device_id]; | ||||
| @@ -135,9 +135,7 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| #endif | #endif | ||||
| } | } | ||||
| // get a global var from primary context in device_id | // get a global var from primary context in device_id | ||||
| CUdeviceptr GetGlobal(int device_id, | |||||
| const std::string& global_name, | |||||
| size_t expect_nbytes) { | |||||
| CUdeviceptr GetGlobal(int device_id, const std::string& global_name, size_t expect_nbytes) { | |||||
| std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
| // must recheck under the lock scope | // must recheck under the lock scope | ||||
| if (module_[device_id] == nullptr) { | if (module_[device_id] == nullptr) { | ||||
| @@ -146,15 +144,12 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| CUdeviceptr global; | CUdeviceptr global; | ||||
| size_t nbytes; | size_t nbytes; | ||||
| CUresult result = cuModuleGetGlobal(&global, &nbytes, | |||||
| module_[device_id], global_name.c_str()); | |||||
| CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str()); | |||||
| CHECK_EQ(nbytes, expect_nbytes); | CHECK_EQ(nbytes, expect_nbytes); | ||||
| if (result != CUDA_SUCCESS) { | if (result != CUDA_SUCCESS) { | ||||
| const char *msg; | |||||
| const char* msg; | |||||
| cuGetErrorName(result, &msg); | cuGetErrorName(result, &msg); | ||||
| LOG(FATAL) | |||||
| << "CUDAError: cuModuleGetGlobal " << global_name | |||||
| << " failed with error: " << msg; | |||||
| LOG(FATAL) << "CUDAError: cuModuleGetGlobal " << global_name << " failed with error: " << msg; | |||||
| } | } | ||||
| return global; | return global; | ||||
| } | } | ||||
| @@ -173,17 +168,15 @@ class CUDAModuleNode : public runtime::ModuleNode { | |||||
| // internal mutex when updating the module | // internal mutex when updating the module | ||||
| std::mutex mutex_; | std::mutex mutex_; | ||||
| std::array<CUfunction, kMaxNumGPUs> func_; | std::array<CUfunction, kMaxNumGPUs> func_; | ||||
| const int MAX_REGISTER_PER_THREAD_BLOCK = 65536; | |||||
| }; | }; | ||||
| // a wrapped function class to get packed func. | // a wrapped function class to get packed func. | ||||
| class CUDAWrappedFunc { | class CUDAWrappedFunc { | ||||
| public: | public: | ||||
| // initialize the CUDA function. | // initialize the CUDA function. | ||||
| void Init(CUDAModuleNode* m, | |||||
| ObjectPtr<Object> sptr, | |||||
| const std::string& func_name, | |||||
| size_t num_void_args, | |||||
| std::vector<size_t> arg_size, | |||||
| void Init(CUDAModuleNode* m, ObjectPtr<Object> sptr, const std::string& func_name, | |||||
| size_t num_void_args, std::vector<size_t> arg_size, | |||||
| const std::vector<std::string>& thread_axis_tags) { | const std::vector<std::string>& thread_axis_tags) { | ||||
| m_ = m; | m_ = m; | ||||
| sptr_ = sptr; | sptr_ = sptr; | ||||
| @@ -194,65 +187,49 @@ class CUDAWrappedFunc { | |||||
| thread_axis_cfg_.Init(num_void_args, thread_axis_tags); | thread_axis_cfg_.Init(num_void_args, thread_axis_tags); | ||||
| } | } | ||||
| // invoke the function with void arguments | // invoke the function with void arguments | ||||
| void operator()(TVMArgs args, | |||||
| TVMRetValue* rv, | |||||
| void** void_args) const { | |||||
| void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { | |||||
| int device_id; | int device_id; | ||||
| CUDA_CALL(cudaGetDevice(&device_id)); | CUDA_CALL(cudaGetDevice(&device_id)); | ||||
| ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); | |||||
| if (fcache_[device_id] == nullptr) { | if (fcache_[device_id] == nullptr) { | ||||
| fcache_[device_id] = m_->GetFunc(device_id, func_name_); | |||||
| fcache_[device_id] = m_->GetFunc(device_id, func_name_, wl); | |||||
| } | } | ||||
| ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); | |||||
| CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream); | CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream); | ||||
| CUresult result; | CUresult result; | ||||
| #ifdef USE_KC_AIR | #ifdef USE_KC_AIR | ||||
| size_t raw_size = num_void_args_; | size_t raw_size = num_void_args_; | ||||
| void** raw_args = new (std::nothrow) void*[raw_size]; | |||||
| void** raw_args = new (std::nothrow) void*[raw_size]; | |||||
| if (*raw_args == nullptr) { | if (*raw_args == nullptr) { | ||||
| LOG(FATAL) << "Memory alloc fail."; | |||||
| LOG(FATAL) << "Memory alloc fail."; | |||||
| } | } | ||||
| size_t args_size = 0; | size_t args_size = 0; | ||||
| for (size_t i = 0; i < raw_size; ++i) | |||||
| { | |||||
| for (size_t i = 0; i < raw_size; ++i) { | |||||
| args_size += arg_size_[i]; | args_size += arg_size_[i]; | ||||
| void** ptr = reinterpret_cast<void**>(void_args[i]); | void** ptr = reinterpret_cast<void**>(void_args[i]); | ||||
| raw_args[i] = *ptr; | raw_args[i] = *ptr; | ||||
| } | } | ||||
| result = cuLaunchKernel( | |||||
| fcache_[device_id], | |||||
| wl.grid_dim(0), | |||||
| wl.grid_dim(1), | |||||
| wl.grid_dim(2), | |||||
| wl.block_dim(0), | |||||
| wl.block_dim(1), | |||||
| wl.block_dim(2), | |||||
| (static_cast<uint32_t>(args_size)/sizeof(void *)), strm, raw_args, 0); | |||||
| result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2), | |||||
| wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), | |||||
| (static_cast<uint32_t>(args_size) / sizeof(void*)), strm, raw_args, 0); | |||||
| if (raw_args != NULL) { | if (raw_args != NULL) { | ||||
| free(raw_args); | free(raw_args); | ||||
| raw_args = NULL; | raw_args = NULL; | ||||
| } | } | ||||
| #else | #else | ||||
| result = cuLaunchKernel( | |||||
| fcache_[device_id], | |||||
| wl.grid_dim(0), | |||||
| wl.grid_dim(1), | |||||
| wl.grid_dim(2), | |||||
| wl.block_dim(0), | |||||
| wl.block_dim(1), | |||||
| wl.block_dim(2), | |||||
| 0, strm, void_args, 0); | |||||
| result = | |||||
| cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2), | |||||
| wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), 0, strm, void_args, 0); | |||||
| #endif | #endif | ||||
| if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { | if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { | ||||
| const char *msg; | |||||
| const char* msg; | |||||
| cuGetErrorName(result, &msg); | cuGetErrorName(result, &msg); | ||||
| std::ostringstream os; | std::ostringstream os; | ||||
| os << "CUDALaunch Error: " << msg << "\n" | os << "CUDALaunch Error: " << msg << "\n" | ||||
| << " grid=(" << wl.grid_dim(0) << "," | |||||
| << wl.grid_dim(1) << "," << wl.grid_dim(2) << "), " | |||||
| << " block=(" << wl.block_dim(0) << "," | |||||
| << wl.block_dim(1) << "," << wl.block_dim(2) << ")\n"; | |||||
| << " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << "," << wl.grid_dim(2) << "), " | |||||
| << " block=(" << wl.block_dim(0) << "," << wl.block_dim(1) << "," << wl.block_dim(2) | |||||
| << ")\n"; | |||||
| std::string cuda = m_->GetSource(""); | std::string cuda = m_->GetSource(""); | ||||
| if (cuda.length() != 0) { | if (cuda.length() != 0) { | ||||
| os << "// func_name=" << func_name_ << "\n" | os << "// func_name=" << func_name_ << "\n" | ||||
| @@ -283,9 +260,7 @@ class CUDAWrappedFunc { | |||||
| class CUDAPrepGlobalBarrier { | class CUDAPrepGlobalBarrier { | ||||
| public: | public: | ||||
| CUDAPrepGlobalBarrier(CUDAModuleNode* m, | |||||
| ObjectPtr<Object> sptr) | |||||
| : m_(m), sptr_(sptr) { | |||||
| CUDAPrepGlobalBarrier(CUDAModuleNode* m, ObjectPtr<Object> sptr) : m_(m), sptr_(sptr) { | |||||
| std::fill(pcache_.begin(), pcache_.end(), 0); | std::fill(pcache_.begin(), pcache_.end(), 0); | ||||
| } | } | ||||
| @@ -293,8 +268,8 @@ class CUDAPrepGlobalBarrier { | |||||
| int device_id; | int device_id; | ||||
| CUDA_CALL(cudaGetDevice(&device_id)); | CUDA_CALL(cudaGetDevice(&device_id)); | ||||
| if (pcache_[device_id] == 0) { | if (pcache_[device_id] == 0) { | ||||
| pcache_[device_id] = m_->GetGlobal( | |||||
| device_id, runtime::symbol::tvm_global_barrier_state, sizeof(unsigned)); | |||||
| pcache_[device_id] = | |||||
| m_->GetGlobal(device_id, runtime::symbol::tvm_global_barrier_state, sizeof(unsigned)); | |||||
| } | } | ||||
| CUDA_DRIVER_CALL(cuMemsetD32(pcache_[device_id], 0, 1)); | CUDA_DRIVER_CALL(cuMemsetD32(pcache_[device_id], 0, 1)); | ||||
| } | } | ||||
| @@ -308,12 +283,10 @@ class CUDAPrepGlobalBarrier { | |||||
| mutable std::array<CUdeviceptr, kMaxNumGPUs> pcache_; | mutable std::array<CUdeviceptr, kMaxNumGPUs> pcache_; | ||||
| }; | }; | ||||
| PackedFunc CUDAModuleNode::GetFunction( | |||||
| const std::string& name, | |||||
| const ObjectPtr<Object>& sptr_to_self) { | |||||
| PackedFunc CUDAModuleNode::GetFunction(const std::string& name, | |||||
| const ObjectPtr<Object>& sptr_to_self) { | |||||
| CHECK_EQ(sptr_to_self.get(), this); | CHECK_EQ(sptr_to_self.get(), this); | ||||
| CHECK_NE(name, symbol::tvm_module_main) | |||||
| << "Device function do not have main"; | |||||
| CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; | |||||
| if (name == symbol::tvm_prepare_global_barrier) { | if (name == symbol::tvm_prepare_global_barrier) { | ||||
| return PackedFunc(CUDAPrepGlobalBarrier(this, sptr_to_self)); | return PackedFunc(CUDAPrepGlobalBarrier(this, sptr_to_self)); | ||||
| } | } | ||||
| @@ -322,7 +295,7 @@ PackedFunc CUDAModuleNode::GetFunction( | |||||
| const FunctionInfo& info = it->second; | const FunctionInfo& info = it->second; | ||||
| CUDAWrappedFunc f; | CUDAWrappedFunc f; | ||||
| std::vector<size_t> arg_size(info.arg_types.size()); | std::vector<size_t> arg_size(info.arg_types.size()); | ||||
| for (int i=0; i<static_cast<int>(info.arg_types.size()); ++i){ | |||||
| for (int i = 0; i < static_cast<int>(info.arg_types.size()); ++i) { | |||||
| TVMType t = info.arg_types[i]; | TVMType t = info.arg_types[i]; | ||||
| CHECK_EQ(t.lanes, 1U); | CHECK_EQ(t.lanes, 1U); | ||||
| uint32_t bits = t.bits; | uint32_t bits = t.bits; | ||||
| @@ -333,18 +306,15 @@ PackedFunc CUDAModuleNode::GetFunction( | |||||
| return PackFuncVoidAddr(f, info.arg_types); | return PackFuncVoidAddr(f, info.arg_types); | ||||
| } | } | ||||
| Module CUDAModuleCreate( | |||||
| std::string data, | |||||
| std::string fmt, | |||||
| std::unordered_map<std::string, FunctionInfo> fmap, | |||||
| std::string cuda_source) { | |||||
| Module CUDAModuleCreate(std::string data, std::string fmt, | |||||
| std::unordered_map<std::string, FunctionInfo> fmap, | |||||
| std::string cuda_source) { | |||||
| auto n = make_object<CUDAModuleNode>(data, fmt, fmap, cuda_source); | auto n = make_object<CUDAModuleNode>(data, fmt, fmap, cuda_source); | ||||
| return Module(n); | return Module(n); | ||||
| } | } | ||||
| // Load module from module. | // Load module from module. | ||||
| Module CUDAModuleLoadFile(const std::string& file_name, | |||||
| const std::string& format) { | |||||
| Module CUDAModuleLoadFile(const std::string& file_name, const std::string& format) { | |||||
| std::string data; | std::string data; | ||||
| std::unordered_map<std::string, FunctionInfo> fmap; | std::unordered_map<std::string, FunctionInfo> fmap; | ||||
| std::string fmt = GetFileFormat(file_name, format); | std::string fmt = GetFileFormat(file_name, format); | ||||
| @@ -365,14 +335,10 @@ Module CUDAModuleLoadBinary(void* strm) { | |||||
| return CUDAModuleCreate(data, fmt, fmap, std::string()); | return CUDAModuleCreate(data, fmt, fmap, std::string()); | ||||
| } | } | ||||
| TVM_REGISTER_GLOBAL("module.loadfile_cubin") | |||||
| .set_body_typed(CUDAModuleLoadFile); | |||||
| TVM_REGISTER_GLOBAL("module.loadfile_cubin").set_body_typed(CUDAModuleLoadFile); | |||||
| TVM_REGISTER_GLOBAL("module.loadfile_ptx") | |||||
| .set_body_typed(CUDAModuleLoadFile); | |||||
| TVM_REGISTER_GLOBAL("module.loadfile_ptx").set_body_typed(CUDAModuleLoadFile); | |||||
| TVM_REGISTER_GLOBAL("module.loadbinary_cuda") | |||||
| .set_body_typed(CUDAModuleLoadBinary); | |||||
| TVM_REGISTER_GLOBAL("module.loadbinary_cuda").set_body_typed(CUDAModuleLoadBinary); | |||||
| } // namespace runtime | } // namespace runtime | ||||
| } // namespace air | } // namespace air | ||||