Compare commits

...

20 Commits
master ... r1.2

Author SHA1 Message Date
  mindspore-ci-bot a5a856cd2c !63 Update releasenote 1.2.0 5 years ago
  anyrenwei 14ebec0fe0 update releasenote 1.2.0 5 years ago
  mindspore-ci-bot c1140a5fe1 !61 Update releasenote of AKG 5 years ago
  anyrenwei 0c97701b01 UPDATE releaseNote 1.2 of AKG 5 years ago
  mindspore-ci-bot a386371413 !52 enhance mapping: drop axes that cannot be mapped to block before block mapping 5 years ago
  dabaiji 10c10ecfe2 enhance mapping: drop axes that cannot be mapped to block before block mapping 5 years ago
  mindspore-ci-bot 26f709e19c !36 fix local memory promotion for large thread 5 years ago
  dabaiji 2980041c17 fix local memory promotion for large thread 5 years ago
  mindspore-ci-bot 4cc3940fb9 !35 enhance auto tiling : bind block for outer-fused-loop and correct the max block limit of y and z dim 5 years ago
  dabaiji da1475d51a enhance auto tiling : bind block for outer-fused-loop and correct the max block limit of y and z dim 5 years ago
  mindspore-ci-bot 0d8a729591 !25 bug fix for reduce binding dimension 5 years ago
  dabaiji ff38b77728 bug fix for reduce binding dimension 5 years ago
  mindspore-ci-bot 502d0a2dee !18 Can not set _GLIBCXX_USE_CXX11_ABI=0 in current profiling 5 years ago
  mindspore-ci-bot 0737733069 !17 [TUNING] add the gpu-tuning process to master 5 years ago
  looop5 1cac564f5c Can not set _GLIBCXX_USE_CXX11_ABI=0 in current profiling 5 years ago
  yiyanzhi_akane 964188ef45 [TUNING] add the gpu-tuning process to master 5 years ago
  mindspore-ci-bot e072be7c4a !14 Delete redundant test file 5 years ago
  lishanni513 421b3a1efa Delete redundant test file 5 years ago
  mindspore-ci-bot 8d56f4052d !6 sync master 5 years ago
  looop5 f013f7dbd5 sync master 5 years ago
68 changed files with 5266 additions and 536 deletions
Split View
  1. +23
    -8
      CMakeLists.txt
  2. +18
    -115
      RELEASE.md
  3. +11
    -38
      build.sh
  4. +15
    -1
      python/akg/build_module.py
  5. +33
    -3
      python/akg/utils/custom_tiling.py
  6. +5
    -6
      python/akg/utils/kernel_exec.py
  7. +0
    -177
      src/akg_mma_lib/mma_test.cu
  8. +1
    -1
      src/poly/schedule_pass_gpu/register_memory_manager.cc
  9. +24
    -0
      src/poly/tiling/custom_tiling.h
  10. +55
    -22
      src/poly/tiling/gen_tiling_space.cc
  11. +10
    -0
      src/poly/tiling/tile_space.h
  12. +19
    -5
      src/poly/tiling/tiling_analyzer.cc
  13. +20
    -11
      src/poly/tiling/tiling_analyzer.h
  14. +12
    -7
      src/poly/tiling/tiling_strategy_manager.h
  15. +279
    -45
      src/poly/tiling/tiling_strategy_manager_gpu.cc
  16. +0
    -0
      tests/fuzz/tune_for_gpu/__init__.py
  17. +17
    -0
      tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py
  18. +95
    -0
      tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py
  19. +501
    -0
      tests/fuzz/tune_for_gpu/autotuning/job.py
  20. +407
    -0
      tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py
  21. +243
    -0
      tests/fuzz/tune_for_gpu/autotuning/runner.py
  22. +217
    -0
      tests/fuzz/tune_for_gpu/autotuning/space.py
  23. +753
    -0
      tests/fuzz/tune_for_gpu/autotuning/space_generators.py
  24. +147
    -0
      tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py
  25. +84
    -0
      tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py
  26. +359
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuner.py
  27. +9
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json
  28. +155
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py
  29. +49
    -0
      tests/fuzz/tune_for_gpu/autotuning/type_definitions.py
  30. +16
    -0
      tests/fuzz/tune_for_gpu/config_gpu.sh
  31. +67
    -0
      tests/fuzz/tune_for_gpu/test_gpu.py
  32. +1
    -0
      tests/st/composite/need_adapt/Fused_Cast_Cast_Mul_TensorAdd___12292245117929986167.info
  33. +1
    -0
      tests/st/composite/need_adapt/Fused_Cast_RealDiv_Mul_TensorAdd_split_16909220147165618805.info
  34. +1
    -0
      tests/st/composite/need_adapt/Fused_ClipByNormNoDivSum_RealDiv_fusion_4181144419579591378.info
  35. +1
    -0
      tests/st/composite/need_adapt/Fused_ClipByNormNoDivSum_RealDiv_fusion_8238389606767005164.info
  36. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_10229161408386697243.info
  37. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_10637164683062061938.info
  38. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_11007228773993183427.info
  39. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_12006221044534455340.info
  40. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_13257561028613500504.info
  41. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_13769955845847610041.info
  42. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_14492938012907533443.info
  43. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_14969106078297683510.info
  44. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_17143661508892073848.info
  45. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_17552201251937562766.info
  46. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_1932853756890330796.info
  47. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_2109265793585062708.info
  48. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_312286377788017483.info
  49. +1
    -0
      tests/st/composite/need_adapt/Fused_LambNextMV_6348844499000494196.info
  50. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_12073466097680829202.info
  51. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_13492243466190004284.info
  52. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_1445905573061742177.info
  53. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_15600956116817642484.info
  54. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_15689878575778426853.info
  55. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_16040335705910473299.info
  56. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_16084070961688803476.info
  57. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_3830386909471115343.info
  58. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_4148121723898026533.info
  59. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_5080003035626701281.info
  60. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_8456945009561581117.info
  61. +1
    -0
      tests/st/composite/need_adapt/Fused_LambUpdateWithLR_8598988701258930330.info
  62. +1
    -0
      tests/st/composite/need_adapt/Fused_Mul_Mul_TensorAdd__4400644352246048056.info
  63. +707
    -0
      tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_001.info
  64. +320
    -0
      tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_002.info
  65. +500
    -0
      tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_003.info
  66. +1
    -0
      tests/st/composite/need_adapt/Fused_Reciprocal_ReduceSum_Mul___1222261331617186059.info
  67. +1
    -1
      tests/test_env.sh
  68. +62
    -96
      third_party/incubator-tvm/src/runtime/cuda/cuda_module.cc

+ 23
- 8
CMakeLists.txt View File

@@ -26,7 +26,7 @@ project(akg C CXX)
set(AKG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(TVM_DIR "${AKG_SOURCE_DIR}/third_party/incubator-tvm")

if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT USE_KC_AIR)
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT USE_KC_AIR AND NOT USE_CCE_PROFILING)
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
endif()

@@ -63,21 +63,36 @@ link_directories(${AKG_RPATH})
# Search AKG_EXTEND by order
set(AKG_EXTEND )
if(NOT USE_CUDA)
set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/libakg_ext.a) # Search libakg_ext.a in directory akg
if(NOT EXISTS ${AKG_EXTEND_FILE})
set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/build/libakg_ext.a) # Search libakg_ext.a in directory akg/build
if(NOT EXISTS ${AKG_EXTEND_FILE} AND NOT USE_KC_AIR) # Download libakg_ext.a to directory akg/build
set(AKG_EXTEND_FILE )
set(LIB_PATH1 ${AKG_SOURCE_DIR}/libakg_ext.a)
set(LIB_PATH2 ${AKG_SOURCE_DIR}/build/libakg_ext.a)

if(EXISTS ${LIB_PATH1}) # Search libakg_ext.a in akg/
set(AKG_EXTEND_FILE ${LIB_PATH1})
else()
if(EXISTS ${LIB_PATH2}) # Search libakg_ext.a in akg/build/
set(AKG_EXTEND_FILE ${LIB_PATH2})
elseif(NOT USE_KC_AIR AND NOT USE_CCE_PROFILING) # Download libakg_ext.a to akg/build/
execute_process(COMMAND bash ${AKG_SOURCE_DIR}/build.sh -a
WORKING_DIRECTORY ${AKG_SOURCE_DIR}
OUTPUT_VARIABLE OUTPUT_URL
RESULT_VARIABLE RESULT)
if(RESULT EQUAL 0)
set(AKG_EXTEND_FILE ${AKG_SOURCE_DIR}/build/libakg_ext.a)
if(RESULT EQUAL 0 AND OUTPUT_URL MATCHES "libakg_ext.a")
# Download library
string(STRIP ${OUTPUT_URL} LIB_URL)
message("-- Downloading ${LIB_URL} --> ${LIB_PATH2}")
file(DOWNLOAD ${LIB_URL} ${LIB_PATH2} STATUS DOWNLOAD_STATUS)
message("-- Download status: ${DOWNLOAD_STATUS}")
list(GET DOWNLOAD_STATUS 0 DOWNLOAD_CODE)
if(DOWNLOAD_CODE EQUAL 0)
set(AKG_EXTEND_FILE ${LIB_PATH2})
endif()
endif()
endif()
endif()

message("-- AKG_EXTEND_FILE: ${AKG_EXTEND_FILE}")
if(EXISTS ${AKG_EXTEND_FILE})
message("-- AKG_EXTEND_FILE: ${AKG_EXTEND_FILE}")
file(COPY ${AKG_EXTEND_FILE} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/akg_extend)
execute_process(COMMAND ar -x libakg_ext.a
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/akg_extend)


+ 18
- 115
RELEASE.md View File

@@ -1,121 +1,24 @@
# Release 1.1.1
## Major Features and Improvements
* Enable Tensor core when processing GEMM operators in AKG by using poly to create the schedule needed by tensor core pass automatically;
* Implemented an akg mma lib with inlined ptx codes instead of wmma interface of cuda;
* Enable one-dimensional mapping to optimize memory promotion.

## Bugfixes
* Fix Segmentation fault in Mapping OuterBand in mindspore (!321).
* Fix bugs for memory promotion issues (!306).
* Fix bugs during gen tuning space for scalar ops (!326).

## Contributors
Thanks goes to these wonderful people:

chengyun, chendeshi, chenlei_autodiff, gengzhen, hanhuifeng, lvwenyuan, lishanni513, hujiahui8, polyhedral, shiliang, wYann, xixixian, xxxxxxw, xuhui, xiaruijie, yangsijia, yiyanzhi, zhangzhaochuang, zhengzuohe

Contributions of any kind are welcome!

# Release 1.1.0
## Major Features and Improvements
* GPU operators improvements
* Propose a new strategy to handle the reduction operators: The reduce axises would be detected and rescheduled as a seperated band in the schedule tree and then mapping to blocks, then it will call the akg_reduce_lib which using atomic operation to do reduction in the codegen pass. The experimental results show that AKG improves the execution performance relative to cudnn in the large shape cases;
* Optimize the auto-tiling algorithms which can improve the performance of reduction operators dramatically in most scenarios.
* Support AutoTuning for composite operators on GPU;
* Refactor composite framework to enable optimization in DSL level;
* Enhance CSE to support eliminating redundant vmadd on Ascend;
* Update scipy to 1.5.3.

## Bugfixes
* TensorAdd support FRACTAL_NZ and DefaultFormat(!228).
* GPU fix cast: fp32 -> uint8(!216).
* bugfix: Fix bug in opt_broadcast(!272).
* fix vadds for int32(!250).

## Contributors
## Release 1.2.0
### Major Features and Improvements
* [STABLE] Rebuild the AKG repository for providing a new way to support ascend backend by linking a static library contained all the ascend passes. (Ascend)
* [STABLE] Optimize the reduction add operation in ascend backend. (Ascend)
* [STABLE] Add support for tuning elemwise&&reduction operators. (GPU)

### Bug fixes
* Fixed a problem that data prefetch cannot be enabled by attributes in DSL.
* Fixed bugs of autotiling algorithms (tiling too small, cannot adapted matmul+bias, etc.) in Ascend platform.
* Fixed local memory promotion for large thread (2980!)
* Fixed reduce binding dimension issue on gpu platform (ff38!)

### Contributors
Thanks goes to these wonderful people:

chengyun, chendeshi, chenlei_autodiff, gaoxiong, gengzhen, guanxiaowei, hanhuifeng, laekov, luoyin, lvwenyuan, liuchang, lishanni513, lingyunli63, polyhedral, shiliang, wYann, wangrao124, xiaruijie, xixixian, xuhui, 要术甲杰, yiyanzhi_akane, yangshuo, yangsijia, zhangzhaochuang, zhengzuohe, zhangrenwei, zengzitao

Contributions of any kind are welcome!

# Release 1.0.0
## Major Features and Improvements
* GPU Support
* AKG now can generate gpu cuda kernel with no-schedule by using polyhedral techniques, which will create initial schedule, tile outerBands, map with blocks and threads and memory promotion automatically in the AutoPoly pass.
* Some primitive and fused operators(most are element-wise operators and reduce operators) were added, as well as corresponding testcases.
* Schedule-templates enhancement
* Optimize the TVM original schedule-templates to get better performance in some reduce cases.
* Support fusing multi-outputs into one kernel for element-wise operators.
* Davinci Enhancement
* Eliminate unnecessary broadcast by transforming the element-wise computation, such as `D[i, j] = A[i] + B[i, j] + C[i]` -> `D[i, j] = A[i] + C[i] + B[i, j]`, which satisfies commutative law and associative law.
* Enhance the pass to_three_address to match more cases for vmadd.

## Bugfixes
* fix a bug that random test case segment_max failed(!127).
* fix the permisson denied error of rewriting meta_file with same name(!147).
* fix warning for unsupported gpu built-in ops(!148).

## Contributors
Thanks goes to these wonderful people:

baita, ConnZhai, gengzhen, guanxiaowei, hanhuifeng, hujiahui8, laekov, lvwenyuan, lishanni513, lingyunli63, polyhedral, wYann, wangrao124, xixixian, xuhui, 要术甲杰, yiyanzhi_akane, yangsijia, zhengzuohe, zhangrenwei, zengzitao

yangsijia, xxxxxxw, polyhedral, zhangrenwei, yiyanzhi, xixixian, hujiahui8, zhengzuohe, lishanni, zhangzhaochuang, xuhui, liuchao, gengzhen, xiaruijie,
chenlei_autodiff, lingyunli63, wYann, lvwenyuan, peiwenfang, hanhuifeng, gaoxiong, chengyun
Contributions of any kind are welcome!

# Release 0.7.0-beta
## Major Features and Improvements
* Backend refactoring
* Rewrite instruction args calculation module in EmitInsn by implementing a new computing strategy based on axis spliting, which achieved improvement both on performance and code simplicity.

## Bugfixes
* fix dump code error when running gpu operators and set env MS_AKG_DUMP_CODE=ON(!113).

## Contributors
Thanks goes to these wonderful people:

lvwenyuan, shiliang, xuhui, wYann

Contributions of any kind are welcome!

# Release 0.6.0-beta
## Major Features and Improvements
* AutoPoly refactor to support integrating multi-backend targets easily
* Employ a pass/passmgr framework to manage all the transformations of ISL schedule tree in which transformation such as InitialSchTree and tileOuterBand would be considered as a pass to schedule tree.
* Refactor some data structure of poly so that they can de-couple with Davinci chips.
* Backend refactoring
* Enhance min alignment analysis with more accurate propagate conditions.
* Finetune pragma using alignment information before EmitInsn pass.
* Simplify EmitInsn pass by unifying the emit method for different patterns.
* Change the way of using TVM
* Delete the repository ktvm and reference TVM directly in sourcecode(third_party/incubator-tvm).
* Enable GPU operators generation which was tailored in ktvm.

## Bugfixes
* fix wrong hoist problem in multicore loop switch hoist pass(!87).
* fix scalar rearrange bug(!84).
* fix matmul tuning and support all space tuning(!73).
* fix variable broadcast_idx redefinition error when pragma dma_copy is replaced by opt_broadcast(!45).
* fix the bug in broadcast_rewrite(!22).
* fix bugs of multi-core processing(!33).
* fix a bug that extra pipe_barrier inserted in the loop(!30).
* fix inefficient auto tiling for axis with tail and remove duplicated check(!6).

## Contributors
Thanks goes to these wonderful people:

brovensmile, chengyun, chenlei_autodiff, chengbin, ConnZhai, fuxiaoteng, gaoxiong, gengzhen, hanhuifeng, KasonChan, luoyin, lvwenyuan, peiwenfang, xuhui, yangsijia, wangzhuo325, wYann

Contributions of any kind are welcome!

# Release 0.5.0-beta
## Major Features
* Support auto-schedule and code-generation on Ascend platform.
* Provide C++ APIs of basic operators used in MindSpore.
* Support Elementwise-Elementwise, Reduce-Elementwise fusion patterns in Bert.
* Support LambUpdateWithLR, LambNextMv, BatchMatmul optimazition for Bert.

## Initial Version
* Upload the initial framework
* Basic support for Ascend910 platform
* Integration with GraphKernel
* Basic support for Ascend910 platform and gpu v100
* Integration with GraphKernel fusion of MindSpore.


+ 11
- 38
build.sh View File

@@ -50,22 +50,23 @@ write_checksum()
done
}

download_lib()
acquire_lib_url()
{
uname_info=`uname -a | tr '[A-Z]' '[a-z]'`
os_info=`cat /etc/os-release | grep '^NAME=' | tr '[A-Z]' '[a-z]'`
os_name=""
arch_name=""
if [[ "${uname_info}" =~ "ubuntu" ]]; then
if [[ "${os_info}" =~ "ubuntu" ]]; then
os_name="ubuntu"
elif [[ "${uname_info}" =~ "euleros" ]]; then
elif [[ "${os_info}" =~ "euleros" ]]; then
os_name="euleros"
elif [[ "${uname_info}" =~ "centos" ]]; then
elif [[ "${os_info}" =~ "centos" ]]; then
os_name="centos"
fi

if [[ "${uname_info}" =~ "aarch64" ]]; then
arch_info=`arch | tr '[A-Z]' '[a-z]'`
arch_name=""
if [[ "${arch_info}" =~ "aarch64" ]]; then
arch_name="aarch64"
elif [[ "${uname_info}" =~ "x86_64" ]]; then
elif [[ "${arch_info}" =~ "x86" ]]; then
arch_name="x86"
fi

@@ -73,32 +74,7 @@ download_lib()
url_prefix="https://repo.mindspore.cn/public/ms-incubator/akg-binary/version"
lib_mark="202103/20210318/master_20210318142553_3e77f3a799ca87c23f1a906eaad5ec4c1f78bc95"
lib_url="${url_prefix}/${lib_mark}/lib/${os_arch}/libakg_ext.a"
hash_url="${url_prefix}/${lib_mark}/lib/${os_arch}/libakg_ext.a.sha256"

if [ ! -d ${BUILD_DIR} ]; then
mkdir -pv ${BUILD_DIR}
fi

# Download libakg_ext.a.sha256
wget -P ${BUILD_DIR} --waitretry=10 --tries=3 ${hash_url}
if [ $? -ne 0 ]; then
echo "Fail to download ${hash_url}"
return 1
fi
# Download libakg_ext.a
wget -P ${BUILD_DIR} --waitretry=10 --tries=3 ${lib_url}
if [ $? -ne 0 ]; then
echo "Fail to download ${lib_url}"
return 1
fi

# Check hash
cur_hash=`sha256sum -b ${BUILD_DIR}/libakg_ext.a | awk '{print $1}'`
orig_hash=`grep libakg_ext.a ${BUILD_DIR}/libakg_ext.a.sha256 | awk '{print $1}'`
if [ "${cur_hash}" != "${orig_hash}" ]; then
echo "Hash check failed!"
return 1
fi
echo "${lib_url}"
}

if [ ! -n "$1" ]; then
@@ -130,10 +106,7 @@ do
t)
;;
a)
download_lib
if [ $? -ne 0 ]; then
exit 1
fi
acquire_lib_url
exit 0
;;
*)


+ 15
- 1
python/akg/build_module.py View File

@@ -50,7 +50,17 @@ def dump_tiling_info(level):
logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1],
tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1],
tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0],
tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0])
tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0],
)
idx_to_str = {0: "x", 1: "y", 2: "z"}
for i in range(len(tuning_spaces["thread_range"])):
info = "[thread.%s] range [%d, %d](jump by %d), "
logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1],
tuning_spaces['thread_mod'][i][0], )
for i in range(len(tuning_spaces["block_range"])):
info = "[block.%s] range [%d, %d](jump by %d)"
logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0],
tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],)
logging.info("===============================================")
elif isinstance(indice, int) and indice == EMPTY_CODE:
logging.info("Empty tiling space.")
@@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att
tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist()
tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist()
tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist()
tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist()
tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist()
tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist()
tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist()
if level >= help_tiling_level["Candidates"]:
tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist()
if not tuning:


+ 33
- 3
python/akg/utils/custom_tiling.py View File

@@ -70,15 +70,33 @@ class TileConstraint(Enum):
SET_EXPANSION = "SET_EXPANSION"
SET_MEM_RATIO = "SET_MEM_RATIO"
SET_AXIS_INFO = "SET_AXIS_INFO"
THREAD_MIN = "THREAD_MIN"
THREAD_MAX = "THREAD_MAX"
THREAD_MOD = "THREAD_MOD"
BLOCK_MIN = "BLOCK_MIN"
BLOCK_MAX = "BLOCK_MAX"
BLOCK_MOD = "BLOCK_MOD"


@check_input_type((double, float, int), TileConstraint, TileLevel)
@check_input_type((double, float, int, list), TileConstraint, TileLevel)
def modify_common_constraints(value, constraint, level=TileLevel.C1):
"""api for dsl to modify some default constraint used in auto tiling."""
if constraint not in TileConstraint:
raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint))
if constraint == TileConstraint.SET_MEM_RATIO:
return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value))
if constraint == TileConstraint.THREAD_MIN:
return create_custom_tiling_node(TileMode.COMMON, thread_min=value)
if constraint == TileConstraint.THREAD_MAX:
return create_custom_tiling_node(TileMode.COMMON, thread_max=value)
if constraint == TileConstraint.THREAD_MOD:
return create_custom_tiling_node(TileMode.COMMON, thread_mod=value)
if constraint == TileConstraint.BLOCK_MIN:
return create_custom_tiling_node(TileMode.COMMON, block_min=value)
if constraint == TileConstraint.BLOCK_MAX:
return create_custom_tiling_node(TileMode.COMMON, block_max=value)
if constraint == TileConstraint.BLOCK_MOD:
return create_custom_tiling_node(TileMode.COMMON, block_mod=value)
raise TypeError("Constraint {} is not supported in this api, please use other api"
.format(constraint.value))

@@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode,
axis_info=DEFAULT_STRING,
priority=DEFAULT_VALUE,
expansion=DEFAULT_VALUE,
mem_ratio=double(DEFAULT_VALUE)):
mem_ratio=double(DEFAULT_VALUE),
thread_min=[],
thread_max=[],
thread_mod=[],
block_min=[],
block_max=[],
block_mod=[]):
"""default method to create custom tiling node, all values are default except tile mode."""

tile_min = to_tvm_type(tile_min, "tile_min")
@@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode,
axis_info=akg.tvm.expr.StringImm(axis_info),
priority=priority,
expansion=expansion,
mem_ratio=mem_ratio)
mem_ratio=mem_ratio,
thread_min=thread_min,
thread_max=thread_max,
thread_mod=thread_mod,
block_min=block_min,
block_max=block_max,
block_mod=block_mod)


def template_nc1hwc0(tensor_name, level):


+ 5
- 6
python/akg/utils/kernel_exec.py View File

@@ -35,6 +35,7 @@ import numpy as np

import akg
from akg.build_module import help_tiling_level
from akg import backend as cce
import akg.tvm
from akg.tvm import autotvm
from akg.tvm import rpc
@@ -88,7 +89,6 @@ def debug_mode(debug_flag):
pass_list.append((0, ir_pass.inject_dma_intrin))
return pass_list


def func_time_required(func_name):
"""Checking the Time Required for Function Running."""
def wrapper(*args, **kwargs):
@@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs):
return None

@func_time_required
def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None):
def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400):
"""
unified run CCE kernel api.

@@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None)
if not tuning:
return out_list[0] if len(out_list) == 1 else tuple(out_list)
else:
cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True)
cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time)
return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles}

stat_info = {}
@@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None
if tuning or (level is not None and level > help_tiling_level['None']):
return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target)

mode = get_runtime_mode()
if mode == "cpu":
mod = akg.tvm.build(s, op_var, "llvm")
@@ -1069,12 +1068,12 @@ def get_device_id():
logging.error(e)
return 0

def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False):
def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400):
"get gpu profiling cycles."
func = tvm.get_global_func('GPUProfilerInit')
func("")
from akg.utils.result_analysis import gpu_profiling
gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id)
gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id)
func = tvm.get_global_func('GPUProfilerStop')
a = func()
return int(a)


+ 0
- 177
src/akg_mma_lib/mma_test.cu View File

@@ -1,177 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <chrono>
#include <m16n16k4.hpp>

// Usage: nvcc -std=c++11 -lineinfo -lcublas -arch=sm_70 -DCUDA_ARCH_SM=70 -I./ mma_test.cu -o mma_test

// const int WARP_SIZE = 32;
const int M = 16;
const int N = 16;
const int K = 4;
const int MMA_M = 16;
const int MMA_N = 16;
const int MMA_K = 4;

template<typename CType=float, typename ABType=half>
__global__ void wmma_test_kernel(CType *const c_ptr, const ABType *const a_ptr, const ABType *const b_ptr) {
akg::wmma::fragment<nvcuda::wmma::matrix_a, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::row_major> frag_a_row;
akg::wmma::fragment<nvcuda::wmma::matrix_a, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::col_major> frag_a_col;
akg::wmma::fragment<nvcuda::wmma::matrix_b, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::row_major> frag_b_row;
akg::wmma::fragment<nvcuda::wmma::matrix_b, MMA_M, MMA_N, MMA_K, ABType, nvcuda::wmma::col_major> frag_b_col;
akg::wmma::fragment<nvcuda::wmma::accumulator, MMA_M, MMA_N, MMA_K, CType> frag_c;

akg::wmma::load_matrix_sync(frag_a_row, a_ptr, K);
akg::wmma::load_matrix_sync(frag_a_col, a_ptr, M);
akg::wmma::load_matrix_sync(frag_b_row, b_ptr, N);
akg::wmma::load_matrix_sync(frag_b_col, b_ptr, K);

akg::wmma::load_matrix_sync<CType>(frag_c, c_ptr, N, nvcuda::wmma::mem_row_major);
print_fragment(frag_c, "frag_c");

akg::wmma::fill_fragment<CType>(frag_c, 0.0f);

akg::wmma::mma_sync(frag_c, frag_a_col, frag_b_col, frag_c);

print_fragment(frag_a_row, "frag_a_row");
print_fragment(frag_a_col, "frag_a_col");
print_fragment(frag_b_row, "frag_b_row");
print_fragment(frag_b_col, "frag_b_col");

akg::wmma::store_matrix_sync(frag_c, c_ptr, N, nvcuda::wmma::mem_row_major);
}

#define FP16_EXPONENT_BITS 0x1F
#define FP16_EXPONENT_SHIFT 10
#define FP16_EXPONENT_BIAS 15
#define FP16_MANTISSA_BITS 0x3ff
#define FP16_MANTISSA_SHIFT (23 - FP16_EXPONENT_SHIFT)
#define FP16_MAX_EXPONENT (FP16_EXPONENT_BITS << FP16_EXPONENT_SHIFT)

inline half FP32toFP16(float val) {
unsigned int f32 = (*(unsigned int *)&val);
unsigned short f16 = 0;

/* Decode IEEE 754 little-endian 32-bit floating-point value */
int sign = (f32 >> 16) & 0x8000;

/* Map exponent to the range [-127,128] */
int exponent = ((f32 >> 23) & 0xff) - 127;
int mantissa = f32 & 0x007fffff;

if (exponent == 128) { /* Infinity or NaN */
f16 = sign | FP16_MAX_EXPONENT;
if (mantissa) f16 |= (mantissa & FP16_MANTISSA_BITS);
} else if (exponent > 15) { /* Overflow - flush to Infinity */
f16 = sign | FP16_MAX_EXPONENT;
} else if (exponent > -15) { /* Representable value */
exponent += FP16_EXPONENT_BIAS;
mantissa >>= FP16_MANTISSA_SHIFT;
f16 = sign | exponent << FP16_EXPONENT_SHIFT | mantissa;

} else {
f16 = sign;
}
return *(half *)&f16;
}

template <class T>
void oneInit(T *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = (T)FP32toFP16(1.f);
}
}

template <class T>
void randomInit(T *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = (T)FP32toFP16(i);
}
}

using stype = half;
using dtype = float;

int main() {
half *da;
half *db;
float *dc;
half *dc_fp16;

unsigned int size_A = M * K;
unsigned int size_B = K * N;
unsigned int size_C = M * N;
unsigned int size_C_fp16 = M * N;
unsigned int mem_size_A = sizeof(stype) * size_A;
unsigned int mem_size_B = sizeof(stype) * size_B;
unsigned int mem_size_C = sizeof(dtype) * size_C;
unsigned int mem_size_C_fp16 = sizeof(stype) * size_C_fp16;
printf("M = %d, N = %d, K = %d\n", M, N, K);
printf("size_A = %d, size_B = %d, size_C = %d, size_C_fp16 = %d\n", mem_size_A, mem_size_B, mem_size_C, mem_size_C_fp16);
stype *h_A = (stype *)malloc(mem_size_A);
stype *h_B = (stype *)malloc(mem_size_B);
dtype *h_C = (dtype *)malloc(mem_size_C);
stype *h_C_fp16 = (stype *)malloc(mem_size_C_fp16);
// dtype *reference = (dtype *)malloc(mem_size_C);
// stype *reference = (stype *)malloc(mem_size_C_fp16);

randomInit<stype>(h_A, size_A);
randomInit<stype>(h_B, size_B);
randomInit<dtype>(h_C, size_C);
randomInit<stype>(h_C_fp16, size_C_fp16);

cudaMalloc(&da, mem_size_A);
cudaMalloc(&db, mem_size_B);
cudaMalloc(&dc, mem_size_C);
cudaMalloc(&dc_fp16, mem_size_C_fp16);

// copy host memory to device
cudaMemcpy(da, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(db, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMemcpy(dc, h_C, mem_size_C, cudaMemcpyHostToDevice);
cudaMemcpy(dc_fp16, h_C_fp16, mem_size_C_fp16, cudaMemcpyHostToDevice);

dim3 threads, grid;
threads = dim3(32);
grid = dim3(1, 1);

// CType == fp32
wmma_test_kernel<float, half><<<grid, threads>>>(dc, da, db);
cudaDeviceSynchronize();
auto error_code = cudaGetLastError();
printf("CType == fp32, last error: %d\n", error_code);
cudaMemcpy(h_C, dc, mem_size_C, cudaMemcpyDeviceToHost);

// CType == fp16
wmma_test_kernel<half, half><<<grid, threads>>>(dc_fp16, da, db);
cudaDeviceSynchronize();
error_code = cudaGetLastError();
printf("CType == fp16, last error: %d\n", error_code);
cudaMemcpy(h_C_fp16, dc_fp16, mem_size_C_fp16, cudaMemcpyDeviceToHost);

free(h_A);
free(h_B);
free(h_C);
free(h_C_fp16);
// free(reference);

cudaFree(da);
cudaFree(db);
cudaFree(dc);
cudaFree(dc_fp16);
}

+ 1
- 1
src/poly/schedule_pass_gpu/register_memory_manager.cc View File

@@ -356,7 +356,7 @@ void RegisterMemoryManager::IsOutofMemory(std::vector<BufferDefInfo> promoted_in
auto tensor_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies<size_t>());
auto data_bytes = scop_info_.user_config_.GetDataType(promoted_info.tensor_id.get_name());
total_alloc_size += tensor_size * std::max<int>(1, data_bytes / BYTES_PER_REGISTER);
if (total_alloc_size * alloc_threads > MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO) {
if (total_alloc_size * alloc_threads >= MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO) {
memory_exceeding_ = true;
break;
}


+ 24
- 0
src/poly/tiling/custom_tiling.h View File

@@ -80,6 +80,24 @@ class CustomTilingNode : public Node {
* default is 0.5 which is reserved for double buffer*/
double mem_ratio;

/*! \brief minimal thread binding factor on gpu, greater than 0*/
Array<Expr> thread_min;

/*! \brief maximal thread binding factor on gpu*/
Array<Expr> thread_max;

/*! \brief constraint thread binding factor % thread_mod == 0*/
Array<Expr> thread_mod;

/*! \brief minimal block binding factor on gpu, greater than 0*/
Array<Expr> block_min;

/*! \brief maximal block binding factor on gpu*/
Array<Expr> block_max;

/*! \brief constraint block binding factor % block_mod == 0*/
Array<Expr> block_mod;

void VisitAttrs(AttrVisitor *v) {
v->Visit("tile_level", &tile_level);
v->Visit("tile_mode", &tile_mode);
@@ -97,6 +115,12 @@ class CustomTilingNode : public Node {
v->Visit("priority", &priority);
v->Visit("expansion", &expansion);
v->Visit("mem_ratio", &mem_ratio);
v->Visit("thread_min", &thread_min);
v->Visit("thread_max", &thread_max);
v->Visit("thread_mod", &thread_mod);
v->Visit("block_min", &block_min);
v->Visit("block_max", &block_max);
v->Visit("block_mod", &block_mod);
}

static constexpr const char *_type_key = "CustomTilingNode";


+ 55
- 22
src/poly/tiling/gen_tiling_space.cc View File

@@ -36,6 +36,15 @@ class TileSpaceCollector {
space_->c1_tile_mod_table = init_array;
space_->c0_tile_mod_table = init_array;
space_->tiling_candidate = init_array;
space_->gpu_thread_range_table = init_array;
space_->gpu_block_range_table = init_array;
space_->gpu_thread_mod_table = init_array;
space_->gpu_block_mod_table = init_array;
if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"};
} else {
cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
}
}
~TileSpaceCollector() = default;

@@ -122,38 +131,61 @@ class TileSpaceCollector {
// step 2. collect cared info from each axis
for (const auto &con : cared_info_) {
int length = con.find("mod") != std::string::npos ? 1 : 2;
auto array = air::runtime::NDArray::Empty({static_cast<int64_t>(tile_size), length}, type, ctx);
auto size = static_cast<int64_t>(tile_size);
if (con.find("gpu") != std::string::npos) {
size = std::max<int64_t>(3, size);
}
auto array = air::runtime::NDArray::Empty({size, length}, type, ctx);
auto spaceDlPack = array.ToDLPack();
auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data);
for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
if (con == "index") {
*ptr++ = b_idx;
*ptr++ = a_idx;
if (con.find("gpu") != std::string::npos) {
size_t s = con.find("thread") != std::string::npos ? 0 : 3;
size_t e = con.find("thread") != std::string::npos ? 3 : 6;
for (size_t i = s; i < e; ++i) {
if (length == 1) {
*ptr++ = analyzer_.binding_spaces_[i].map_mod_;
} else {
if (con == "C1_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C0_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C1_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
} else if (con == "C0_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
*ptr++ = analyzer_.binding_spaces_[i].map_min_;
*ptr++ = analyzer_.binding_spaces_[i].map_extent_;
}
}
} else {
for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
if (con == "index") {
*ptr++ = b_idx;
*ptr++ = a_idx;
} else {
if (con == "C1_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C0_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C1_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
} else if (con == "C0_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
}
}
}
}
}

if (con == "index") space_->index_table = array;
if (con == "C1_range") space_->c1_tile_range_table = array;
if (con == "C0_range") space_->c0_tile_range_table = array;
if (con == "C1_mod") space_->c1_tile_mod_table = array;
if (con == "C0_mod") space_->c0_tile_mod_table = array;
if (con == "gpu_thread_range") space_->gpu_thread_range_table = array;
if (con == "gpu_block_range") space_->gpu_block_range_table = array;
if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array;
if (con == "gpu_block_mod") space_->gpu_block_mod_table = array;

delete spaceDlPack;
}
}
@@ -196,7 +228,8 @@ class TileSpaceCollector {
bool min_tile_ok = false;
for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) {
bool break_constraint =
(tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0);
((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) ||
(axis->forbid_iso && tile_extent->value % tile != 0);
if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) {
continue;
}
@@ -365,7 +398,7 @@ class TileSpaceCollector {
DLContext ctx = {kDLCPU, 0};
std::vector<TileAxis *> tile_axes_;
std::vector<bool> is_shared_;
std::unordered_set<std::string> cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
std::unordered_set<std::string> cared_info_;

struct Result {
std::vector<int> tile;


+ 10
- 0
src/poly/tiling/tile_space.h View File

@@ -28,6 +28,11 @@ class TileSpaceNode : public Node {
air::runtime::NDArray c1_tile_mod_table;
air::runtime::NDArray c0_tile_mod_table;
air::runtime::NDArray tiling_candidate;
air::runtime::NDArray gpu_thread_range_table;
air::runtime::NDArray gpu_block_range_table;
air::runtime::NDArray gpu_thread_mod_table;
air::runtime::NDArray gpu_block_mod_table;


void VisitAttrs(AttrVisitor *v) {
v->Visit("index_table", &index_table);
@@ -36,6 +41,11 @@ class TileSpaceNode : public Node {
v->Visit("c1_tile_mod_table", &c1_tile_mod_table);
v->Visit("c0_tile_mod_table", &c0_tile_mod_table);
v->Visit("tiling_candidate", &tiling_candidate);
v->Visit("gpu_thread_range_table", &gpu_thread_range_table);
v->Visit("gpu_block_range_table", &gpu_block_range_table);
v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table);
v->Visit("gpu_block_mod_table", &gpu_block_mod_table);

}
static constexpr const char *_type_key = "TileSpace";
TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node);


+ 19
- 5
src/poly/tiling/tiling_analyzer.cc View File

@@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() {

if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
ReduceStrategy reduce_strategy(this);
actived_strategies.push_back(&reduce_strategy);
ModStrategy mod_strategy(this);
actived_strategies.push_back(&mod_strategy);

GemmStrategy gemm_strategy(this);
GpuDmaAnalysisStrategy dma_analysis_strategy(this);
CustomTilingStrategy custom_strategy(this);
GpuStrategy gpu_strategy(this);
if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) {
actived_strategies.push_back(&dma_analysis_strategy);
} else {
if (scop_info_.user_config_.GetIsTuning()) {
actived_strategies.push_back(&custom_strategy);
} else {
actived_strategies.push_back(&reduce_strategy);
actived_strategies.push_back(&mod_strategy);
actived_strategies.push_back(&gemm_strategy);
}
actived_strategies.push_back(&gpu_strategy);
}
strategy_manager->SetStrategies(actived_strategies);
strategy_manager->ExecuteGpu();
if (scop_info_.user_config_.GetIsTuning()) {
binding_spaces_.clear();
for (auto i : gpu_strategy.thread_binding_spaces_) {
UpdateBindingSpace(i);
}
for (auto i : gpu_strategy.block_binding_spaces_) {
UpdateBindingSpace(i);
}
}
return;
}
}
@@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() {
if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
CastStrategy cast_strategy(this);
actived_strategies.push_back(&cast_strategy);

strategy_manager->SetStrategies(actived_strategies);
strategy_manager->ExecuteGpu();
return;
@@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() {

bool TilingAnalyzer::Prepare() {
logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger(
scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
CHECK(logger_) << "memory alloc fail.";
// Stage 1: Analyze schedule tree.
ScheduleTreeAnalyzer sch_ana(this, this->sch_);


+ 20
- 11
src/poly/tiling/tiling_analyzer.h View File

@@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) {
return (ALIGN_BYTES + dtype - 1) / dtype;
}

inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
inline int64_t GetMinBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
int64_t min_byte = -1;
for (const auto &it : dtypes) {
if (it.second.empty()) {
@@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>
min_byte = min_elem;
}
}
return GetAlignBytes(min_byte);
return min_byte;
}

inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
return GetAlignBytes(GetMinBytes(dtypes));
}

inline Expr CastToExpr(const std::string &value) {
@@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND";
constexpr auto AT_MOD = "MOD";
constexpr auto AT_CAST = "CAST";
constexpr auto AT_MEM_RATIO = "MEM_RATIO";
constexpr auto AT_THREAD_MIN = "THREAD_MIN";
constexpr auto AT_THREAD_MAX = "THREAD_MAX";
constexpr auto AT_THREAD_MOD = "THREAD_MOD";
constexpr auto AT_BLOCK_MIN = "BLOCK_MIN";
constexpr auto AT_BLOCK_MAX = "BLOCK_MAX";
constexpr auto AT_BLOCK_MOD = "BLOCK_MOD";

class TilingAnalyzer;

@@ -233,12 +243,12 @@ class TilingAnalyzer {
sch_(sch),
scop_info_(scop_info),
is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) {
if (scop_info.mmu_info_.IsGemm()) {
op_type_ = GEMM_OP;
} else if (scop_info.mmu_info_.IsConv()) {
op_type_ = CONV_OP;
} else {
op_type_ = VECTOR_OP;
if (scop_info.mmu_info_.IsGemm()) {
op_type_ = GEMM_OP;
} else if (scop_info.mmu_info_.IsConv()) {
op_type_ = CONV_OP;
} else {
op_type_ = VECTOR_OP;
}
}

@@ -292,7 +302,7 @@ class TilingAnalyzer {
CHECK(logger_);
return *(logger_.get());
}
void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); }
Stmt body_;
Binds &binds_;
isl::schedule sch_;
@@ -306,9 +316,8 @@ class TilingAnalyzer {

std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_;
std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_;

bool is_retry_{false};
std::vector<TileAxis::MappingConstraint> binding_spaces_; // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z]
private:
void AddTilingConstraints();
void AddPostTilingConstraints();


+ 12
- 7
src/poly/tiling/tiling_strategy_manager.h View File

@@ -58,8 +58,10 @@ class TilingStrategy {

// gpu configs
int64_t warp_sizes_ = 32;
int64_t max_num_blocks_ = 256 * 256;
int64_t max_num_threads_ = 1024;
int64_t max_x_dim_block_ = pow(2, 31) - 1;
int64_t max_y_z_dim_block_ = 65535;
int64_t max_x_y_dim_thread_ = 1024;
int64_t max_z_dim_thread_ = 64;
size_t max_dim_ = 3;
int64_t max_elem_per_thread_ = 1024;
};
@@ -284,8 +286,6 @@ class GemmStrategy : public TilingStrategy {
~GemmStrategy() {}
void AddNpuConstraint();
void AddGpuConstraint();

std::string interested_attr_key = AT_GEMM;
};

class GpuStrategy : public TilingStrategy {
@@ -306,6 +306,8 @@ class GpuStrategy : public TilingStrategy {
};
void AddNpuConstraint();
void AddGpuConstraint();
std::vector<TileAxis::MappingConstraint> thread_binding_spaces_; // [thread.x, thread.y, thread.z]
std::vector<TileAxis::MappingConstraint> block_binding_spaces_; // [block.x, block.y, block.z]

private:
void DetermineTemplate();
@@ -326,6 +328,8 @@ class GpuStrategy : public TilingStrategy {
// Step 1. Collect axes and sort them from inner to outer
void BuildAxesQueue();

void ApplyCustomConstraint();

/*
* Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks.
* e.g.
@@ -342,6 +346,8 @@ class GpuStrategy : public TilingStrategy {
// Step 3. Transform list of integer into string mapping config.
void SetMappingConfig();


int GetLocalAllocBufCount();
Template template_{Template::DEFAULT};
bool is_reduce_op_[TEMPLATE_BULK] = {false, false, true, true, true, false};

@@ -350,13 +356,12 @@ class GpuStrategy : public TilingStrategy {
std::vector<int64_t> thread_limit_;
std::vector<int64_t> block_cfg_;
std::vector<int64_t> thread_cfg_;
int64_t max_x_y_dim_thread_ = 1024;
int64_t max_z_dim_thread_ = 64;
int block_count_{0}; // number of mapped blocks
int64_t elem_per_thread_[3]{SpItemPerThread::AUTO};
int64_t min_elem_for_io_bound_ = 2;
size_t depth_{0};
bool need_reverse_{false};
bool reverse_binding_{false};
int64_t fused_size_{1};
std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"},
{3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"},
@@ -378,7 +383,7 @@ class MulticoreStrategy {

class TilingPriorityScorer {
public:
TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
~TilingPriorityScorer() {}

/*


+ 279
- 45
src/poly/tiling/tiling_strategy_manager_gpu.cc View File

@@ -18,7 +18,6 @@
#include <numeric>

#include "tiling_analyzer.h"

namespace akg {
namespace ir {
namespace poly {
@@ -174,13 +173,13 @@ void ReduceStrategy::AkgReduceLibStrategyOnGpu() {
int64_t min_blocks = square_thread ? 32 : 512;
int64_t min_elem_per_thread = use_local ? 2 : 8;
int64_t min_ty = 8;
if (total_injective_size * total_reduce_size / min_blocks / max_num_threads_ < min_elem_per_thread) {
if (total_injective_size * total_reduce_size / min_blocks / max_x_y_dim_thread_ < min_elem_per_thread) {
min_blocks = 32;
min_ty = square_thread ? min_ty : 1;
}

std::pair<int64_t, int64_t> tx_range{1, max_num_threads_};
std::pair<int64_t, int64_t> ty_range{1, max_num_threads_};
std::pair<int64_t, int64_t> tx_range{1, max_x_y_dim_thread_};
std::pair<int64_t, int64_t> ty_range{1, max_x_y_dim_thread_};
auto AlignToPowerOfTwo = [](int64_t original_factor) -> int64_t {
while ((original_factor) & (original_factor - 1)) {
--original_factor;
@@ -340,9 +339,9 @@ void ReduceStrategy::DealWith4DFusedReduce() {
continue;
}
axis->TileRestrainToSingleValue(CastIntToExpr(last_mod_value), TileLevel::CACHE1);
if (last_mod_value > max_num_threads_) {
if (last_mod_value > max_x_y_dim_thread_) {
LOG(WARNING) << "Cannot bind axis to " << last_mod_value << " threads, maximal thread number is "
<< max_num_threads_
<< max_x_y_dim_thread_
<< ". If fusing more than two axes together, footprint box calculated by isl may not be correct.";
continue;
}
@@ -377,13 +376,141 @@ void ReduceStrategy::DealWithPostReduceTensors() {
}
}

int GpuStrategy::GetLocalAllocBufCount() {
int count = 0;
for (auto &it : analyzer_->buf_info_) {
auto buf = it.second.get();
CHECK(buf);
if (buf->scope == TilingMemScope::MEM_SCOPE_LOCAL) {
count++;
}
}
return count;
}

void GpuStrategy::ApplyCustomConstraint() {
auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) {
std::vector<std::string> sp = akg::common::Split(constraint, ",");
std::vector<int64_t> ret;
for (auto val : sp) {
if (ret.size() == max_size) {
break;
}
CHECK(!val.empty());
ret.emplace_back(static_cast<int>(std::strtol(val.c_str(), nullptr, 10)));
}
return ret;
};

// init binding space through template-determined limit
thread_binding_spaces_.clear();
block_binding_spaces_.clear();
for (size_t i = 0; i < thread_limit_.size(); ++i) {
TileAxis::MappingConstraint elem;
elem.map_extent_ = thread_limit_[i];
thread_binding_spaces_.emplace_back(elem);
}
for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) {
TileAxis::MappingConstraint elem;
elem.map_extent_ = block_limit_[i];
block_binding_spaces_.emplace_back(elem);
}

// add constraints to binding space according to custom tiling
std::unordered_set<std::string> thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD};
std::unordered_set<std::string> block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD};
for (const auto attr : analyzer_->RootAxis()->attrs) {
std::vector<int64_t> constraint;
std::vector<TileAxis::MappingConstraint> target;
if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size());
target = thread_binding_spaces_;
} else if (block_keys.find(attr.attr_key) != block_keys.end()) {
constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size());
target = block_binding_spaces_;
}
if (constraint.empty()) {
continue;
}

for (size_t i = 0; i < constraint.size(); ++i) {
if (attr.attr_key.find("MIN") != std::string::npos) {
target[i].map_min_ = std::max<int64_t>(target[i].map_min_, constraint[i]);
} else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) {
target[i].map_extent_ = std::min<int64_t>(target[i].map_extent_, constraint[i]);
} else if (attr.attr_key.find("MOD") != std::string::npos) {
target[i].map_mod_ = std::max<int64_t>(1, constraint[i]);
}
}

if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
thread_binding_spaces_ = target;
} else if (block_keys.find(attr.attr_key) != block_keys.end()) {
block_binding_spaces_ = target;
}
}

// apply custom constraint to corresponding axis and modify binding space according to tile range of axis
size_t cur_depth = 0;
analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) {
if (axis == analyzer_->RootAxis()) {
return;
}
auto cons = axis->GetConstConstraint(CACHE1);
auto range_extent = axis->GetConstExtent();
int tile_min = cons.tile_min_.as<IntImm>()->value;
int tile_extent = cons.tile_extent_.as<IntImm>()->value;
auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth;

auto thread_extent = tile_extent;
if (idx < thread_binding_spaces_.size()) {
thread_extent = std::min<int64_t>(thread_extent, thread_binding_spaces_[idx].map_extent_);
thread_binding_spaces_[idx].map_extent_ = thread_extent;
}

auto block_extent = range_extent / tile_min;
if (idx < block_binding_spaces_.size()) {
block_extent = std::min<int64_t>(block_extent, block_binding_spaces_[idx].map_extent_);
block_binding_spaces_[idx].map_extent_ = block_extent;
}

auto block_min = block_extent / std::max<int64_t>(1, thread_extent);
if (idx < block_binding_spaces_.size()) {
block_min = std::max<int64_t>(block_min, block_binding_spaces_[idx].map_min_);
block_binding_spaces_[idx].map_min_ = block_min;
}

axis->thread_constraints.map_extent_ = thread_extent;
axis->block_constraints.map_extent_ = block_extent;
axis->block_constraints.map_min_ = block_min;
if (idx < thread_binding_spaces_.size()) {
axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_;
}
if (idx < block_binding_spaces_.size()) {
axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_;
}
++cur_depth;
});
}

void GpuStrategy::AddGpuConstraint() {
InitMappingLimit();
if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) {
if (!analyzer_->scop_info_.user_config_.GetIsTuning() &&
(template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) {
BroadcastSpeedup();
}
BuildAxesQueue();
if (analyzer_->scop_info_.user_config_.GetIsTuning()) {
ApplyCustomConstraint();
for (size_t i = 0; i < max_dim_; ++i) {
TileAxis::MappingConstraint pad;
if (i >= thread_binding_spaces_.size()) {
thread_binding_spaces_.emplace_back(pad);
}
if (i >= block_binding_spaces_.size()) {
block_binding_spaces_.emplace_back(pad);
}
}
return;
}
InnerThreadOuterBlock();
@@ -391,19 +518,36 @@ void GpuStrategy::AddGpuConstraint() {
InjectiveSpeedup();
}
SetMappingConfig();
if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
analyzer_->ForEachAxisTopDown([this](TileAxis *axis) {
if (axis == analyzer_->RootAxis()) {
return;
}
axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0);
});
}
// TODO: This is a very naive strategy to avoid cuda launch out of resources
// and we should fix this in register memory promotion pass.
if (template_ != Template::REDUCTION && template_ != Template::ALL_REDUCE) {
auto local_buf_count = GetLocalAllocBufCount();
auto thread_size = std::accumulate(thread_cfg_.begin(), thread_cfg_.end(), 1, std::multiplies<int>());
if (local_buf_count >= 4 || local_buf_count * 4 * thread_size >= 65536) {
analyzer_->scop_info_.user_config_.SetUseRegisterMemory(false);
}
}
}

void GpuStrategy::InitMappingLimit() {
max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread();
max_x_y_dim_thread_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread();
DetermineTemplate();
std::stringstream ss;
need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;
reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;

if (template_ == Template::CUSTOM_CONFIG) {
auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig();
for (size_t i = 0; i < thread_config->bound; ++i) {
auto idx = need_reverse_ ? thread_config->bound - 1 - i : i;
auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i;
if (idx >= depth_) {
continue;
}
@@ -427,29 +571,33 @@ void GpuStrategy::InitMappingLimit() {
} else if (template_ == Template::MATMUL) {
// This is a naive tiling strategy used in gpu when thread and block configs are already set.
// This strategy will tile up to three inner-most axes to 32 (for thread binding).
thread_limit_ = {32, 8};
if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
thread_limit_ = {warp_sizes_, 16};
} else {
thread_limit_ = {warp_sizes_, 8};
}
} else {
thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_};
}

if (template_ != Template::CUSTOM_CONFIG) {
if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
AdjustThreadMappingLimit();
}

if (template_ == Template::CUSTOM_CONFIG) {
auto block_config = analyzer_->scop_info_.user_config_.GetBlockConfig();
for (int i = block_config->bound - 1; i >= 0; --i) {
for (int i = 0; i < static_cast<int>(block_config->bound) - 1; ++i) {
if (i >= static_cast<int>(depth_)) {
continue;
break;
}
block_limit_.emplace_back(block_config->GetAt(i).second);
}
} else if (template_ <= Template::REDUCTION) {
block_limit_ = {max_num_blocks_, max_num_blocks_, max_num_blocks_};
block_limit_ = {max_x_dim_block_, max_y_z_dim_block_, max_y_z_dim_block_};
} else if (template_ == Template::ALL_REDUCE && !analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib()) {
block_limit_ = {1};
} else {
block_limit_ = {max_num_blocks_, max_num_blocks_, max_num_blocks_};
block_limit_ = {max_x_dim_block_, max_y_z_dim_block_, max_y_z_dim_block_};
}

std::vector<std::string> elem_cfg = common::Split(analyzer_->scop_info_.user_config_.GetElemPerThread(), " ");
@@ -490,13 +638,20 @@ void GpuStrategy::InnerThreadOuterBlock() {
auto block_dim = std::min(block_limit_.size(), max_dim_);

// tile from inner to outer and map to thread
analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to thread-----");
ss << "[Thread Limit]: ";
for (auto l : thread_limit_) {
ss << l << ", ";
}
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);

size_t ori_size = pending_axes_.size();
size_t inner_dim = 0;
for (size_t i = 0; i < ori_size; ++i) {
TileAxis *axis;
int64_t shape;
std::tie(axis, shape) = pending_axes_[i];
int64_t rest_threads = std::min(max_num_threads_ / activated_threads, thread_limit_[thread_cfg_.size()]);
int64_t rest_threads = std::min(max_x_y_dim_thread_ / activated_threads, thread_limit_[thread_cfg_.size()]);
ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape
<< ", rest_threads = " << rest_threads;
auto SkipMapping = [this, &axis, &shape, &ss, &inner_dim, &thread_dim]() {
@@ -505,16 +660,26 @@ void GpuStrategy::InnerThreadOuterBlock() {
tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_)
: tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_)
: 1;
if (axis->block_constraints.map_extent_ > 1) {
tile =
std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
ss << ", map to block.";
auto tile_min = axis->c1_constraints.tile_min_.as<IntImm>()->value;
auto tile_extent = axis->c1_constraints.tile_extent_.as<IntImm>()->value;
if (tile_min == tile_extent && tile_extent != MIN_TILE) {
ss << "tile extent is already determined = " << tile_extent;
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
tile = tile_min;
} else {
tile = std::min(tile, shape);
if (axis->block_constraints.map_extent_ > 1) {
tile =
std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
} else {
tile = std::min(tile, shape);
}
}
axis->TileRestrainLower(tile, TileLevel::CACHE1);
ss << ", tile = " << tile;
if (axis->block_constraints.map_extent_ > 1) {
pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
ss << ", map to block.";
}
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
};

@@ -535,6 +700,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
SkipMapping();
continue;
}

auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_
: elem_per_thread_[inner_dim];
item = std::min(item, max_elem_per_thread_);
@@ -559,8 +725,9 @@ void GpuStrategy::InnerThreadOuterBlock() {
if (template_ == Template::PURE_ELEM) {
std::map<int64_t, std::vector<size_t>, std::greater<int64_t>> sorted_by_gcd;
for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) {
auto use = (max_num_blocks_ > 0 && pending_axes_[i].second > 0)
? TilingAnalyzer::FindDivisibleTilingFactor(max_num_blocks_, pending_axes_[i].second)
auto block_limit = i == 0 ? max_x_dim_block_ : max_y_z_dim_block_;
auto use = (block_limit > 0 && pending_axes_[i].second > 0)
? TilingAnalyzer::FindDivisibleTilingFactor(block_limit, pending_axes_[i].second)
: 1;
if (sorted_by_gcd.find(use) == sorted_by_gcd.end()) {
sorted_by_gcd[use] = {i};
@@ -575,6 +742,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
if (pending_axes_.size() - i > block_dim) {
auto axis = pending_axes_[i].first;
ss << "axis " << axis->index << "_" << axis->dim_axis

<< " exceeded block dim and should be mapped to block for higher performance, consider flatten";
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
continue;
@@ -584,20 +752,30 @@ void GpuStrategy::InnerThreadOuterBlock() {
}
} else {
for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) {
if (pending_axes_[i].second <= 1 && indexing.size() == block_limit_.size()) {
continue;
}
indexing.emplace_back(i);
}
}

// map outer band to block according to predefined indice
analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to block-----");
ss << "[Block Limit]: ";
for (auto l : block_limit_) {
ss << l << ", ";
}
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);

for (const auto &i : indexing) {
TileAxis *axis;
int64_t shape;
std::tie(axis, shape) = pending_axes_[i];
auto idx = pending_axes_.size() - 1 - i;
idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx;
auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]);
rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_);
ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks;
auto idx = depth_ - 1 - (pending_axes_.size() - 1 - i);
idx = reverse_binding_ ? std::min(depth_, block_limit_.size()) - 1 - idx : idx;
auto rest_blocks = idx < block_limit_.size() ? std::min(block_limit_[idx], axis->block_constraints.map_extent_) : 1;
ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", block_idx = " << idx
<< ", rest blocks = " << rest_blocks;
if (block_count_ >= static_cast<int>(block_dim)) {
ss << "-> No mapping.";
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
@@ -635,11 +813,9 @@ void GpuStrategy::SetMappingConfig() {
if (block_cfg_.empty()) {
block_cfg_.emplace_back(1);
}
bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION);
std::string block_str = "";
std::string thread_str = "";
if (reverse_binding) {
if (reverse_binding_) {
for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) {
if (i >= block_count_) {
continue;
@@ -753,7 +929,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in
tile = thread_size;
ss << "tile = thread size, ";
} else {
auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim;
auto block_dim = reverse_binding_ ? block_limit_.size() - 1 - inner_dim : inner_dim;
int64_t least_blocks;
if (block_dim >= 0 && block_dim < block_limit_.size()) {
least_blocks = block_limit_[block_dim];
@@ -903,7 +1079,7 @@ void GpuStrategy::InjectiveSpeedup() {
while (shape % lower != 0) {
--lower;
}
bool is_efficient = lower * 2 > thread_size || total_threads / thread_size * lower * 2 >= max_num_threads_;
bool is_efficient = lower * 2 > thread_size || total_threads / thread_size * lower * 2 >= max_x_y_dim_thread_;
if (is_efficient) {
ss << "align thread from " << thread_size << " to " << lower << " according to shape " << shape;
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
@@ -923,8 +1099,8 @@ void GpuStrategy::InjectiveSpeedup() {
auto coaleasced_size = injective_axes.back()->thread_constraints.map_extent_;
auto proposal_blocks = coaleasced_size >= warp_sizes_ ? 256 : 512;
auto proposal_threads = (coaleasced_size >= warp_sizes_ && injective_axes.size() > 1U) ? 128
: coaleasced_size < max_num_threads_ ? 512
: max_num_threads_;
: coaleasced_size < max_x_y_dim_thread_ ? 512
: max_x_y_dim_thread_;
auto total_blocks = std::accumulate(block_cfg_.begin(), block_cfg_.end(), 1, std::multiplies<int>());
auto proposal_elem_per_thread = coaleasced_size < warp_sizes_ ? 1
: total_blocks < proposal_blocks * 8 ? min_elem_for_io_bound_
@@ -1091,7 +1267,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
int total_injective_size = 1;
auto broadcast_innermost = broadcast_idx_.find(original_shape.size() - 1) != broadcast_idx_.end();
for (size_t i = 0; i < original_shape.size(); ++i) {
if (original_shape[i] * possible_threads <= max_num_threads_) {
if (original_shape[i] * possible_threads <= max_x_y_dim_thread_) {
possible_threads *= original_shape[i];
}
auto rev_idx = original_shape.size() - 1 - i;
@@ -1100,7 +1276,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
coalesced_size = coalesced_size == 0 ? original_shape[i] : coalesced_size;
if (broadcast_innermost) {
auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1;
auto thread_limit = max_num_threads_ / prev_extent;
auto thread_limit = max_x_y_dim_thread_ / prev_extent;
auto coef = analyzer_->FindDivisibleTilingFactor(thread_limit, original_shape[i]);
axis->thread_constraints.map_extent_ = prev_extent * coef;
possible_threads = axis->thread_constraints.map_extent_;
@@ -1108,7 +1284,7 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
} else if (broadcast_innermost) {
auto prev_extent = axis->thread_constraints.map_extent_ > 0 ? axis->thread_constraints.map_extent_ : 1;
axis->thread_constraints.map_extent_ =
prev_extent * original_shape[i] <= max_num_threads_ ? prev_extent * original_shape[i] : prev_extent;
prev_extent * original_shape[i] <= max_x_y_dim_thread_ ? prev_extent * original_shape[i] : prev_extent;
possible_threads = axis->thread_constraints.map_extent_;
}
coalesced_size = coalesced_size == 0 ? 1 : coalesced_size;
@@ -1121,10 +1297,10 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
std::min(elem_per_thread, std::max<int>((fused_size_ / possible_threads / min_block + 1) / 2 * 2, 1));
ss << "thread for-loop speedup = " << axis->thread_constraints.item_process_;
} else if (total_injective_size > min_block) {
while (possible_threads % warp_sizes_ != 0 && possible_threads < max_num_threads_) {
while (possible_threads % warp_sizes_ != 0 && possible_threads < max_x_y_dim_thread_) {
++possible_threads;
}
int elem_per_block = std::max<int>(16 / (max_num_threads_ / possible_threads), 1);
int elem_per_block = std::max<int>(16 / (max_x_y_dim_thread_ / possible_threads), 1);
auto proposal_blocks = std::max(min_block, std::max<int>(fused_size_ / possible_threads / elem_per_block, 1));
axis->block_constraints.map_extent_ = proposal_blocks;
axis->thread_constraints.map_extent_ = possible_threads;
@@ -1139,12 +1315,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
}
}

void CustomTilingStrategy::AddGpuConstraint() {
auto interested_info = GetInterestedInfo(interested_attr_key, false);
for (auto it : interested_info) {
TileAxis *axis = it.first;
for (auto attr : it.second) {
std::vector<std::string> modes = akg::common::Split(attr.attr_key, ":");
CHECK_EQ(modes.size(), 2U);
std::string constraint_str = attr.attr_value;
if (constraint_str.find("->") != std::string::npos) {
std::vector<std::string> res = akg::common::Split(constraint_str, "->");
constraint_str = res[1];
}
std::vector<std::string> constraints = akg::common::Split(constraint_str, "_");
CHECK_GE(constraints.size(), 1U);
std::vector<std::string> level = akg::common::Split(constraints[0], ":");
CHECK(level.size() == 2U && level[0] == "LEVEL");
CHECK(level[1] == "C1" || level[1] == "C0");
TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0;
constraints.erase(constraints.begin());
for (const auto &con : constraints) {
std::vector<std::string> items = akg::common::Split(con, ":");
CHECK_EQ(items.size(), 2U);
CHECK_NE(items[0], "");
CHECK_NE(items[1], "");
if (items[0] == "MIN") {
if (items[1] == "MIN") {
if (lv == CACHE1) {
axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_;
} else if (lv == CACHE0) {
axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_;
}
} else {
if (lv == CACHE1) {
axis->c1_constraints.tile_min_ = CastToExpr(items[1]);
} else if (lv == CACHE0) {
axis->c0_constraints.tile_min_ = CastToExpr(items[1]);
}
}
} else if (items[0] == "FACTOR") {
axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv);
} else if (items[0] == "FORBIDISO") {
axis->forbid_iso = true;
} else if (items[0] == "MAX") {
if (items[1] == "FULL") {
axis->TileRestrainEntire(lv);
} else {
if (lv == CACHE1) {
axis->c1_constraints.tile_extent_ = CastToExpr(items[1]);
} else if (lv == CACHE0) {
axis->c0_constraints.tile_extent_ = CastToExpr(items[1]);
}
}
} else if (items[0] == AT_MOD) {
axis->TileRestrainMod(CastToExpr(items[1]), lv);
}
}
}
}
}

// No constraint found in cuda

void ModStrategy::AddGpuConstraint() {}

void CustomTilingStrategy::AddGpuConstraint() {}

void ConflictTreeRangeStrategy::AddGpuConstraint() {}

void VectorizedStrategy::AddGpuConstraint() {}


+ 0
- 0
tests/fuzz/tune_for_gpu/__init__.py View File


+ 17
- 0
tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py View File

@@ -0,0 +1,17 @@
import sys

if __name__ == "__main__":
from_log_file = str(sys.argv[1])
sorted_log_file = str(sys.argv[2])
f_in = open(from_log_file, 'r')
f_out = open(sorted_log_file, "wt")
d = dict()
for line in f_in:
config = line.split("|")
d[str(config[1])] = float(config[2])
sorted_dict = {k: v for k, v in sorted(
d.items(), key=lambda item: (item[1], item[0]))}
for k, v in sorted_dict.items():
f_out.write("|" + str(k) + "|" + str(v) + "\n")
f_in.close()
f_out.close()

+ 95
- 0
tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py View File

@@ -0,0 +1,95 @@
from .kernel_compiler import compile_kernel
from collections import namedtuple
from .space import ListConfigSpace

def get_reduce_axis_length(in_shape,reduce_axis):
lx, ly = 1, 1
if reduce_axis == None or len(reduce_axis) == len(in_shape):
for v in in_shape: lx *= v
elif (len(in_shape) - 1) in reduce_axis:
for i in range(len(in_shape)):
if i in reduce_axis:
lx *= in_shape[i]
else:
ly *= in_shape[i]

else:
for i in range(len(in_shape)):
if i in reduce_axis:
ly *= in_shape[i]
else:
lx *= in_shape[i]

return lx, ly

def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of reduce_sum operators in gpu"""
space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
gen_tiling_spaces=True)
in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis
dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2
dim_names = ['tiling_' + str(i) for i in range(dim_len)]
dim_names.append("block_x")
dim_names.append("block_y")
dim_names.append("block_z")
dim_names.append("thread_x")
dim_names.append("thread_y")
dim_names.append("thread_z")
for key in tuning_attrs_info[0]:
dim_names.append(key)
lx, ly = get_reduce_axis_length(in_shape, reduce_axis)

tiling_spaces = []
if reduce_axis == None or len(reduce_axis) == len(in_shape):
"""all-reduce"""
possible_tx_list = [2**i for i in range(4,11)]
for tx in possible_tx_list:
if tx > lx: break
possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)]
if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx)
for d0 in possible_dim0_list:
bx = lx//d0 if lx % d0 == 0 else lx//d0+1
tiling_spaces.append([d0,bx,1,1,tx,1,1])


elif (len(in_shape) - 1) in reduce_axis:
"""reduce-x"""
possible_tx_list = [2**i for i in range(4,11)]
for tx in possible_tx_list:
if tx > lx: break
ty = 1
by = ly
possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)]
if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx)
for d1 in possible_dim1_list:
bx = lx//d1 if lx % d1 == 0 else lx//d1+1
tiling_spaces.append([1,d1,bx,by,1,tx,ty,1])

else:
"""reduce-y"""
tx = min(32,lx)
bx = lx//tx if lx %tx==0 else lx//tx + 1
d0 = tx
for ty in range(min(8,ly),1025):
if ty * tx > 1024: break
possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)]
for d1 in possible_dim1_list:
by = ly//d1 if ly % d1 == 0 else ly//d1+1
tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1])

input_type = namedtuple(op_type, dim_names)
space = ListConfigSpace(input_type)
if len(tuning_attrs_info[0]) != 0:
for tiling_space in tiling_spaces:
for tuning_attrs_config in tuning_attrs_info[1]:
tmp = tiling_space[:]
tmp.extend(tuning_attrs_config)
config = input_type(*tmp)
space.add(config)
else:
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
return space_res.index_table, space, key, expect, input_for_mod

+ 501
- 0
tests/fuzz/tune_for_gpu/autotuning/job.py View File

@@ -0,0 +1,501 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AutoTuning job"""
import os
import json
import time
import datetime
import importlib
import logging
import pandas as pd
import subprocess
import numpy as np
from collections import namedtuple
from multiprocessing import Process, Manager
from akg import composite
from akg.utils import kernel_exec as utils
from akg.composite.build_module import generate_trait
from autotuning.runner import KernelRunner, error_time_list, error_time_string
from autotuning.tuner import ModelBasedTuner, Tuner
from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
from autotuning.space_generators import get_space
from autotuning.space import ListConfigSpace
from autotuning.test_data_generators import gen_data
from autotuning.space_generators import gen_bool_list
from autotuning.tuning_utils import *

logging.basicConfig(level=logging.DEBUG)

logger = logging.getLogger('fuzz.tune.autotuning.job')

storage_dir = './res/'

if not os.path.exists(storage_dir):
os.makedirs(storage_dir)

json_file = './res/' + "{0}" + ".json"
json_load = './autotuning/shapes/' + "{0}"


def get_repo(repo, keys, default=None):
for key in keys:
repo = repo.get(key)
if not repo:
return default
return repo


def get_json_space(json_input, space_dict):
space_res = composite.get_tiling_space(json_input, 2)
space_dict['res'] = space_res


def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False,
skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]):
"""composite json tuning launch"""
subprocess.run("mkdir -p res/", shell=True)
iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
files = os.listdir(json_dir)
with open(repo_path, 'r') as f:
repo = json.loads(f.read())
for input_file in files:
print("----Start tuning for ", input_file)
with open(json_dir + '/' + input_file, 'r') as f:
json_input = f.read()
json_content = json.loads(json_input)
for input_desc in json_content["input_desc"]:
if input_desc[0]["shape"] == []:
input_desc[0]["shape"] = [1]
json_input = json.dumps(json_content)

# skip tuning for info in repo
if skip_exist:
compute, shape, dtype = generate_trait(json_content)
if get_repo(repo, [compute, shape, dtype]):
print("Info for %s already exists" % input_file)
print("ops are ", str(compute))
print("shape is ", str(shape))
print("dtype is ", str(dtype))
with open('res/skip_file.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue

# generate tuning space
if not extra_tune:
time_start_get_space = time.time()
with Manager() as manager:
space_dict = manager.dict()
p = Process(target=get_json_space,
args=(json_input, space_dict))
p.start()
p.join(600)
if 'res' not in space_dict:
with open('res/error_space_list.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue
space_res = space_dict['res']
time_end_get_space = time.time()
print("get space time: ", time_end_get_space - time_start_get_space)
index_table = space_res['index']
tiling_spaces = space_res['tuning_space']
if not isinstance(tiling_spaces, list):
with open('res/empty_space_list.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue
dim_names = ['tiling_' + str(i)
for i in range(len(tiling_spaces[0]))]
use_tuning_attrs = len(tiling_spaces) < 10 ** 5
if tuning_attrs and use_tuning_attrs:
dim_names.extend(tuning_attrs)
input_type = namedtuple("json", dim_names)
space = ListConfigSpace(input_type)
if tuning_attrs and use_tuning_attrs:
attr_options = gen_bool_list(tuning_attrs)
for tiling_space in tiling_spaces:
for attr_option in attr_options:
tmp = tiling_space[:]
tmp.extend(attr_option)
config = input_type(*tmp)
space.add(config)
else:
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
else:
index_table = []
pre_lists = gen_bool_list(self_attrs)
pre_input_type = namedtuple("extra_tune", self_attrs)
space = ListConfigSpace(pre_input_type)
for item in pre_lists:
config = pre_input_type(*item)
space.add(config)

key = json_content["op"]
try:
input_for_mod, expect = gen_data(
op_type="json", op_desc=json_input)
except BaseException as e:
logger.debug(
"gen numpy data from [%s] failed: %s", input_file, str(e))
with open('res/error_gen_data_list.txt', 'a') as fe:
fe.write(input_file)
fe.write(": ")
fe.write(str(e))
fe.write("\n")
continue
print('space size:', space.length)
print('index table:', index_table)

output_para = None # this is for multi-output
if len(json_content["output_desc"]) > 1:
output_para = []
for i in range(len(json_content["output_desc"])):
output_para.append(i - len(json_content["output_desc"]))
runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs,
input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180,
repeat_times=1)

# we can only get a valid tiling, or accurate get cycles
is_truly_profiling = utils.get_profiling_mode(
) or os.environ['RUNTIME_MODE'] == "gpu"

# available device numbers, normally is 8 or 1
available_device_numbers = utils.get_available_devices_num()

if all_space:
tuner = Tuner(runner, index_table, space,
n_parallel=available_device_numbers)
least_try_times = 3 # space.length
else:
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=64, pre_model=None)
least_try_times = iter_times[0 if space.length <
10 ** 4 else 1 if space.length < 10 ** 5 else 2]
tuner.tune(least_try_times, output_file="json.log")

print_tuning_result("json", space, index_table, tuner, key)

if save_res:
if extra_tune:
save_tuning_result(key, "extra_tune",
json_content, index_table, tuner, repo_path)
else:
save_tuning_result(key, "json", json_content,
index_table, tuner, repo_path)


def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False,
all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None):
"""AutoTuning jobs"""
iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
time_start_get_space = time.time()
index_table, space, key, expect, input_for_mod = get_space(
op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
time_end_get_space = time.time()
print("get space time: ", time_end_get_space - time_start_get_space)
print('space size:', space.length)
print('index table:', index_table)
key = key if insert_key == '' else insert_key

# filter already tuned shape
if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys():
if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]:
return

if isinstance(conf_of_set_dim[key], dict):
return

output_para = None # this is for multi-output
if isinstance(input_for_mod, dict):
input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs']
runner = KernelRunner(op_type, desc, index_table,
self_attrs=None, input_data=input_for_mod,
expect=expect, mod_output_param=output_para,
timeout=30, repeat_times=1,
is_all_space=all_space,
skip_config_set=skip_config_set,
need_tune_json=tuning_attrs_info[2])

# we can only get a valid tiling, or accurate get cycles
is_truly_profiling = utils.get_profiling_mode()

# number of multi-processing for build kernels
available_device_numbers = get_parallel_build_num()

time_start_tuning = time.time()
if all_space:
tuner = Tuner(runner, index_table, space,
n_parallel=available_device_numbers)
least_try_times = space.length
else:
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=100, pre_model=None)
least_try_times = space.length
tuner.tune(least_try_times, output_file=op_type + ".log")

time_end_tuning = time.time()
print("tuning time: ", time_end_tuning - time_start_tuning)
print_tuning_result(op_type, space, index_table, tuner, key)
# save_results_to_csv(op_type, space, index_table, tuner, key)

# if save_res:
# save_tuning_result(key, op_type, desc, index_table, tuner)


def print_tuning_result(op_type, space, index_table, tuner, key):
"""print tuning result"""
print(op_type + " shape is:", key)
print('space size:', space.length)
print('index table:', index_table)
print('best config:', tuner.best_config)
print('best time:',
tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time])
print('original time:', tuner.original_time)
print('optimal result is ', tuner.original_time /
tuner.best_time, "faster then auto set dim.")
print("total try times", len(tuner.xs))
for x, y in zip(tuner.xs, tuner.ys):
print(space.get(x), y if y not in error_time_string.keys()
else error_time_string[y])


def save_results_to_csv(op_type, space, index_table, tuner, key):
"""save all results to csv"""
data = []
for x, y in zip(tuner.xs, tuner.ys):
data.append([space.get(x), y if y not in error_time_string.keys()
else 9999999])
df = pd.DataFrame(data, columns=["config", "time"])
df.to_csv(op_type + "_" + key + ".csv")


def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"):
"""save tuning result"""
if tuner.best_config is not None and tuner.best_time not in error_time_list:
set_dim_configs = tuner.best_config.input
if op_type == "matmul":
param = []
for _ in range(len(desc.x_shape) - 2):
param.append((1, 1))
if set_dim_configs.n_l1 > 0:
param.append((set_dim_configs.n_l1, set_dim_configs.n_l0))
if set_dim_configs.m_l1 > 0:
param.append((set_dim_configs.m_l1, set_dim_configs.m_l0))
param.extend(
[(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)])
tiling_param = (param, {"bypass": set_dim_configs.bypass})

# special case with different tiling parameter format
elif op_type in ("conv", "conv_bn1"):
param = []
tile_hh = set_dim_configs.tile_h
tile_coco = set_dim_configs.tile_co
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
tile_ww = set_dim_configs.tile_w
param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
tiling_param = (param, {"bypass": set_dim_configs.bypass})
elif op_type == "conv_backprop_input":
param = []
tile_hh = set_dim_configs.tile_h
tile_coco = set_dim_configs.tile_co
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
tile_ww = set_dim_configs.tile_w
param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
tiling_param = (param)
elif op_type == "conv_backprop_filter":
param = []
tile_cici = set_dim_configs.tile_ci
tile_khkh = set_dim_configs.tile_kh
tile_kwkw = set_dim_configs.tile_kw
tile_coco = set_dim_configs.tile_co
tile_bb = set_dim_configs.tile_batch
tile_hh = set_dim_configs.tile_h
tile_ww = set_dim_configs.tile_w
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
param = [tile_cici, tile_khkh, tile_kwkw, tile_coco,
tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn]
tiling_param = (param)
elif ("batch_matmul" in op_type) and (platform == "gpu"):
tiling = [str(getattr(set_dim_configs, name)) for name in getattr(
set_dim_configs, "_fields") if name.startswith("tiling")]
tiling_param = ""
for i, tile_v in enumerate(tiling):
if i % 2 == 0:
tiling_param += "0 " + str(i) + " "
tiling_param += tile_v + " "

block_param = get_block_str_from_config(set_dim_configs)
thread_param = get_thread_str_from_config(set_dim_configs)
config = {
'attrs': {
'dim': tiling_param,
'bind_block': block_param,
'bind_thread': thread_param
},
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
'date': str(datetime.datetime.now()),
'tuning_time': tuner.tuning_time,
}
elif op_type == "json":
from autotuning.runner import get_attr_from_config
tiling_param = get_attr_from_config(set_dim_configs, index_table)
elif op_type == "reduce_sum_gpu":
print(set_dim_configs)
tiling = [str(getattr(set_dim_configs, name))
for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
tiling_param = ""
for i, tile_v in enumerate(tiling):
tiling_param += "0 " + str(i) + " "
tiling_param += tile_v + " 1 "

block_param = get_block_str_from_config(set_dim_configs)
thread_param = get_thread_str_from_config(set_dim_configs)
config = {
'attrs': {
'dim': tiling_param,
'bind_block': block_param,
'bind_thread': thread_param
},
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
'date': str(datetime.datetime.now()),
'tuning_time': tuner.tuning_time,
}
else:
print(set_dim_configs)
tiling = [[getattr(set_dim_configs, name), 1]
for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
tiling_param = []
for i, tile_v in enumerate(tiling):
tiling_param.append(index_table[i] + tile_v)
config = []
else:
tiling_param = []

# when there is a valid result, save the result
if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list:
config = {'attrs': tiling_param,
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
"date": str(datetime.datetime.now()),
"tuning time": tuner.tuning_time,
}
if op_type == "json":
config["file_name"] = str(key)
compute, shape, dtype = generate_trait(desc)
tuner.export_dim_configs(
config, json_file.format(op_type), False, str(key))
save_file = "autotuning/extra_tune.json" if extra_tune else repo_path
with open(save_file, 'r') as f:
repo = json.loads(f.read())
if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or
int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])):
tuner.export_dim_configs_for_keys(config, save_file, False, [
compute, shape, dtype, "metadata"])
else:
try:
tuner.export_dim_configs(
config, json_file.format(op_type), False, str(key))
except UnboundLocalError as e:
logger.warning(e)
print("[save_tuning_result]: ", "no result is saved.")


def load_json_configs(op_type):
"""load json configs"""
dim_file = json_file.format(op_type)
file_path = os.path.realpath(dim_file)
if os.path.isfile(file_path):
try:
with open(file_path, 'r') as f:
data = json.load(f)
return data
except IOError as e:
logger.debug(e)
return {}
return {}


def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type):
"""read tuning shapes from file"""
file = importlib.import_module('autotuning.shapes.' + op_type)
shapes = file.shapes
for _, shp in enumerate(shapes):
do_profiling(shp, debug_mode, save_res,
all_space, op_type, conf_of_set_dim)


def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
"""do profiling"""
# remove undeleted JOB files for previous shapes
subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True)
if op_type == 'matmul':
key = shp[2][0:-1]
logger.debug("start profiling: [%s]", str(key))
desc = MatmulCubeDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv_backprop'):
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvBackpropDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv') and "gpu" not in op_type:
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]:
logger.debug("start profiling: [%s]", str(shp))
jobs(op_type, shp, debug_mode, save_res,
all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)
else:
key = shp
logger.debug("start profiling: [%s]", str(key))
desc = key
jobs(op_type, desc, debug_mode, save_res,
all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set)
logger.debug("end profiling: [%s]", str(key))


def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False,
from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
# get the existed tiling
conf_of_set_dim = load_json_configs(op_type) if from_json else None

if desc is None:
read_shapes_from_file(debug_mode, save_res,
all_space, conf_of_set_dim, op_type)
else:
shp = desc
do_profiling(shp, debug_mode, save_res, all_space, op_type,
tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)

+ 407
- 0
tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py View File

@@ -0,0 +1,407 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compile kernel module for operator"""
import os
from typing import NamedTuple
from base import TestBase
from akg.utils import kernel_exec as utils
from akg.utils import custom_tiling as ct_util
from akg.ops.nn import conv_bn1
from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul
from test_op.batch_matmul import batch_matmul
from akg.ops.math_gpu.reduce_sum import reduce_sum
from akg.build_module import tuning_spaces
from akg.ops.nn import matmul
from test_run import batchmatmul_run, matmul_run
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
import numpy as np
from gen_random import random_gaussian
from .tuning_utils import merge_attrs


def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None):
# wait for implementation
return


def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table,
config: ConvConfig = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}

if op_desc.use_bias:
shape = [input_shape[0], input_shape[1], input_shape[2]]
else:
shape = [input_shape[0], input_shape[1]]
conv_dtype = 'float16'

return utils.op_build(conv.conv, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
op_desc.dilation, op_desc.use_bias, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_bn1"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_bn1_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}

if op_desc.use_bias:
shape = [input_shape[0], input_shape[1], input_shape[2]]
else:
shape = [input_shape[0], input_shape[1]]
conv_dtype = 'float16'

return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
op_desc.dilation, op_desc.use_bias, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table,
config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for matmul_cube"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "matmul_cube_poly"
if idx is not None:
kernel_name += str(idx)
if config is None:
attrs = {'dim': ""}
else:
tiling_param = []
for _ in range(len(op_desc.x_shape) - 2):
tiling_param.append((1, 1))
if config.n_l1 > 0:
tiling_param.append((config.n_l1, config.n_l0))
if config.m_l1 > 0:
tiling_param.append((config.m_l1, config.m_l0))
tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)])
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs = {'dim': dim_info, 'bypass': config.bypass}
return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format,
op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y,
op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name,
attrs, tuning=gen_tiling_spaces)


def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_backprop_input"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_backprop_input_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param}

conv_dtype = 'float16'
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
stride_h, stride_w = op_desc.stride

out_n = in_n
out_c = cout
out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1

x_shape = (out_n, out_c, out_h, out_w)
w_shape = (cout, in_c, w_h, w_w)
in_nn, in_cc, in_hh, in_ww = x_shape
input_shape_nc1hwc0 = (in_nn, in_cc // block_size,
in_hh, in_ww, block_size)
k_n, k_c, k_h, k_w = w_shape
kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0
kernel_shape_fractal = (k_c // block_size * k_h *
k_w, k_n // block_size, block_size, block_size)

shape = [input_shape_nc1hwc0, kernel_shape_fractal]

return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_backprop_filter"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_backprop_filter_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_cici = config.tile_ci
tile_khkh = config.tile_kh
tile_kwkw = config.tile_kw
tile_coco = config.tile_co
tile_bb = config.tile_batch
tile_hh = config.tile_h
tile_ww = config.tile_w
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww,
tile_mm, tile_kk, tile_nn]
attrs = {'conv_tile': tiling_param}

conv_dtype = 'float16'
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
stride_h, stride_w = op_desc.stride

out_n = in_n
out_c = cout
out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1

x_shape = (in_n, in_c, in_h, in_w)
y_shape = (out_n, out_c, out_h, out_w)
in_n, in_c, in_h, in_w = x_shape
input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)
o_n, o_c, o_h, o_w = y_shape
kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size)
o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0
mo = (o_h * o_w + block_size - 1) // block_size
mi = block_size
kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0)

input_shape = [kernel_shape_fractal, input_shape_nc1hwc0]

return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for vector"""
test_base = TestBase()
test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd())
kernel_name = "poly_"
if idx is not None:
kernel_name += str(idx)
if config is None:
attrs = {'dim': ""}
else:
tiling = [[getattr(config, name), 1] for name in getattr(
config, '_fields') if name.startswith('tiling')]
tiling_param = []
for i, element in enumerate(tiling):
tiling_param.append(index_table[i] + element)
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs = {'dim': dim_info}
_, func, args, kwargs = test_base.ana_args(op_desc)
if 'attrs' in kwargs.keys():
kwargs['attrs']['dim'] = attrs['dim']
kwargs['attrs']['tuning'] = gen_tiling_spaces
kwargs['attrs']['kernel_name'] = kernel_name
else:
for _, arg_ in enumerate(args):
if isinstance(arg_, dict):
arg_['dim'] = attrs['dim']
arg_['tuning'] = gen_tiling_spaces
arg_['kernel_name'] = kernel_name
break
try:
if gen_tiling_spaces:
mod, expect, param_for_mod = func(*args, **kwargs)
mod = list(mod)
mod.append(expect)
mod.append(param_for_mod)
else:
mod = func(*args, **kwargs)
except BaseException as e:
print("Compile ERROR message:", e)
print(func)
print("Compile ERROR")
raise Exception("Compile ERROR")

return mod


def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None,
config: NamedTuple = None, idx=None,
gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for batch_matmul in gpu"""
kernel_name = "batch_matmul_gpu_"
# wait for implementation
return


def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None,
config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for reduce_sum in gpu"""
kernel_name = "reduce_sum_gpu_"
if idx is not None:
kernel_name += str(idx)
attrs = op_desc[2]
if config is not None:
attrs = merge_attrs(attrs, config, need_tune_json)

try:
if gen_tiling_spaces:
# NOTE: don't use this process for reduce spaces generation,
# see function: "_get_space_reduce_gpu_manually".
from .tiling_strategies_gpu import reduce_gpu_tiling_strategy
spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ),
(attrs.in_dtype,
), kernel_name="reduce_sum",
op_attrs=[
attrs.axis, attrs.keepdims],
attrs={"target": "cuda",
"enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
"enable_atomic_add": attrs.enable_atomic_add,
"custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True)

from test_ms_reduce_sum import gen_data
input_for_mod, output, expect = gen_data(
attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims)
return [spaces, set_dim_key, expect, [input_for_mod, output]]
else:
mod = utils.op_build(reduce_sum, (attrs.in_shape, ),
(attrs.in_dtype,
), kernel_name="reduce_sum",
op_attrs=[
attrs.axis, attrs.keepdims],
attrs={"target": "cuda",
"enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
"dim": attrs.dim,
"bind_block": attrs.bind_block,
"bind_thread": attrs.bind_thread,
"enable_atomic_add": attrs.enable_atomic_add})
return mod
except BaseException as e:
print("Compile ERROR message:", e)
print(reduce_sum)
print("Compile ERROR")
raise Exception("Compile ERROR")


def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for convolution in gpu using image2col+gemm"""
# wait for implementation
return


_compile_kernel_func = {
'conv': gen_kernel_conv,
'conv_bn1': gen_kernel_conv_bn1,
'conv_backprop_input': gen_kernel_conv_backprop_input,
'conv_backprop_filter': gen_kernel_conv_backprop_filter,
'matmul': gen_kernel_matmul_cube,
'reduce_sum_gpu': gen_kernel_reduce_sum_gpu,
'batch_matmul_gpu': gen_kernel_batch_matmul_gpu,
'conv_image2col_gemm_gpu': gen_kernel_conv_image2col_gemm_gpu,
}


def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None,
config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None):
"""Generate kernel module for operator

Parameters
op_type: str
operator name
op_desc: NamedTuple
operator definition parameters
config_param: NameTuple
operator config parameters
idx: int
operator idx(th) kernel
gen_tiling_spaces: bool
parameter passed to utils.op_build, whether to get spaces instead of stmt
----------

Returns:
kernel if gen_tiling_spaces == False else np.ndarray
"""
gen_func = _compile_kernel_func.get(op_type, None)
if gen_func is None:
gen_func = gen_kernel_for_vector
if gen_tiling_spaces:
space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param,
idx, gen_tiling_spaces)
else:
if "gpu" in op_type:
mod = gen_func(op_desc, input_shape, index_table,
config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json)
else:
mod = gen_func(op_desc, input_shape, index_table,
config_param, idx, gen_tiling_spaces)

return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod

+ 243
- 0
tests/fuzz/tune_for_gpu/autotuning/runner.py View File

@@ -0,0 +1,243 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Runner for compile and execute a configs of an operator on device"""
import time
import multiprocessing
import logging
import json
import os
import subprocess
import time
from typing import NamedTuple
import numpy as np
from akg import composite
from akg.utils import custom_tiling as ct_util
from akg.utils import kernel_exec as utils
from .kernel_compiler import compile_kernel
from .test_data_generators import gen_data
from .tuning_utils import *

logger = logging.getLogger('fuzz.tune.autotuning.runner')

error_time_list = [
9999999999.0,
9999999998.0,
9999999997.0,
9999999996.0,
]

error_time_string = {
error_time_list[0]: 'run_failed',
error_time_list[1]: 'precision_error',
error_time_list[2]: 'compile_failed',
error_time_list[3]: 'timeout'
}

run_failed_time = error_time_list[0]
precision_error_time = error_time_list[1]
compile_fail_time = error_time_list[2]
timeout_time = error_time_list[3]


def get_attr_from_config(config, index_table):
tiling = []
attrs = {}
tuning_dict = config._asdict()
for key, value in tuning_dict.items():
if key.startswith('tiling'):
item = [value, 1]
tiling.append(item)
else:
attrs[key] = value
if len(tiling):
tiling_param = []
for i, element in enumerate(tiling):
tiling_param.append(index_table[i] + element)
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs['dim'] = dim_info
else:
print("No tiling info. Use auto tiling.")
return attrs


class KernelRunner:
"""kernel runner
This runner will compile and execute configs of an operator, and return their running times.

Parameters
----------
op_type: str
The name of operator
op_desc: NamedTuple
The definition parameters of operator
timeout: int
Timeout for running one config
repeat_times:
Run one config repeat_times
"""

def __init__(self, op_type: str, op_desc: NamedTuple,
index_table: list, self_attrs: list, timeout: int = 600,
repeat_times: int = 2, input_data=None,
expect=None, mod_output_param=None, is_all_space=True,
skip_config_set=None, need_tune_json=None):
self.op_type = op_type
self.op_desc = op_desc
self._index_table = index_table
self.self_attrs = self_attrs
self.run_kernel_time = 0.0
self.tune_self_attrs = True
self.timeout = timeout
self.repeat_times = repeat_times
self.mod_output_param = mod_output_param
self.is_all_space = is_all_space
self.skip_config_set = skip_config_set
self.need_tune_json = need_tune_json
if input_data is None:
self.input, self.expect = gen_data(op_type, op_desc)
if isinstance(self.input, dict):
self.input, self.mod_output_param = self.input['args'], self.input['outputs']
else:
self.input, self.expect = input_data, expect
self.input_shape = [x.shape for x in self.input]

def info(self):
print('run kernel time:', self.run_kernel_time)

def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False):
"""Compile and execute a config of the operator on device"""

if json.dumps(config.input._asdict()) in self.skip_config_set:
print("CONFIG SKIP:", json.dumps(config.input._asdict()))
run_times[idx] = -1
return

time_one_kernel_start = time.time()
logger.debug('compile %dth kernel', idx)
gpu_devices_list = get_available_gpu_num()
device_id = gpu_devices_list[idx % len(gpu_devices_list)]
logger.debug('run %dth kernel', idx)
logger.debug('++++++++++++++++++++++=device_id')
logger.debug(device_id)
logger.debug('++++++++++++++++++++++=device_id')
try:
time_start_build = time.time()
logger.debug(config)
if self.op_type in ["json", "extra_tune"]:
if is_auto:
mod = composite.build(self.op_desc)
if self.op_type == "extra_tune":
del os.environ['MS_GRAPH_KERNEL_TILING']
else:
attrs = get_attr_from_config(
config.input, self._index_table)
if os.environ['RUNTIME_MODE'] == "gpu":
attrs['target'] = "cuda"
mod = composite.build(self.op_desc, attrs, use_repo=False)
else:
mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table,
None if is_auto else config.input, idx, need_tune_json=self.need_tune_json)
time_end_build = time.time()
logger.debug("build module time: %f",
time_end_build - time_start_build)
logger.debug('finished compile %dth kernel', idx)
except BaseException as e:
logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str(
config.input), str(e))
run_times[idx] = compile_fail_time
return

run_times[idx] = run_failed_time

try:
# NOTE: in gpu tuning, it is no need to use this repeat_times,
# repeat_time has been setted in mod_launch in tuning mode.
for _ in range(self.repeat_times):
stat_info = {}
try:
time_start_launch = time.time()
if self.mod_output_param is not None:
pass
else:
output, stat_info = utils.mod_launch(
mod, self.input, tuning=True, device_id=device_id, repeat_time=40)
if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True):
stat_info['run_time'] = precision_error_time
logger.debug("Precision Error: [%s]",
"origin" if config is None else str(config.input))

time_end_launch = time.time()
logger.debug("mod launch time: %f",
time_end_launch - time_start_launch)
except BaseException as e:
logger.debug("Run Failed: [%s] : %s", str(
config.input), str(e))
stat_info['run_time'] = run_failed_time
run_times[idx] = np.minimum(
run_times[idx], stat_info['run_time'])
finally:
logger.debug('end of %dth kernel', idx)
time_one_kernel_end = time.time()
logger.debug('run one kernel time: %f',
time_one_kernel_end - time_one_kernel_start)
return

def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False):
"""Compile and execute a batch config of the operator on device"""
start = time.time()
logger.setLevel(logging.DEBUG)
logger.debug("gen cce kernels batch: %d kernels", len(configs))
subprocess.run("rm -rf ./jobs/JOB*", shell=True)

process_jobs = []
run_times = multiprocessing.Manager().list(
np.full((len(configs),), compile_fail_time))
for idx, config in enumerate(configs):
p = multiprocessing.Process(target=self.run_one_kernel,
args=(run_times, idx, config, best_time, is_auto_set_dim))
process_jobs.append(p)
p.start()
timeout_error = False
for idx, p in enumerate(process_jobs):
if not timeout_error:
p.join(timeout=self.timeout)
if p.is_alive():
timeout_error = True
logger.debug("Timeout Error: [%s]", str(configs[idx].input))
run_times[idx] = timeout_time
p.terminate()

process_end = time.time()
logger.debug("process time: %f", process_end - start)
# clean the profiling directory
tune_device = int(os.environ['DEVICE_ID'])
tune_num = int(os.environ['DEVICE_TOTAL_NUM'])
if os.environ['RUNTIME_MODE'] == "gpu":
subprocess.run("rm -rf cuda_meta_*", shell=True)
else:
pass
end = time.time()
logger.debug("run kernels time: %f", end - start)
self.run_kernel_time += end - start

for idx, config in enumerate(configs):
if run_times[idx] not in error_time_list:
logger.debug("KernelRunTime : [%s] : %s", str(
configs[idx].input), str(run_times[idx]))
else:
logger.debug("KernelRunTime : [%s] : %s",
str(configs[idx].input), str(error_time_string[run_times[idx]]))

return run_times

+ 217
- 0
tests/fuzz/tune_for_gpu/autotuning/space.py View File

@@ -0,0 +1,217 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Config space"""
from abc import ABCMeta, abstractmethod
from typing import NamedTuple, List
import random
import numpy as np


class ConfigEntity:
"""General config entity"""

def __init__(self, input_id: int, input_space: NamedTuple):
self.__input = input_space
self.__input_id = input_id
self.__input_type = type(input_space)

def __len__(self):
return len(self.__input)

def __str__(self):
return str(self.__input_id) + ': ' + str(self.__input)

def __repr__(self):
return str(self)

@property
def input_id(self):
return self.__input_id

@property
def input_type(self):
return self.__input_type

@property
def input(self):
return self.__input

@property
def feature(self):
return self.__input


class ConfigSpace(metaclass=ABCMeta):
"""Searching space of configs"""

def __init__(self, input_type):
self._input_type = input_type
self._dim_names = getattr(self._input_type, '_fields')

self._configs = [] # List[ConfigEntity]

@abstractmethod
def reset_fetch(self):
pass

@abstractmethod
def has_next(self) -> bool:
pass

@abstractmethod
def fetch_index(self) -> int:
"""fetch a random index of config"""

@abstractmethod
def fetch_config(self) -> ConfigEntity:
"""fetch a random config"""

@abstractmethod
def random_walk(self, p: int) -> int:
"""find a neighbor hood of the p-th ConfigEntity, which only
differs with p in at most one dimension"""

def get(self, idx: int) -> ConfigEntity:
"""get the `idx`-th config of the space"""
return self._configs[idx]

@property
def configs(self):
return self._configs

@property
def dim_names(self):
return self._dim_names

@property
def input_type(self):
return self._input_type

@property
# @abstractmethod
def length(self):
return len(self.configs)


class ConfigTrie:
"""Trie node for config entities"""

def __init__(self):
self.ch = dict()

def add(self, config: ConfigEntity, last_dim: int):
"""add a ConfigEntity"""
cur = self
for i, x in enumerate(config.input):
if i == last_dim:
continue
if x not in cur.ch:
cur.ch[x] = ConfigTrie()
if not isinstance(cur.ch, dict):
raise TypeError('none-leaf node should have a dict of childs')
cur = cur.ch[x]

if not isinstance(cur.ch, list):
cur.ch = []
cur.ch.append(config.input_id)

def fetch_random(self, config: ConfigEntity, last_dim: int) -> int:
"""randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension"""
cur = self
for i, x in enumerate(config.input):
if i == last_dim:
continue
if not isinstance(cur.ch, dict):
raise TypeError('none leaf node should have a dict of childs')
if x not in cur.ch:
raise RuntimeError('no element found')
cur = cur.ch[x]
if not cur.ch:
raise RuntimeError('no element found')
if len(cur.ch) == 1:
return cur.ch[0]
idx = config.input_id
while idx == config.input_id:
idx = random.choice(cur.ch)
return idx


class ListConfigSpace(ConfigSpace):
"""Searching space of configs, which stores all possible configs in a list"""

def __init__(self, input_type):
super(ListConfigSpace, self).__init__(input_type)

self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))]
self.__fetch_pool = []

def reset_fetch(self):
"""reset fetch state"""
self.__fetch_pool = [i for i in range(len(self._configs))]

def fetch_scope(self, start, end):
self.__fetch_pool = [i for i in range(start, end)]

def has_next(self) -> bool:
return len(self.__fetch_pool) > 0

def fetch_index(self) -> int:
"""fetch a random index of config"""
idx = np.random.randint(len(self.__fetch_pool))
ret = self.__fetch_pool[idx]
self.__fetch_pool[idx] = self.__fetch_pool[-1]
self.__fetch_pool.pop()
return ret

def fetch_next_index(self) -> int:
"""fetch next index of config"""
idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0]
self.__fetch_pool.pop()
return idx

def fetch_config(self) -> ConfigEntity:
"""fetch a random config"""
return self.get(self.fetch_index())

def add(self, input_space: NamedTuple):
"""add a new config to space"""
if not isinstance(input_space, self._input_type):
raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space),
self._input_type))
config = ConfigEntity(len(self._configs), input_space)
self.__fetch_pool.append(len(self._configs))
for i in range(len(self._dim_names)):
self.__config_tries[i].add(config, i)
self._configs.append(config)

def random_walk(self, p: int) -> int:
"""find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension"""
dim = np.random.randint(len(self._dim_names))
return self.__config_tries[dim].fetch_random(self._configs[p], dim)

@property
def length(self):
return len(self._configs)

@classmethod
def from_list(cls, configs: List[NamedTuple]):
if not isinstance(configs, list):
raise TypeError('configs must be of list type, got %s' % type(configs))
if not configs:
raise ValueError('configs must be non-empty')
space = cls(type(configs[0]))
for config in configs:
space.add(config)
return space

+ 753
- 0
tests/fuzz/tune_for_gpu/autotuning/space_generators.py View File

@@ -0,0 +1,753 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""space generating functions for operators"""
from functools import partial
from typing import NamedTuple
from collections import namedtuple
from test_run import matmul_run
from akg.utils import validation_check as vc_util
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
from .space import ListConfigSpace
from .kernel_compiler import compile_kernel
from .gen_spaces_gpu import _get_space_reduce_gpu_manually
from tqdm import tqdm
from enum import Enum

GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"}

class GpuSpacePolicy(Enum):
"""Policy to expand tile candidates with block and thread."""
FULL = "FULL"
BMM = "BMM"
REDUCE_ALL = "REDUCE_ALL"
REDUCE_X = "REDUCE_X"
REDUCE_Y = "REDUCE_Y"


def gen_bool_list(attr_list):
bool_list = []
for _ in attr_list:
if len(bool_list) == 0:
bool_list = [[True], [False]]
else:
tmp_list = []
for attr_option in bool_list:
tmp = attr_option[:]
tmp.append(True)
tmp1 = tmp[:]
tmp.pop()
tmp.append(False)
tmp2 = tmp[:]
tmp_list.append(tmp1)
tmp_list.append(tmp2)
bool_list = tmp_list
return bool_list


def _get_space_vector(op_type: str, op_desc):
"""get config space of vector operator"""
space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
gen_tiling_spaces=True)

if space_res is None:
raise RuntimeError('no space returned')
if 'index' not in space_res or 'tuning_space' not in space_res:
raise RuntimeError('invalid space returned')
index_table = space_res['index']
tiling_spaces = space_res['tuning_space']

if not tiling_spaces:
raise RuntimeError('empty tiling spaces')

dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))]
input_type = namedtuple(op_type, dim_names)
space = ListConfigSpace(input_type)
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
return index_table, space, key, expect, input_for_mod


def _get_space_conv(op_desc: ConvDesc):
"""get config space of convolution"""
if not isinstance(op_desc, ConvDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape
padding = (pad_[0], pad_[1], pad_[2], pad_[3])
p_top, p_bottom, p_left, p_right = padding
s_h, s_w = stride_

in_c = ((in_c - 1) // 16 + 1) * 16
tile_c = in_c
tile_co_start = 16

data_len = 2

h_max = in_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

bypass_options = [0, 1]

for bypass in bypass_options:
for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h +
p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
if bypass == 1:
if tile_co != k_n:
continue
l1_size = data_len * (size_h * size_w * in_c)
else:
l1_size = data_len * (size_h * size_w * in_c +
tile_co * tile_c * k_h * k_w)

if l1_size > l1_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = in_c * k_h * k_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
tile_n, tile_w, bypass))

return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_bn1(op_desc: ConvDesc):
"""get config space of convolution"""
if not isinstance(op_desc, ConvDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape
padding = (pad_[0], pad_[1], pad_[2], pad_[3])
p_top, p_bottom, p_left, p_right = padding
s_h, s_w = stride_

in_c = ((in_c - 1) // 16 + 1) * 16
tile_c = in_c
tile_co_start = 16

data_len = 2

h_max = in_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

bypass_options = [0, 1]

for bypass in bypass_options:
h_range = range(h_max, k_h - 1, -s_h)
for tile_h in h_range:
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h +
p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
if bypass == 1:
if tile_co != k_n:
continue
l1_size = data_len * (size_h * size_w * in_c)
else:
l1_size = data_len * (size_h * size_w * in_c +
tile_co * tile_c * k_h * k_w)

if l1_size > l1_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = in_c * k_h * k_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
tile_n, tile_w, bypass))

return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc):
"""get config space of convolution backprop input"""
if not isinstance(op_desc, ConvBackpropDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvBackpropInputConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1
block_size = 16

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2
ub_max_size = l0c_max_size

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
k_n = (k_n + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = pad_
stride_h, stride_w = stride_

out_c = k_n
out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1

out_h = out_h * stride_h
out_w = out_w * stride_w

p_top = k_h - pad_[0] - 1
p_bottom = in_h + pad_[0] - stride_[0] * \
((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1)
p_left = k_w - pad_[2] - 1
p_right = in_w + pad_[2] - stride_[1] * \
((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1)

s_h = 1
s_w = 1

tile_c = out_c
tile_co_start = 16

data_len = 2

h_max = out_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = out_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
l1_size = data_len * (size_h * size_w * out_c +
tile_co * tile_c * k_h * k_w)
if l1_size > l1_max_size:
continue
ub_size = data_len * (size_h * size_w * out_c)
if ub_size > ub_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = out_c * k_h * k_w
k_base = 16 * k_h * k_w
k_max_ = ((k_max - 1) // k_base + 1) * k_base
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // k_base * k_base
for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m,
tile_k, tile_n, tile_w))
return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc):
"""get config space of convolution backwprop filter"""
if not isinstance(op_desc, ConvBackpropDesc):
raise TypeError('op_desc must be ConvBackpropDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvBackpropFilterConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1
block_size = 16

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, k_h, k_w = op_desc.filter_shape
k_n = cout

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = pad_
s_h, s_w = stride_
tile_co_start = 16
tile_ci_start = 16
data_len = 2
h_max = in_h + pad_top + pad_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + pad_left + pad_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
win_tile_h = (tile_h - k_h) // s_h + 1
# Only one head for cut H axis
if win_tile_h * s_h < pad_top:
continue
# Only one tail for cut H axis
if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top:
continue
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - pad_top, in_h +
pad_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
win_tile_w = (tile_w - k_w) // s_w + 1
# Only one head for cut W axis
if win_tile_w * s_w < pad_left:
continue
# Only one tail for cut W axis
if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left:
continue
if size_w == w_max:
size_w = in_w
else:
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - pad_left, in_w +
pad_left - tile_w + k_w - s_w)
for tile_kh in range(k_h, 0, -1):
for tile_kw in range(k_w, 0, -1):
k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
in_c_ = ((in_c - 1) // 16 + 1) * 16
ci_range = range(in_c_, tile_ci_start - 1, -16)
for tile_ci in ci_range:
tile_batch = 1
l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w +
tile_ci * size_h * size_w)
if l1_size > l1_max_size:
continue

if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_):
tile_m = tile_co
tile_n = tile_ci * tile_kh * tile_kw
l0c_size = data_len * tile_n * tile_m
if l0c_size > l0c_max_size:
continue
k_max = tile_batch * tile_h * tile_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size1 = l0a_max_size // data_len // tile_m
k_size1_ = k_size1 // 16 * 16
k_size2 = l0b_max_size // data_len // tile_n
k_size2_ = k_size2 // 16 * 16
for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16):
config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co,
tile_batch, tile_h, tile_w, tile_m,
tile_k, tile_n))
else:
for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16):
k_max = tile_batch * tile_h * tile_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = tile_co
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw,
tile_co, tile_batch, tile_h,
tile_w, tile_m, tile_k, tile_n))
return None, config_space, op_desc.__str__(), None, None


def _get_space_matmul_cube(op_desc: MatmulCubeDesc):
"""get config space of matmul_cube"""
if not isinstance(op_desc, MatmulCubeDesc):
raise TypeError('op_desc must be MatmulCubeDesc')
config_space = ListConfigSpace(MatmulCubeConfig)
batch_tuple, m, k, n = matmul_run.extract_dim(
op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)

mmax = (m + 15) // 16
nmax = (n + 15) // 16
kmax = (k + 15) // 16

double_buffer = True
mad_fp32 = True

l1_max_size = (1024 * 1024) # L1 MEM 1024KB
l0a_max_size = (64 * 1024) # L0A MEM 64KB
l0b_max_size = (64 * 1024) # L0B MEM 64KB
l0c_max_size = (256 * 1024) # L0C MEM 256KB
# UB MEM 248KB, 8KB reserved for compiler
ub_max_size = ((256 - 8) * 1024)

if double_buffer:
l1_max_size = l1_max_size // 2
l0a_max_size = l0a_max_size // 2
l0b_max_size = l0b_max_size // 2
l0c_max_size = l0c_max_size // 2
ub_max_size = ub_max_size // 2

if mad_fp32:
l0c_max_size = l0c_max_size // 2
if op_desc.out_dtype == 'float32':
ub_max_size = ub_max_size // 2

bypass_options = [0, 1, 2]

for bypass in bypass_options:
if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or
(op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')):
continue

if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or
(op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')):
continue

for k_l1 in range(1, kmax + 1):
if kmax % k_l1 != 0:
continue
for k_l0 in range(1, k_l1 + 1):
if k_l1 % k_l0 != 0:
continue

# no need to cut from l1 to l0 for m and n when k is cut
for m_l1 in range(1, mmax + 1):
if mmax % m_l1 != 0:
continue
m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1)
for m_l0 in m_l0_range:
if m_l1 % m_l0 != 0:
continue
for n_l1 in range(1, nmax + 1):
if nmax % n_l1 != 0:
continue
n_l0_range = [n_l1] if k_l1 != kmax else range(
1, n_l1 + 1)
for n_l0 in n_l0_range:
if n_l1 % n_l0 != 0:
continue

if m_l0 * 16 * k_l0 * 16 > l0a_max_size:
continue

if n_l0 * 16 * k_l0 * 16 > l0b_max_size:
continue

if m_l0 * 16 * n_l0 * 16 > l0c_max_size:
continue

if m_l0 * 16 * n_l0 * 16 > ub_max_size:
continue

if bypass == 2:
l1_size = n_l1 * 16 * k_l1 * 16
elif bypass == 1:
l1_size = m_l1 * 16 * k_l1 * 16
else:
l1_size = (m_l1 * 16 + n_l1 *
16) * k_l1 * 16
if l1_size > l1_max_size:
continue

if nmax == 1:
n_l1 = 0
n_l0 = 0
if mmax == 1:
m_l1 = 0
m_l0 = 0
if kmax == 1:
k_l1 = 16
k_l0 = 16
config_space.add(MatmulCubeConfig(
n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass))
shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
op_desc.bias, op_desc.left_format,
op_desc.right_format, op_desc.out_format)
return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format,
op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype,
op_desc.out_dtype)), None, None



def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of batch_matmul operator in gpu"""
return

def get_range_block(space_res):
block_range = space_res.gpu_block_range_table.asnumpy().tolist()
block_mod = space_res.gpu_block_mod_table.asnumpy().tolist()
block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0])
block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0])
if len(block_y_range) == 0: block_y_range = range(1,2)
block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0])
if len(block_z_range) == 0: block_z_range = range(1,2)
return block_x_range,block_y_range,block_z_range

def get_range_thread(space_res):
thread_range = space_res.gpu_thread_range_table.asnumpy().tolist()
thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist()
thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0])
thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0])
if len(thread_y_range) == 0: thread_y_range = range(1,2)
thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0])
if len(thread_z_range) == 0: thread_z_range = range(1,2)
return thread_x_range,thread_y_range,thread_z_range

def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL):
total_shape = max([max(v) for v in tiling_spaces])
new_spaces = []
block_x_range, block_y_range, block_z_range = get_range_block(space_res)
thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res)
pbar = tqdm(total=len(tiling_spaces))
max_thread = 1024
for space in tiling_spaces:
pbar.set_description("Adding block, thread to spaces")
if policy == GpuSpacePolicy.REDUCE_ALL:
for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2):
for by in block_y_range:
for bz in block_z_range:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
if tx * ty * tz > max_thread:
continue
tmp_space = space[:]
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
elif policy == GpuSpacePolicy.BMM:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
if tx * ty * tz > max_thread:
continue
tmp_space = space[:]
if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]):
continue
bx = max(1, tmp_space[-1] // tx)
by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1
bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1
if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop:
continue
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
elif policy == GpuSpacePolicy.FULL:
for bx in block_x_range:
for by in block_y_range:
for bz in block_z_range:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
tmp_space = space[:]
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
else:
raise ValueError("Policy {} is not defined.".format(policy))

pbar.update(1)
print("total spaces size is: ",len(new_spaces))
return new_spaces

def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of conv_image2col_gemm operators in gpu"""
return

_get_space_func = {
'conv': _get_space_conv,
'conv_bn1': _get_space_conv_bn1,
'conv_backprop_input': _get_space_conv_backprop_input,
'conv_backprop_filter': _get_space_conv_backprop_filter,
'matmul': _get_space_matmul_cube,
"reduce_sum_gpu": _get_space_reduce_gpu_manually,
"batch_matmul_gpu": _get_space_batch_matmul_gpu,
"conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu,
}


def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None):
"""get space of an operator"""
func = _get_space_func.get(op_type, None)
if func is None:
func = partial(_get_space_vector, op_type=op_type)
if "gpu" in op_type:
return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
return func(op_desc=op_desc)

+ 147
- 0
tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py View File

@@ -0,0 +1,147 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generating test data for operators"""
from typing import NamedTuple

import numpy as np
from gen_json_data import gen_json_data
from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc

def _gen_data_json(op_desc):
"""Generating test data for composite json"""
input_for_mod, expect, _ = gen_json_data(op_desc)
return input_for_mod, expect

def _gen_data_conv(op_desc: ConvDesc):
"""Generating test data for conv"""
fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
op_desc.pad, op_desc.stride, op_desc.dilation,
op_desc.use_bias)
out_data = np.full(expect.shape, 0, 'float16')

if op_desc.use_bias:
args = (fmap_data, filter_data, bias_data, out_data)
else:
args = (fmap_data, filter_data, out_data)
return args, expect


def _gen_data_conv_bn1(op_desc: ConvDesc):
"""Generating test data for conv_bn1"""
fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
op_desc.pad, op_desc.stride, op_desc.dilation,
op_desc.use_bias)
axes = (0, 2, 3)
conv_mean = np.mean(conv_expect, axis=axes, keepdims=True)
conv_square = np.power(conv_expect, 2)
conv_var_part = np.mean(conv_square, axis=axes, keepdims=True)

expects = (conv_expect, conv_var_part, conv_mean)

out_datas = [np.full(e.shape, 0, 'float16') for e in expects]
out_datas[1] = out_datas[1].astype(np.float32)
out_datas[2] = out_datas[2].astype(np.float32)

if op_desc.use_bias:
in_data = [fmap_data, filter_data, bias_data]
else:
in_data = [fmap_data, filter_data]

args = in_data
for out in out_datas:
args.append(out)
args = tuple(args)

return {"args": args, 'outputs': (-3, -2, -1)}, expects


def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc):
dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation)
out_data = np.full(dx.shape, 0, 'float16')

args = (dout, w, out_data)
return args, dx


def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc):
"""Generating test data for conv_backprop_filter"""
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

x_shape = (in_n, in_c, in_h, in_w)
w_shape = (cout, in_c, w_h, w_w)

dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride,
op_desc.dilation)
out_data = np.full(expect.shape, 0, 'float32')

args = (dy_data, dx_data, out_data)
return args, expect


def _gen_data_matmul_cube(op_desc: MatmulCubeDesc):
"""Generating test data for matmul_cube"""
batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)
m = (m + 15) // 16 * 16
n = (n + 15) // 16 * 16
k = (k + 15) // 16 * 16
_, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
op_desc.bias, op_desc.left_format, op_desc.right_format,
op_desc.out_format)
m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype,
op_desc.out_dtype, op_desc.bias, op_desc.adj_x,
op_desc.adj_y, op_desc.left_format,
op_desc.right_format, op_desc.out_format)

out_data = np.full(out_shape, np.nan, op_desc.out_dtype)

if op_desc.bias:
args = (m_x, m_y, bias_data, out_data)
else:
args = (m_x, m_y, out_data)
return args, bench_mark


_gen_data_func = {
'json': _gen_data_json,
'conv': _gen_data_conv,
'conv_bn1': _gen_data_conv_bn1,
'conv_backprop_input': _gen_data_conv_backprop_input,
'conv_backprop_filter': _gen_data_conv_backprop_filter,
'matmul': _gen_data_matmul_cube,
}


def gen_data(op_type: str, op_desc: NamedTuple):
"""Generate test data for operator

Parameters
op_type: str
operator name
op_desc: NamedTuple
operator definition parameters
----------
"""
gen_func = _gen_data_func.get(op_type, None)
if gen_func is None:
raise ValueError('Unsupported op type for test data generating: %s' % op_type)
return gen_func(op_desc)

+ 84
- 0
tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py View File

@@ -0,0 +1,84 @@
from akg.utils import custom_tiling as ct_util

def reduce_gpu_tiling_strategy(in_shape, reduce_axis):
"""Custom tiling strategy for reduce op in gpu"""
strategy = list()

if reduce_axis == None or len(reduce_axis) == len(in_shape):
"""all-reduce"""
strategy.append(
ct_util.create_constraint_on_axis(
values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN
)
)
elif (len(in_shape) - 1) in reduce_axis:
"""Reduce-X: dummy strategy for hand-write space"""
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
)[0]
)
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
)
)

else:
"""Reduce-Y: dummy strategy for hand-write space"""
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
)[0]
)
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
)
)

return strategy


def conv_dummy_strategy():
"""Conv strategy: dummy strategy"""
return

def batch_matmul_gpu_tiling_strategy(desc):
"""Custom tiling strategy for batch matmul in gpu with or without tensor core"""
return

+ 359
- 0
tests/fuzz/tune_for_gpu/autotuning/tuner.py View File

@@ -0,0 +1,359 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tuner for finding best config for operators"""
import logging
import time
import json
import os
import numpy as np
from multiprocessing import Process
from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel
from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer
from .space import ConfigSpace
from .runner import KernelRunner
from tqdm import tqdm

logger = logging.getLogger('fuzz.tune.autotuning.tuner')


class Tuner:
"""Basic tuner class

Parameters
----------
runner: KernelRunner
This is for run kernels in physical device
config_space: ConfigSpace
The space of configs
n_parallel: int
How many kernels are processed in a turn
"""

def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None):
self._runner = runner
self._index_table = index_table
self._space = config_space
self._n_parallel = n_parallel

# trial plan
self._trials = []
self._trial_pt = 0
self._visited = set()

# observed samples
self._xs = []
self._ys = []

# keep the current best
self._best_config = None # type: ConfigEntity
self._best_time = np.inf
self._best_iter = 0
self._tuning_time = 0.0
self._original_time = np.inf
self._skip_config_set = skip_config_set

@property
def best_config(self):
return self._best_config

@property
def best_time(self):
return self._best_time

@property
def best_iter(self):
return self._best_iter

@property
def tuning_time(self):
return self._tuning_time

@property
def original_time(self):
return self._original_time

@property
def xs(self):
return self._xs

@property
def ys(self):
return self._ys

def info(self):
print('space size:', self._space.length)
print('best config:', self._best_config)
print('best time:', self._best_time)
print('best_iter:', self._best_iter)
print('tuning time:', self._tuning_time, 'secs')

def next_batch(self, batch_size: int, is_add_visited=True):
"""extract next batch with xgboost model"""
ret = []
counter = 0
if not is_add_visited:
return [self._space.get(index) for index in range(min(batch_size, self._space.length))]
while counter < batch_size and self._space.has_next():
index = 0
while self._trial_pt < len(self._trials):
index = self._trials[self._trial_pt]
if index not in self._visited:
break
self._trial_pt += 1

if self._trial_pt >= len(self._trials):
# if the trial list is empty choose randomly
index = self._space.fetch_index()

ret.append(self._space.get(index))
self._visited.add(index)

counter += 1
return ret

def next_config(self, batch_size: int):
"""extract next config orderly"""
ret = []
counter = 0
while counter < batch_size and self._space.has_next():
index = self._space.fetch_next_index()
ret.append(self._space.get(index))
self._visited.add(index)
counter += 1
return ret

def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""):
"""export configs"""
mode = "a" if append else "w"
with open(output_file, mode) as f:
for x, y in configs:
if y != -1:
f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y))

def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""):
"""export dim configs"""
mode = "a" if append else "w"
data = {}
try:
if os.path.isfile(output_file):
with open(output_file, 'r') as f:
data = json.load(f)
except IOError as e:
logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
with open(output_file, mode) as f:
import re
data[key] = configs
s = json.dumps(data, sort_keys=True)
s = re.sub(r',\s*"', ',\n"', s)
s = '{\n' + s[1:-1] + '\n}'
f.write(s)

def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]):
"""export dim configs"""
mode = "a" if append else "w"
data = {}
try:
if os.path.isfile(output_file):
with open(output_file, 'r') as f:
data = json.load(f)
except IOError as e:
logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
with open(output_file, mode) as f:
import copy
tmp = copy.deepcopy(configs)
for key in reversed(keys):
info = {key: tmp}
tmp = copy.deepcopy(info)
data.update(info)
s = json.dumps(data, sort_keys=True, indent=4)
print(s)
f.write(s)

def load_configs(self, input_file: str):
"""load configs"""
configs = []
file_path = os.path.realpath(input_file)
if os.path.isfile(file_path):
with open(file_path, "r") as f:
for line in f:
x, y, _ = line.split('|')
configs.append((self._space.input_type(**json.loads(x)), np.float64(y)))
return configs

def tune(self, least_try_times: int, output_file: str = None):
"""grid search all configs"""
i = 0
pbar = tqdm(total=least_try_times)
while i < least_try_times:
if not self._space.has_next():
break
configs = self.next_config(min(self._n_parallel, least_try_times - i))
run_times = self._runner.run(configs, self._best_time)
results = []
for idx, conf in enumerate(configs):
results.append((conf.input_id, run_times[idx]))
# keep best config
if self.best_time > run_times[idx]:
self._best_time = run_times[idx]
self._best_iter = i + idx
self._best_config = conf

i += len(results)
pbar.update(len(results))

# update
for res in results:
self._xs.append(res[0])
self._ys.append(res[1])
if output_file:
configs = [(self._space.get(res[0]).input, res[1]) for res in results]
self.export_configs(configs, output_file)
return run_times


class ModelBasedTuner(Tuner):
"""Model based tuner
This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials

Parameters
----------
plan_size: int
Tuner will re-fit model per `plan_size` new measure samples
pre_model: CostModel
The cost model that predicts the speed of a config (IR)
"""

def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None):
super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel)
self.__plan_size = plan_size

if pre_model is not None:
self.__cost_model = pre_model
self.__cost_model.reset_space(self._space)
else:
self.__cost_model = XgbCostModel(self._space)

self.__model_optimizer = SimulatedAnnealingOptimizer(self._space)
self.__train_ct = 0

self.__is_auto_set_dim = False#True

# time to leave
self.__ttl = None
self.__least_try_times = None
self.__early_stopping = None

self.__model_run_time = 0.0

def info(self):
super(ModelBasedTuner, self).info()
print('model run time:', self.__model_run_time, 'secs')

def model_res(self):
self.__cost_model.fit(self._xs, self._ys, self.__plan_size)
best_configs = self.__model_optimizer.find_best(
self.__cost_model, self.__plan_size, self._visited)
self._trials = best_configs

def tune(self, least_try_times: int, output_file: str = None):
early_stopping = least_try_times
self.__least_try_times = least_try_times
self.__early_stopping = early_stopping

logger.setLevel(logging.DEBUG)
old_level = logger.level
i = 0
error_ct = 0

tuning_start = time.time()
while (i < self._space.length and (i < least_try_times
or (self._best_time > self._original_time - 0.9
and i < least_try_times * 3))):
if not self._space.has_next():
break
iter_start = time.time()
if not self.__is_auto_set_dim:
configs = self.next_batch(min(self._n_parallel, self._space.length - i))
else:
configs = self.next_batch(min(self._n_parallel, self._space.length - i), False)

logger.debug('--indexes: %s', str([x.input_id for x in configs]))

run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim)
if self.__is_auto_set_dim:
from operator import add
from functools import reduce
self._original_time = reduce(add, run_times) / len(run_times)
self._best_time = self._original_time
self._best_iter = -1
self._best_config = None
run_times = None
self.__is_auto_set_dim = False
continue

results = []
for idx, conf in enumerate(configs):
if run_times[idx] == -1:
continue
results.append((conf.input_id, run_times[idx]))
# keep best config
if self._best_time > run_times[idx]:
self._best_time = run_times[idx]
self._best_iter = i + idx
self._best_config = conf

i += len(results)
self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i

start = time.time()
# update
for res in results:
self._xs.append(res[0])
self._ys.append(res[1])
if output_file:
configs = [(self._space.get(res[0]).input, res[1]) for res in results]
desc = str(self._runner.op_desc)
self.export_configs(configs, output_file, desc=desc)
# if we have enough new training samples
if len(self._xs) >= self.__plan_size * (self.__train_ct + 1):
p = Process(target=self.model_res)
p.start()
p.join()
self._trial_pt = 0
self.__train_ct += 1

end = time.time()
logger.debug('model running time: %f seconds', end - start)
self.__model_run_time += end - start

iter_end = time.time()
logger.debug('iter time: %f seconds', iter_end - iter_start)

if self._best_iter > 0 and i >= self.best_iter + early_stopping:
logger.debug('Early stopped. Best iter: %d', self._best_iter)
return

print("tuning time already, ", time.time() - tuning_start)
if time.time() - tuning_start > 7200:
logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter)
return

if error_ct > 150:
logging.warning('Too many errors happen in the tuning. Now is in debug mode')
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(old_level)

self._tuning_time += time.time() - tuning_start

+ 9
- 0
tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json View File

@@ -0,0 +1,9 @@
{
"enable_atomic_add": {
"dtype": "bool",
"options": [
"False",
"True"
]
}
}

+ 155
- 0
tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py View File

@@ -0,0 +1,155 @@
from collections import namedtuple
import os
import logging


def get_block_str_from_config(config: namedtuple):
block_param = ""
if "block_x" in getattr(config, "_fields"):
block_param += str(config.block_x) + " "

if "block_y" in getattr(config, "_fields"):
block_param += str(config.block_y) + " "

if "block_z" in getattr(config, "_fields"):
block_param += str(config.block_z) + " "
return block_param


def get_thread_str_from_config(config: namedtuple):
thread_param = ""
if "thread_x" in getattr(config, "_fields"):
thread_param += str(config.thread_x) + " "

if "thread_y" in getattr(config, "_fields"):
thread_param += str(config.thread_y) + " "

if "thread_z" in getattr(config, "_fields"):
thread_param += str(config.thread_z) + " "
return thread_param


def get_parallel_build_num():
"""get the num of parallel build"""
env_dic = os.environ
try:
return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1
except NameError as e:
logging.error(e)
return 1


def get_available_gpu_num():
"""get the num of gpu"""
env_dic = os.environ
try:
return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ]
except NameError as e:
logging.error(e)
return 1

def get_real_attr(value ,key ,need_tune_json, need_tune_keys):
if key not in need_tune_keys:
return value
if need_tune_json[key]['dtype'] == "bool":
if need_tune_json[key]['options'][value].lower() == "true":
return True
elif need_tune_json[key]['options'][value].lower() == "false":
return False
else:
raise TypeError("Wrong boolean type, please check json file")
elif need_tune_json[key]['dtype'] == "str":
if isinstance(need_tune_json[key]['options'][value], str):
return need_tune_json[key]['options'][value]
else:
raise TypeError("Wrong str type, please check json file")
elif need_tune_json[key]['dtype'] == "int":
if isinstance(need_tune_json[key]['options'][value], int):
return need_tune_json[key]['options'][value]
else:
raise TypeError("Wrong int type, please check json file")


def merge_attrs(attrs, config, need_tune_json):
tiling = [getattr(config, name) for name in getattr(
config, '_fields') if name.startswith('tiling')]
dim_str = ''
d_config = config._asdict()
d_attrs = attrs._asdict()
is_2d_tiling = False
for name in getattr(config, '_fields'):
if name.startswith('tiling'):
if name.count("_") == 2:
is_2d_tiling = True
break
for i, element in enumerate(tiling):
if is_2d_tiling:
if i % 2 == 0:
dim_str += "0 " + str(i//2) + " "
dim_str += str(element) + " "
else:
# 1d tiling
dim_str += "0 " + str(i) + " " + str(element) + " 1 "

# add block, thread information
block = [str(getattr(config, name)) for name in getattr(
config, '_fields') if name.startswith('block')]
bind_block_str = ' '.join(block)

thread = [str(getattr(config, name)) for name in getattr(
config, '_fields') if name.startswith('thread')]
bind_thread_str = ' '.join(thread)

d_attrs['dim'] = dim_str
d_attrs['bind_block'] = bind_block_str
d_attrs['bind_thread'] = bind_thread_str

need_tune_keys = need_tune_json.keys()
for key in need_tune_keys:
d_attrs[key] = d_config[key]

# make a new attrs with config info
attrs_type = type(attrs)
config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs]
new_attrs = attrs_type(*config_list)
return new_attrs


def get_skip_configs_from_log(skip_configs_log):
skip_config_set = set()
if skip_configs_log != "":
with open(skip_configs_log, 'r') as file:
for line in file:
config = str(line.split("|")[1]).strip()
skip_config_set.add(config)
print("SKIP CONFIGS NUMBER:", len(skip_config_set))
return skip_config_set

def get_tuning_attrs_from_json(tuning_attrs_json):
import json
need_tune_spaces = [[]]
keys = []
json_string = dict()
if tuning_attrs_json != "":
with open(tuning_attrs_json,'r') as file:
json_string =json.load(file)
for key in json_string.keys():
keys.append(key)
num_options = len(json_string[key]['options'])
tmp_spaces = []
for space in need_tune_spaces:
for i in range(num_options):
tmp_space = space[:]
tmp_space.append(i)
tmp_spaces.append(tmp_space)
need_tune_spaces = tmp_spaces[:]
return (keys, need_tune_spaces, json_string)

if __name__ == "__main__":
"""test components"""
file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json"
keys, need_tune_spaces = get_tuning_attrs_from_json(file_name)
print(keys)
print(need_tune_spaces)

+ 49
- 0
tests/fuzz/tune_for_gpu/autotuning/type_definitions.py View File

@@ -0,0 +1,49 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""operator description and config param definitions"""
from collections import namedtuple

# op desc for ascend
ConvDesc = namedtuple("ConvDesc", [
'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias'])

ConvBackpropDesc = namedtuple(
"ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation'])

MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format",
"out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"])


# op desc for gpu
ReduceGpuDesc = namedtuple("ReduceGpuDesc", [
"in_shape", "in_dtype", "axis", "keepdims",
"poly_sch", "dim", "bind_block", "bind_thread",
"enable_akg_reduce_lib", "enable_atomic_add"])


# config param definitions for ascend
ConvConfig = namedtuple('ConvConfig', [
'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass'])
ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig',
['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w'])
ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig',
['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch',
'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n'])
MatmulCubeConfig = namedtuple(
'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass'])

# config param definitions for gpu

EmptyConfig = namedtuple('empty', [])

+ 16
- 0
tests/fuzz/tune_for_gpu/config_gpu.sh View File

@@ -0,0 +1,16 @@
# how many multi-processing to build
export BUILD_PARALLEL_NUM=4

# set the default gpu devices, plz never change it
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# set the real devices you want to use
export USE_GPU_DEVICES=0,1,2,3

export RUNTIME_MODE=gpu

export PROFILING_MODE=true

# ascend config
export DEVICE_ID=0
export DEVICE_TOTAL_NUM=8

+ 67
- 0
tests/fuzz/tune_for_gpu/test_gpu.py View File

@@ -0,0 +1,67 @@
# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""test"""
import time
from autotuning.job import launch
from akg.utils import kernel_exec
from akg.ops.math_gpu import reduce_sum
from autotuning.type_definitions import ReduceGpuDesc
import numpy as np
import sys
import argparse
from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json


def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False):
mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ),
kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims],
attrs={"target": "cuda", "enable_akg_reduce_lib": True})
return mod

def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None):
time_start = time.time()
op_type_ = 'reduce_sum_gpu'
debug_mode_ = True
save_res_ = True
all_space_ = True
op_config = [in_shape, in_dtype, axis, keepdims,
"", "", "",
True, True, True]
op_config = ReduceGpuDesc(*op_config)
desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute,
op_config, tuning_attrs_info)
launch(op_type=op_type_, debug_mode=debug_mode_,
save_res=save_res_, desc=desc_, all_space=all_space_,
from_json=False, skip_config_set=skip_config_set,
tuning_attrs_info=tuning_attrs_info)
time_end = time.time()
print("total tuning time: ", time_end - time_start)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--skip_configs_log", type=str,
default="", help="skip those configs in .log file")
parser.add_argument("--tuning_attrs_json", type=str, default="",
help="the json file to describe the tuning atttrs")
args = parser.parse_args()

# check whether have configs need to skip
skip_config_set = get_skip_configs_from_log(args.skip_configs_log)

# add tuning_attrs from json file
tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json)

run_test_reduce_sum((1024, 1024), "float32", (1,),
False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)

+ 1
- 0
tests/st/composite/need_adapt/Fused_Cast_Cast_Mul_TensorAdd___12292245117929986167.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"11288","input_desc":[[{"data_type":"float32","shape":[1024],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_0"}],[{"data_type":"float16","shape":[8192,1024],"tensor_name":"input_2"}]],"op":"Fused_Cast_Cast_Mul_TensorAdd___12292245117929986167","op_desc":[{"attr":[{"name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_0"}]],"name":"Cast","output_desc":[{"data_type":"float16","name":"output","shape":[1024],"tensor_name":"output_0_0"}]},{"attr":[{"name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_1"}]],"name":"Cast","output_desc":[{"data_type":"float16","name":"output","shape":[1024],"tensor_name":"output_0_1"}]},{"attr":[{"name":"x_shape","value":[8192,1024]},{"name":"y_shape","value":[1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[8192,1024],"tensor_name":"input_2"}],[{"data_type":"float16","name":"y","shape":[1024],"tensor_name":"output_0_1"}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[8192,1024],"tensor_name":"output_0_2"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[8192,1024],"tensor_name":"output_0_2"}],[{"data_type":"float16","name":"y","shape":[1024],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","name":"output","shape":[8192,1024],"tensor_name":"output_0_3"}]}],"output_desc":[{"data_type":"float16","shape":[1024],"tensor_name":"output_0_1"},{"data_type":"float16","shape":[8192,1024],"tensor_name":"output_0_3"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_Cast_RealDiv_Mul_TensorAdd_split_16909220147165618805.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"44349.44349","id":1550,"input_desc":[[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1],"tensor_name":"input_2"}],[{"data_type":"float16","format":"DefaultFormat","shape":[1024],"tensor_name":"input_4"}],[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"input_1"}],[{"data_type":"float32","format":"DefaultFormat","shape":[1024],"tensor_name":"input_0"}]],"op":"Fused_Cast_RealDiv_Mul_TensorAdd_split_16909220147165618805","op_desc":[{"attr":[{"data_type":"bool","name":"is_backed_cast","value":false},{"data_type":"str","name":"pri_format","value":"NC1HWC0"},{"data_type":"str","name":"dst_type","value":"float16"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1024],"tensor_name":"input_0"}]],"name":"Cast","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[1024],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"input_1"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[4,1024,1],"tensor_name":"input_2"}]],"name":"RealDiv","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_1"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"output_0_1"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[1024],"tensor_name":"input_4"}]],"name":"Mul","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"DefaultFormat"}],"impl_path":"","input_desc":[[{"data_type":"float16","format":"DefaultFormat","name":"input_0","shape":[4,1024,1024],"tensor_name":"output_0_2"}],[{"data_type":"float16","format":"DefaultFormat","name":"input_1","shape":[1024],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","format":"DefaultFormat","name":"output","shape":[4,1024,1024],"tensor_name":"output_0_3"}]}],"output_desc":[{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"output_0_1"},{"data_type":"float16","format":"DefaultFormat","shape":[4,1024,1024],"tensor_name":"output_0_3"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_ClipByNormNoDivSum_RealDiv_fusion_4181144419579591378.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"26625.40461","id":2555,"input_desc":[[{"data_type":"float32","format":"DefaultFormat","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","shape":[1024,1024],"tensor_name":"input_8"}]],"op":"Fused_ClipByNormNoDivSum_RealDiv_fusion_4181144419579591378","op_desc":[{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000"}],"fusion":"SelectGT_000","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_1","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_1"}]],"name":"Sqrt","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"output_0_2"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1,1],"tensor_name":"input_0"}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_3"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1,1],"tensor_name":"output_0_3"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_7","value":1.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1,1],"tensor_name":"output_0_4"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"FRACTAL_NZ"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1024,1024],"tensor_name":"input_8"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1,1],"tensor_name":"output_0_4"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1024,1024],"tensor_name":"output_0_5"}]}],"output_desc":[{"data_type":"float32","format":"DefaultFormat","shape":[1024,1024],"tensor_name":"output_0_5"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_ClipByNormNoDivSum_RealDiv_fusion_8238389606767005164.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"26625.40534","id":2566,"input_desc":[[{"data_type":"float32","format":"DefaultFormat","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","shape":[4096],"tensor_name":"input_7"}]],"op":"Fused_ClipByNormNoDivSum_RealDiv_fusion_8238389606767005164","op_desc":[{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000"}],"fusion":"SelectGT_000","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_1","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_0"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_1"}]],"name":"Sqrt","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_2"}]},{"attr":[{"data_type":"str","name":"fusion","value":"SelectGT_000_end"}],"fusion":"SelectGT_000_end","impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_2","shape":[1],"tensor_name":"input_0"}]],"name":"Select","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_3"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"input_3","value":1.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[1],"tensor_name":"output_0_4"}]},{"attr":[{"data_type":"str","name":"pri_format","value":"NC1HWC0"}],"impl_path":"","input_desc":[[{"data_type":"float32","format":"DefaultFormat","name":"input_0","shape":[4096],"tensor_name":"input_7"}],[{"data_type":"float32","format":"DefaultFormat","name":"input_1","shape":[1],"tensor_name":"output_0_4"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","format":"DefaultFormat","name":"output","shape":[4096],"tensor_name":"output_0_5"}]}],"output_desc":[{"data_type":"float32","format":"DefaultFormat","shape":[4096],"tensor_name":"output_0_5"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_10229161408386697243.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_10637164683062061938.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_11007228773993183427.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_12006221044534455340.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_13257561028613500504.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_13769955845847610041.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_14492938012907533443.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_14969106078297683510.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_17143661508892073848.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_17552201251937562766.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_1932853756890330796.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_2109265793585062708.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_312286377788017483.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambNextMV_6348844499000494196.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_12073466097680829202.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"1907_sp_1847_1787_1728_1704_1680_1656_1587_sp_552_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[2],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[2],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_12073466097680829202","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0019","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0009_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[2]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[2],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[2],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[2],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_13492243466190004284.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"9739_sp_9282_8825_8368_7912_7889_7864_sp_2735_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[512,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[512,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_13492243466190004284","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0013_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0002_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[512,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[512,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[512,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[512,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[512,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[512,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[512,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_1445905573061742177.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"1944_sp_1884_1824_1747_1716_1692_1668_1612_sp_655_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[4096],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[4096],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_1445905573061742177","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0016","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0006_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[4096]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[4096],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[4096],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[4096],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[4096],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[4096],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[4096],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_15600956116817642484.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_15689878575778426853.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"1957_sp_1897_1837_1732_1708_1684_1660_1593_sp_587_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_15689878575778426853","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0013","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0003_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[1024],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_16040335705910473299.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"9298_sp_8841_8384_7927_7902_7879_7441_sp_2650_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[21128],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[21128],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_16040335705910473299","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0020","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0020_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0020","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0009_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[21128]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[21128],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[21128],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[21128],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_16084070961688803476.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_3830386909471115343.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"1917_sp_1857_1797_1738_1712_1688_1664_1601_sp_621_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[30522],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[30522],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_3830386909471115343","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0018","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}],"process":"aicore"},{"attr":null,"fusion":"SelectGT_0008_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}],"process":"aicore"},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[30522]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_8"}],"process":"aicore"},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_9"}],"process":"aicore"},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[30522],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[30522],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_10"}],"process":"aicore"}],"output_desc":[{"data_type":"float32","shape":[30522],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_4148121723898026533.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"9673_sp_9262_8851_8049_8024_8001_7586_sp_2782_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[1024,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[1024,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_4148121723898026533","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0014","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0014_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0014","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0004_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[1024,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1024,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[1024,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[1024,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[1024,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[1024,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_5080003035626701281.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"9290_sp_8833_8376_7919_7896_7873_7430_sp_2599_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[2,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[2,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_5080003035626701281","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0012","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0012_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0012","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0001_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[2,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[2,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[2,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[2,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[2,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[2,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_8456945009561581117.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"9742_sp_9285_8828_8371_7914_7891_7868_sp_2752_construct","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_2"}],[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[],"tensor_name":"input_1"}],[{"data_type":"float32","shape":[1],"tensor_name":"input_14"}],[{"data_type":"float32","shape":[21128,1024],"tensor_name":"input_16"}],[{"data_type":"float32","shape":[21128,1024],"tensor_name":"input_17"}]],"op":"Fused_LambUpdateWithLR_8456945009561581117","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_1"}]],"name":"RealDiv","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":null,"fusion":"SelectGT_0011","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_1"}]},{"attr":null,"fusion":"SelectGT_0011_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_6","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_2"}]},{"attr":null,"fusion":"SelectGT_0011","impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Greater","output_desc":[{"data_type":"bool","name":"output","shape":[],"tensor_name":"output_0_3"}]},{"attr":null,"fusion":"SelectGT_0000_end","impl_path":"","input_desc":[[{"data_type":"bool","name":"condition","shape":[1],"tensor_name":"output_0_3"}],[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_2"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_9","value":1.0}]],"name":"Select","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_4"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_4"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_11","value":10.0}]],"name":"Minimum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_5"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_5"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_3","value":0.0}]],"name":"Maximum","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_6"}]},{"attr":[{"name":"x_shape","value":[]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_6"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"input_14"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[1],"tensor_name":"output_0_7"}]},{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[21128,1024]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"output_0_7"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"input_16"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_8"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"output_0_8"}]],"name":"Sub","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_9"}]},{"attr":[{"name":"fake_output","value":true}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[21128,1024],"tensor_name":"input_17"}],[{"data_type":"float32","name":"y","shape":[21128,1024],"tensor_name":"output_0_9"}],[{"data_type":"float32","name":"z","shape":[21128,1024],"tensor_name":"output_0_9"}]],"name":"InplaceAssign","output_desc":[{"data_type":"float32","name":"output","shape":[21128,1024],"tensor_name":"output_0_10"}]}],"output_desc":[{"data_type":"float32","shape":[21128,1024],"tensor_name":"output_0_10"}],"platform":"AKG","process":"aicore"}

+ 1
- 0
tests/st/composite/need_adapt/Fused_LambUpdateWithLR_8598988701258930330.info
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/composite/need_adapt/Fused_Mul_Mul_TensorAdd__4400644352246048056.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"10313","input_desc":[[{"data_type":"float16","shape":[16,1,1,512],"tensor_name":"input_2"}],[{"data_type":"float16","shape":[16,16,512,512],"tensor_name":"input_1"}]],"op":"Fused_Mul_Mul_TensorAdd__4400644352246048056","op_desc":[{"attr":[{"name":"x_shape","value":[1]},{"name":"y_shape","value":[16,16,512,512]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[1],"tensor_name":"input_0","value":0.125}],[{"data_type":"float16","name":"y","shape":[16,16,512,512],"tensor_name":"input_1"}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[16,16,512,512],"tensor_name":"output_0_0"}]},{"attr":[{"name":"x_shape","value":[16,1,1,512]},{"name":"y_shape","value":[1]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[16,1,1,512],"tensor_name":"input_2"}],[{"data_type":"float16","name":"y","shape":[1],"tensor_name":"input_3","value":-10000.0}]],"name":"Mul","output_desc":[{"data_type":"float16","name":"output","shape":[16,1,1,512],"tensor_name":"output_0_1"}]},{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float16","name":"x","shape":[16,1,1,512],"tensor_name":"output_0_1"}],[{"data_type":"float16","name":"y","shape":[16,16,512,512],"tensor_name":"output_0_0"}]],"name":"TensorAdd","output_desc":[{"data_type":"float16","name":"output","shape":[16,16,512,512],"tensor_name":"output_0_2"}]}],"output_desc":[{"data_type":"float16","shape":[16,1,1,512],"tensor_name":"output_0_1"},{"data_type":"float16","shape":[16,16,512,512],"tensor_name":"output_0_2"}],"platform":"AKG","process":"aicore"}

+ 707
- 0
tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_001.info View File

@@ -0,0 +1,707 @@
{
"composite": true,
"input_desc": [
[
{
"data_type": "float32",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "input_14"
}
],
[
{
"data_type": "float32",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_0"
}
],
[
{
"data_type": "float32",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_3"
}
],
[
{
"data_type": "float32",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_9"
}
],
[
{
"data_type": "float32",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_12"
}
]
],
"op": "Fused_Poly_Schedule_Opt_001",
"op_desc": [
{
"attr": [
{
"name": "x_shape",
"value": [
256
]
},
{
"name": "y_shape",
"value": [
1
]
},
{
"name": "data_format",
"value": [
"NC1HWC0",
"NC1HWC0"
]
}
],
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_0"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1
],
"tensor_name": "input_1",
"value": 9.964923265215475e-06
}
]
],
"name": "Mul",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_0"
}
]
},
{
"attr": [
{
"name": "x_shape",
"value": [
256
]
},
{
"name": "y_shape",
"value": [
256
]
},
{
"name": "data_format",
"value": [
"NC1HWC0",
"NC1HWC0"
]
}
],
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_0"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_0"
}
]
],
"name": "Mul",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_1"
}
]
},
{
"attr": [
{
"name": "x_shape",
"value": [
256
]
},
{
"name": "y_shape",
"value": [
1
]
},
{
"name": "data_format",
"value": [
"NC1HWC0",
"NC1HWC0"
]
}
],
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_3"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1
],
"tensor_name": "input_1",
"value": 9.964923265215475e-06
}
]
],
"name": "Mul",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_2"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_2"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_1"
}
]
],
"name": "Sub",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_3"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_3"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1
],
"tensor_name": "input_7",
"value": 9.999999747378752e-06
}
]
],
"name": "TensorAdd",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_4"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_4"
}
]
],
"name": "Sqrt",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_5"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_9"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_5"
}
]
],
"name": "RealDiv",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_6"
}
]
},
{
"attr": [
{
"name": "x_shape",
"value": [
256
]
},
{
"name": "y_shape",
"value": [
256
]
},
{
"name": "data_format",
"value": [
"NC1HWC0",
"NC1HWC0"
]
}
],
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_6"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_0"
}
]
],
"name": "Mul",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_7"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "input_12"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_7"
}
]
],
"name": "Sub",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_8"
}
]
},
{
"attr": [
{
"name": "x_shape",
"value": [
256
]
},
{
"name": "y_shape",
"value": [
32,
256,
56,
56
]
},
{
"name": "data_format",
"value": [
"NC1HWC0",
"NC1HWC0"
]
}
],
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_6"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "input_14"
}
]
],
"name": "Mul",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "output_0_9"
}
]
},
{
"attr": null,
"impl_path": "",
"input_desc": [
[
{
"data_type": "float32",
"name": "x",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "output_0_9"
}
],
[
{
"data_type": "float32",
"name": "y",
"shape": [
1,
16,
1,
1,
16
],
"tensor_name": "output_0_8"
}
]
],
"name": "TensorAdd",
"output_desc": [
{
"data_type": "float32",
"name": "output",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "output_0_10"
}
]
}
],
"output_desc": [
{
"data_type": "float32",
"shape": [
32,
16,
56,
56,
16
],
"tensor_name": "output_0_10"
}
],
"platform": "AKG",
"process": "aicore"
}

+ 320
- 0
tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_002.info View File

@@ -0,0 +1,320 @@
{
"composite": true,
"input_desc": [
[{
"data_type": "float16",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "input_0"
}],
[{
"data_type": "float16",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "input_9"
}],
[{
"data_type": "float32",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "input_7"
}],
[{
"data_type": "float32",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "input_2"
}]
],
"op": "Fused_Poly_Schedule_Opt_002",
"op_desc": [{
"attr": [{
"name": "dst_type",
"value": "float32"
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float16",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "input_0"
}]
],
"name": "Cast",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_0"
}]
}, {
"attr": [{
"name": "axis",
"value": [0, 2, 3]
}, {
"name": "keep_dims",
"value": true
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_0"
}]
],
"name": "ReduceSum",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_1"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "input_2"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_3",
"value": 9.999999747378752e-05
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_2"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_2"
}]
],
"name": "Sqrt",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_3"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1],
"tensor_name": "input_5",
"value": 1.0
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_3"
}]
],
"name": "RealDiv",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_4"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [256]
}, {
"name": "y_shape",
"value": []
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "input_7"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_8",
"value": -1.0
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_5"
}]
}, {
"attr": [{
"name": "dst_type",
"value": "float32"
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float16",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "input_9"
}]
],
"name": "Cast",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_6"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_6"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_5"
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_7"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [32, 256, 56, 56]
}, {
"name": "y_shape",
"value": [256]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_7"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_4"
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_8"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [32, 256, 56, 56]
}, {
"name": "y_shape",
"value": [32, 256, 56, 56]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_0"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_8"
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_9"
}]
}, {
"attr": [{
"name": "axis",
"value": [0, 2, 3]
}, {
"name": "keep_dims",
"value": true
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 16, 56, 56, 16],
"tensor_name": "output_0_9"
}]
],
"name": "ReduceSum",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_10"
}]
}],
"output_desc": [{
"data_type": "float32",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_10"
}, {
"data_type": "float32",
"shape": [1, 16, 1, 1, 16],
"tensor_name": "output_0_1"
}],
"platform": "AKG",
"process": "aicore"
}

+ 500
- 0
tests/st/composite/need_adapt/Fused_Poly_Schedule_Opt_003.info View File

@@ -0,0 +1,500 @@
{
"composite": true,
"input_desc": [
[{
"data_type": "float16",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "input_19"
}],
[{
"data_type": "float16",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "input_14"
}],
[{
"data_type": "float32",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_7"
}],
[{
"data_type": "float32",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_5"
}],
[{
"data_type": "float32",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_3"
}],
[{
"data_type": "float32",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_9"
}],
[{
"data_type": "float32",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_0"
}]
],
"op": "Fused_Poly_Schedule_Opt_003",
"op_desc": [{
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_0"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_1",
"value": 9.999999747378752e-05
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_0"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_0"
}]
],
"name": "Sqrt",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_1"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_3"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_1"
}]
],
"name": "RealDiv",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_2"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [64]
}, {
"name": "y_shape",
"value": [1]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_5"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_6",
"value": -9.964923265215475e-06
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_3"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [64]
}, {
"name": "y_shape",
"value": [1]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_7"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_8",
"value": 9.964923265215475e-06
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_4"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_9"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_1"
}]
],
"name": "RealDiv",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_5"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [64]
}, {
"name": "y_shape",
"value": [64]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_5"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_4"
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_6"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_6"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_3"
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_7"
}]
}, {
"attr": [{
"name": "dst_type",
"value": "float32"
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float16",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "input_14"
}]
],
"name": "Cast",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_8"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [64]
}, {
"name": "y_shape",
"value": [1]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "input_7"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1],
"tensor_name": "input_15",
"value": -9.964923265215475e-06
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_9"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_9"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_1"
}]
],
"name": "RealDiv",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_10"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [64]
}, {
"name": "y_shape",
"value": [32, 64, 56, 56]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_10"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_8"
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_11"
}]
}, {
"attr": [{
"name": "dst_type",
"value": "float32"
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float16",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "input_19"
}]
],
"name": "Cast",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_12"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_12"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_11"
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_13"
}]
}, {
"attr": null,
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_13"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_7"
}]
],
"name": "TensorAdd",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_14"
}]
}, {
"attr": [{
"name": "x_shape",
"value": [32, 64, 56, 56]
}, {
"name": "y_shape",
"value": [64]
}, {
"name": "data_format",
"value": ["NC1HWC0", "NC1HWC0"]
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_14"
}],
[{
"data_type": "float32",
"name": "y",
"shape": [1, 4, 1, 1, 16],
"tensor_name": "output_0_2"
}]
],
"name": "Mul",
"output_desc": [{
"data_type": "float32",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_15"
}]
}, {
"attr": [{
"name": "dst_type",
"value": "float16"
}],
"impl_path": "",
"input_desc": [
[{
"data_type": "float32",
"name": "x",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_15"
}]
],
"name": "Cast",
"output_desc": [{
"data_type": "float16",
"name": "output",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_16"
}]
}],
"output_desc": [{
"data_type": "float16",
"shape": [32, 4, 56, 56, 16],
"tensor_name": "output_0_16"
}],
"platform": "AKG",
"process": "aicore"
}

+ 1
- 0
tests/st/composite/need_adapt/Fused_Reciprocal_ReduceSum_Mul___1222261331617186059.info View File

@@ -0,0 +1 @@
{"composite":true,"composite_graph":"12299","input_desc":[[{"data_type":"float32","shape":[],"tensor_name":"input_0"}],[{"data_type":"float32","shape":[1216,30522],"tensor_name":"input_1"}]],"op":"Fused_Reciprocal_ReduceSum_Mul___1222261331617186059","op_desc":[{"attr":null,"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1],"tensor_name":"input_0"}]],"name":"Reciprocal","output_desc":[{"data_type":"float32","name":"output","shape":[],"tensor_name":"output_0_0"}]},{"attr":[{"name":"axis","value":[0]},{"name":"keep_dims","value":false}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[1216,30522],"tensor_name":"input_1"}]],"name":"ReduceSum","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_1"}]},{"attr":[{"name":"x_shape","value":[30522]},{"name":"y_shape","value":[]},{"name":"data_format","value":["DefaultFormat","DefaultFormat"]}],"impl_path":"","input_desc":[[{"data_type":"float32","name":"x","shape":[30522],"tensor_name":"output_0_1"}],[{"data_type":"float32","name":"y","shape":[1],"tensor_name":"output_0_0"}]],"name":"Mul","output_desc":[{"data_type":"float32","name":"output","shape":[30522],"tensor_name":"output_0_2"}]}],"output_desc":[{"data_type":"float32","shape":[1],"tensor_name":"output_0_0"},{"data_type":"float32","shape":[30522],"tensor_name":"output_0_2"}],"platform":"AKG","process":"aicore"}

+ 1
- 1
tests/test_env.sh View File

@@ -25,7 +25,7 @@ else
TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm"

export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH}
export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH}
export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH}
if [ $# -eq 1 ] && [ $1 = "gpu" ]; then
export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
fi


+ 62
- 96
third_party/incubator-tvm/src/runtime/cuda/cuda_module.cc View File

@@ -20,23 +20,25 @@
/*!
* \file cuda_module.cc
* 2020.09.19 - Modify operator() for kc_air.
* 2020.09.22 - Separate the implementation of KC and GPU.
* 2020.09.22 - Separate the implementation of KC and GPU.
*/
#include "cuda_module.h"

#include <tvm/runtime/registry.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <tvm/runtime/registry.h>

#include <array>
#include <string>
#include <mutex>
#include <string>
#include <unordered_map>
#include "cuda_common.h"
#include <vector>

#include "../file_util.h"
#include "../meta_data.h"
#include "../pack_args.h"
#include "../thread_storage_scope.h"
#include "../meta_data.h"
#include "../file_util.h"
#include "cuda_common.h"

namespace air {
namespace runtime {
@@ -47,8 +49,7 @@ namespace runtime {
// The modules will be lazily loaded
class CUDAModuleNode : public runtime::ModuleNode {
public:
explicit CUDAModuleNode(std::string data,
std::string fmt,
explicit CUDAModuleNode(std::string data, std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string cuda_source)
: data_(data), fmt_(fmt), fmap_(fmap), cuda_source_(cuda_source) {
@@ -65,16 +66,11 @@ class CUDAModuleNode : public runtime::ModuleNode {
}
}

const char* type_key() const final {
return "cuda";
}
const char* type_key() const final { return "cuda"; }

PackedFunc GetFunction(
const std::string& name,
const ObjectPtr<Object>& sptr_to_self) final;
PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;

void SaveToFile(const std::string& file_name,
const std::string& format) final {
void SaveToFile(const std::string& file_name, const std::string& format) final {
std::string fmt = GetFileFormat(file_name, format);
std::string meta_file = GetMetaFilePath(file_name);
if (fmt == "cu") {
@@ -82,8 +78,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
SaveMetaDataToFile(meta_file, fmap_);
SaveBinaryToFile(file_name, cuda_source_);
} else {
CHECK_EQ(fmt, fmt_)
<< "Can only save to format=" << fmt_;
CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
SaveMetaDataToFile(meta_file, fmap_);
SaveBinaryToFile(file_name, data_);
}
@@ -106,11 +101,18 @@ class CUDAModuleNode : public runtime::ModuleNode {
}

// get a CUfunction from primary context in device_id
CUfunction GetFunc(int device_id, const std::string& func_name) {
CUfunction GetFunc(int device_id, const std::string& func_name, ThreadWorkLoad wl) {
std::lock_guard<std::mutex> lock(mutex_);
// must recheck under the lock scope
if (module_[device_id] == nullptr) {
CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
CUjit_option options[1];
options[0] = CU_JIT_MAX_REGISTERS;
void* values[1];
long register_nums =
MAX_REGISTER_PER_THREAD_BLOCK / (wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2));
values[0] = (void*)register_nums;
CUDA_DRIVER_CALL(
cuModuleLoadDataEx(&(module_[device_id]), data_.c_str(), 1, options, values));
}
CUresult result = CUDA_SUCCESS;
CUfunction func = nullptr;
@@ -122,11 +124,9 @@ class CUDAModuleNode : public runtime::ModuleNode {
#endif
}
if (result != CUDA_SUCCESS) {
const char *msg;
const char* msg;
cuGetErrorName(result, &msg);
LOG(FATAL)
<< "CUDAError: cuModuleGetFunction " << func_name
<< " failed with error: " << msg;
LOG(FATAL) << "CUDAError: cuModuleGetFunction " << func_name << " failed with error: " << msg;
}
#ifdef USE_KC_AIR
return func_[device_id];
@@ -135,9 +135,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
#endif
}
// get a global var from primary context in device_id
CUdeviceptr GetGlobal(int device_id,
const std::string& global_name,
size_t expect_nbytes) {
CUdeviceptr GetGlobal(int device_id, const std::string& global_name, size_t expect_nbytes) {
std::lock_guard<std::mutex> lock(mutex_);
// must recheck under the lock scope
if (module_[device_id] == nullptr) {
@@ -146,15 +144,12 @@ class CUDAModuleNode : public runtime::ModuleNode {
CUdeviceptr global;
size_t nbytes;

CUresult result = cuModuleGetGlobal(&global, &nbytes,
module_[device_id], global_name.c_str());
CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str());
CHECK_EQ(nbytes, expect_nbytes);
if (result != CUDA_SUCCESS) {
const char *msg;
const char* msg;
cuGetErrorName(result, &msg);
LOG(FATAL)
<< "CUDAError: cuModuleGetGlobal " << global_name
<< " failed with error: " << msg;
LOG(FATAL) << "CUDAError: cuModuleGetGlobal " << global_name << " failed with error: " << msg;
}
return global;
}
@@ -173,17 +168,15 @@ class CUDAModuleNode : public runtime::ModuleNode {
// internal mutex when updating the module
std::mutex mutex_;
std::array<CUfunction, kMaxNumGPUs> func_;
const int MAX_REGISTER_PER_THREAD_BLOCK = 65536;
};

// a wrapped function class to get packed func.
class CUDAWrappedFunc {
public:
// initialize the CUDA function.
void Init(CUDAModuleNode* m,
ObjectPtr<Object> sptr,
const std::string& func_name,
size_t num_void_args,
std::vector<size_t> arg_size,
void Init(CUDAModuleNode* m, ObjectPtr<Object> sptr, const std::string& func_name,
size_t num_void_args, std::vector<size_t> arg_size,
const std::vector<std::string>& thread_axis_tags) {
m_ = m;
sptr_ = sptr;
@@ -194,65 +187,49 @@ class CUDAWrappedFunc {
thread_axis_cfg_.Init(num_void_args, thread_axis_tags);
}
// invoke the function with void arguments
void operator()(TVMArgs args,
TVMRetValue* rv,
void** void_args) const {
void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
int device_id;
CUDA_CALL(cudaGetDevice(&device_id));
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
if (fcache_[device_id] == nullptr) {
fcache_[device_id] = m_->GetFunc(device_id, func_name_);
fcache_[device_id] = m_->GetFunc(device_id, func_name_, wl);
}
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
CUresult result;

#ifdef USE_KC_AIR
size_t raw_size = num_void_args_;
void** raw_args = new (std::nothrow) void*[raw_size];
void** raw_args = new (std::nothrow) void*[raw_size];
if (*raw_args == nullptr) {
LOG(FATAL) << "Memory alloc fail.";
LOG(FATAL) << "Memory alloc fail.";
}
size_t args_size = 0;
for (size_t i = 0; i < raw_size; ++i)
{
for (size_t i = 0; i < raw_size; ++i) {
args_size += arg_size_[i];
void** ptr = reinterpret_cast<void**>(void_args[i]);
raw_args[i] = *ptr;
}
result = cuLaunchKernel(
fcache_[device_id],
wl.grid_dim(0),
wl.grid_dim(1),
wl.grid_dim(2),
wl.block_dim(0),
wl.block_dim(1),
wl.block_dim(2),
(static_cast<uint32_t>(args_size)/sizeof(void *)), strm, raw_args, 0);
result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2),
wl.block_dim(0), wl.block_dim(1), wl.block_dim(2),
(static_cast<uint32_t>(args_size) / sizeof(void*)), strm, raw_args, 0);
if (raw_args != NULL) {
free(raw_args);
raw_args = NULL;
}
#else
result = cuLaunchKernel(
fcache_[device_id],
wl.grid_dim(0),
wl.grid_dim(1),
wl.grid_dim(2),
wl.block_dim(0),
wl.block_dim(1),
wl.block_dim(2),
0, strm, void_args, 0);
result =
cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2),
wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), 0, strm, void_args, 0);
#endif

if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {
const char *msg;
const char* msg;
cuGetErrorName(result, &msg);
std::ostringstream os;
os << "CUDALaunch Error: " << msg << "\n"
<< " grid=(" << wl.grid_dim(0) << ","
<< wl.grid_dim(1) << "," << wl.grid_dim(2) << "), "
<< " block=(" << wl.block_dim(0) << ","
<< wl.block_dim(1) << "," << wl.block_dim(2) << ")\n";
<< " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << "," << wl.grid_dim(2) << "), "
<< " block=(" << wl.block_dim(0) << "," << wl.block_dim(1) << "," << wl.block_dim(2)
<< ")\n";
std::string cuda = m_->GetSource("");
if (cuda.length() != 0) {
os << "// func_name=" << func_name_ << "\n"
@@ -283,9 +260,7 @@ class CUDAWrappedFunc {

class CUDAPrepGlobalBarrier {
public:
CUDAPrepGlobalBarrier(CUDAModuleNode* m,
ObjectPtr<Object> sptr)
: m_(m), sptr_(sptr) {
CUDAPrepGlobalBarrier(CUDAModuleNode* m, ObjectPtr<Object> sptr) : m_(m), sptr_(sptr) {
std::fill(pcache_.begin(), pcache_.end(), 0);
}

@@ -293,8 +268,8 @@ class CUDAPrepGlobalBarrier {
int device_id;
CUDA_CALL(cudaGetDevice(&device_id));
if (pcache_[device_id] == 0) {
pcache_[device_id] = m_->GetGlobal(
device_id, runtime::symbol::tvm_global_barrier_state, sizeof(unsigned));
pcache_[device_id] =
m_->GetGlobal(device_id, runtime::symbol::tvm_global_barrier_state, sizeof(unsigned));
}
CUDA_DRIVER_CALL(cuMemsetD32(pcache_[device_id], 0, 1));
}
@@ -308,12 +283,10 @@ class CUDAPrepGlobalBarrier {
mutable std::array<CUdeviceptr, kMaxNumGPUs> pcache_;
};

PackedFunc CUDAModuleNode::GetFunction(
const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
PackedFunc CUDAModuleNode::GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
CHECK_EQ(sptr_to_self.get(), this);
CHECK_NE(name, symbol::tvm_module_main)
<< "Device function do not have main";
CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
if (name == symbol::tvm_prepare_global_barrier) {
return PackedFunc(CUDAPrepGlobalBarrier(this, sptr_to_self));
}
@@ -322,7 +295,7 @@ PackedFunc CUDAModuleNode::GetFunction(
const FunctionInfo& info = it->second;
CUDAWrappedFunc f;
std::vector<size_t> arg_size(info.arg_types.size());
for (int i=0; i<static_cast<int>(info.arg_types.size()); ++i){
for (int i = 0; i < static_cast<int>(info.arg_types.size()); ++i) {
TVMType t = info.arg_types[i];
CHECK_EQ(t.lanes, 1U);
uint32_t bits = t.bits;
@@ -333,18 +306,15 @@ PackedFunc CUDAModuleNode::GetFunction(
return PackFuncVoidAddr(f, info.arg_types);
}

Module CUDAModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string cuda_source) {
Module CUDAModuleCreate(std::string data, std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string cuda_source) {
auto n = make_object<CUDAModuleNode>(data, fmt, fmap, cuda_source);
return Module(n);
}

// Load module from module.
Module CUDAModuleLoadFile(const std::string& file_name,
const std::string& format) {
Module CUDAModuleLoadFile(const std::string& file_name, const std::string& format) {
std::string data;
std::unordered_map<std::string, FunctionInfo> fmap;
std::string fmt = GetFileFormat(file_name, format);
@@ -365,14 +335,10 @@ Module CUDAModuleLoadBinary(void* strm) {
return CUDAModuleCreate(data, fmt, fmap, std::string());
}

TVM_REGISTER_GLOBAL("module.loadfile_cubin")
.set_body_typed(CUDAModuleLoadFile);
TVM_REGISTER_GLOBAL("module.loadfile_cubin").set_body_typed(CUDAModuleLoadFile);

TVM_REGISTER_GLOBAL("module.loadfile_ptx")
.set_body_typed(CUDAModuleLoadFile);
TVM_REGISTER_GLOBAL("module.loadfile_ptx").set_body_typed(CUDAModuleLoadFile);

TVM_REGISTER_GLOBAL("module.loadbinary_cuda")
.set_body_typed(CUDAModuleLoadBinary);
TVM_REGISTER_GLOBAL("module.loadbinary_cuda").set_body_typed(CUDAModuleLoadBinary);
} // namespace runtime
} // namespace air


Loading…
Cancel
Save