Browse Source

!17 [TUNING] add the gpu-tuning process to master

From: @yiyanzhi_akane
Reviewed-by: @dylangeng
Signed-off-by:
tags/v1.2.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
0737733069
27 changed files with 3520 additions and 81 deletions
  1. +15
    -1
      python/akg/build_module.py
  2. +33
    -3
      python/akg/utils/custom_tiling.py
  3. +5
    -6
      python/akg/utils/kernel_exec.py
  4. +24
    -0
      src/poly/tiling/custom_tiling.h
  5. +55
    -22
      src/poly/tiling/gen_tiling_space.cc
  6. +10
    -0
      src/poly/tiling/tile_space.h
  7. +19
    -5
      src/poly/tiling/tiling_analyzer.cc
  8. +20
    -11
      src/poly/tiling/tiling_analyzer.h
  9. +6
    -3
      src/poly/tiling/tiling_strategy_manager.h
  10. +213
    -29
      src/poly/tiling/tiling_strategy_manager_gpu.cc
  11. +0
    -0
      tests/fuzz/tune_for_gpu/__init__.py
  12. +17
    -0
      tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py
  13. +95
    -0
      tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py
  14. +501
    -0
      tests/fuzz/tune_for_gpu/autotuning/job.py
  15. +407
    -0
      tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py
  16. +243
    -0
      tests/fuzz/tune_for_gpu/autotuning/runner.py
  17. +217
    -0
      tests/fuzz/tune_for_gpu/autotuning/space.py
  18. +753
    -0
      tests/fuzz/tune_for_gpu/autotuning/space_generators.py
  19. +147
    -0
      tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py
  20. +84
    -0
      tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py
  21. +359
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuner.py
  22. +9
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json
  23. +155
    -0
      tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py
  24. +49
    -0
      tests/fuzz/tune_for_gpu/autotuning/type_definitions.py
  25. +16
    -0
      tests/fuzz/tune_for_gpu/config_gpu.sh
  26. +67
    -0
      tests/fuzz/tune_for_gpu/test_gpu.py
  27. +1
    -1
      tests/test_env.sh

+ 15
- 1
python/akg/build_module.py View File

@@ -50,7 +50,17 @@ def dump_tiling_info(level):
logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1],
tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1],
tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0],
tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0])
tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0],
)
idx_to_str = {0: "x", 1: "y", 2: "z"}
for i in range(len(tuning_spaces["thread_range"])):
info = "[thread.%s] range [%d, %d](jump by %d), "
logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1],
tuning_spaces['thread_mod'][i][0], )
for i in range(len(tuning_spaces["block_range"])):
info = "[block.%s] range [%d, %d](jump by %d)"
logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0],
tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],)
logging.info("===============================================")
elif isinstance(indice, int) and indice == EMPTY_CODE:
logging.info("Empty tiling space.")
@@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att
tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist()
tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist()
tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist()
tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist()
tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist()
tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist()
tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist()
if level >= help_tiling_level["Candidates"]:
tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist()
if not tuning:


+ 33
- 3
python/akg/utils/custom_tiling.py View File

@@ -70,15 +70,33 @@ class TileConstraint(Enum):
SET_EXPANSION = "SET_EXPANSION"
SET_MEM_RATIO = "SET_MEM_RATIO"
SET_AXIS_INFO = "SET_AXIS_INFO"
THREAD_MIN = "THREAD_MIN"
THREAD_MAX = "THREAD_MAX"
THREAD_MOD = "THREAD_MOD"
BLOCK_MIN = "BLOCK_MIN"
BLOCK_MAX = "BLOCK_MAX"
BLOCK_MOD = "BLOCK_MOD"


@check_input_type((double, float, int), TileConstraint, TileLevel)
@check_input_type((double, float, int, list), TileConstraint, TileLevel)
def modify_common_constraints(value, constraint, level=TileLevel.C1):
"""api for dsl to modify some default constraint used in auto tiling."""
if constraint not in TileConstraint:
raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint))
if constraint == TileConstraint.SET_MEM_RATIO:
return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value))
if constraint == TileConstraint.THREAD_MIN:
return create_custom_tiling_node(TileMode.COMMON, thread_min=value)
if constraint == TileConstraint.THREAD_MAX:
return create_custom_tiling_node(TileMode.COMMON, thread_max=value)
if constraint == TileConstraint.THREAD_MOD:
return create_custom_tiling_node(TileMode.COMMON, thread_mod=value)
if constraint == TileConstraint.BLOCK_MIN:
return create_custom_tiling_node(TileMode.COMMON, block_min=value)
if constraint == TileConstraint.BLOCK_MAX:
return create_custom_tiling_node(TileMode.COMMON, block_max=value)
if constraint == TileConstraint.BLOCK_MOD:
return create_custom_tiling_node(TileMode.COMMON, block_mod=value)
raise TypeError("Constraint {} is not supported in this api, please use other api"
.format(constraint.value))

@@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode,
axis_info=DEFAULT_STRING,
priority=DEFAULT_VALUE,
expansion=DEFAULT_VALUE,
mem_ratio=double(DEFAULT_VALUE)):
mem_ratio=double(DEFAULT_VALUE),
thread_min=[],
thread_max=[],
thread_mod=[],
block_min=[],
block_max=[],
block_mod=[]):
"""default method to create custom tiling node, all values are default except tile mode."""

tile_min = to_tvm_type(tile_min, "tile_min")
@@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode,
axis_info=akg.tvm.expr.StringImm(axis_info),
priority=priority,
expansion=expansion,
mem_ratio=mem_ratio)
mem_ratio=mem_ratio,
thread_min=thread_min,
thread_max=thread_max,
thread_mod=thread_mod,
block_min=block_min,
block_max=block_max,
block_mod=block_mod)


def template_nc1hwc0(tensor_name, level):


+ 5
- 6
python/akg/utils/kernel_exec.py View File

@@ -35,6 +35,7 @@ import numpy as np

import akg
from akg.build_module import help_tiling_level
from akg import backend as cce
import akg.tvm
from akg.tvm import autotvm
from akg.tvm import rpc
@@ -88,7 +89,6 @@ def debug_mode(debug_flag):
pass_list.append((0, ir_pass.inject_dma_intrin))
return pass_list


def func_time_required(func_name):
"""Checking the Time Required for Function Running."""
def wrapper(*args, **kwargs):
@@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs):
return None

@func_time_required
def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None):
def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400):
"""
unified run CCE kernel api.

@@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None)
if not tuning:
return out_list[0] if len(out_list) == 1 else tuple(out_list)
else:
cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True)
cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time)
return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles}

stat_info = {}
@@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None
if tuning or (level is not None and level > help_tiling_level['None']):
return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target)

mode = get_runtime_mode()
if mode == "cpu":
mod = akg.tvm.build(s, op_var, "llvm")
@@ -1069,12 +1068,12 @@ def get_device_id():
logging.error(e)
return 0

def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False):
def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400):
"get gpu profiling cycles."
func = tvm.get_global_func('GPUProfilerInit')
func("")
from akg.utils.result_analysis import gpu_profiling
gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id)
gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id)
func = tvm.get_global_func('GPUProfilerStop')
a = func()
return int(a)


+ 24
- 0
src/poly/tiling/custom_tiling.h View File

@@ -80,6 +80,24 @@ class CustomTilingNode : public Node {
* default is 0.5 which is reserved for double buffer*/
double mem_ratio;

/*! \brief minimal thread binding factor on gpu, greater than 0*/
Array<Expr> thread_min;

/*! \brief maximal thread binding factor on gpu*/
Array<Expr> thread_max;

/*! \brief constraint thread binding factor % thread_mod == 0*/
Array<Expr> thread_mod;

/*! \brief minimal block binding factor on gpu, greater than 0*/
Array<Expr> block_min;

/*! \brief maximal block binding factor on gpu*/
Array<Expr> block_max;

/*! \brief constraint block binding factor % block_mod == 0*/
Array<Expr> block_mod;

void VisitAttrs(AttrVisitor *v) {
v->Visit("tile_level", &tile_level);
v->Visit("tile_mode", &tile_mode);
@@ -97,6 +115,12 @@ class CustomTilingNode : public Node {
v->Visit("priority", &priority);
v->Visit("expansion", &expansion);
v->Visit("mem_ratio", &mem_ratio);
v->Visit("thread_min", &thread_min);
v->Visit("thread_max", &thread_max);
v->Visit("thread_mod", &thread_mod);
v->Visit("block_min", &block_min);
v->Visit("block_max", &block_max);
v->Visit("block_mod", &block_mod);
}

static constexpr const char *_type_key = "CustomTilingNode";


+ 55
- 22
src/poly/tiling/gen_tiling_space.cc View File

@@ -36,6 +36,15 @@ class TileSpaceCollector {
space_->c1_tile_mod_table = init_array;
space_->c0_tile_mod_table = init_array;
space_->tiling_candidate = init_array;
space_->gpu_thread_range_table = init_array;
space_->gpu_block_range_table = init_array;
space_->gpu_thread_mod_table = init_array;
space_->gpu_block_mod_table = init_array;
if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"};
} else {
cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
}
}
~TileSpaceCollector() = default;

@@ -122,38 +131,61 @@ class TileSpaceCollector {
// step 2. collect cared info from each axis
for (const auto &con : cared_info_) {
int length = con.find("mod") != std::string::npos ? 1 : 2;
auto array = air::runtime::NDArray::Empty({static_cast<int64_t>(tile_size), length}, type, ctx);
auto size = static_cast<int64_t>(tile_size);
if (con.find("gpu") != std::string::npos) {
size = std::max<int64_t>(3, size);
}
auto array = air::runtime::NDArray::Empty({size, length}, type, ctx);
auto spaceDlPack = array.ToDLPack();
auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data);
for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
if (con == "index") {
*ptr++ = b_idx;
*ptr++ = a_idx;
if (con.find("gpu") != std::string::npos) {
size_t s = con.find("thread") != std::string::npos ? 0 : 3;
size_t e = con.find("thread") != std::string::npos ? 3 : 6;
for (size_t i = s; i < e; ++i) {
if (length == 1) {
*ptr++ = analyzer_.binding_spaces_[i].map_mod_;
} else {
if (con == "C1_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C0_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C1_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
} else if (con == "C0_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
*ptr++ = analyzer_.binding_spaces_[i].map_min_;
*ptr++ = analyzer_.binding_spaces_[i].map_extent_;
}
}
} else {
for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
if (con == "index") {
*ptr++ = b_idx;
*ptr++ = a_idx;
} else {
if (con == "C1_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C0_range") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_min_.as<IntImm>()->value;
*ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
} else if (con == "C1_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
} else if (con == "C0_mod") {
TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
*ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
}
}
}
}
}

if (con == "index") space_->index_table = array;
if (con == "C1_range") space_->c1_tile_range_table = array;
if (con == "C0_range") space_->c0_tile_range_table = array;
if (con == "C1_mod") space_->c1_tile_mod_table = array;
if (con == "C0_mod") space_->c0_tile_mod_table = array;
if (con == "gpu_thread_range") space_->gpu_thread_range_table = array;
if (con == "gpu_block_range") space_->gpu_block_range_table = array;
if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array;
if (con == "gpu_block_mod") space_->gpu_block_mod_table = array;

delete spaceDlPack;
}
}
@@ -196,7 +228,8 @@ class TileSpaceCollector {
bool min_tile_ok = false;
for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) {
bool break_constraint =
(tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0);
((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) ||
(axis->forbid_iso && tile_extent->value % tile != 0);
if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) {
continue;
}
@@ -365,7 +398,7 @@ class TileSpaceCollector {
DLContext ctx = {kDLCPU, 0};
std::vector<TileAxis *> tile_axes_;
std::vector<bool> is_shared_;
std::unordered_set<std::string> cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
std::unordered_set<std::string> cared_info_;

struct Result {
std::vector<int> tile;


+ 10
- 0
src/poly/tiling/tile_space.h View File

@@ -28,6 +28,11 @@ class TileSpaceNode : public Node {
air::runtime::NDArray c1_tile_mod_table;
air::runtime::NDArray c0_tile_mod_table;
air::runtime::NDArray tiling_candidate;
air::runtime::NDArray gpu_thread_range_table;
air::runtime::NDArray gpu_block_range_table;
air::runtime::NDArray gpu_thread_mod_table;
air::runtime::NDArray gpu_block_mod_table;


void VisitAttrs(AttrVisitor *v) {
v->Visit("index_table", &index_table);
@@ -36,6 +41,11 @@ class TileSpaceNode : public Node {
v->Visit("c1_tile_mod_table", &c1_tile_mod_table);
v->Visit("c0_tile_mod_table", &c0_tile_mod_table);
v->Visit("tiling_candidate", &tiling_candidate);
v->Visit("gpu_thread_range_table", &gpu_thread_range_table);
v->Visit("gpu_block_range_table", &gpu_block_range_table);
v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table);
v->Visit("gpu_block_mod_table", &gpu_block_mod_table);

}
static constexpr const char *_type_key = "TileSpace";
TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node);


+ 19
- 5
src/poly/tiling/tiling_analyzer.cc View File

@@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() {

if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
ReduceStrategy reduce_strategy(this);
actived_strategies.push_back(&reduce_strategy);
ModStrategy mod_strategy(this);
actived_strategies.push_back(&mod_strategy);

GemmStrategy gemm_strategy(this);
GpuDmaAnalysisStrategy dma_analysis_strategy(this);
CustomTilingStrategy custom_strategy(this);
GpuStrategy gpu_strategy(this);
if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) {
actived_strategies.push_back(&dma_analysis_strategy);
} else {
if (scop_info_.user_config_.GetIsTuning()) {
actived_strategies.push_back(&custom_strategy);
} else {
actived_strategies.push_back(&reduce_strategy);
actived_strategies.push_back(&mod_strategy);
actived_strategies.push_back(&gemm_strategy);
}
actived_strategies.push_back(&gpu_strategy);
}
strategy_manager->SetStrategies(actived_strategies);
strategy_manager->ExecuteGpu();
if (scop_info_.user_config_.GetIsTuning()) {
binding_spaces_.clear();
for (auto i : gpu_strategy.thread_binding_spaces_) {
UpdateBindingSpace(i);
}
for (auto i : gpu_strategy.block_binding_spaces_) {
UpdateBindingSpace(i);
}
}
return;
}
}
@@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() {
if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
CastStrategy cast_strategy(this);
actived_strategies.push_back(&cast_strategy);

strategy_manager->SetStrategies(actived_strategies);
strategy_manager->ExecuteGpu();
return;
@@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() {

bool TilingAnalyzer::Prepare() {
logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger(
scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
CHECK(logger_) << "memory alloc fail.";
// Stage 1: Analyze schedule tree.
ScheduleTreeAnalyzer sch_ana(this, this->sch_);


+ 20
- 11
src/poly/tiling/tiling_analyzer.h View File

@@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) {
return (ALIGN_BYTES + dtype - 1) / dtype;
}

inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
inline int64_t GetMinBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
int64_t min_byte = -1;
for (const auto &it : dtypes) {
if (it.second.empty()) {
@@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>
min_byte = min_elem;
}
}
return GetAlignBytes(min_byte);
return min_byte;
}

inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
return GetAlignBytes(GetMinBytes(dtypes));
}

inline Expr CastToExpr(const std::string &value) {
@@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND";
constexpr auto AT_MOD = "MOD";
constexpr auto AT_CAST = "CAST";
constexpr auto AT_MEM_RATIO = "MEM_RATIO";
constexpr auto AT_THREAD_MIN = "THREAD_MIN";
constexpr auto AT_THREAD_MAX = "THREAD_MAX";
constexpr auto AT_THREAD_MOD = "THREAD_MOD";
constexpr auto AT_BLOCK_MIN = "BLOCK_MIN";
constexpr auto AT_BLOCK_MAX = "BLOCK_MAX";
constexpr auto AT_BLOCK_MOD = "BLOCK_MOD";

class TilingAnalyzer;

@@ -233,12 +243,12 @@ class TilingAnalyzer {
sch_(sch),
scop_info_(scop_info),
is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) {
if (scop_info.mmu_info_.IsGemm()) {
op_type_ = GEMM_OP;
} else if (scop_info.mmu_info_.IsConv()) {
op_type_ = CONV_OP;
} else {
op_type_ = VECTOR_OP;
if (scop_info.mmu_info_.IsGemm()) {
op_type_ = GEMM_OP;
} else if (scop_info.mmu_info_.IsConv()) {
op_type_ = CONV_OP;
} else {
op_type_ = VECTOR_OP;
}
}

@@ -292,7 +302,7 @@ class TilingAnalyzer {
CHECK(logger_);
return *(logger_.get());
}
void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); }
Stmt body_;
Binds &binds_;
isl::schedule sch_;
@@ -306,9 +316,8 @@ class TilingAnalyzer {

std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_;
std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_;

bool is_retry_{false};
std::vector<TileAxis::MappingConstraint> binding_spaces_; // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z]
private:
void AddTilingConstraints();
void AddPostTilingConstraints();


+ 6
- 3
src/poly/tiling/tiling_strategy_manager.h View File

@@ -284,8 +284,6 @@ class GemmStrategy : public TilingStrategy {
~GemmStrategy() {}
void AddNpuConstraint();
void AddGpuConstraint();

std::string interested_attr_key = AT_GEMM;
};

class GpuStrategy : public TilingStrategy {
@@ -306,6 +304,8 @@ class GpuStrategy : public TilingStrategy {
};
void AddNpuConstraint();
void AddGpuConstraint();
std::vector<TileAxis::MappingConstraint> thread_binding_spaces_; // [thread.x, thread.y, thread.z]
std::vector<TileAxis::MappingConstraint> block_binding_spaces_; // [block.x, block.y, block.z]

private:
void DetermineTemplate();
@@ -326,6 +326,8 @@ class GpuStrategy : public TilingStrategy {
// Step 1. Collect axes and sort them from inner to outer
void BuildAxesQueue();

void ApplyCustomConstraint();

/*
* Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks.
* e.g.
@@ -357,6 +359,7 @@ class GpuStrategy : public TilingStrategy {
int64_t min_elem_for_io_bound_ = 2;
size_t depth_{0};
bool need_reverse_{false};
bool reverse_binding_{false};
int64_t fused_size_{1};
std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"},
{3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"},
@@ -378,7 +381,7 @@ class MulticoreStrategy {

class TilingPriorityScorer {
public:
TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
~TilingPriorityScorer() {}

/*


+ 213
- 29
src/poly/tiling/tiling_strategy_manager_gpu.cc View File

@@ -18,7 +18,6 @@
#include <numeric>

#include "tiling_analyzer.h"

namespace akg {
namespace ir {
namespace poly {
@@ -377,13 +376,129 @@ void ReduceStrategy::DealWithPostReduceTensors() {
}
}

void GpuStrategy::ApplyCustomConstraint() {
auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) {
std::vector<std::string> sp = akg::common::Split(constraint, ",");
std::vector<int64_t> ret;
for (auto val : sp) {
if (ret.size() == max_size) {
break;
}
CHECK(!val.empty());
ret.emplace_back(static_cast<int>(std::strtol(val.c_str(), nullptr, 10)));
}
return ret;
};

// init binding space through template-determined limit
thread_binding_spaces_.clear();
block_binding_spaces_.clear();
for (size_t i = 0; i < thread_limit_.size(); ++i) {
TileAxis::MappingConstraint elem;
elem.map_extent_ = thread_limit_[i];
thread_binding_spaces_.emplace_back(elem);
}
for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) {
TileAxis::MappingConstraint elem;
elem.map_extent_ = block_limit_[i];
block_binding_spaces_.emplace_back(elem);
}

// add constraints to binding space according to custom tiling
std::unordered_set<std::string> thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD};
std::unordered_set<std::string> block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD};
for (const auto attr : analyzer_->RootAxis()->attrs) {
std::vector<int64_t> constraint;
std::vector<TileAxis::MappingConstraint> target;
if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size());
target = thread_binding_spaces_;
} else if (block_keys.find(attr.attr_key) != block_keys.end()) {
constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size());
target = block_binding_spaces_;
}
if (constraint.empty()) {
continue;
}

for (size_t i = 0; i < constraint.size(); ++i) {
if (attr.attr_key.find("MIN") != std::string::npos) {
target[i].map_min_ = std::max<int64_t>(target[i].map_min_, constraint[i]);
} else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) {
target[i].map_extent_ = std::min<int64_t>(target[i].map_extent_, constraint[i]);
} else if (attr.attr_key.find("MOD") != std::string::npos) {
target[i].map_mod_ = std::max<int64_t>(1, constraint[i]);
}
}

if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
thread_binding_spaces_ = target;
} else if (block_keys.find(attr.attr_key) != block_keys.end()) {
block_binding_spaces_ = target;
}
}

// apply custom constraint to corresponding axis and modify binding space according to tile range of axis
size_t cur_depth = 0;
analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) {
if (axis == analyzer_->RootAxis()) {
return;
}
auto cons = axis->GetConstConstraint(CACHE1);
auto range_extent = axis->GetConstExtent();
int tile_min = cons.tile_min_.as<IntImm>()->value;
int tile_extent = cons.tile_extent_.as<IntImm>()->value;
auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth;

auto thread_extent = tile_extent;
if (idx < thread_binding_spaces_.size()) {
thread_extent = std::min<int64_t>(thread_extent, thread_binding_spaces_[idx].map_extent_);
thread_binding_spaces_[idx].map_extent_ = thread_extent;
}

auto block_extent = range_extent / tile_min;
if (idx < block_binding_spaces_.size()) {
block_extent = std::min<int64_t>(block_extent, block_binding_spaces_[idx].map_extent_);
block_binding_spaces_[idx].map_extent_ = block_extent;
}

auto block_min = block_extent / std::max<int64_t>(1, thread_extent);
if (idx < block_binding_spaces_.size()) {
block_min = std::max<int64_t>(block_min, block_binding_spaces_[idx].map_min_);
block_binding_spaces_[idx].map_min_ = block_min;
}

axis->thread_constraints.map_extent_ = thread_extent;
axis->block_constraints.map_extent_ = block_extent;
axis->block_constraints.map_min_ = block_min;
if (idx < thread_binding_spaces_.size()) {
axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_;
}
if (idx < block_binding_spaces_.size()) {
axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_;
}
++cur_depth;
});
}

void GpuStrategy::AddGpuConstraint() {
InitMappingLimit();
if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) {
if (!analyzer_->scop_info_.user_config_.GetIsTuning() &&
(template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) {
BroadcastSpeedup();
}
BuildAxesQueue();
if (analyzer_->scop_info_.user_config_.GetIsTuning()) {
ApplyCustomConstraint();
for (size_t i = 0; i < max_dim_; ++i) {
TileAxis::MappingConstraint pad;
if (i >= thread_binding_spaces_.size()) {
thread_binding_spaces_.emplace_back(pad);
}
if (i >= block_binding_spaces_.size()) {
block_binding_spaces_.emplace_back(pad);
}
}
return;
}
InnerThreadOuterBlock();
@@ -391,19 +506,27 @@ void GpuStrategy::AddGpuConstraint() {
InjectiveSpeedup();
}
SetMappingConfig();
if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
analyzer_->ForEachAxisTopDown([this](TileAxis *axis) {
if (axis == analyzer_->RootAxis()) {
return;
}
axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0);
});
}
}

void GpuStrategy::InitMappingLimit() {
max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread();
DetermineTemplate();
std::stringstream ss;
need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;
reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;

if (template_ == Template::CUSTOM_CONFIG) {
auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig();
for (size_t i = 0; i < thread_config->bound; ++i) {
auto idx = need_reverse_ ? thread_config->bound - 1 - i : i;
auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i;
if (idx >= depth_) {
continue;
}
@@ -427,12 +550,16 @@ void GpuStrategy::InitMappingLimit() {
} else if (template_ == Template::MATMUL) {
// This is a naive tiling strategy used in gpu when thread and block configs are already set.
// This strategy will tile up to three inner-most axes to 32 (for thread binding).
thread_limit_ = {32, 8};
if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
thread_limit_ = {warp_sizes_, 16};
} else {
thread_limit_ = {warp_sizes_, 8};
}
} else {
thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_};
}

if (template_ != Template::CUSTOM_CONFIG) {
if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
AdjustThreadMappingLimit();
}

@@ -505,13 +632,21 @@ void GpuStrategy::InnerThreadOuterBlock() {
tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_)
: tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_)
: 1;
if (axis->block_constraints.map_extent_ > 1) {
tile =
std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
ss << ", map to block.";
auto tile_min = axis->c1_constraints.tile_min_.as<IntImm>()->value;
auto tile_extent = axis->c1_constraints.tile_extent_.as<IntImm>()->value;
if (tile_min == tile_extent && tile_extent != MIN_TILE) {
ss << "tile extent is already determined = " << tile_extent;
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
tile = tile_min;
} else {
tile = std::min(tile, shape);
if (axis->block_constraints.map_extent_ > 1) {
tile =
std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
ss << ", map to block.";
} else {
tile = std::min(tile, shape);
}
}
axis->TileRestrainLower(tile, TileLevel::CACHE1);
ss << ", tile = " << tile;
@@ -522,19 +657,11 @@ void GpuStrategy::InnerThreadOuterBlock() {
rest_threads = std::min(rest_threads, axis->thread_constraints.map_extent_);
}

if (thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) {
if (rest_threads <= 1 || thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) {
ss << ", no thread/dim rests";
SkipMapping();
continue;
}
if (rest_threads <= 1) {
if (axis->mc_sup ||
(template_ == Template::REDUCTION && analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib())) {
thread_cfg_.emplace_back(1);
}
SkipMapping();
continue;
}
auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_
: elem_per_thread_[inner_dim];
item = std::min(item, max_elem_per_thread_);
@@ -575,6 +702,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
if (pending_axes_.size() - i > block_dim) {
auto axis = pending_axes_[i].first;
ss << "axis " << axis->index << "_" << axis->dim_axis

<< " exceeded block dim and should be mapped to block for higher performance, consider flatten";
analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
continue;
@@ -594,7 +722,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
int64_t shape;
std::tie(axis, shape) = pending_axes_[i];
auto idx = pending_axes_.size() - 1 - i;
idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx;
idx = reverse_binding_ ? block_limit_.size() - 1 - idx : idx;
auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]);
rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_);
ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks;
@@ -635,11 +763,9 @@ void GpuStrategy::SetMappingConfig() {
if (block_cfg_.empty()) {
block_cfg_.emplace_back(1);
}
bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION);
std::string block_str = "";
std::string thread_str = "";
if (reverse_binding) {
if (reverse_binding_) {
for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) {
if (i >= block_count_) {
continue;
@@ -753,7 +879,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in
tile = thread_size;
ss << "tile = thread size, ";
} else {
auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim;
auto block_dim = reverse_binding_ ? inner_dim : block_limit_.size() - 1 - inner_dim;
int64_t least_blocks;
if (block_dim >= 0 && block_dim < block_limit_.size()) {
least_blocks = block_limit_[block_dim];
@@ -1139,12 +1265,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
}
}

void CustomTilingStrategy::AddGpuConstraint() {
auto interested_info = GetInterestedInfo(interested_attr_key, false);
for (auto it : interested_info) {
TileAxis *axis = it.first;
for (auto attr : it.second) {
std::vector<std::string> modes = akg::common::Split(attr.attr_key, ":");
CHECK_EQ(modes.size(), 2U);
std::string constraint_str = attr.attr_value;
if (constraint_str.find("->") != std::string::npos) {
std::vector<std::string> res = akg::common::Split(constraint_str, "->");
constraint_str = res[1];
}
std::vector<std::string> constraints = akg::common::Split(constraint_str, "_");
CHECK_GE(constraints.size(), 1U);
std::vector<std::string> level = akg::common::Split(constraints[0], ":");
CHECK(level.size() == 2U && level[0] == "LEVEL");
CHECK(level[1] == "C1" || level[1] == "C0");
TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0;
constraints.erase(constraints.begin());
for (const auto &con : constraints) {
std::vector<std::string> items = akg::common::Split(con, ":");
CHECK_EQ(items.size(), 2U);
CHECK_NE(items[0], "");
CHECK_NE(items[1], "");
if (items[0] == "MIN") {
if (items[1] == "MIN") {
if (lv == CACHE1) {
axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_;
} else if (lv == CACHE0) {
axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_;
}
} else {
if (lv == CACHE1) {
axis->c1_constraints.tile_min_ = CastToExpr(items[1]);
} else if (lv == CACHE0) {
axis->c0_constraints.tile_min_ = CastToExpr(items[1]);
}
}
} else if (items[0] == "FACTOR") {
axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv);
} else if (items[0] == "FORBIDISO") {
axis->forbid_iso = true;
} else if (items[0] == "MAX") {
if (items[1] == "FULL") {
axis->TileRestrainEntire(lv);
} else {
if (lv == CACHE1) {
axis->c1_constraints.tile_extent_ = CastToExpr(items[1]);
} else if (lv == CACHE0) {
axis->c0_constraints.tile_extent_ = CastToExpr(items[1]);
}
}
} else if (items[0] == AT_MOD) {
axis->TileRestrainMod(CastToExpr(items[1]), lv);
}
}
}
}
}

// No constraint found in cuda

void ModStrategy::AddGpuConstraint() {}

void CustomTilingStrategy::AddGpuConstraint() {}

void ConflictTreeRangeStrategy::AddGpuConstraint() {}

void VectorizedStrategy::AddGpuConstraint() {}


+ 0
- 0
tests/fuzz/tune_for_gpu/__init__.py View File


+ 17
- 0
tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py View File

@@ -0,0 +1,17 @@
import sys

if __name__ == "__main__":
from_log_file = str(sys.argv[1])
sorted_log_file = str(sys.argv[2])
f_in = open(from_log_file, 'r')
f_out = open(sorted_log_file, "wt")
d = dict()
for line in f_in:
config = line.split("|")
d[str(config[1])] = float(config[2])
sorted_dict = {k: v for k, v in sorted(
d.items(), key=lambda item: (item[1], item[0]))}
for k, v in sorted_dict.items():
f_out.write("|" + str(k) + "|" + str(v) + "\n")
f_in.close()
f_out.close()

+ 95
- 0
tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py View File

@@ -0,0 +1,95 @@
from .kernel_compiler import compile_kernel
from collections import namedtuple
from .space import ListConfigSpace

def get_reduce_axis_length(in_shape,reduce_axis):
lx, ly = 1, 1
if reduce_axis == None or len(reduce_axis) == len(in_shape):
for v in in_shape: lx *= v
elif (len(in_shape) - 1) in reduce_axis:
for i in range(len(in_shape)):
if i in reduce_axis:
lx *= in_shape[i]
else:
ly *= in_shape[i]

else:
for i in range(len(in_shape)):
if i in reduce_axis:
ly *= in_shape[i]
else:
lx *= in_shape[i]

return lx, ly

def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of reduce_sum operators in gpu"""
space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
gen_tiling_spaces=True)
in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis
dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2
dim_names = ['tiling_' + str(i) for i in range(dim_len)]
dim_names.append("block_x")
dim_names.append("block_y")
dim_names.append("block_z")
dim_names.append("thread_x")
dim_names.append("thread_y")
dim_names.append("thread_z")
for key in tuning_attrs_info[0]:
dim_names.append(key)
lx, ly = get_reduce_axis_length(in_shape, reduce_axis)

tiling_spaces = []
if reduce_axis == None or len(reduce_axis) == len(in_shape):
"""all-reduce"""
possible_tx_list = [2**i for i in range(4,11)]
for tx in possible_tx_list:
if tx > lx: break
possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)]
if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx)
for d0 in possible_dim0_list:
bx = lx//d0 if lx % d0 == 0 else lx//d0+1
tiling_spaces.append([d0,bx,1,1,tx,1,1])


elif (len(in_shape) - 1) in reduce_axis:
"""reduce-x"""
possible_tx_list = [2**i for i in range(4,11)]
for tx in possible_tx_list:
if tx > lx: break
ty = 1
by = ly
possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)]
if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx)
for d1 in possible_dim1_list:
bx = lx//d1 if lx % d1 == 0 else lx//d1+1
tiling_spaces.append([1,d1,bx,by,1,tx,ty,1])

else:
"""reduce-y"""
tx = min(32,lx)
bx = lx//tx if lx %tx==0 else lx//tx + 1
d0 = tx
for ty in range(min(8,ly),1025):
if ty * tx > 1024: break
possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)]
for d1 in possible_dim1_list:
by = ly//d1 if ly % d1 == 0 else ly//d1+1
tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1])

input_type = namedtuple(op_type, dim_names)
space = ListConfigSpace(input_type)
if len(tuning_attrs_info[0]) != 0:
for tiling_space in tiling_spaces:
for tuning_attrs_config in tuning_attrs_info[1]:
tmp = tiling_space[:]
tmp.extend(tuning_attrs_config)
config = input_type(*tmp)
space.add(config)
else:
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
return space_res.index_table, space, key, expect, input_for_mod

+ 501
- 0
tests/fuzz/tune_for_gpu/autotuning/job.py View File

@@ -0,0 +1,501 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AutoTuning job"""
import os
import json
import time
import datetime
import importlib
import logging
import pandas as pd
import subprocess
import numpy as np
from collections import namedtuple
from multiprocessing import Process, Manager
from akg import composite
from akg.utils import kernel_exec as utils
from akg.composite.build_module import generate_trait
from autotuning.runner import KernelRunner, error_time_list, error_time_string
from autotuning.tuner import ModelBasedTuner, Tuner
from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
from autotuning.space_generators import get_space
from autotuning.space import ListConfigSpace
from autotuning.test_data_generators import gen_data
from autotuning.space_generators import gen_bool_list
from autotuning.tuning_utils import *

logging.basicConfig(level=logging.DEBUG)

logger = logging.getLogger('fuzz.tune.autotuning.job')

storage_dir = './res/'

if not os.path.exists(storage_dir):
os.makedirs(storage_dir)

json_file = './res/' + "{0}" + ".json"
json_load = './autotuning/shapes/' + "{0}"


def get_repo(repo, keys, default=None):
for key in keys:
repo = repo.get(key)
if not repo:
return default
return repo


def get_json_space(json_input, space_dict):
space_res = composite.get_tiling_space(json_input, 2)
space_dict['res'] = space_res


def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False,
skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]):
"""composite json tuning launch"""
subprocess.run("mkdir -p res/", shell=True)
iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
files = os.listdir(json_dir)
with open(repo_path, 'r') as f:
repo = json.loads(f.read())
for input_file in files:
print("----Start tuning for ", input_file)
with open(json_dir + '/' + input_file, 'r') as f:
json_input = f.read()
json_content = json.loads(json_input)
for input_desc in json_content["input_desc"]:
if input_desc[0]["shape"] == []:
input_desc[0]["shape"] = [1]
json_input = json.dumps(json_content)

# skip tuning for info in repo
if skip_exist:
compute, shape, dtype = generate_trait(json_content)
if get_repo(repo, [compute, shape, dtype]):
print("Info for %s already exists" % input_file)
print("ops are ", str(compute))
print("shape is ", str(shape))
print("dtype is ", str(dtype))
with open('res/skip_file.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue

# generate tuning space
if not extra_tune:
time_start_get_space = time.time()
with Manager() as manager:
space_dict = manager.dict()
p = Process(target=get_json_space,
args=(json_input, space_dict))
p.start()
p.join(600)
if 'res' not in space_dict:
with open('res/error_space_list.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue
space_res = space_dict['res']
time_end_get_space = time.time()
print("get space time: ", time_end_get_space - time_start_get_space)
index_table = space_res['index']
tiling_spaces = space_res['tuning_space']
if not isinstance(tiling_spaces, list):
with open('res/empty_space_list.txt', 'a') as fe:
fe.write(input_file)
fe.write("\n")
continue
dim_names = ['tiling_' + str(i)
for i in range(len(tiling_spaces[0]))]
use_tuning_attrs = len(tiling_spaces) < 10 ** 5
if tuning_attrs and use_tuning_attrs:
dim_names.extend(tuning_attrs)
input_type = namedtuple("json", dim_names)
space = ListConfigSpace(input_type)
if tuning_attrs and use_tuning_attrs:
attr_options = gen_bool_list(tuning_attrs)
for tiling_space in tiling_spaces:
for attr_option in attr_options:
tmp = tiling_space[:]
tmp.extend(attr_option)
config = input_type(*tmp)
space.add(config)
else:
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
else:
index_table = []
pre_lists = gen_bool_list(self_attrs)
pre_input_type = namedtuple("extra_tune", self_attrs)
space = ListConfigSpace(pre_input_type)
for item in pre_lists:
config = pre_input_type(*item)
space.add(config)

key = json_content["op"]
try:
input_for_mod, expect = gen_data(
op_type="json", op_desc=json_input)
except BaseException as e:
logger.debug(
"gen numpy data from [%s] failed: %s", input_file, str(e))
with open('res/error_gen_data_list.txt', 'a') as fe:
fe.write(input_file)
fe.write(": ")
fe.write(str(e))
fe.write("\n")
continue
print('space size:', space.length)
print('index table:', index_table)

output_para = None # this is for multi-output
if len(json_content["output_desc"]) > 1:
output_para = []
for i in range(len(json_content["output_desc"])):
output_para.append(i - len(json_content["output_desc"]))
runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs,
input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180,
repeat_times=1)

# we can only get a valid tiling, or accurate get cycles
is_truly_profiling = utils.get_profiling_mode(
) or os.environ['RUNTIME_MODE'] == "gpu"

# available device numbers, normally is 8 or 1
available_device_numbers = utils.get_available_devices_num()

if all_space:
tuner = Tuner(runner, index_table, space,
n_parallel=available_device_numbers)
least_try_times = 3 # space.length
else:
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=64, pre_model=None)
least_try_times = iter_times[0 if space.length <
10 ** 4 else 1 if space.length < 10 ** 5 else 2]
tuner.tune(least_try_times, output_file="json.log")

print_tuning_result("json", space, index_table, tuner, key)

if save_res:
if extra_tune:
save_tuning_result(key, "extra_tune",
json_content, index_table, tuner, repo_path)
else:
save_tuning_result(key, "json", json_content,
index_table, tuner, repo_path)


def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False,
all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None):
"""AutoTuning jobs"""
iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
time_start_get_space = time.time()
index_table, space, key, expect, input_for_mod = get_space(
op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
time_end_get_space = time.time()
print("get space time: ", time_end_get_space - time_start_get_space)
print('space size:', space.length)
print('index table:', index_table)
key = key if insert_key == '' else insert_key

# filter already tuned shape
if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys():
if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]:
return

if isinstance(conf_of_set_dim[key], dict):
return

output_para = None # this is for multi-output
if isinstance(input_for_mod, dict):
input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs']
runner = KernelRunner(op_type, desc, index_table,
self_attrs=None, input_data=input_for_mod,
expect=expect, mod_output_param=output_para,
timeout=30, repeat_times=1,
is_all_space=all_space,
skip_config_set=skip_config_set,
need_tune_json=tuning_attrs_info[2])

# we can only get a valid tiling, or accurate get cycles
is_truly_profiling = utils.get_profiling_mode()

# number of multi-processing for build kernels
available_device_numbers = get_parallel_build_num()

time_start_tuning = time.time()
if all_space:
tuner = Tuner(runner, index_table, space,
n_parallel=available_device_numbers)
least_try_times = space.length
else:
tuner = ModelBasedTuner(runner, index_table, space,
n_parallel=available_device_numbers if is_truly_profiling else 1,
plan_size=100, pre_model=None)
least_try_times = space.length
tuner.tune(least_try_times, output_file=op_type + ".log")

time_end_tuning = time.time()
print("tuning time: ", time_end_tuning - time_start_tuning)
print_tuning_result(op_type, space, index_table, tuner, key)
# save_results_to_csv(op_type, space, index_table, tuner, key)

# if save_res:
# save_tuning_result(key, op_type, desc, index_table, tuner)


def print_tuning_result(op_type, space, index_table, tuner, key):
"""print tuning result"""
print(op_type + " shape is:", key)
print('space size:', space.length)
print('index table:', index_table)
print('best config:', tuner.best_config)
print('best time:',
tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time])
print('original time:', tuner.original_time)
print('optimal result is ', tuner.original_time /
tuner.best_time, "faster then auto set dim.")
print("total try times", len(tuner.xs))
for x, y in zip(tuner.xs, tuner.ys):
print(space.get(x), y if y not in error_time_string.keys()
else error_time_string[y])


def save_results_to_csv(op_type, space, index_table, tuner, key):
"""save all results to csv"""
data = []
for x, y in zip(tuner.xs, tuner.ys):
data.append([space.get(x), y if y not in error_time_string.keys()
else 9999999])
df = pd.DataFrame(data, columns=["config", "time"])
df.to_csv(op_type + "_" + key + ".csv")


def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"):
"""save tuning result"""
if tuner.best_config is not None and tuner.best_time not in error_time_list:
set_dim_configs = tuner.best_config.input
if op_type == "matmul":
param = []
for _ in range(len(desc.x_shape) - 2):
param.append((1, 1))
if set_dim_configs.n_l1 > 0:
param.append((set_dim_configs.n_l1, set_dim_configs.n_l0))
if set_dim_configs.m_l1 > 0:
param.append((set_dim_configs.m_l1, set_dim_configs.m_l0))
param.extend(
[(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)])
tiling_param = (param, {"bypass": set_dim_configs.bypass})

# special case with different tiling parameter format
elif op_type in ("conv", "conv_bn1"):
param = []
tile_hh = set_dim_configs.tile_h
tile_coco = set_dim_configs.tile_co
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
tile_ww = set_dim_configs.tile_w
param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
tiling_param = (param, {"bypass": set_dim_configs.bypass})
elif op_type == "conv_backprop_input":
param = []
tile_hh = set_dim_configs.tile_h
tile_coco = set_dim_configs.tile_co
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
tile_ww = set_dim_configs.tile_w
param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
tiling_param = (param)
elif op_type == "conv_backprop_filter":
param = []
tile_cici = set_dim_configs.tile_ci
tile_khkh = set_dim_configs.tile_kh
tile_kwkw = set_dim_configs.tile_kw
tile_coco = set_dim_configs.tile_co
tile_bb = set_dim_configs.tile_batch
tile_hh = set_dim_configs.tile_h
tile_ww = set_dim_configs.tile_w
tile_mm = set_dim_configs.tile_m
tile_kk = set_dim_configs.tile_k
tile_nn = set_dim_configs.tile_n
param = [tile_cici, tile_khkh, tile_kwkw, tile_coco,
tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn]
tiling_param = (param)
elif ("batch_matmul" in op_type) and (platform == "gpu"):
tiling = [str(getattr(set_dim_configs, name)) for name in getattr(
set_dim_configs, "_fields") if name.startswith("tiling")]
tiling_param = ""
for i, tile_v in enumerate(tiling):
if i % 2 == 0:
tiling_param += "0 " + str(i) + " "
tiling_param += tile_v + " "

block_param = get_block_str_from_config(set_dim_configs)
thread_param = get_thread_str_from_config(set_dim_configs)
config = {
'attrs': {
'dim': tiling_param,
'bind_block': block_param,
'bind_thread': thread_param
},
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
'date': str(datetime.datetime.now()),
'tuning_time': tuner.tuning_time,
}
elif op_type == "json":
from autotuning.runner import get_attr_from_config
tiling_param = get_attr_from_config(set_dim_configs, index_table)
elif op_type == "reduce_sum_gpu":
print(set_dim_configs)
tiling = [str(getattr(set_dim_configs, name))
for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
tiling_param = ""
for i, tile_v in enumerate(tiling):
tiling_param += "0 " + str(i) + " "
tiling_param += tile_v + " 1 "

block_param = get_block_str_from_config(set_dim_configs)
thread_param = get_thread_str_from_config(set_dim_configs)
config = {
'attrs': {
'dim': tiling_param,
'bind_block': block_param,
'bind_thread': thread_param
},
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
'date': str(datetime.datetime.now()),
'tuning_time': tuner.tuning_time,
}
else:
print(set_dim_configs)
tiling = [[getattr(set_dim_configs, name), 1]
for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
tiling_param = []
for i, tile_v in enumerate(tiling):
tiling_param.append(index_table[i] + tile_v)
config = []
else:
tiling_param = []

# when there is a valid result, save the result
if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list:
config = {'attrs': tiling_param,
'best_cycles': tuner.best_time,
'original_cycles': tuner.original_time,
"date": str(datetime.datetime.now()),
"tuning time": tuner.tuning_time,
}
if op_type == "json":
config["file_name"] = str(key)
compute, shape, dtype = generate_trait(desc)
tuner.export_dim_configs(
config, json_file.format(op_type), False, str(key))
save_file = "autotuning/extra_tune.json" if extra_tune else repo_path
with open(save_file, 'r') as f:
repo = json.loads(f.read())
if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or
int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])):
tuner.export_dim_configs_for_keys(config, save_file, False, [
compute, shape, dtype, "metadata"])
else:
try:
tuner.export_dim_configs(
config, json_file.format(op_type), False, str(key))
except UnboundLocalError as e:
logger.warning(e)
print("[save_tuning_result]: ", "no result is saved.")


def load_json_configs(op_type):
"""load json configs"""
dim_file = json_file.format(op_type)
file_path = os.path.realpath(dim_file)
if os.path.isfile(file_path):
try:
with open(file_path, 'r') as f:
data = json.load(f)
return data
except IOError as e:
logger.debug(e)
return {}
return {}


def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type):
"""read tuning shapes from file"""
file = importlib.import_module('autotuning.shapes.' + op_type)
shapes = file.shapes
for _, shp in enumerate(shapes):
do_profiling(shp, debug_mode, save_res,
all_space, op_type, conf_of_set_dim)


def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
"""do profiling"""
# remove undeleted JOB files for previous shapes
subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True)
if op_type == 'matmul':
key = shp[2][0:-1]
logger.debug("start profiling: [%s]", str(key))
desc = MatmulCubeDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv_backprop'):
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvBackpropDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type.startswith('conv') and "gpu" not in op_type:
key = shp[2]
logger.debug("start profiling: [%s]", str(key))
desc = ConvDesc(*key)
jobs(op_type, desc, debug_mode, save_res,
all_space, key.__str__(), conf_of_set_dim)
logger.debug("end profiling: [%s]", str(key))
elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]:
logger.debug("start profiling: [%s]", str(shp))
jobs(op_type, shp, debug_mode, save_res,
all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)
else:
key = shp
logger.debug("start profiling: [%s]", str(key))
desc = key
jobs(op_type, desc, debug_mode, save_res,
all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set)
logger.debug("end profiling: [%s]", str(key))


def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False,
from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
# get the existed tiling
conf_of_set_dim = load_json_configs(op_type) if from_json else None

if desc is None:
read_shapes_from_file(debug_mode, save_res,
all_space, conf_of_set_dim, op_type)
else:
shp = desc
do_profiling(shp, debug_mode, save_res, all_space, op_type,
tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)

+ 407
- 0
tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py View File

@@ -0,0 +1,407 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compile kernel module for operator"""
import os
from typing import NamedTuple
from base import TestBase
from akg.utils import kernel_exec as utils
from akg.utils import custom_tiling as ct_util
from akg.ops.nn import conv_bn1
from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul
from test_op.batch_matmul import batch_matmul
from akg.ops.math_gpu.reduce_sum import reduce_sum
from akg.build_module import tuning_spaces
from akg.ops.nn import matmul
from test_run import batchmatmul_run, matmul_run
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
import numpy as np
from gen_random import random_gaussian
from .tuning_utils import merge_attrs


def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None):
# wait for implementation
return


def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table,
config: ConvConfig = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}

if op_desc.use_bias:
shape = [input_shape[0], input_shape[1], input_shape[2]]
else:
shape = [input_shape[0], input_shape[1]]
conv_dtype = 'float16'

return utils.op_build(conv.conv, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
op_desc.dilation, op_desc.use_bias, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_bn1"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_bn1_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}

if op_desc.use_bias:
shape = [input_shape[0], input_shape[1], input_shape[2]]
else:
shape = [input_shape[0], input_shape[1]]
conv_dtype = 'float16'

return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
op_desc.dilation, op_desc.use_bias, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table,
config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for matmul_cube"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "matmul_cube_poly"
if idx is not None:
kernel_name += str(idx)
if config is None:
attrs = {'dim': ""}
else:
tiling_param = []
for _ in range(len(op_desc.x_shape) - 2):
tiling_param.append((1, 1))
if config.n_l1 > 0:
tiling_param.append((config.n_l1, config.n_l0))
if config.m_l1 > 0:
tiling_param.append((config.m_l1, config.m_l0))
tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)])
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs = {'dim': dim_info, 'bypass': config.bypass}
return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format,
op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y,
op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name,
attrs, tuning=gen_tiling_spaces)


def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_backprop_input"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_backprop_input_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_hh = config.tile_h
tile_coco = config.tile_co
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tile_ww = config.tile_w
tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
attrs = {'conv_tile': tiling_param}

conv_dtype = 'float16'
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
stride_h, stride_w = op_desc.stride

out_n = in_n
out_c = cout
out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1

x_shape = (out_n, out_c, out_h, out_w)
w_shape = (cout, in_c, w_h, w_w)
in_nn, in_cc, in_hh, in_ww = x_shape
input_shape_nc1hwc0 = (in_nn, in_cc // block_size,
in_hh, in_ww, block_size)
k_n, k_c, k_h, k_w = w_shape
kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0
kernel_shape_fractal = (k_c // block_size * k_h *
k_w, k_n // block_size, block_size, block_size)

shape = [input_shape_nc1hwc0, kernel_shape_fractal]

return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None,
idx=None, gen_tiling_spaces=False):
"""Compile kernel module for conv_backprop_filter"""
if index_table is not None:
raise RuntimeError('index_table should be none')
kernel_name = "conv_backprop_filter_poly"
if idx is not None:
kernel_name += str(idx)

if config is None:
attrs = {'dim': ""}
else:
tile_cici = config.tile_ci
tile_khkh = config.tile_kh
tile_kwkw = config.tile_kw
tile_coco = config.tile_co
tile_bb = config.tile_batch
tile_hh = config.tile_h
tile_ww = config.tile_w
tile_mm = config.tile_m
tile_kk = config.tile_k
tile_nn = config.tile_n
tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww,
tile_mm, tile_kk, tile_nn]
attrs = {'conv_tile': tiling_param}

conv_dtype = 'float16'
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
stride_h, stride_w = op_desc.stride

out_n = in_n
out_c = cout
out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1

x_shape = (in_n, in_c, in_h, in_w)
y_shape = (out_n, out_c, out_h, out_w)
in_n, in_c, in_h, in_w = x_shape
input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)
o_n, o_c, o_h, o_w = y_shape
kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size)
o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0
mo = (o_h * o_w + block_size - 1) // block_size
mi = block_size
kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0)

input_shape = [kernel_shape_fractal, input_shape_nc1hwc0]

return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype],
op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation, attrs],
kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)


def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False):
"""Compile kernel module for vector"""
test_base = TestBase()
test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd())
kernel_name = "poly_"
if idx is not None:
kernel_name += str(idx)
if config is None:
attrs = {'dim': ""}
else:
tiling = [[getattr(config, name), 1] for name in getattr(
config, '_fields') if name.startswith('tiling')]
tiling_param = []
for i, element in enumerate(tiling):
tiling_param.append(index_table[i] + element)
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs = {'dim': dim_info}
_, func, args, kwargs = test_base.ana_args(op_desc)
if 'attrs' in kwargs.keys():
kwargs['attrs']['dim'] = attrs['dim']
kwargs['attrs']['tuning'] = gen_tiling_spaces
kwargs['attrs']['kernel_name'] = kernel_name
else:
for _, arg_ in enumerate(args):
if isinstance(arg_, dict):
arg_['dim'] = attrs['dim']
arg_['tuning'] = gen_tiling_spaces
arg_['kernel_name'] = kernel_name
break
try:
if gen_tiling_spaces:
mod, expect, param_for_mod = func(*args, **kwargs)
mod = list(mod)
mod.append(expect)
mod.append(param_for_mod)
else:
mod = func(*args, **kwargs)
except BaseException as e:
print("Compile ERROR message:", e)
print(func)
print("Compile ERROR")
raise Exception("Compile ERROR")

return mod


def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None,
config: NamedTuple = None, idx=None,
gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for batch_matmul in gpu"""
kernel_name = "batch_matmul_gpu_"
# wait for implementation
return


def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None,
config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for reduce_sum in gpu"""
kernel_name = "reduce_sum_gpu_"
if idx is not None:
kernel_name += str(idx)
attrs = op_desc[2]
if config is not None:
attrs = merge_attrs(attrs, config, need_tune_json)

try:
if gen_tiling_spaces:
# NOTE: don't use this process for reduce spaces generation,
# see function: "_get_space_reduce_gpu_manually".
from .tiling_strategies_gpu import reduce_gpu_tiling_strategy
spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ),
(attrs.in_dtype,
), kernel_name="reduce_sum",
op_attrs=[
attrs.axis, attrs.keepdims],
attrs={"target": "cuda",
"enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
"enable_atomic_add": attrs.enable_atomic_add,
"custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True)

from test_ms_reduce_sum import gen_data
input_for_mod, output, expect = gen_data(
attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims)
return [spaces, set_dim_key, expect, [input_for_mod, output]]
else:
mod = utils.op_build(reduce_sum, (attrs.in_shape, ),
(attrs.in_dtype,
), kernel_name="reduce_sum",
op_attrs=[
attrs.axis, attrs.keepdims],
attrs={"target": "cuda",
"enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
"dim": attrs.dim,
"bind_block": attrs.bind_block,
"bind_thread": attrs.bind_thread,
"enable_atomic_add": attrs.enable_atomic_add})
return mod
except BaseException as e:
print("Compile ERROR message:", e)
print(reduce_sum)
print("Compile ERROR")
raise Exception("Compile ERROR")


def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
"""Compile kernel module for convolution in gpu using image2col+gemm"""
# wait for implementation
return


_compile_kernel_func = {
'conv': gen_kernel_conv,
'conv_bn1': gen_kernel_conv_bn1,
'conv_backprop_input': gen_kernel_conv_backprop_input,
'conv_backprop_filter': gen_kernel_conv_backprop_filter,
'matmul': gen_kernel_matmul_cube,
'reduce_sum_gpu': gen_kernel_reduce_sum_gpu,
'batch_matmul_gpu': gen_kernel_batch_matmul_gpu,
'conv_image2col_gemm_gpu': gen_kernel_conv_image2col_gemm_gpu,
}


def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None,
config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None):
"""Generate kernel module for operator

Parameters
op_type: str
operator name
op_desc: NamedTuple
operator definition parameters
config_param: NameTuple
operator config parameters
idx: int
operator idx(th) kernel
gen_tiling_spaces: bool
parameter passed to utils.op_build, whether to get spaces instead of stmt
----------

Returns:
kernel if gen_tiling_spaces == False else np.ndarray
"""
gen_func = _compile_kernel_func.get(op_type, None)
if gen_func is None:
gen_func = gen_kernel_for_vector
if gen_tiling_spaces:
space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param,
idx, gen_tiling_spaces)
else:
if "gpu" in op_type:
mod = gen_func(op_desc, input_shape, index_table,
config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json)
else:
mod = gen_func(op_desc, input_shape, index_table,
config_param, idx, gen_tiling_spaces)

return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod

+ 243
- 0
tests/fuzz/tune_for_gpu/autotuning/runner.py View File

@@ -0,0 +1,243 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Runner for compile and execute a configs of an operator on device"""
import time
import multiprocessing
import logging
import json
import os
import subprocess
import time
from typing import NamedTuple
import numpy as np
from akg import composite
from akg.utils import custom_tiling as ct_util
from akg.utils import kernel_exec as utils
from .kernel_compiler import compile_kernel
from .test_data_generators import gen_data
from .tuning_utils import *

logger = logging.getLogger('fuzz.tune.autotuning.runner')

error_time_list = [
9999999999.0,
9999999998.0,
9999999997.0,
9999999996.0,
]

error_time_string = {
error_time_list[0]: 'run_failed',
error_time_list[1]: 'precision_error',
error_time_list[2]: 'compile_failed',
error_time_list[3]: 'timeout'
}

run_failed_time = error_time_list[0]
precision_error_time = error_time_list[1]
compile_fail_time = error_time_list[2]
timeout_time = error_time_list[3]


def get_attr_from_config(config, index_table):
tiling = []
attrs = {}
tuning_dict = config._asdict()
for key, value in tuning_dict.items():
if key.startswith('tiling'):
item = [value, 1]
tiling.append(item)
else:
attrs[key] = value
if len(tiling):
tiling_param = []
for i, element in enumerate(tiling):
tiling_param.append(index_table[i] + element)
dim_info = ct_util.set_dims(tuple(tiling_param))
attrs['dim'] = dim_info
else:
print("No tiling info. Use auto tiling.")
return attrs


class KernelRunner:
"""kernel runner
This runner will compile and execute configs of an operator, and return their running times.

Parameters
----------
op_type: str
The name of operator
op_desc: NamedTuple
The definition parameters of operator
timeout: int
Timeout for running one config
repeat_times:
Run one config repeat_times
"""

def __init__(self, op_type: str, op_desc: NamedTuple,
index_table: list, self_attrs: list, timeout: int = 600,
repeat_times: int = 2, input_data=None,
expect=None, mod_output_param=None, is_all_space=True,
skip_config_set=None, need_tune_json=None):
self.op_type = op_type
self.op_desc = op_desc
self._index_table = index_table
self.self_attrs = self_attrs
self.run_kernel_time = 0.0
self.tune_self_attrs = True
self.timeout = timeout
self.repeat_times = repeat_times
self.mod_output_param = mod_output_param
self.is_all_space = is_all_space
self.skip_config_set = skip_config_set
self.need_tune_json = need_tune_json
if input_data is None:
self.input, self.expect = gen_data(op_type, op_desc)
if isinstance(self.input, dict):
self.input, self.mod_output_param = self.input['args'], self.input['outputs']
else:
self.input, self.expect = input_data, expect
self.input_shape = [x.shape for x in self.input]

def info(self):
print('run kernel time:', self.run_kernel_time)

def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False):
"""Compile and execute a config of the operator on device"""

if json.dumps(config.input._asdict()) in self.skip_config_set:
print("CONFIG SKIP:", json.dumps(config.input._asdict()))
run_times[idx] = -1
return

time_one_kernel_start = time.time()
logger.debug('compile %dth kernel', idx)
gpu_devices_list = get_available_gpu_num()
device_id = gpu_devices_list[idx % len(gpu_devices_list)]
logger.debug('run %dth kernel', idx)
logger.debug('++++++++++++++++++++++=device_id')
logger.debug(device_id)
logger.debug('++++++++++++++++++++++=device_id')
try:
time_start_build = time.time()
logger.debug(config)
if self.op_type in ["json", "extra_tune"]:
if is_auto:
mod = composite.build(self.op_desc)
if self.op_type == "extra_tune":
del os.environ['MS_GRAPH_KERNEL_TILING']
else:
attrs = get_attr_from_config(
config.input, self._index_table)
if os.environ['RUNTIME_MODE'] == "gpu":
attrs['target'] = "cuda"
mod = composite.build(self.op_desc, attrs, use_repo=False)
else:
mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table,
None if is_auto else config.input, idx, need_tune_json=self.need_tune_json)
time_end_build = time.time()
logger.debug("build module time: %f",
time_end_build - time_start_build)
logger.debug('finished compile %dth kernel', idx)
except BaseException as e:
logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str(
config.input), str(e))
run_times[idx] = compile_fail_time
return

run_times[idx] = run_failed_time

try:
# NOTE: in gpu tuning, it is no need to use this repeat_times,
# repeat_time has been setted in mod_launch in tuning mode.
for _ in range(self.repeat_times):
stat_info = {}
try:
time_start_launch = time.time()
if self.mod_output_param is not None:
pass
else:
output, stat_info = utils.mod_launch(
mod, self.input, tuning=True, device_id=device_id, repeat_time=40)
if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True):
stat_info['run_time'] = precision_error_time
logger.debug("Precision Error: [%s]",
"origin" if config is None else str(config.input))

time_end_launch = time.time()
logger.debug("mod launch time: %f",
time_end_launch - time_start_launch)
except BaseException as e:
logger.debug("Run Failed: [%s] : %s", str(
config.input), str(e))
stat_info['run_time'] = run_failed_time
run_times[idx] = np.minimum(
run_times[idx], stat_info['run_time'])
finally:
logger.debug('end of %dth kernel', idx)
time_one_kernel_end = time.time()
logger.debug('run one kernel time: %f',
time_one_kernel_end - time_one_kernel_start)
return

def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False):
"""Compile and execute a batch config of the operator on device"""
start = time.time()
logger.setLevel(logging.DEBUG)
logger.debug("gen cce kernels batch: %d kernels", len(configs))
subprocess.run("rm -rf ./jobs/JOB*", shell=True)

process_jobs = []
run_times = multiprocessing.Manager().list(
np.full((len(configs),), compile_fail_time))
for idx, config in enumerate(configs):
p = multiprocessing.Process(target=self.run_one_kernel,
args=(run_times, idx, config, best_time, is_auto_set_dim))
process_jobs.append(p)
p.start()
timeout_error = False
for idx, p in enumerate(process_jobs):
if not timeout_error:
p.join(timeout=self.timeout)
if p.is_alive():
timeout_error = True
logger.debug("Timeout Error: [%s]", str(configs[idx].input))
run_times[idx] = timeout_time
p.terminate()

process_end = time.time()
logger.debug("process time: %f", process_end - start)
# clean the profiling directory
tune_device = int(os.environ['DEVICE_ID'])
tune_num = int(os.environ['DEVICE_TOTAL_NUM'])
if os.environ['RUNTIME_MODE'] == "gpu":
subprocess.run("rm -rf cuda_meta_*", shell=True)
else:
pass
end = time.time()
logger.debug("run kernels time: %f", end - start)
self.run_kernel_time += end - start

for idx, config in enumerate(configs):
if run_times[idx] not in error_time_list:
logger.debug("KernelRunTime : [%s] : %s", str(
configs[idx].input), str(run_times[idx]))
else:
logger.debug("KernelRunTime : [%s] : %s",
str(configs[idx].input), str(error_time_string[run_times[idx]]))

return run_times

+ 217
- 0
tests/fuzz/tune_for_gpu/autotuning/space.py View File

@@ -0,0 +1,217 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Config space"""
from abc import ABCMeta, abstractmethod
from typing import NamedTuple, List
import random
import numpy as np


class ConfigEntity:
"""General config entity"""

def __init__(self, input_id: int, input_space: NamedTuple):
self.__input = input_space
self.__input_id = input_id
self.__input_type = type(input_space)

def __len__(self):
return len(self.__input)

def __str__(self):
return str(self.__input_id) + ': ' + str(self.__input)

def __repr__(self):
return str(self)

@property
def input_id(self):
return self.__input_id

@property
def input_type(self):
return self.__input_type

@property
def input(self):
return self.__input

@property
def feature(self):
return self.__input


class ConfigSpace(metaclass=ABCMeta):
"""Searching space of configs"""

def __init__(self, input_type):
self._input_type = input_type
self._dim_names = getattr(self._input_type, '_fields')

self._configs = [] # List[ConfigEntity]

@abstractmethod
def reset_fetch(self):
pass

@abstractmethod
def has_next(self) -> bool:
pass

@abstractmethod
def fetch_index(self) -> int:
"""fetch a random index of config"""

@abstractmethod
def fetch_config(self) -> ConfigEntity:
"""fetch a random config"""

@abstractmethod
def random_walk(self, p: int) -> int:
"""find a neighbor hood of the p-th ConfigEntity, which only
differs with p in at most one dimension"""

def get(self, idx: int) -> ConfigEntity:
"""get the `idx`-th config of the space"""
return self._configs[idx]

@property
def configs(self):
return self._configs

@property
def dim_names(self):
return self._dim_names

@property
def input_type(self):
return self._input_type

@property
# @abstractmethod
def length(self):
return len(self.configs)


class ConfigTrie:
"""Trie node for config entities"""

def __init__(self):
self.ch = dict()

def add(self, config: ConfigEntity, last_dim: int):
"""add a ConfigEntity"""
cur = self
for i, x in enumerate(config.input):
if i == last_dim:
continue
if x not in cur.ch:
cur.ch[x] = ConfigTrie()
if not isinstance(cur.ch, dict):
raise TypeError('none-leaf node should have a dict of childs')
cur = cur.ch[x]

if not isinstance(cur.ch, list):
cur.ch = []
cur.ch.append(config.input_id)

def fetch_random(self, config: ConfigEntity, last_dim: int) -> int:
"""randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension"""
cur = self
for i, x in enumerate(config.input):
if i == last_dim:
continue
if not isinstance(cur.ch, dict):
raise TypeError('none leaf node should have a dict of childs')
if x not in cur.ch:
raise RuntimeError('no element found')
cur = cur.ch[x]
if not cur.ch:
raise RuntimeError('no element found')
if len(cur.ch) == 1:
return cur.ch[0]
idx = config.input_id
while idx == config.input_id:
idx = random.choice(cur.ch)
return idx


class ListConfigSpace(ConfigSpace):
"""Searching space of configs, which stores all possible configs in a list"""

def __init__(self, input_type):
super(ListConfigSpace, self).__init__(input_type)

self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))]
self.__fetch_pool = []

def reset_fetch(self):
"""reset fetch state"""
self.__fetch_pool = [i for i in range(len(self._configs))]

def fetch_scope(self, start, end):
self.__fetch_pool = [i for i in range(start, end)]

def has_next(self) -> bool:
return len(self.__fetch_pool) > 0

def fetch_index(self) -> int:
"""fetch a random index of config"""
idx = np.random.randint(len(self.__fetch_pool))
ret = self.__fetch_pool[idx]
self.__fetch_pool[idx] = self.__fetch_pool[-1]
self.__fetch_pool.pop()
return ret

def fetch_next_index(self) -> int:
"""fetch next index of config"""
idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0]
self.__fetch_pool.pop()
return idx

def fetch_config(self) -> ConfigEntity:
"""fetch a random config"""
return self.get(self.fetch_index())

def add(self, input_space: NamedTuple):
"""add a new config to space"""
if not isinstance(input_space, self._input_type):
raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space),
self._input_type))
config = ConfigEntity(len(self._configs), input_space)
self.__fetch_pool.append(len(self._configs))
for i in range(len(self._dim_names)):
self.__config_tries[i].add(config, i)
self._configs.append(config)

def random_walk(self, p: int) -> int:
"""find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension"""
dim = np.random.randint(len(self._dim_names))
return self.__config_tries[dim].fetch_random(self._configs[p], dim)

@property
def length(self):
return len(self._configs)

@classmethod
def from_list(cls, configs: List[NamedTuple]):
if not isinstance(configs, list):
raise TypeError('configs must be of list type, got %s' % type(configs))
if not configs:
raise ValueError('configs must be non-empty')
space = cls(type(configs[0]))
for config in configs:
space.add(config)
return space

+ 753
- 0
tests/fuzz/tune_for_gpu/autotuning/space_generators.py View File

@@ -0,0 +1,753 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""space generating functions for operators"""
from functools import partial
from typing import NamedTuple
from collections import namedtuple
from test_run import matmul_run
from akg.utils import validation_check as vc_util
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
from .space import ListConfigSpace
from .kernel_compiler import compile_kernel
from .gen_spaces_gpu import _get_space_reduce_gpu_manually
from tqdm import tqdm
from enum import Enum

GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"}

class GpuSpacePolicy(Enum):
"""Policy to expand tile candidates with block and thread."""
FULL = "FULL"
BMM = "BMM"
REDUCE_ALL = "REDUCE_ALL"
REDUCE_X = "REDUCE_X"
REDUCE_Y = "REDUCE_Y"


def gen_bool_list(attr_list):
bool_list = []
for _ in attr_list:
if len(bool_list) == 0:
bool_list = [[True], [False]]
else:
tmp_list = []
for attr_option in bool_list:
tmp = attr_option[:]
tmp.append(True)
tmp1 = tmp[:]
tmp.pop()
tmp.append(False)
tmp2 = tmp[:]
tmp_list.append(tmp1)
tmp_list.append(tmp2)
bool_list = tmp_list
return bool_list


def _get_space_vector(op_type: str, op_desc):
"""get config space of vector operator"""
space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
gen_tiling_spaces=True)

if space_res is None:
raise RuntimeError('no space returned')
if 'index' not in space_res or 'tuning_space' not in space_res:
raise RuntimeError('invalid space returned')
index_table = space_res['index']
tiling_spaces = space_res['tuning_space']

if not tiling_spaces:
raise RuntimeError('empty tiling spaces')

dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))]
input_type = namedtuple(op_type, dim_names)
space = ListConfigSpace(input_type)
for tiling_space in tiling_spaces:
config = input_type(*tiling_space)
space.add(config)
return index_table, space, key, expect, input_for_mod


def _get_space_conv(op_desc: ConvDesc):
"""get config space of convolution"""
if not isinstance(op_desc, ConvDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape
padding = (pad_[0], pad_[1], pad_[2], pad_[3])
p_top, p_bottom, p_left, p_right = padding
s_h, s_w = stride_

in_c = ((in_c - 1) // 16 + 1) * 16
tile_c = in_c
tile_co_start = 16

data_len = 2

h_max = in_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

bypass_options = [0, 1]

for bypass in bypass_options:
for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h +
p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
if bypass == 1:
if tile_co != k_n:
continue
l1_size = data_len * (size_h * size_w * in_c)
else:
l1_size = data_len * (size_h * size_w * in_c +
tile_co * tile_c * k_h * k_w)

if l1_size > l1_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = in_c * k_h * k_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
tile_n, tile_w, bypass))

return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_bn1(op_desc: ConvDesc):
"""get config space of convolution"""
if not isinstance(op_desc, ConvDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape
padding = (pad_[0], pad_[1], pad_[2], pad_[3])
p_top, p_bottom, p_left, p_right = padding
s_h, s_w = stride_

in_c = ((in_c - 1) // 16 + 1) * 16
tile_c = in_c
tile_co_start = 16

data_len = 2

h_max = in_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

bypass_options = [0, 1]

for bypass in bypass_options:
h_range = range(h_max, k_h - 1, -s_h)
for tile_h in h_range:
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h +
p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
if bypass == 1:
if tile_co != k_n:
continue
l1_size = data_len * (size_h * size_w * in_c)
else:
l1_size = data_len * (size_h * size_w * in_c +
tile_co * tile_c * k_h * k_w)

if l1_size > l1_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = in_c * k_h * k_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
tile_n, tile_w, bypass))

return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc):
"""get config space of convolution backprop input"""
if not isinstance(op_desc, ConvBackpropDesc):
raise TypeError('op_desc must be ConvDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvBackpropInputConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1
block_size = 16

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2
ub_max_size = l0c_max_size

_, in_c, in_h, in_w = op_desc.fmap_shape
k_n, _, k_h, k_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
k_n = (k_n + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = pad_
stride_h, stride_w = stride_

out_c = k_n
out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1
out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1

out_h = out_h * stride_h
out_w = out_w * stride_w

p_top = k_h - pad_[0] - 1
p_bottom = in_h + pad_[0] - stride_[0] * \
((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1)
p_left = k_w - pad_[2] - 1
p_right = in_w + pad_[2] - stride_[1] * \
((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1)

s_h = 1
s_w = 1

tile_c = out_c
tile_co_start = 16

data_len = 2

h_max = out_h + p_top + p_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = out_w + p_left + p_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
win_tile_h = (tile_h - k_h) // s_h + 1
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
if size_w == w_max:
size_w = in_w
else:
win_tile_w = (tile_w - k_w) // s_w + 1
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - p_left, in_w +
p_left - tile_w + k_w - s_w)

k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
l1_size = data_len * (size_h * size_w * out_c +
tile_co * tile_c * k_h * k_w)
if l1_size > l1_max_size:
continue
ub_size = data_len * (size_h * size_w * out_c)
if ub_size > ub_max_size:
continue

tile_co_ = ((tile_co - 1) // 16 + 1) * 16
for tile_n in range(tile_co_, 15, -16):
k_max = out_c * k_h * k_w
k_base = 16 * k_h * k_w
k_max_ = ((k_max - 1) // k_base + 1) * k_base
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // k_base * k_base
for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base):
m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
(int(((tile_w - k_w) // (s_w)) + 1))
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m,
tile_k, tile_n, tile_w))
return None, config_space, op_desc.__str__(), None, None


def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc):
"""get config space of convolution backwprop filter"""
if not isinstance(op_desc, ConvBackpropDesc):
raise TypeError('op_desc must be ConvBackpropDesc')

stride_ = op_desc.stride
pad_ = op_desc.pad
dilation_ = op_desc.dilation
vc_util.convolution_format_check(
op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
config_space = ListConfigSpace(ConvBackpropFilterConfig)

# if double buff is not enabled, set it's value to 1
size_scale = 1
block_size = 16

l1_max_size = (1024 * 1024) // size_scale
l0a_max_size = (64 * 1024) // size_scale
l0b_max_size = (64 * 1024) // size_scale
l0c_max_size = ((256 - 8) * 1024) // size_scale // 2

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, k_h, k_w = op_desc.filter_shape
k_n = cout

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

pad_top, pad_bottom, pad_left, pad_right = pad_
s_h, s_w = stride_
tile_co_start = 16
tile_ci_start = 16
data_len = 2
h_max = in_h + pad_top + pad_bottom
win_h = (h_max - k_h) // s_h + 1
h_max = (h_max - k_h) // s_h * s_h + k_h
w_max = in_w + pad_left + pad_right
win_w = (w_max - k_w) // s_w + 1
w_max = (w_max - k_w) // s_w * s_w + k_w

for tile_h in range(h_max, k_h - 1, -s_h):
size_h = tile_h
win_tile_h = (tile_h - k_h) // s_h + 1
# Only one head for cut H axis
if win_tile_h * s_h < pad_top:
continue
# Only one tail for cut H axis
if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top:
continue
if tile_h == h_max:
w_range = range(w_max, k_w - 1, -s_w)
size_h = in_h
else:
w_range = [w_max]
h_tiles = (win_h + win_tile_h - 1) // win_tile_h
if h_tiles == 2:
size_h = max(tile_h - pad_top, in_h +
pad_top - tile_h + k_h - s_h)

for tile_w in w_range:
size_w = tile_w
win_tile_w = (tile_w - k_w) // s_w + 1
# Only one head for cut W axis
if win_tile_w * s_w < pad_left:
continue
# Only one tail for cut W axis
if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left:
continue
if size_w == w_max:
size_w = in_w
else:
w_tiles = (win_w + win_tile_w - 1) // win_tile_w
if w_tiles == 2:
size_w = max(tile_w - pad_left, in_w +
pad_left - tile_w + k_w - s_w)
for tile_kh in range(k_h, 0, -1):
for tile_kw in range(k_w, 0, -1):
k_n_ = ((k_n - 1) // 16 + 1) * 16
co_range = range(k_n_, tile_co_start - 1, -16)
for tile_co in co_range:
in_c_ = ((in_c - 1) // 16 + 1) * 16
ci_range = range(in_c_, tile_ci_start - 1, -16)
for tile_ci in ci_range:
tile_batch = 1
l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w +
tile_ci * size_h * size_w)
if l1_size > l1_max_size:
continue

if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_):
tile_m = tile_co
tile_n = tile_ci * tile_kh * tile_kw
l0c_size = data_len * tile_n * tile_m
if l0c_size > l0c_max_size:
continue
k_max = tile_batch * tile_h * tile_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size1 = l0a_max_size // data_len // tile_m
k_size1_ = k_size1 // 16 * 16
k_size2 = l0b_max_size // data_len // tile_n
k_size2_ = k_size2 // 16 * 16
for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16):
config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co,
tile_batch, tile_h, tile_w, tile_m,
tile_k, tile_n))
else:
for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16):
k_max = tile_batch * tile_h * tile_w
k_max_ = ((k_max - 1) // 16 + 1) * 16
k_size = l0b_max_size // data_len // tile_n
k_size_ = k_size // 16 * 16
for tile_k in range(min(k_max_, k_size_), 15, -16):
m_max = tile_co
m_max_ = ((m_max - 1) // 16 + 1) * 16
m_size1 = l0a_max_size // data_len // tile_k
m_size1_ = m_size1 // 16 * 16
m_size2 = l0c_max_size // data_len // tile_n
m_size2_ = m_size2 // 16 * 16
for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw,
tile_co, tile_batch, tile_h,
tile_w, tile_m, tile_k, tile_n))
return None, config_space, op_desc.__str__(), None, None


def _get_space_matmul_cube(op_desc: MatmulCubeDesc):
"""get config space of matmul_cube"""
if not isinstance(op_desc, MatmulCubeDesc):
raise TypeError('op_desc must be MatmulCubeDesc')
config_space = ListConfigSpace(MatmulCubeConfig)
batch_tuple, m, k, n = matmul_run.extract_dim(
op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)

mmax = (m + 15) // 16
nmax = (n + 15) // 16
kmax = (k + 15) // 16

double_buffer = True
mad_fp32 = True

l1_max_size = (1024 * 1024) # L1 MEM 1024KB
l0a_max_size = (64 * 1024) # L0A MEM 64KB
l0b_max_size = (64 * 1024) # L0B MEM 64KB
l0c_max_size = (256 * 1024) # L0C MEM 256KB
# UB MEM 248KB, 8KB reserved for compiler
ub_max_size = ((256 - 8) * 1024)

if double_buffer:
l1_max_size = l1_max_size // 2
l0a_max_size = l0a_max_size // 2
l0b_max_size = l0b_max_size // 2
l0c_max_size = l0c_max_size // 2
ub_max_size = ub_max_size // 2

if mad_fp32:
l0c_max_size = l0c_max_size // 2
if op_desc.out_dtype == 'float32':
ub_max_size = ub_max_size // 2

bypass_options = [0, 1, 2]

for bypass in bypass_options:
if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or
(op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')):
continue

if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or
(op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')):
continue

for k_l1 in range(1, kmax + 1):
if kmax % k_l1 != 0:
continue
for k_l0 in range(1, k_l1 + 1):
if k_l1 % k_l0 != 0:
continue

# no need to cut from l1 to l0 for m and n when k is cut
for m_l1 in range(1, mmax + 1):
if mmax % m_l1 != 0:
continue
m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1)
for m_l0 in m_l0_range:
if m_l1 % m_l0 != 0:
continue
for n_l1 in range(1, nmax + 1):
if nmax % n_l1 != 0:
continue
n_l0_range = [n_l1] if k_l1 != kmax else range(
1, n_l1 + 1)
for n_l0 in n_l0_range:
if n_l1 % n_l0 != 0:
continue

if m_l0 * 16 * k_l0 * 16 > l0a_max_size:
continue

if n_l0 * 16 * k_l0 * 16 > l0b_max_size:
continue

if m_l0 * 16 * n_l0 * 16 > l0c_max_size:
continue

if m_l0 * 16 * n_l0 * 16 > ub_max_size:
continue

if bypass == 2:
l1_size = n_l1 * 16 * k_l1 * 16
elif bypass == 1:
l1_size = m_l1 * 16 * k_l1 * 16
else:
l1_size = (m_l1 * 16 + n_l1 *
16) * k_l1 * 16
if l1_size > l1_max_size:
continue

if nmax == 1:
n_l1 = 0
n_l0 = 0
if mmax == 1:
m_l1 = 0
m_l0 = 0
if kmax == 1:
k_l1 = 16
k_l0 = 16
config_space.add(MatmulCubeConfig(
n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass))
shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
op_desc.bias, op_desc.left_format,
op_desc.right_format, op_desc.out_format)
return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format,
op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype,
op_desc.out_dtype)), None, None



def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of batch_matmul operator in gpu"""
return

def get_range_block(space_res):
block_range = space_res.gpu_block_range_table.asnumpy().tolist()
block_mod = space_res.gpu_block_mod_table.asnumpy().tolist()
block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0])
block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0])
if len(block_y_range) == 0: block_y_range = range(1,2)
block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0])
if len(block_z_range) == 0: block_z_range = range(1,2)
return block_x_range,block_y_range,block_z_range

def get_range_thread(space_res):
thread_range = space_res.gpu_thread_range_table.asnumpy().tolist()
thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist()
thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0])
thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0])
if len(thread_y_range) == 0: thread_y_range = range(1,2)
thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0])
if len(thread_z_range) == 0: thread_z_range = range(1,2)
return thread_x_range,thread_y_range,thread_z_range

def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL):
total_shape = max([max(v) for v in tiling_spaces])
new_spaces = []
block_x_range, block_y_range, block_z_range = get_range_block(space_res)
thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res)
pbar = tqdm(total=len(tiling_spaces))
max_thread = 1024
for space in tiling_spaces:
pbar.set_description("Adding block, thread to spaces")
if policy == GpuSpacePolicy.REDUCE_ALL:
for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2):
for by in block_y_range:
for bz in block_z_range:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
if tx * ty * tz > max_thread:
continue
tmp_space = space[:]
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
elif policy == GpuSpacePolicy.BMM:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
if tx * ty * tz > max_thread:
continue
tmp_space = space[:]
if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]):
continue
bx = max(1, tmp_space[-1] // tx)
by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1
bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1
if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop:
continue
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
elif policy == GpuSpacePolicy.FULL:
for bx in block_x_range:
for by in block_y_range:
for bz in block_z_range:
for tx in thread_x_range:
for ty in thread_y_range:
for tz in thread_z_range:
tmp_space = space[:]
tmp_space.append(bx)
tmp_space.append(by)
tmp_space.append(bz)
tmp_space.append(tx)
tmp_space.append(ty)
tmp_space.append(tz)
new_spaces.append(tmp_space)
else:
raise ValueError("Policy {} is not defined.".format(policy))

pbar.update(1)
print("total spaces size is: ",len(new_spaces))
return new_spaces

def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
"""get config space of conv_image2col_gemm operators in gpu"""
return

_get_space_func = {
'conv': _get_space_conv,
'conv_bn1': _get_space_conv_bn1,
'conv_backprop_input': _get_space_conv_backprop_input,
'conv_backprop_filter': _get_space_conv_backprop_filter,
'matmul': _get_space_matmul_cube,
"reduce_sum_gpu": _get_space_reduce_gpu_manually,
"batch_matmul_gpu": _get_space_batch_matmul_gpu,
"conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu,
}


def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None):
"""get space of an operator"""
func = _get_space_func.get(op_type, None)
if func is None:
func = partial(_get_space_vector, op_type=op_type)
if "gpu" in op_type:
return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
return func(op_desc=op_desc)

+ 147
- 0
tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py View File

@@ -0,0 +1,147 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generating test data for operators"""
from typing import NamedTuple

import numpy as np
from gen_json_data import gen_json_data
from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run
from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc

def _gen_data_json(op_desc):
"""Generating test data for composite json"""
input_for_mod, expect, _ = gen_json_data(op_desc)
return input_for_mod, expect

def _gen_data_conv(op_desc: ConvDesc):
"""Generating test data for conv"""
fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
op_desc.pad, op_desc.stride, op_desc.dilation,
op_desc.use_bias)
out_data = np.full(expect.shape, 0, 'float16')

if op_desc.use_bias:
args = (fmap_data, filter_data, bias_data, out_data)
else:
args = (fmap_data, filter_data, out_data)
return args, expect


def _gen_data_conv_bn1(op_desc: ConvDesc):
"""Generating test data for conv_bn1"""
fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
op_desc.pad, op_desc.stride, op_desc.dilation,
op_desc.use_bias)
axes = (0, 2, 3)
conv_mean = np.mean(conv_expect, axis=axes, keepdims=True)
conv_square = np.power(conv_expect, 2)
conv_var_part = np.mean(conv_square, axis=axes, keepdims=True)

expects = (conv_expect, conv_var_part, conv_mean)

out_datas = [np.full(e.shape, 0, 'float16') for e in expects]
out_datas[1] = out_datas[1].astype(np.float32)
out_datas[2] = out_datas[2].astype(np.float32)

if op_desc.use_bias:
in_data = [fmap_data, filter_data, bias_data]
else:
in_data = [fmap_data, filter_data]

args = in_data
for out in out_datas:
args.append(out)
args = tuple(args)

return {"args": args, 'outputs': (-3, -2, -1)}, expects


def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc):
dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
op_desc.stride, op_desc.dilation)
out_data = np.full(dx.shape, 0, 'float16')

args = (dout, w, out_data)
return args, dx


def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc):
"""Generating test data for conv_backprop_filter"""
block_size = 16

in_n, in_c, in_h, in_w = op_desc.fmap_shape
cout, _, w_h, w_w = op_desc.filter_shape

in_c = (in_c + block_size - 1) // block_size * block_size
cout = (cout + block_size - 1) // block_size * block_size

x_shape = (in_n, in_c, in_h, in_w)
w_shape = (cout, in_c, w_h, w_w)

dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride,
op_desc.dilation)
out_data = np.full(expect.shape, 0, 'float32')

args = (dy_data, dx_data, out_data)
return args, expect


def _gen_data_matmul_cube(op_desc: MatmulCubeDesc):
"""Generating test data for matmul_cube"""
batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)
m = (m + 15) // 16 * 16
n = (n + 15) // 16 * 16
k = (k + 15) // 16 * 16
_, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
op_desc.bias, op_desc.left_format, op_desc.right_format,
op_desc.out_format)
m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype,
op_desc.out_dtype, op_desc.bias, op_desc.adj_x,
op_desc.adj_y, op_desc.left_format,
op_desc.right_format, op_desc.out_format)

out_data = np.full(out_shape, np.nan, op_desc.out_dtype)

if op_desc.bias:
args = (m_x, m_y, bias_data, out_data)
else:
args = (m_x, m_y, out_data)
return args, bench_mark


_gen_data_func = {
'json': _gen_data_json,
'conv': _gen_data_conv,
'conv_bn1': _gen_data_conv_bn1,
'conv_backprop_input': _gen_data_conv_backprop_input,
'conv_backprop_filter': _gen_data_conv_backprop_filter,
'matmul': _gen_data_matmul_cube,
}


def gen_data(op_type: str, op_desc: NamedTuple):
"""Generate test data for operator

Parameters
op_type: str
operator name
op_desc: NamedTuple
operator definition parameters
----------
"""
gen_func = _gen_data_func.get(op_type, None)
if gen_func is None:
raise ValueError('Unsupported op type for test data generating: %s' % op_type)
return gen_func(op_desc)

+ 84
- 0
tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py View File

@@ -0,0 +1,84 @@
from akg.utils import custom_tiling as ct_util

def reduce_gpu_tiling_strategy(in_shape, reduce_axis):
"""Custom tiling strategy for reduce op in gpu"""
strategy = list()

if reduce_axis == None or len(reduce_axis) == len(in_shape):
"""all-reduce"""
strategy.append(
ct_util.create_constraint_on_axis(
values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN
)
)
elif (len(in_shape) - 1) in reduce_axis:
"""Reduce-X: dummy strategy for hand-write space"""
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
)[0]
)
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
)
)

else:
"""Reduce-Y: dummy strategy for hand-write space"""
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
)[0]
)
strategy.append(
ct_util.create_constraint_on_axis(
values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
)[0]
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
)
)
strategy.append(
ct_util.modify_common_constraints(
value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
)
)

return strategy


def conv_dummy_strategy():
"""Conv strategy: dummy strategy"""
return

def batch_matmul_gpu_tiling_strategy(desc):
"""Custom tiling strategy for batch matmul in gpu with or without tensor core"""
return

+ 359
- 0
tests/fuzz/tune_for_gpu/autotuning/tuner.py View File

@@ -0,0 +1,359 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tuner for finding best config for operators"""
import logging
import time
import json
import os
import numpy as np
from multiprocessing import Process
from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel
from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer
from .space import ConfigSpace
from .runner import KernelRunner
from tqdm import tqdm

logger = logging.getLogger('fuzz.tune.autotuning.tuner')


class Tuner:
"""Basic tuner class

Parameters
----------
runner: KernelRunner
This is for run kernels in physical device
config_space: ConfigSpace
The space of configs
n_parallel: int
How many kernels are processed in a turn
"""

def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None):
self._runner = runner
self._index_table = index_table
self._space = config_space
self._n_parallel = n_parallel

# trial plan
self._trials = []
self._trial_pt = 0
self._visited = set()

# observed samples
self._xs = []
self._ys = []

# keep the current best
self._best_config = None # type: ConfigEntity
self._best_time = np.inf
self._best_iter = 0
self._tuning_time = 0.0
self._original_time = np.inf
self._skip_config_set = skip_config_set

@property
def best_config(self):
return self._best_config

@property
def best_time(self):
return self._best_time

@property
def best_iter(self):
return self._best_iter

@property
def tuning_time(self):
return self._tuning_time

@property
def original_time(self):
return self._original_time

@property
def xs(self):
return self._xs

@property
def ys(self):
return self._ys

def info(self):
print('space size:', self._space.length)
print('best config:', self._best_config)
print('best time:', self._best_time)
print('best_iter:', self._best_iter)
print('tuning time:', self._tuning_time, 'secs')

def next_batch(self, batch_size: int, is_add_visited=True):
"""extract next batch with xgboost model"""
ret = []
counter = 0
if not is_add_visited:
return [self._space.get(index) for index in range(min(batch_size, self._space.length))]
while counter < batch_size and self._space.has_next():
index = 0
while self._trial_pt < len(self._trials):
index = self._trials[self._trial_pt]
if index not in self._visited:
break
self._trial_pt += 1

if self._trial_pt >= len(self._trials):
# if the trial list is empty choose randomly
index = self._space.fetch_index()

ret.append(self._space.get(index))
self._visited.add(index)

counter += 1
return ret

def next_config(self, batch_size: int):
"""extract next config orderly"""
ret = []
counter = 0
while counter < batch_size and self._space.has_next():
index = self._space.fetch_next_index()
ret.append(self._space.get(index))
self._visited.add(index)
counter += 1
return ret

def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""):
"""export configs"""
mode = "a" if append else "w"
with open(output_file, mode) as f:
for x, y in configs:
if y != -1:
f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y))

def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""):
"""export dim configs"""
mode = "a" if append else "w"
data = {}
try:
if os.path.isfile(output_file):
with open(output_file, 'r') as f:
data = json.load(f)
except IOError as e:
logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
with open(output_file, mode) as f:
import re
data[key] = configs
s = json.dumps(data, sort_keys=True)
s = re.sub(r',\s*"', ',\n"', s)
s = '{\n' + s[1:-1] + '\n}'
f.write(s)

def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]):
"""export dim configs"""
mode = "a" if append else "w"
data = {}
try:
if os.path.isfile(output_file):
with open(output_file, 'r') as f:
data = json.load(f)
except IOError as e:
logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
with open(output_file, mode) as f:
import copy
tmp = copy.deepcopy(configs)
for key in reversed(keys):
info = {key: tmp}
tmp = copy.deepcopy(info)
data.update(info)
s = json.dumps(data, sort_keys=True, indent=4)
print(s)
f.write(s)

def load_configs(self, input_file: str):
"""load configs"""
configs = []
file_path = os.path.realpath(input_file)
if os.path.isfile(file_path):
with open(file_path, "r") as f:
for line in f:
x, y, _ = line.split('|')
configs.append((self._space.input_type(**json.loads(x)), np.float64(y)))
return configs

def tune(self, least_try_times: int, output_file: str = None):
"""grid search all configs"""
i = 0
pbar = tqdm(total=least_try_times)
while i < least_try_times:
if not self._space.has_next():
break
configs = self.next_config(min(self._n_parallel, least_try_times - i))
run_times = self._runner.run(configs, self._best_time)
results = []
for idx, conf in enumerate(configs):
results.append((conf.input_id, run_times[idx]))
# keep best config
if self.best_time > run_times[idx]:
self._best_time = run_times[idx]
self._best_iter = i + idx
self._best_config = conf

i += len(results)
pbar.update(len(results))

# update
for res in results:
self._xs.append(res[0])
self._ys.append(res[1])
if output_file:
configs = [(self._space.get(res[0]).input, res[1]) for res in results]
self.export_configs(configs, output_file)
return run_times


class ModelBasedTuner(Tuner):
"""Model based tuner
This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials

Parameters
----------
plan_size: int
Tuner will re-fit model per `plan_size` new measure samples
pre_model: CostModel
The cost model that predicts the speed of a config (IR)
"""

def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None):
super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel)
self.__plan_size = plan_size

if pre_model is not None:
self.__cost_model = pre_model
self.__cost_model.reset_space(self._space)
else:
self.__cost_model = XgbCostModel(self._space)

self.__model_optimizer = SimulatedAnnealingOptimizer(self._space)
self.__train_ct = 0

self.__is_auto_set_dim = False#True

# time to leave
self.__ttl = None
self.__least_try_times = None
self.__early_stopping = None

self.__model_run_time = 0.0

def info(self):
super(ModelBasedTuner, self).info()
print('model run time:', self.__model_run_time, 'secs')

def model_res(self):
self.__cost_model.fit(self._xs, self._ys, self.__plan_size)
best_configs = self.__model_optimizer.find_best(
self.__cost_model, self.__plan_size, self._visited)
self._trials = best_configs

def tune(self, least_try_times: int, output_file: str = None):
early_stopping = least_try_times
self.__least_try_times = least_try_times
self.__early_stopping = early_stopping

logger.setLevel(logging.DEBUG)
old_level = logger.level
i = 0
error_ct = 0

tuning_start = time.time()
while (i < self._space.length and (i < least_try_times
or (self._best_time > self._original_time - 0.9
and i < least_try_times * 3))):
if not self._space.has_next():
break
iter_start = time.time()
if not self.__is_auto_set_dim:
configs = self.next_batch(min(self._n_parallel, self._space.length - i))
else:
configs = self.next_batch(min(self._n_parallel, self._space.length - i), False)

logger.debug('--indexes: %s', str([x.input_id for x in configs]))

run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim)
if self.__is_auto_set_dim:
from operator import add
from functools import reduce
self._original_time = reduce(add, run_times) / len(run_times)
self._best_time = self._original_time
self._best_iter = -1
self._best_config = None
run_times = None
self.__is_auto_set_dim = False
continue

results = []
for idx, conf in enumerate(configs):
if run_times[idx] == -1:
continue
results.append((conf.input_id, run_times[idx]))
# keep best config
if self._best_time > run_times[idx]:
self._best_time = run_times[idx]
self._best_iter = i + idx
self._best_config = conf

i += len(results)
self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i

start = time.time()
# update
for res in results:
self._xs.append(res[0])
self._ys.append(res[1])
if output_file:
configs = [(self._space.get(res[0]).input, res[1]) for res in results]
desc = str(self._runner.op_desc)
self.export_configs(configs, output_file, desc=desc)
# if we have enough new training samples
if len(self._xs) >= self.__plan_size * (self.__train_ct + 1):
p = Process(target=self.model_res)
p.start()
p.join()
self._trial_pt = 0
self.__train_ct += 1

end = time.time()
logger.debug('model running time: %f seconds', end - start)
self.__model_run_time += end - start

iter_end = time.time()
logger.debug('iter time: %f seconds', iter_end - iter_start)

if self._best_iter > 0 and i >= self.best_iter + early_stopping:
logger.debug('Early stopped. Best iter: %d', self._best_iter)
return

print("tuning time already, ", time.time() - tuning_start)
if time.time() - tuning_start > 7200:
logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter)
return

if error_ct > 150:
logging.warning('Too many errors happen in the tuning. Now is in debug mode')
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(old_level)

self._tuning_time += time.time() - tuning_start

+ 9
- 0
tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json View File

@@ -0,0 +1,9 @@
{
"enable_atomic_add": {
"dtype": "bool",
"options": [
"False",
"True"
]
}
}

+ 155
- 0
tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py View File

@@ -0,0 +1,155 @@
from collections import namedtuple
import os
import logging


def get_block_str_from_config(config: namedtuple):
block_param = ""
if "block_x" in getattr(config, "_fields"):
block_param += str(config.block_x) + " "

if "block_y" in getattr(config, "_fields"):
block_param += str(config.block_y) + " "

if "block_z" in getattr(config, "_fields"):
block_param += str(config.block_z) + " "
return block_param


def get_thread_str_from_config(config: namedtuple):
thread_param = ""
if "thread_x" in getattr(config, "_fields"):
thread_param += str(config.thread_x) + " "

if "thread_y" in getattr(config, "_fields"):
thread_param += str(config.thread_y) + " "

if "thread_z" in getattr(config, "_fields"):
thread_param += str(config.thread_z) + " "
return thread_param


def get_parallel_build_num():
"""get the num of parallel build"""
env_dic = os.environ
try:
return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1
except NameError as e:
logging.error(e)
return 1


def get_available_gpu_num():
"""get the num of gpu"""
env_dic = os.environ
try:
return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ]
except NameError as e:
logging.error(e)
return 1

def get_real_attr(value ,key ,need_tune_json, need_tune_keys):
if key not in need_tune_keys:
return value
if need_tune_json[key]['dtype'] == "bool":
if need_tune_json[key]['options'][value].lower() == "true":
return True
elif need_tune_json[key]['options'][value].lower() == "false":
return False
else:
raise TypeError("Wrong boolean type, please check json file")
elif need_tune_json[key]['dtype'] == "str":
if isinstance(need_tune_json[key]['options'][value], str):
return need_tune_json[key]['options'][value]
else:
raise TypeError("Wrong str type, please check json file")
elif need_tune_json[key]['dtype'] == "int":
if isinstance(need_tune_json[key]['options'][value], int):
return need_tune_json[key]['options'][value]
else:
raise TypeError("Wrong int type, please check json file")


def merge_attrs(attrs, config, need_tune_json):
tiling = [getattr(config, name) for name in getattr(
config, '_fields') if name.startswith('tiling')]
dim_str = ''
d_config = config._asdict()
d_attrs = attrs._asdict()
is_2d_tiling = False
for name in getattr(config, '_fields'):
if name.startswith('tiling'):
if name.count("_") == 2:
is_2d_tiling = True
break
for i, element in enumerate(tiling):
if is_2d_tiling:
if i % 2 == 0:
dim_str += "0 " + str(i//2) + " "
dim_str += str(element) + " "
else:
# 1d tiling
dim_str += "0 " + str(i) + " " + str(element) + " 1 "

# add block, thread information
block = [str(getattr(config, name)) for name in getattr(
config, '_fields') if name.startswith('block')]
bind_block_str = ' '.join(block)

thread = [str(getattr(config, name)) for name in getattr(
config, '_fields') if name.startswith('thread')]
bind_thread_str = ' '.join(thread)

d_attrs['dim'] = dim_str
d_attrs['bind_block'] = bind_block_str
d_attrs['bind_thread'] = bind_thread_str

need_tune_keys = need_tune_json.keys()
for key in need_tune_keys:
d_attrs[key] = d_config[key]

# make a new attrs with config info
attrs_type = type(attrs)
config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs]
new_attrs = attrs_type(*config_list)
return new_attrs


def get_skip_configs_from_log(skip_configs_log):
skip_config_set = set()
if skip_configs_log != "":
with open(skip_configs_log, 'r') as file:
for line in file:
config = str(line.split("|")[1]).strip()
skip_config_set.add(config)
print("SKIP CONFIGS NUMBER:", len(skip_config_set))
return skip_config_set

def get_tuning_attrs_from_json(tuning_attrs_json):
import json
need_tune_spaces = [[]]
keys = []
json_string = dict()
if tuning_attrs_json != "":
with open(tuning_attrs_json,'r') as file:
json_string =json.load(file)
for key in json_string.keys():
keys.append(key)
num_options = len(json_string[key]['options'])
tmp_spaces = []
for space in need_tune_spaces:
for i in range(num_options):
tmp_space = space[:]
tmp_space.append(i)
tmp_spaces.append(tmp_space)
need_tune_spaces = tmp_spaces[:]
return (keys, need_tune_spaces, json_string)

if __name__ == "__main__":
"""test components"""
file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json"
keys, need_tune_spaces = get_tuning_attrs_from_json(file_name)
print(keys)
print(need_tune_spaces)

+ 49
- 0
tests/fuzz/tune_for_gpu/autotuning/type_definitions.py View File

@@ -0,0 +1,49 @@
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""operator description and config param definitions"""
from collections import namedtuple

# op desc for ascend
ConvDesc = namedtuple("ConvDesc", [
'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias'])

ConvBackpropDesc = namedtuple(
"ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation'])

MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format",
"out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"])


# op desc for gpu
ReduceGpuDesc = namedtuple("ReduceGpuDesc", [
"in_shape", "in_dtype", "axis", "keepdims",
"poly_sch", "dim", "bind_block", "bind_thread",
"enable_akg_reduce_lib", "enable_atomic_add"])


# config param definitions for ascend
ConvConfig = namedtuple('ConvConfig', [
'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass'])
ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig',
['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w'])
ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig',
['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch',
'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n'])
MatmulCubeConfig = namedtuple(
'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass'])

# config param definitions for gpu

EmptyConfig = namedtuple('empty', [])

+ 16
- 0
tests/fuzz/tune_for_gpu/config_gpu.sh View File

@@ -0,0 +1,16 @@
# how many multi-processing to build
export BUILD_PARALLEL_NUM=4

# set the default gpu devices, plz never change it
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# set the real devices you want to use
export USE_GPU_DEVICES=0,1,2,3

export RUNTIME_MODE=gpu

export PROFILING_MODE=true

# ascend config
export DEVICE_ID=0
export DEVICE_TOTAL_NUM=8

+ 67
- 0
tests/fuzz/tune_for_gpu/test_gpu.py View File

@@ -0,0 +1,67 @@
# Copyright 2019-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""test"""
import time
from autotuning.job import launch
from akg.utils import kernel_exec
from akg.ops.math_gpu import reduce_sum
from autotuning.type_definitions import ReduceGpuDesc
import numpy as np
import sys
import argparse
from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json


def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False):
mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ),
kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims],
attrs={"target": "cuda", "enable_akg_reduce_lib": True})
return mod

def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None):
time_start = time.time()
op_type_ = 'reduce_sum_gpu'
debug_mode_ = True
save_res_ = True
all_space_ = True
op_config = [in_shape, in_dtype, axis, keepdims,
"", "", "",
True, True, True]
op_config = ReduceGpuDesc(*op_config)
desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute,
op_config, tuning_attrs_info)
launch(op_type=op_type_, debug_mode=debug_mode_,
save_res=save_res_, desc=desc_, all_space=all_space_,
from_json=False, skip_config_set=skip_config_set,
tuning_attrs_info=tuning_attrs_info)
time_end = time.time()
print("total tuning time: ", time_end - time_start)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--skip_configs_log", type=str,
default="", help="skip those configs in .log file")
parser.add_argument("--tuning_attrs_json", type=str, default="",
help="the json file to describe the tuning atttrs")
args = parser.parse_args()

# check whether have configs need to skip
skip_config_set = get_skip_configs_from_log(args.skip_configs_log)

# add tuning_attrs from json file
tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json)

run_test_reduce_sum((1024, 1024), "float32", (1,),
False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)

+ 1
- 1
tests/test_env.sh View File

@@ -25,7 +25,7 @@ else
TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm"

export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH}
export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH}
export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH}
if [ $# -eq 1 ] && [ $1 = "gpu" ]; then
export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
fi


Loading…
Cancel
Save