From: @yiyanzhi_akane Reviewed-by: @dylangeng Signed-off-by:tags/v1.2.0
| @@ -50,7 +50,17 @@ def dump_tiling_info(level): | |||
| logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1], | |||
| tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1], | |||
| tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0], | |||
| tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0]) | |||
| tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0], | |||
| ) | |||
| idx_to_str = {0: "x", 1: "y", 2: "z"} | |||
| for i in range(len(tuning_spaces["thread_range"])): | |||
| info = "[thread.%s] range [%d, %d](jump by %d), " | |||
| logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1], | |||
| tuning_spaces['thread_mod'][i][0], ) | |||
| for i in range(len(tuning_spaces["block_range"])): | |||
| info = "[block.%s] range [%d, %d](jump by %d)" | |||
| logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0], | |||
| tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],) | |||
| logging.info("===============================================") | |||
| elif isinstance(indice, int) and indice == EMPTY_CODE: | |||
| logging.info("Empty tiling space.") | |||
| @@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att | |||
| tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist() | |||
| tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist() | |||
| tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist() | |||
| tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist() | |||
| tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist() | |||
| tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist() | |||
| tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist() | |||
| if level >= help_tiling_level["Candidates"]: | |||
| tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist() | |||
| if not tuning: | |||
| @@ -70,15 +70,33 @@ class TileConstraint(Enum): | |||
| SET_EXPANSION = "SET_EXPANSION" | |||
| SET_MEM_RATIO = "SET_MEM_RATIO" | |||
| SET_AXIS_INFO = "SET_AXIS_INFO" | |||
| THREAD_MIN = "THREAD_MIN" | |||
| THREAD_MAX = "THREAD_MAX" | |||
| THREAD_MOD = "THREAD_MOD" | |||
| BLOCK_MIN = "BLOCK_MIN" | |||
| BLOCK_MAX = "BLOCK_MAX" | |||
| BLOCK_MOD = "BLOCK_MOD" | |||
| @check_input_type((double, float, int), TileConstraint, TileLevel) | |||
| @check_input_type((double, float, int, list), TileConstraint, TileLevel) | |||
| def modify_common_constraints(value, constraint, level=TileLevel.C1): | |||
| """api for dsl to modify some default constraint used in auto tiling.""" | |||
| if constraint not in TileConstraint: | |||
| raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint)) | |||
| if constraint == TileConstraint.SET_MEM_RATIO: | |||
| return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value)) | |||
| if constraint == TileConstraint.THREAD_MIN: | |||
| return create_custom_tiling_node(TileMode.COMMON, thread_min=value) | |||
| if constraint == TileConstraint.THREAD_MAX: | |||
| return create_custom_tiling_node(TileMode.COMMON, thread_max=value) | |||
| if constraint == TileConstraint.THREAD_MOD: | |||
| return create_custom_tiling_node(TileMode.COMMON, thread_mod=value) | |||
| if constraint == TileConstraint.BLOCK_MIN: | |||
| return create_custom_tiling_node(TileMode.COMMON, block_min=value) | |||
| if constraint == TileConstraint.BLOCK_MAX: | |||
| return create_custom_tiling_node(TileMode.COMMON, block_max=value) | |||
| if constraint == TileConstraint.BLOCK_MOD: | |||
| return create_custom_tiling_node(TileMode.COMMON, block_mod=value) | |||
| raise TypeError("Constraint {} is not supported in this api, please use other api" | |||
| .format(constraint.value)) | |||
| @@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode, | |||
| axis_info=DEFAULT_STRING, | |||
| priority=DEFAULT_VALUE, | |||
| expansion=DEFAULT_VALUE, | |||
| mem_ratio=double(DEFAULT_VALUE)): | |||
| mem_ratio=double(DEFAULT_VALUE), | |||
| thread_min=[], | |||
| thread_max=[], | |||
| thread_mod=[], | |||
| block_min=[], | |||
| block_max=[], | |||
| block_mod=[]): | |||
| """default method to create custom tiling node, all values are default except tile mode.""" | |||
| tile_min = to_tvm_type(tile_min, "tile_min") | |||
| @@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode, | |||
| axis_info=akg.tvm.expr.StringImm(axis_info), | |||
| priority=priority, | |||
| expansion=expansion, | |||
| mem_ratio=mem_ratio) | |||
| mem_ratio=mem_ratio, | |||
| thread_min=thread_min, | |||
| thread_max=thread_max, | |||
| thread_mod=thread_mod, | |||
| block_min=block_min, | |||
| block_max=block_max, | |||
| block_mod=block_mod) | |||
| def template_nc1hwc0(tensor_name, level): | |||
| @@ -35,6 +35,7 @@ import numpy as np | |||
| import akg | |||
| from akg.build_module import help_tiling_level | |||
| from akg import backend as cce | |||
| import akg.tvm | |||
| from akg.tvm import autotvm | |||
| from akg.tvm import rpc | |||
| @@ -88,7 +89,6 @@ def debug_mode(debug_flag): | |||
| pass_list.append((0, ir_pass.inject_dma_intrin)) | |||
| return pass_list | |||
| def func_time_required(func_name): | |||
| """Checking the Time Required for Function Running.""" | |||
| def wrapper(*args, **kwargs): | |||
| @@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs): | |||
| return None | |||
| @func_time_required | |||
| def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None): | |||
| def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400): | |||
| """ | |||
| unified run CCE kernel api. | |||
| @@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None) | |||
| if not tuning: | |||
| return out_list[0] if len(out_list) == 1 else tuple(out_list) | |||
| else: | |||
| cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True) | |||
| cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time) | |||
| return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles} | |||
| stat_info = {} | |||
| @@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", | |||
| level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None | |||
| if tuning or (level is not None and level > help_tiling_level['None']): | |||
| return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target) | |||
| mode = get_runtime_mode() | |||
| if mode == "cpu": | |||
| mod = akg.tvm.build(s, op_var, "llvm") | |||
| @@ -1069,12 +1068,12 @@ def get_device_id(): | |||
| logging.error(e) | |||
| return 0 | |||
| def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False): | |||
| def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400): | |||
| "get gpu profiling cycles." | |||
| func = tvm.get_global_func('GPUProfilerInit') | |||
| func("") | |||
| from akg.utils.result_analysis import gpu_profiling | |||
| gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id) | |||
| gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id) | |||
| func = tvm.get_global_func('GPUProfilerStop') | |||
| a = func() | |||
| return int(a) | |||
| @@ -80,6 +80,24 @@ class CustomTilingNode : public Node { | |||
| * default is 0.5 which is reserved for double buffer*/ | |||
| double mem_ratio; | |||
| /*! \brief minimal thread binding factor on gpu, greater than 0*/ | |||
| Array<Expr> thread_min; | |||
| /*! \brief maximal thread binding factor on gpu*/ | |||
| Array<Expr> thread_max; | |||
| /*! \brief constraint thread binding factor % thread_mod == 0*/ | |||
| Array<Expr> thread_mod; | |||
| /*! \brief minimal block binding factor on gpu, greater than 0*/ | |||
| Array<Expr> block_min; | |||
| /*! \brief maximal block binding factor on gpu*/ | |||
| Array<Expr> block_max; | |||
| /*! \brief constraint block binding factor % block_mod == 0*/ | |||
| Array<Expr> block_mod; | |||
| void VisitAttrs(AttrVisitor *v) { | |||
| v->Visit("tile_level", &tile_level); | |||
| v->Visit("tile_mode", &tile_mode); | |||
| @@ -97,6 +115,12 @@ class CustomTilingNode : public Node { | |||
| v->Visit("priority", &priority); | |||
| v->Visit("expansion", &expansion); | |||
| v->Visit("mem_ratio", &mem_ratio); | |||
| v->Visit("thread_min", &thread_min); | |||
| v->Visit("thread_max", &thread_max); | |||
| v->Visit("thread_mod", &thread_mod); | |||
| v->Visit("block_min", &block_min); | |||
| v->Visit("block_max", &block_max); | |||
| v->Visit("block_mod", &block_mod); | |||
| } | |||
| static constexpr const char *_type_key = "CustomTilingNode"; | |||
| @@ -36,6 +36,15 @@ class TileSpaceCollector { | |||
| space_->c1_tile_mod_table = init_array; | |||
| space_->c0_tile_mod_table = init_array; | |||
| space_->tiling_candidate = init_array; | |||
| space_->gpu_thread_range_table = init_array; | |||
| space_->gpu_block_range_table = init_array; | |||
| space_->gpu_thread_mod_table = init_array; | |||
| space_->gpu_block_mod_table = init_array; | |||
| if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | |||
| cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"}; | |||
| } else { | |||
| cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; | |||
| } | |||
| } | |||
| ~TileSpaceCollector() = default; | |||
| @@ -122,38 +131,61 @@ class TileSpaceCollector { | |||
| // step 2. collect cared info from each axis | |||
| for (const auto &con : cared_info_) { | |||
| int length = con.find("mod") != std::string::npos ? 1 : 2; | |||
| auto array = air::runtime::NDArray::Empty({static_cast<int64_t>(tile_size), length}, type, ctx); | |||
| auto size = static_cast<int64_t>(tile_size); | |||
| if (con.find("gpu") != std::string::npos) { | |||
| size = std::max<int64_t>(3, size); | |||
| } | |||
| auto array = air::runtime::NDArray::Empty({size, length}, type, ctx); | |||
| auto spaceDlPack = array.ToDLPack(); | |||
| auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data); | |||
| for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { | |||
| for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { | |||
| if (con == "index") { | |||
| *ptr++ = b_idx; | |||
| *ptr++ = a_idx; | |||
| if (con.find("gpu") != std::string::npos) { | |||
| size_t s = con.find("thread") != std::string::npos ? 0 : 3; | |||
| size_t e = con.find("thread") != std::string::npos ? 3 : 6; | |||
| for (size_t i = s; i < e; ++i) { | |||
| if (length == 1) { | |||
| *ptr++ = analyzer_.binding_spaces_[i].map_mod_; | |||
| } else { | |||
| if (con == "C1_range") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||
| } else if (con == "C0_range") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||
| } else if (con == "C1_mod") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||
| } else if (con == "C0_mod") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||
| *ptr++ = analyzer_.binding_spaces_[i].map_min_; | |||
| *ptr++ = analyzer_.binding_spaces_[i].map_extent_; | |||
| } | |||
| } | |||
| } else { | |||
| for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { | |||
| for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { | |||
| if (con == "index") { | |||
| *ptr++ = b_idx; | |||
| *ptr++ = a_idx; | |||
| } else { | |||
| if (con == "C1_range") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||
| } else if (con == "C0_range") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||
| *ptr++ = const_cons.tile_min_.as<IntImm>()->value; | |||
| *ptr++ = const_cons.tile_extent_.as<IntImm>()->value; | |||
| } else if (con == "C1_mod") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); | |||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||
| } else if (con == "C0_mod") { | |||
| TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); | |||
| *ptr++ = const_cons.tile_mod_.as<IntImm>()->value; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (con == "index") space_->index_table = array; | |||
| if (con == "C1_range") space_->c1_tile_range_table = array; | |||
| if (con == "C0_range") space_->c0_tile_range_table = array; | |||
| if (con == "C1_mod") space_->c1_tile_mod_table = array; | |||
| if (con == "C0_mod") space_->c0_tile_mod_table = array; | |||
| if (con == "gpu_thread_range") space_->gpu_thread_range_table = array; | |||
| if (con == "gpu_block_range") space_->gpu_block_range_table = array; | |||
| if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array; | |||
| if (con == "gpu_block_mod") space_->gpu_block_mod_table = array; | |||
| delete spaceDlPack; | |||
| } | |||
| } | |||
| @@ -196,7 +228,8 @@ class TileSpaceCollector { | |||
| bool min_tile_ok = false; | |||
| for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) { | |||
| bool break_constraint = | |||
| (tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0); | |||
| ((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) || | |||
| (axis->forbid_iso && tile_extent->value % tile != 0); | |||
| if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) { | |||
| continue; | |||
| } | |||
| @@ -365,7 +398,7 @@ class TileSpaceCollector { | |||
| DLContext ctx = {kDLCPU, 0}; | |||
| std::vector<TileAxis *> tile_axes_; | |||
| std::vector<bool> is_shared_; | |||
| std::unordered_set<std::string> cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; | |||
| std::unordered_set<std::string> cared_info_; | |||
| struct Result { | |||
| std::vector<int> tile; | |||
| @@ -28,6 +28,11 @@ class TileSpaceNode : public Node { | |||
| air::runtime::NDArray c1_tile_mod_table; | |||
| air::runtime::NDArray c0_tile_mod_table; | |||
| air::runtime::NDArray tiling_candidate; | |||
| air::runtime::NDArray gpu_thread_range_table; | |||
| air::runtime::NDArray gpu_block_range_table; | |||
| air::runtime::NDArray gpu_thread_mod_table; | |||
| air::runtime::NDArray gpu_block_mod_table; | |||
| void VisitAttrs(AttrVisitor *v) { | |||
| v->Visit("index_table", &index_table); | |||
| @@ -36,6 +41,11 @@ class TileSpaceNode : public Node { | |||
| v->Visit("c1_tile_mod_table", &c1_tile_mod_table); | |||
| v->Visit("c0_tile_mod_table", &c0_tile_mod_table); | |||
| v->Visit("tiling_candidate", &tiling_candidate); | |||
| v->Visit("gpu_thread_range_table", &gpu_thread_range_table); | |||
| v->Visit("gpu_block_range_table", &gpu_block_range_table); | |||
| v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table); | |||
| v->Visit("gpu_block_mod_table", &gpu_block_mod_table); | |||
| } | |||
| static constexpr const char *_type_key = "TileSpace"; | |||
| TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node); | |||
| @@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() { | |||
| if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | |||
| ReduceStrategy reduce_strategy(this); | |||
| actived_strategies.push_back(&reduce_strategy); | |||
| ModStrategy mod_strategy(this); | |||
| actived_strategies.push_back(&mod_strategy); | |||
| GemmStrategy gemm_strategy(this); | |||
| GpuDmaAnalysisStrategy dma_analysis_strategy(this); | |||
| CustomTilingStrategy custom_strategy(this); | |||
| GpuStrategy gpu_strategy(this); | |||
| if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) { | |||
| actived_strategies.push_back(&dma_analysis_strategy); | |||
| } else { | |||
| if (scop_info_.user_config_.GetIsTuning()) { | |||
| actived_strategies.push_back(&custom_strategy); | |||
| } else { | |||
| actived_strategies.push_back(&reduce_strategy); | |||
| actived_strategies.push_back(&mod_strategy); | |||
| actived_strategies.push_back(&gemm_strategy); | |||
| } | |||
| actived_strategies.push_back(&gpu_strategy); | |||
| } | |||
| strategy_manager->SetStrategies(actived_strategies); | |||
| strategy_manager->ExecuteGpu(); | |||
| if (scop_info_.user_config_.GetIsTuning()) { | |||
| binding_spaces_.clear(); | |||
| for (auto i : gpu_strategy.thread_binding_spaces_) { | |||
| UpdateBindingSpace(i); | |||
| } | |||
| for (auto i : gpu_strategy.block_binding_spaces_) { | |||
| UpdateBindingSpace(i); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| } | |||
| @@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() { | |||
| if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { | |||
| CastStrategy cast_strategy(this); | |||
| actived_strategies.push_back(&cast_strategy); | |||
| strategy_manager->SetStrategies(actived_strategies); | |||
| strategy_manager->ExecuteGpu(); | |||
| return; | |||
| @@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() { | |||
| bool TilingAnalyzer::Prepare() { | |||
| logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger( | |||
| scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); | |||
| scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); | |||
| CHECK(logger_) << "memory alloc fail."; | |||
| // Stage 1: Analyze schedule tree. | |||
| ScheduleTreeAnalyzer sch_ana(this, this->sch_); | |||
| @@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) { | |||
| return (ALIGN_BYTES + dtype - 1) / dtype; | |||
| } | |||
| inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||
| inline int64_t GetMinBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||
| int64_t min_byte = -1; | |||
| for (const auto &it : dtypes) { | |||
| if (it.second.empty()) { | |||
| @@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int> | |||
| min_byte = min_elem; | |||
| } | |||
| } | |||
| return GetAlignBytes(min_byte); | |||
| return min_byte; | |||
| } | |||
| inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) { | |||
| return GetAlignBytes(GetMinBytes(dtypes)); | |||
| } | |||
| inline Expr CastToExpr(const std::string &value) { | |||
| @@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND"; | |||
| constexpr auto AT_MOD = "MOD"; | |||
| constexpr auto AT_CAST = "CAST"; | |||
| constexpr auto AT_MEM_RATIO = "MEM_RATIO"; | |||
| constexpr auto AT_THREAD_MIN = "THREAD_MIN"; | |||
| constexpr auto AT_THREAD_MAX = "THREAD_MAX"; | |||
| constexpr auto AT_THREAD_MOD = "THREAD_MOD"; | |||
| constexpr auto AT_BLOCK_MIN = "BLOCK_MIN"; | |||
| constexpr auto AT_BLOCK_MAX = "BLOCK_MAX"; | |||
| constexpr auto AT_BLOCK_MOD = "BLOCK_MOD"; | |||
| class TilingAnalyzer; | |||
| @@ -233,12 +243,12 @@ class TilingAnalyzer { | |||
| sch_(sch), | |||
| scop_info_(scop_info), | |||
| is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) { | |||
| if (scop_info.mmu_info_.IsGemm()) { | |||
| op_type_ = GEMM_OP; | |||
| } else if (scop_info.mmu_info_.IsConv()) { | |||
| op_type_ = CONV_OP; | |||
| } else { | |||
| op_type_ = VECTOR_OP; | |||
| if (scop_info.mmu_info_.IsGemm()) { | |||
| op_type_ = GEMM_OP; | |||
| } else if (scop_info.mmu_info_.IsConv()) { | |||
| op_type_ = CONV_OP; | |||
| } else { | |||
| op_type_ = VECTOR_OP; | |||
| } | |||
| } | |||
| @@ -292,7 +302,7 @@ class TilingAnalyzer { | |||
| CHECK(logger_); | |||
| return *(logger_.get()); | |||
| } | |||
| void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); } | |||
| Stmt body_; | |||
| Binds &binds_; | |||
| isl::schedule sch_; | |||
| @@ -306,9 +316,8 @@ class TilingAnalyzer { | |||
| std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_; | |||
| std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_; | |||
| bool is_retry_{false}; | |||
| std::vector<TileAxis::MappingConstraint> binding_spaces_; // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z] | |||
| private: | |||
| void AddTilingConstraints(); | |||
| void AddPostTilingConstraints(); | |||
| @@ -284,8 +284,6 @@ class GemmStrategy : public TilingStrategy { | |||
| ~GemmStrategy() {} | |||
| void AddNpuConstraint(); | |||
| void AddGpuConstraint(); | |||
| std::string interested_attr_key = AT_GEMM; | |||
| }; | |||
| class GpuStrategy : public TilingStrategy { | |||
| @@ -306,6 +304,8 @@ class GpuStrategy : public TilingStrategy { | |||
| }; | |||
| void AddNpuConstraint(); | |||
| void AddGpuConstraint(); | |||
| std::vector<TileAxis::MappingConstraint> thread_binding_spaces_; // [thread.x, thread.y, thread.z] | |||
| std::vector<TileAxis::MappingConstraint> block_binding_spaces_; // [block.x, block.y, block.z] | |||
| private: | |||
| void DetermineTemplate(); | |||
| @@ -326,6 +326,8 @@ class GpuStrategy : public TilingStrategy { | |||
| // Step 1. Collect axes and sort them from inner to outer | |||
| void BuildAxesQueue(); | |||
| void ApplyCustomConstraint(); | |||
| /* | |||
| * Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks. | |||
| * e.g. | |||
| @@ -357,6 +359,7 @@ class GpuStrategy : public TilingStrategy { | |||
| int64_t min_elem_for_io_bound_ = 2; | |||
| size_t depth_{0}; | |||
| bool need_reverse_{false}; | |||
| bool reverse_binding_{false}; | |||
| int64_t fused_size_{1}; | |||
| std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"}, | |||
| {3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"}, | |||
| @@ -378,7 +381,7 @@ class MulticoreStrategy { | |||
| class TilingPriorityScorer { | |||
| public: | |||
| TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} | |||
| TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} | |||
| ~TilingPriorityScorer() {} | |||
| /* | |||
| @@ -18,7 +18,6 @@ | |||
| #include <numeric> | |||
| #include "tiling_analyzer.h" | |||
| namespace akg { | |||
| namespace ir { | |||
| namespace poly { | |||
| @@ -377,13 +376,129 @@ void ReduceStrategy::DealWithPostReduceTensors() { | |||
| } | |||
| } | |||
| void GpuStrategy::ApplyCustomConstraint() { | |||
| auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) { | |||
| std::vector<std::string> sp = akg::common::Split(constraint, ","); | |||
| std::vector<int64_t> ret; | |||
| for (auto val : sp) { | |||
| if (ret.size() == max_size) { | |||
| break; | |||
| } | |||
| CHECK(!val.empty()); | |||
| ret.emplace_back(static_cast<int>(std::strtol(val.c_str(), nullptr, 10))); | |||
| } | |||
| return ret; | |||
| }; | |||
| // init binding space through template-determined limit | |||
| thread_binding_spaces_.clear(); | |||
| block_binding_spaces_.clear(); | |||
| for (size_t i = 0; i < thread_limit_.size(); ++i) { | |||
| TileAxis::MappingConstraint elem; | |||
| elem.map_extent_ = thread_limit_[i]; | |||
| thread_binding_spaces_.emplace_back(elem); | |||
| } | |||
| for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) { | |||
| TileAxis::MappingConstraint elem; | |||
| elem.map_extent_ = block_limit_[i]; | |||
| block_binding_spaces_.emplace_back(elem); | |||
| } | |||
| // add constraints to binding space according to custom tiling | |||
| std::unordered_set<std::string> thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD}; | |||
| std::unordered_set<std::string> block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD}; | |||
| for (const auto attr : analyzer_->RootAxis()->attrs) { | |||
| std::vector<int64_t> constraint; | |||
| std::vector<TileAxis::MappingConstraint> target; | |||
| if (thread_keys.find(attr.attr_key) != thread_keys.end()) { | |||
| constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size()); | |||
| target = thread_binding_spaces_; | |||
| } else if (block_keys.find(attr.attr_key) != block_keys.end()) { | |||
| constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size()); | |||
| target = block_binding_spaces_; | |||
| } | |||
| if (constraint.empty()) { | |||
| continue; | |||
| } | |||
| for (size_t i = 0; i < constraint.size(); ++i) { | |||
| if (attr.attr_key.find("MIN") != std::string::npos) { | |||
| target[i].map_min_ = std::max<int64_t>(target[i].map_min_, constraint[i]); | |||
| } else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) { | |||
| target[i].map_extent_ = std::min<int64_t>(target[i].map_extent_, constraint[i]); | |||
| } else if (attr.attr_key.find("MOD") != std::string::npos) { | |||
| target[i].map_mod_ = std::max<int64_t>(1, constraint[i]); | |||
| } | |||
| } | |||
| if (thread_keys.find(attr.attr_key) != thread_keys.end()) { | |||
| thread_binding_spaces_ = target; | |||
| } else if (block_keys.find(attr.attr_key) != block_keys.end()) { | |||
| block_binding_spaces_ = target; | |||
| } | |||
| } | |||
| // apply custom constraint to corresponding axis and modify binding space according to tile range of axis | |||
| size_t cur_depth = 0; | |||
| analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) { | |||
| if (axis == analyzer_->RootAxis()) { | |||
| return; | |||
| } | |||
| auto cons = axis->GetConstConstraint(CACHE1); | |||
| auto range_extent = axis->GetConstExtent(); | |||
| int tile_min = cons.tile_min_.as<IntImm>()->value; | |||
| int tile_extent = cons.tile_extent_.as<IntImm>()->value; | |||
| auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth; | |||
| auto thread_extent = tile_extent; | |||
| if (idx < thread_binding_spaces_.size()) { | |||
| thread_extent = std::min<int64_t>(thread_extent, thread_binding_spaces_[idx].map_extent_); | |||
| thread_binding_spaces_[idx].map_extent_ = thread_extent; | |||
| } | |||
| auto block_extent = range_extent / tile_min; | |||
| if (idx < block_binding_spaces_.size()) { | |||
| block_extent = std::min<int64_t>(block_extent, block_binding_spaces_[idx].map_extent_); | |||
| block_binding_spaces_[idx].map_extent_ = block_extent; | |||
| } | |||
| auto block_min = block_extent / std::max<int64_t>(1, thread_extent); | |||
| if (idx < block_binding_spaces_.size()) { | |||
| block_min = std::max<int64_t>(block_min, block_binding_spaces_[idx].map_min_); | |||
| block_binding_spaces_[idx].map_min_ = block_min; | |||
| } | |||
| axis->thread_constraints.map_extent_ = thread_extent; | |||
| axis->block_constraints.map_extent_ = block_extent; | |||
| axis->block_constraints.map_min_ = block_min; | |||
| if (idx < thread_binding_spaces_.size()) { | |||
| axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_; | |||
| } | |||
| if (idx < block_binding_spaces_.size()) { | |||
| axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_; | |||
| } | |||
| ++cur_depth; | |||
| }); | |||
| } | |||
| void GpuStrategy::AddGpuConstraint() { | |||
| InitMappingLimit(); | |||
| if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) { | |||
| if (!analyzer_->scop_info_.user_config_.GetIsTuning() && | |||
| (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) { | |||
| BroadcastSpeedup(); | |||
| } | |||
| BuildAxesQueue(); | |||
| if (analyzer_->scop_info_.user_config_.GetIsTuning()) { | |||
| ApplyCustomConstraint(); | |||
| for (size_t i = 0; i < max_dim_; ++i) { | |||
| TileAxis::MappingConstraint pad; | |||
| if (i >= thread_binding_spaces_.size()) { | |||
| thread_binding_spaces_.emplace_back(pad); | |||
| } | |||
| if (i >= block_binding_spaces_.size()) { | |||
| block_binding_spaces_.emplace_back(pad); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| InnerThreadOuterBlock(); | |||
| @@ -391,19 +506,27 @@ void GpuStrategy::AddGpuConstraint() { | |||
| InjectiveSpeedup(); | |||
| } | |||
| SetMappingConfig(); | |||
| if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||
| analyzer_->ForEachAxisTopDown([this](TileAxis *axis) { | |||
| if (axis == analyzer_->RootAxis()) { | |||
| return; | |||
| } | |||
| axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0); | |||
| }); | |||
| } | |||
| } | |||
| void GpuStrategy::InitMappingLimit() { | |||
| max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread(); | |||
| DetermineTemplate(); | |||
| std::stringstream ss; | |||
| need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; | |||
| reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; | |||
| if (template_ == Template::CUSTOM_CONFIG) { | |||
| auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig(); | |||
| for (size_t i = 0; i < thread_config->bound; ++i) { | |||
| auto idx = need_reverse_ ? thread_config->bound - 1 - i : i; | |||
| auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i; | |||
| if (idx >= depth_) { | |||
| continue; | |||
| } | |||
| @@ -427,12 +550,16 @@ void GpuStrategy::InitMappingLimit() { | |||
| } else if (template_ == Template::MATMUL) { | |||
| // This is a naive tiling strategy used in gpu when thread and block configs are already set. | |||
| // This strategy will tile up to three inner-most axes to 32 (for thread binding). | |||
| thread_limit_ = {32, 8}; | |||
| if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||
| thread_limit_ = {warp_sizes_, 16}; | |||
| } else { | |||
| thread_limit_ = {warp_sizes_, 8}; | |||
| } | |||
| } else { | |||
| thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_}; | |||
| } | |||
| if (template_ != Template::CUSTOM_CONFIG) { | |||
| if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { | |||
| AdjustThreadMappingLimit(); | |||
| } | |||
| @@ -505,13 +632,21 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||
| tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_) | |||
| : tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_) | |||
| : 1; | |||
| if (axis->block_constraints.map_extent_ > 1) { | |||
| tile = | |||
| std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1)); | |||
| pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1))); | |||
| ss << ", map to block."; | |||
| auto tile_min = axis->c1_constraints.tile_min_.as<IntImm>()->value; | |||
| auto tile_extent = axis->c1_constraints.tile_extent_.as<IntImm>()->value; | |||
| if (tile_min == tile_extent && tile_extent != MIN_TILE) { | |||
| ss << "tile extent is already determined = " << tile_extent; | |||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | |||
| tile = tile_min; | |||
| } else { | |||
| tile = std::min(tile, shape); | |||
| if (axis->block_constraints.map_extent_ > 1) { | |||
| tile = | |||
| std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1)); | |||
| pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1))); | |||
| ss << ", map to block."; | |||
| } else { | |||
| tile = std::min(tile, shape); | |||
| } | |||
| } | |||
| axis->TileRestrainLower(tile, TileLevel::CACHE1); | |||
| ss << ", tile = " << tile; | |||
| @@ -522,19 +657,11 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||
| rest_threads = std::min(rest_threads, axis->thread_constraints.map_extent_); | |||
| } | |||
| if (thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) { | |||
| if (rest_threads <= 1 || thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) { | |||
| ss << ", no thread/dim rests"; | |||
| SkipMapping(); | |||
| continue; | |||
| } | |||
| if (rest_threads <= 1) { | |||
| if (axis->mc_sup || | |||
| (template_ == Template::REDUCTION && analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib())) { | |||
| thread_cfg_.emplace_back(1); | |||
| } | |||
| SkipMapping(); | |||
| continue; | |||
| } | |||
| auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_ | |||
| : elem_per_thread_[inner_dim]; | |||
| item = std::min(item, max_elem_per_thread_); | |||
| @@ -575,6 +702,7 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||
| if (pending_axes_.size() - i > block_dim) { | |||
| auto axis = pending_axes_[i].first; | |||
| ss << "axis " << axis->index << "_" << axis->dim_axis | |||
| << " exceeded block dim and should be mapped to block for higher performance, consider flatten"; | |||
| analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); | |||
| continue; | |||
| @@ -594,7 +722,7 @@ void GpuStrategy::InnerThreadOuterBlock() { | |||
| int64_t shape; | |||
| std::tie(axis, shape) = pending_axes_[i]; | |||
| auto idx = pending_axes_.size() - 1 - i; | |||
| idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx; | |||
| idx = reverse_binding_ ? block_limit_.size() - 1 - idx : idx; | |||
| auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]); | |||
| rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_); | |||
| ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks; | |||
| @@ -635,11 +763,9 @@ void GpuStrategy::SetMappingConfig() { | |||
| if (block_cfg_.empty()) { | |||
| block_cfg_.emplace_back(1); | |||
| } | |||
| bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && | |||
| analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION); | |||
| std::string block_str = ""; | |||
| std::string thread_str = ""; | |||
| if (reverse_binding) { | |||
| if (reverse_binding_) { | |||
| for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) { | |||
| if (i >= block_count_) { | |||
| continue; | |||
| @@ -753,7 +879,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in | |||
| tile = thread_size; | |||
| ss << "tile = thread size, "; | |||
| } else { | |||
| auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim; | |||
| auto block_dim = reverse_binding_ ? inner_dim : block_limit_.size() - 1 - inner_dim; | |||
| int64_t least_blocks; | |||
| if (block_dim >= 0 && block_dim < block_limit_.size()) { | |||
| least_blocks = block_limit_[block_dim]; | |||
| @@ -1139,12 +1265,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { | |||
| } | |||
| } | |||
| void CustomTilingStrategy::AddGpuConstraint() { | |||
| auto interested_info = GetInterestedInfo(interested_attr_key, false); | |||
| for (auto it : interested_info) { | |||
| TileAxis *axis = it.first; | |||
| for (auto attr : it.second) { | |||
| std::vector<std::string> modes = akg::common::Split(attr.attr_key, ":"); | |||
| CHECK_EQ(modes.size(), 2U); | |||
| std::string constraint_str = attr.attr_value; | |||
| if (constraint_str.find("->") != std::string::npos) { | |||
| std::vector<std::string> res = akg::common::Split(constraint_str, "->"); | |||
| constraint_str = res[1]; | |||
| } | |||
| std::vector<std::string> constraints = akg::common::Split(constraint_str, "_"); | |||
| CHECK_GE(constraints.size(), 1U); | |||
| std::vector<std::string> level = akg::common::Split(constraints[0], ":"); | |||
| CHECK(level.size() == 2U && level[0] == "LEVEL"); | |||
| CHECK(level[1] == "C1" || level[1] == "C0"); | |||
| TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0; | |||
| constraints.erase(constraints.begin()); | |||
| for (const auto &con : constraints) { | |||
| std::vector<std::string> items = akg::common::Split(con, ":"); | |||
| CHECK_EQ(items.size(), 2U); | |||
| CHECK_NE(items[0], ""); | |||
| CHECK_NE(items[1], ""); | |||
| if (items[0] == "MIN") { | |||
| if (items[1] == "MIN") { | |||
| if (lv == CACHE1) { | |||
| axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_; | |||
| } else if (lv == CACHE0) { | |||
| axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_; | |||
| } | |||
| } else { | |||
| if (lv == CACHE1) { | |||
| axis->c1_constraints.tile_min_ = CastToExpr(items[1]); | |||
| } else if (lv == CACHE0) { | |||
| axis->c0_constraints.tile_min_ = CastToExpr(items[1]); | |||
| } | |||
| } | |||
| } else if (items[0] == "FACTOR") { | |||
| axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv); | |||
| } else if (items[0] == "FORBIDISO") { | |||
| axis->forbid_iso = true; | |||
| } else if (items[0] == "MAX") { | |||
| if (items[1] == "FULL") { | |||
| axis->TileRestrainEntire(lv); | |||
| } else { | |||
| if (lv == CACHE1) { | |||
| axis->c1_constraints.tile_extent_ = CastToExpr(items[1]); | |||
| } else if (lv == CACHE0) { | |||
| axis->c0_constraints.tile_extent_ = CastToExpr(items[1]); | |||
| } | |||
| } | |||
| } else if (items[0] == AT_MOD) { | |||
| axis->TileRestrainMod(CastToExpr(items[1]), lv); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // No constraint found in cuda | |||
| void ModStrategy::AddGpuConstraint() {} | |||
| void CustomTilingStrategy::AddGpuConstraint() {} | |||
| void ConflictTreeRangeStrategy::AddGpuConstraint() {} | |||
| void VectorizedStrategy::AddGpuConstraint() {} | |||
| @@ -0,0 +1,17 @@ | |||
| import sys | |||
| if __name__ == "__main__": | |||
| from_log_file = str(sys.argv[1]) | |||
| sorted_log_file = str(sys.argv[2]) | |||
| f_in = open(from_log_file, 'r') | |||
| f_out = open(sorted_log_file, "wt") | |||
| d = dict() | |||
| for line in f_in: | |||
| config = line.split("|") | |||
| d[str(config[1])] = float(config[2]) | |||
| sorted_dict = {k: v for k, v in sorted( | |||
| d.items(), key=lambda item: (item[1], item[0]))} | |||
| for k, v in sorted_dict.items(): | |||
| f_out.write("|" + str(k) + "|" + str(v) + "\n") | |||
| f_in.close() | |||
| f_out.close() | |||
| @@ -0,0 +1,95 @@ | |||
| from .kernel_compiler import compile_kernel | |||
| from collections import namedtuple | |||
| from .space import ListConfigSpace | |||
| def get_reduce_axis_length(in_shape,reduce_axis): | |||
| lx, ly = 1, 1 | |||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||
| for v in in_shape: lx *= v | |||
| elif (len(in_shape) - 1) in reduce_axis: | |||
| for i in range(len(in_shape)): | |||
| if i in reduce_axis: | |||
| lx *= in_shape[i] | |||
| else: | |||
| ly *= in_shape[i] | |||
| else: | |||
| for i in range(len(in_shape)): | |||
| if i in reduce_axis: | |||
| ly *= in_shape[i] | |||
| else: | |||
| lx *= in_shape[i] | |||
| return lx, ly | |||
| def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||
| """get config space of reduce_sum operators in gpu""" | |||
| space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, | |||
| gen_tiling_spaces=True) | |||
| in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis | |||
| dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2 | |||
| dim_names = ['tiling_' + str(i) for i in range(dim_len)] | |||
| dim_names.append("block_x") | |||
| dim_names.append("block_y") | |||
| dim_names.append("block_z") | |||
| dim_names.append("thread_x") | |||
| dim_names.append("thread_y") | |||
| dim_names.append("thread_z") | |||
| for key in tuning_attrs_info[0]: | |||
| dim_names.append(key) | |||
| lx, ly = get_reduce_axis_length(in_shape, reduce_axis) | |||
| tiling_spaces = [] | |||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||
| """all-reduce""" | |||
| possible_tx_list = [2**i for i in range(4,11)] | |||
| for tx in possible_tx_list: | |||
| if tx > lx: break | |||
| possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)] | |||
| if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx) | |||
| for d0 in possible_dim0_list: | |||
| bx = lx//d0 if lx % d0 == 0 else lx//d0+1 | |||
| tiling_spaces.append([d0,bx,1,1,tx,1,1]) | |||
| elif (len(in_shape) - 1) in reduce_axis: | |||
| """reduce-x""" | |||
| possible_tx_list = [2**i for i in range(4,11)] | |||
| for tx in possible_tx_list: | |||
| if tx > lx: break | |||
| ty = 1 | |||
| by = ly | |||
| possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)] | |||
| if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx) | |||
| for d1 in possible_dim1_list: | |||
| bx = lx//d1 if lx % d1 == 0 else lx//d1+1 | |||
| tiling_spaces.append([1,d1,bx,by,1,tx,ty,1]) | |||
| else: | |||
| """reduce-y""" | |||
| tx = min(32,lx) | |||
| bx = lx//tx if lx %tx==0 else lx//tx + 1 | |||
| d0 = tx | |||
| for ty in range(min(8,ly),1025): | |||
| if ty * tx > 1024: break | |||
| possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)] | |||
| for d1 in possible_dim1_list: | |||
| by = ly//d1 if ly % d1 == 0 else ly//d1+1 | |||
| tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1]) | |||
| input_type = namedtuple(op_type, dim_names) | |||
| space = ListConfigSpace(input_type) | |||
| if len(tuning_attrs_info[0]) != 0: | |||
| for tiling_space in tiling_spaces: | |||
| for tuning_attrs_config in tuning_attrs_info[1]: | |||
| tmp = tiling_space[:] | |||
| tmp.extend(tuning_attrs_config) | |||
| config = input_type(*tmp) | |||
| space.add(config) | |||
| else: | |||
| for tiling_space in tiling_spaces: | |||
| config = input_type(*tiling_space) | |||
| space.add(config) | |||
| return space_res.index_table, space, key, expect, input_for_mod | |||
| @@ -0,0 +1,501 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """AutoTuning job""" | |||
| import os | |||
| import json | |||
| import time | |||
| import datetime | |||
| import importlib | |||
| import logging | |||
| import pandas as pd | |||
| import subprocess | |||
| import numpy as np | |||
| from collections import namedtuple | |||
| from multiprocessing import Process, Manager | |||
| from akg import composite | |||
| from akg.utils import kernel_exec as utils | |||
| from akg.composite.build_module import generate_trait | |||
| from autotuning.runner import KernelRunner, error_time_list, error_time_string | |||
| from autotuning.tuner import ModelBasedTuner, Tuner | |||
| from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc | |||
| from autotuning.space_generators import get_space | |||
| from autotuning.space import ListConfigSpace | |||
| from autotuning.test_data_generators import gen_data | |||
| from autotuning.space_generators import gen_bool_list | |||
| from autotuning.tuning_utils import * | |||
| logging.basicConfig(level=logging.DEBUG) | |||
| logger = logging.getLogger('fuzz.tune.autotuning.job') | |||
| storage_dir = './res/' | |||
| if not os.path.exists(storage_dir): | |||
| os.makedirs(storage_dir) | |||
| json_file = './res/' + "{0}" + ".json" | |||
| json_load = './autotuning/shapes/' + "{0}" | |||
| def get_repo(repo, keys, default=None): | |||
| for key in keys: | |||
| repo = repo.get(key) | |||
| if not repo: | |||
| return default | |||
| return repo | |||
| def get_json_space(json_input, space_dict): | |||
| space_res = composite.get_tiling_space(json_input, 2) | |||
| space_dict['res'] = space_res | |||
| def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False, | |||
| skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]): | |||
| """composite json tuning launch""" | |||
| subprocess.run("mkdir -p res/", shell=True) | |||
| iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] | |||
| files = os.listdir(json_dir) | |||
| with open(repo_path, 'r') as f: | |||
| repo = json.loads(f.read()) | |||
| for input_file in files: | |||
| print("----Start tuning for ", input_file) | |||
| with open(json_dir + '/' + input_file, 'r') as f: | |||
| json_input = f.read() | |||
| json_content = json.loads(json_input) | |||
| for input_desc in json_content["input_desc"]: | |||
| if input_desc[0]["shape"] == []: | |||
| input_desc[0]["shape"] = [1] | |||
| json_input = json.dumps(json_content) | |||
| # skip tuning for info in repo | |||
| if skip_exist: | |||
| compute, shape, dtype = generate_trait(json_content) | |||
| if get_repo(repo, [compute, shape, dtype]): | |||
| print("Info for %s already exists" % input_file) | |||
| print("ops are ", str(compute)) | |||
| print("shape is ", str(shape)) | |||
| print("dtype is ", str(dtype)) | |||
| with open('res/skip_file.txt', 'a') as fe: | |||
| fe.write(input_file) | |||
| fe.write("\n") | |||
| continue | |||
| # generate tuning space | |||
| if not extra_tune: | |||
| time_start_get_space = time.time() | |||
| with Manager() as manager: | |||
| space_dict = manager.dict() | |||
| p = Process(target=get_json_space, | |||
| args=(json_input, space_dict)) | |||
| p.start() | |||
| p.join(600) | |||
| if 'res' not in space_dict: | |||
| with open('res/error_space_list.txt', 'a') as fe: | |||
| fe.write(input_file) | |||
| fe.write("\n") | |||
| continue | |||
| space_res = space_dict['res'] | |||
| time_end_get_space = time.time() | |||
| print("get space time: ", time_end_get_space - time_start_get_space) | |||
| index_table = space_res['index'] | |||
| tiling_spaces = space_res['tuning_space'] | |||
| if not isinstance(tiling_spaces, list): | |||
| with open('res/empty_space_list.txt', 'a') as fe: | |||
| fe.write(input_file) | |||
| fe.write("\n") | |||
| continue | |||
| dim_names = ['tiling_' + str(i) | |||
| for i in range(len(tiling_spaces[0]))] | |||
| use_tuning_attrs = len(tiling_spaces) < 10 ** 5 | |||
| if tuning_attrs and use_tuning_attrs: | |||
| dim_names.extend(tuning_attrs) | |||
| input_type = namedtuple("json", dim_names) | |||
| space = ListConfigSpace(input_type) | |||
| if tuning_attrs and use_tuning_attrs: | |||
| attr_options = gen_bool_list(tuning_attrs) | |||
| for tiling_space in tiling_spaces: | |||
| for attr_option in attr_options: | |||
| tmp = tiling_space[:] | |||
| tmp.extend(attr_option) | |||
| config = input_type(*tmp) | |||
| space.add(config) | |||
| else: | |||
| for tiling_space in tiling_spaces: | |||
| config = input_type(*tiling_space) | |||
| space.add(config) | |||
| else: | |||
| index_table = [] | |||
| pre_lists = gen_bool_list(self_attrs) | |||
| pre_input_type = namedtuple("extra_tune", self_attrs) | |||
| space = ListConfigSpace(pre_input_type) | |||
| for item in pre_lists: | |||
| config = pre_input_type(*item) | |||
| space.add(config) | |||
| key = json_content["op"] | |||
| try: | |||
| input_for_mod, expect = gen_data( | |||
| op_type="json", op_desc=json_input) | |||
| except BaseException as e: | |||
| logger.debug( | |||
| "gen numpy data from [%s] failed: %s", input_file, str(e)) | |||
| with open('res/error_gen_data_list.txt', 'a') as fe: | |||
| fe.write(input_file) | |||
| fe.write(": ") | |||
| fe.write(str(e)) | |||
| fe.write("\n") | |||
| continue | |||
| print('space size:', space.length) | |||
| print('index table:', index_table) | |||
| output_para = None # this is for multi-output | |||
| if len(json_content["output_desc"]) > 1: | |||
| output_para = [] | |||
| for i in range(len(json_content["output_desc"])): | |||
| output_para.append(i - len(json_content["output_desc"])) | |||
| runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs, | |||
| input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180, | |||
| repeat_times=1) | |||
| # we can only get a valid tiling, or accurate get cycles | |||
| is_truly_profiling = utils.get_profiling_mode( | |||
| ) or os.environ['RUNTIME_MODE'] == "gpu" | |||
| # available device numbers, normally is 8 or 1 | |||
| available_device_numbers = utils.get_available_devices_num() | |||
| if all_space: | |||
| tuner = Tuner(runner, index_table, space, | |||
| n_parallel=available_device_numbers) | |||
| least_try_times = 3 # space.length | |||
| else: | |||
| tuner = ModelBasedTuner(runner, index_table, space, | |||
| n_parallel=available_device_numbers if is_truly_profiling else 1, | |||
| plan_size=64, pre_model=None) | |||
| least_try_times = iter_times[0 if space.length < | |||
| 10 ** 4 else 1 if space.length < 10 ** 5 else 2] | |||
| tuner.tune(least_try_times, output_file="json.log") | |||
| print_tuning_result("json", space, index_table, tuner, key) | |||
| if save_res: | |||
| if extra_tune: | |||
| save_tuning_result(key, "extra_tune", | |||
| json_content, index_table, tuner, repo_path) | |||
| else: | |||
| save_tuning_result(key, "json", json_content, | |||
| index_table, tuner, repo_path) | |||
| def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False, | |||
| all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None): | |||
| """AutoTuning jobs""" | |||
| iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] | |||
| time_start_get_space = time.time() | |||
| index_table, space, key, expect, input_for_mod = get_space( | |||
| op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) | |||
| time_end_get_space = time.time() | |||
| print("get space time: ", time_end_get_space - time_start_get_space) | |||
| print('space size:', space.length) | |||
| print('index table:', index_table) | |||
| key = key if insert_key == '' else insert_key | |||
| # filter already tuned shape | |||
| if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys(): | |||
| if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]: | |||
| return | |||
| if isinstance(conf_of_set_dim[key], dict): | |||
| return | |||
| output_para = None # this is for multi-output | |||
| if isinstance(input_for_mod, dict): | |||
| input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs'] | |||
| runner = KernelRunner(op_type, desc, index_table, | |||
| self_attrs=None, input_data=input_for_mod, | |||
| expect=expect, mod_output_param=output_para, | |||
| timeout=30, repeat_times=1, | |||
| is_all_space=all_space, | |||
| skip_config_set=skip_config_set, | |||
| need_tune_json=tuning_attrs_info[2]) | |||
| # we can only get a valid tiling, or accurate get cycles | |||
| is_truly_profiling = utils.get_profiling_mode() | |||
| # number of multi-processing for build kernels | |||
| available_device_numbers = get_parallel_build_num() | |||
| time_start_tuning = time.time() | |||
| if all_space: | |||
| tuner = Tuner(runner, index_table, space, | |||
| n_parallel=available_device_numbers) | |||
| least_try_times = space.length | |||
| else: | |||
| tuner = ModelBasedTuner(runner, index_table, space, | |||
| n_parallel=available_device_numbers if is_truly_profiling else 1, | |||
| plan_size=100, pre_model=None) | |||
| least_try_times = space.length | |||
| tuner.tune(least_try_times, output_file=op_type + ".log") | |||
| time_end_tuning = time.time() | |||
| print("tuning time: ", time_end_tuning - time_start_tuning) | |||
| print_tuning_result(op_type, space, index_table, tuner, key) | |||
| # save_results_to_csv(op_type, space, index_table, tuner, key) | |||
| # if save_res: | |||
| # save_tuning_result(key, op_type, desc, index_table, tuner) | |||
| def print_tuning_result(op_type, space, index_table, tuner, key): | |||
| """print tuning result""" | |||
| print(op_type + " shape is:", key) | |||
| print('space size:', space.length) | |||
| print('index table:', index_table) | |||
| print('best config:', tuner.best_config) | |||
| print('best time:', | |||
| tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time]) | |||
| print('original time:', tuner.original_time) | |||
| print('optimal result is ', tuner.original_time / | |||
| tuner.best_time, "faster then auto set dim.") | |||
| print("total try times", len(tuner.xs)) | |||
| for x, y in zip(tuner.xs, tuner.ys): | |||
| print(space.get(x), y if y not in error_time_string.keys() | |||
| else error_time_string[y]) | |||
| def save_results_to_csv(op_type, space, index_table, tuner, key): | |||
| """save all results to csv""" | |||
| data = [] | |||
| for x, y in zip(tuner.xs, tuner.ys): | |||
| data.append([space.get(x), y if y not in error_time_string.keys() | |||
| else 9999999]) | |||
| df = pd.DataFrame(data, columns=["config", "time"]) | |||
| df.to_csv(op_type + "_" + key + ".csv") | |||
| def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"): | |||
| """save tuning result""" | |||
| if tuner.best_config is not None and tuner.best_time not in error_time_list: | |||
| set_dim_configs = tuner.best_config.input | |||
| if op_type == "matmul": | |||
| param = [] | |||
| for _ in range(len(desc.x_shape) - 2): | |||
| param.append((1, 1)) | |||
| if set_dim_configs.n_l1 > 0: | |||
| param.append((set_dim_configs.n_l1, set_dim_configs.n_l0)) | |||
| if set_dim_configs.m_l1 > 0: | |||
| param.append((set_dim_configs.m_l1, set_dim_configs.m_l0)) | |||
| param.extend( | |||
| [(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)]) | |||
| tiling_param = (param, {"bypass": set_dim_configs.bypass}) | |||
| # special case with different tiling parameter format | |||
| elif op_type in ("conv", "conv_bn1"): | |||
| param = [] | |||
| tile_hh = set_dim_configs.tile_h | |||
| tile_coco = set_dim_configs.tile_co | |||
| tile_mm = set_dim_configs.tile_m | |||
| tile_kk = set_dim_configs.tile_k | |||
| tile_nn = set_dim_configs.tile_n | |||
| tile_ww = set_dim_configs.tile_w | |||
| param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||
| tiling_param = (param, {"bypass": set_dim_configs.bypass}) | |||
| elif op_type == "conv_backprop_input": | |||
| param = [] | |||
| tile_hh = set_dim_configs.tile_h | |||
| tile_coco = set_dim_configs.tile_co | |||
| tile_mm = set_dim_configs.tile_m | |||
| tile_kk = set_dim_configs.tile_k | |||
| tile_nn = set_dim_configs.tile_n | |||
| tile_ww = set_dim_configs.tile_w | |||
| param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||
| tiling_param = (param) | |||
| elif op_type == "conv_backprop_filter": | |||
| param = [] | |||
| tile_cici = set_dim_configs.tile_ci | |||
| tile_khkh = set_dim_configs.tile_kh | |||
| tile_kwkw = set_dim_configs.tile_kw | |||
| tile_coco = set_dim_configs.tile_co | |||
| tile_bb = set_dim_configs.tile_batch | |||
| tile_hh = set_dim_configs.tile_h | |||
| tile_ww = set_dim_configs.tile_w | |||
| tile_mm = set_dim_configs.tile_m | |||
| tile_kk = set_dim_configs.tile_k | |||
| tile_nn = set_dim_configs.tile_n | |||
| param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, | |||
| tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn] | |||
| tiling_param = (param) | |||
| elif ("batch_matmul" in op_type) and (platform == "gpu"): | |||
| tiling = [str(getattr(set_dim_configs, name)) for name in getattr( | |||
| set_dim_configs, "_fields") if name.startswith("tiling")] | |||
| tiling_param = "" | |||
| for i, tile_v in enumerate(tiling): | |||
| if i % 2 == 0: | |||
| tiling_param += "0 " + str(i) + " " | |||
| tiling_param += tile_v + " " | |||
| block_param = get_block_str_from_config(set_dim_configs) | |||
| thread_param = get_thread_str_from_config(set_dim_configs) | |||
| config = { | |||
| 'attrs': { | |||
| 'dim': tiling_param, | |||
| 'bind_block': block_param, | |||
| 'bind_thread': thread_param | |||
| }, | |||
| 'best_cycles': tuner.best_time, | |||
| 'original_cycles': tuner.original_time, | |||
| 'date': str(datetime.datetime.now()), | |||
| 'tuning_time': tuner.tuning_time, | |||
| } | |||
| elif op_type == "json": | |||
| from autotuning.runner import get_attr_from_config | |||
| tiling_param = get_attr_from_config(set_dim_configs, index_table) | |||
| elif op_type == "reduce_sum_gpu": | |||
| print(set_dim_configs) | |||
| tiling = [str(getattr(set_dim_configs, name)) | |||
| for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] | |||
| tiling_param = "" | |||
| for i, tile_v in enumerate(tiling): | |||
| tiling_param += "0 " + str(i) + " " | |||
| tiling_param += tile_v + " 1 " | |||
| block_param = get_block_str_from_config(set_dim_configs) | |||
| thread_param = get_thread_str_from_config(set_dim_configs) | |||
| config = { | |||
| 'attrs': { | |||
| 'dim': tiling_param, | |||
| 'bind_block': block_param, | |||
| 'bind_thread': thread_param | |||
| }, | |||
| 'best_cycles': tuner.best_time, | |||
| 'original_cycles': tuner.original_time, | |||
| 'date': str(datetime.datetime.now()), | |||
| 'tuning_time': tuner.tuning_time, | |||
| } | |||
| else: | |||
| print(set_dim_configs) | |||
| tiling = [[getattr(set_dim_configs, name), 1] | |||
| for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] | |||
| tiling_param = [] | |||
| for i, tile_v in enumerate(tiling): | |||
| tiling_param.append(index_table[i] + tile_v) | |||
| config = [] | |||
| else: | |||
| tiling_param = [] | |||
| # when there is a valid result, save the result | |||
| if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list: | |||
| config = {'attrs': tiling_param, | |||
| 'best_cycles': tuner.best_time, | |||
| 'original_cycles': tuner.original_time, | |||
| "date": str(datetime.datetime.now()), | |||
| "tuning time": tuner.tuning_time, | |||
| } | |||
| if op_type == "json": | |||
| config["file_name"] = str(key) | |||
| compute, shape, dtype = generate_trait(desc) | |||
| tuner.export_dim_configs( | |||
| config, json_file.format(op_type), False, str(key)) | |||
| save_file = "autotuning/extra_tune.json" if extra_tune else repo_path | |||
| with open(save_file, 'r') as f: | |||
| repo = json.loads(f.read()) | |||
| if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or | |||
| int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])): | |||
| tuner.export_dim_configs_for_keys(config, save_file, False, [ | |||
| compute, shape, dtype, "metadata"]) | |||
| else: | |||
| try: | |||
| tuner.export_dim_configs( | |||
| config, json_file.format(op_type), False, str(key)) | |||
| except UnboundLocalError as e: | |||
| logger.warning(e) | |||
| print("[save_tuning_result]: ", "no result is saved.") | |||
| def load_json_configs(op_type): | |||
| """load json configs""" | |||
| dim_file = json_file.format(op_type) | |||
| file_path = os.path.realpath(dim_file) | |||
| if os.path.isfile(file_path): | |||
| try: | |||
| with open(file_path, 'r') as f: | |||
| data = json.load(f) | |||
| return data | |||
| except IOError as e: | |||
| logger.debug(e) | |||
| return {} | |||
| return {} | |||
| def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type): | |||
| """read tuning shapes from file""" | |||
| file = importlib.import_module('autotuning.shapes.' + op_type) | |||
| shapes = file.shapes | |||
| for _, shp in enumerate(shapes): | |||
| do_profiling(shp, debug_mode, save_res, | |||
| all_space, op_type, conf_of_set_dim) | |||
| def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): | |||
| """do profiling""" | |||
| # remove undeleted JOB files for previous shapes | |||
| subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True) | |||
| if op_type == 'matmul': | |||
| key = shp[2][0:-1] | |||
| logger.debug("start profiling: [%s]", str(key)) | |||
| desc = MatmulCubeDesc(*key) | |||
| jobs(op_type, desc, debug_mode, save_res, | |||
| all_space, key.__str__(), conf_of_set_dim) | |||
| logger.debug("end profiling: [%s]", str(key)) | |||
| elif op_type.startswith('conv_backprop'): | |||
| key = shp[2] | |||
| logger.debug("start profiling: [%s]", str(key)) | |||
| desc = ConvBackpropDesc(*key) | |||
| jobs(op_type, desc, debug_mode, save_res, | |||
| all_space, key.__str__(), conf_of_set_dim) | |||
| logger.debug("end profiling: [%s]", str(key)) | |||
| elif op_type.startswith('conv') and "gpu" not in op_type: | |||
| key = shp[2] | |||
| logger.debug("start profiling: [%s]", str(key)) | |||
| desc = ConvDesc(*key) | |||
| jobs(op_type, desc, debug_mode, save_res, | |||
| all_space, key.__str__(), conf_of_set_dim) | |||
| logger.debug("end profiling: [%s]", str(key)) | |||
| elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]: | |||
| logger.debug("start profiling: [%s]", str(shp)) | |||
| jobs(op_type, shp, debug_mode, save_res, | |||
| all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||
| else: | |||
| key = shp | |||
| logger.debug("start profiling: [%s]", str(key)) | |||
| desc = key | |||
| jobs(op_type, desc, debug_mode, save_res, | |||
| all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set) | |||
| logger.debug("end profiling: [%s]", str(key)) | |||
| def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False, | |||
| from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): | |||
| # get the existed tiling | |||
| conf_of_set_dim = load_json_configs(op_type) if from_json else None | |||
| if desc is None: | |||
| read_shapes_from_file(debug_mode, save_res, | |||
| all_space, conf_of_set_dim, op_type) | |||
| else: | |||
| shp = desc | |||
| do_profiling(shp, debug_mode, save_res, all_space, op_type, | |||
| tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||
| @@ -0,0 +1,407 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Compile kernel module for operator""" | |||
| import os | |||
| from typing import NamedTuple | |||
| from base import TestBase | |||
| from akg.utils import kernel_exec as utils | |||
| from akg.utils import custom_tiling as ct_util | |||
| from akg.ops.nn import conv_bn1 | |||
| from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul | |||
| from test_op.batch_matmul import batch_matmul | |||
| from akg.ops.math_gpu.reduce_sum import reduce_sum | |||
| from akg.build_module import tuning_spaces | |||
| from akg.ops.nn import matmul | |||
| from test_run import batchmatmul_run, matmul_run | |||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig | |||
| import numpy as np | |||
| from gen_random import random_gaussian | |||
| from .tuning_utils import merge_attrs | |||
| def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None): | |||
| # wait for implementation | |||
| return | |||
| def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table, | |||
| config: ConvConfig = None, idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for conv""" | |||
| if index_table is not None: | |||
| raise RuntimeError('index_table should be none') | |||
| kernel_name = "conv_poly" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tile_hh = config.tile_h | |||
| tile_coco = config.tile_co | |||
| tile_mm = config.tile_m | |||
| tile_kk = config.tile_k | |||
| tile_nn = config.tile_n | |||
| tile_ww = config.tile_w | |||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||
| attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} | |||
| if op_desc.use_bias: | |||
| shape = [input_shape[0], input_shape[1], input_shape[2]] | |||
| else: | |||
| shape = [input_shape[0], input_shape[1]] | |||
| conv_dtype = 'float16' | |||
| return utils.op_build(conv.conv, [shape], [conv_dtype], | |||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, | |||
| op_desc.dilation, op_desc.use_bias, attrs], | |||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||
| def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None, | |||
| idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for conv_bn1""" | |||
| if index_table is not None: | |||
| raise RuntimeError('index_table should be none') | |||
| kernel_name = "conv_bn1_poly" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tile_hh = config.tile_h | |||
| tile_coco = config.tile_co | |||
| tile_mm = config.tile_m | |||
| tile_kk = config.tile_k | |||
| tile_nn = config.tile_n | |||
| tile_ww = config.tile_w | |||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||
| attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} | |||
| if op_desc.use_bias: | |||
| shape = [input_shape[0], input_shape[1], input_shape[2]] | |||
| else: | |||
| shape = [input_shape[0], input_shape[1]] | |||
| conv_dtype = 'float16' | |||
| return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype], | |||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, | |||
| op_desc.dilation, op_desc.use_bias, attrs], | |||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||
| def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table, | |||
| config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for matmul_cube""" | |||
| if index_table is not None: | |||
| raise RuntimeError('index_table should be none') | |||
| kernel_name = "matmul_cube_poly" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tiling_param = [] | |||
| for _ in range(len(op_desc.x_shape) - 2): | |||
| tiling_param.append((1, 1)) | |||
| if config.n_l1 > 0: | |||
| tiling_param.append((config.n_l1, config.n_l0)) | |||
| if config.m_l1 > 0: | |||
| tiling_param.append((config.m_l1, config.m_l0)) | |||
| tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)]) | |||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||
| attrs = {'dim': dim_info, 'bypass': config.bypass} | |||
| return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format, | |||
| op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y, | |||
| op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name, | |||
| attrs, tuning=gen_tiling_spaces) | |||
| def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None, | |||
| idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for conv_backprop_input""" | |||
| if index_table is not None: | |||
| raise RuntimeError('index_table should be none') | |||
| kernel_name = "conv_backprop_input_poly" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tile_hh = config.tile_h | |||
| tile_coco = config.tile_co | |||
| tile_mm = config.tile_m | |||
| tile_kk = config.tile_k | |||
| tile_nn = config.tile_n | |||
| tile_ww = config.tile_w | |||
| tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] | |||
| attrs = {'conv_tile': tiling_param} | |||
| conv_dtype = 'float16' | |||
| block_size = 16 | |||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||
| cout, _, w_h, w_w = op_desc.filter_shape | |||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||
| cout = (cout + block_size - 1) // block_size * block_size | |||
| pad_top, pad_bottom, pad_left, pad_right = op_desc.pad | |||
| stride_h, stride_w = op_desc.stride | |||
| out_n = in_n | |||
| out_c = cout | |||
| out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 | |||
| out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 | |||
| x_shape = (out_n, out_c, out_h, out_w) | |||
| w_shape = (cout, in_c, w_h, w_w) | |||
| in_nn, in_cc, in_hh, in_ww = x_shape | |||
| input_shape_nc1hwc0 = (in_nn, in_cc // block_size, | |||
| in_hh, in_ww, block_size) | |||
| k_n, k_c, k_h, k_w = w_shape | |||
| kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) | |||
| k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0 | |||
| kernel_shape_fractal = (k_c // block_size * k_h * | |||
| k_w, k_n // block_size, block_size, block_size) | |||
| shape = [input_shape_nc1hwc0, kernel_shape_fractal] | |||
| return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype], | |||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||
| op_desc.stride, op_desc.dilation, attrs], | |||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||
| def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None, | |||
| idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for conv_backprop_filter""" | |||
| if index_table is not None: | |||
| raise RuntimeError('index_table should be none') | |||
| kernel_name = "conv_backprop_filter_poly" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tile_cici = config.tile_ci | |||
| tile_khkh = config.tile_kh | |||
| tile_kwkw = config.tile_kw | |||
| tile_coco = config.tile_co | |||
| tile_bb = config.tile_batch | |||
| tile_hh = config.tile_h | |||
| tile_ww = config.tile_w | |||
| tile_mm = config.tile_m | |||
| tile_kk = config.tile_k | |||
| tile_nn = config.tile_n | |||
| tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww, | |||
| tile_mm, tile_kk, tile_nn] | |||
| attrs = {'conv_tile': tiling_param} | |||
| conv_dtype = 'float16' | |||
| block_size = 16 | |||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||
| cout, _, w_h, w_w = op_desc.filter_shape | |||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||
| cout = (cout + block_size - 1) // block_size * block_size | |||
| pad_top, pad_bottom, pad_left, pad_right = op_desc.pad | |||
| stride_h, stride_w = op_desc.stride | |||
| out_n = in_n | |||
| out_c = cout | |||
| out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 | |||
| out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 | |||
| x_shape = (in_n, in_c, in_h, in_w) | |||
| y_shape = (out_n, out_c, out_h, out_w) | |||
| in_n, in_c, in_h, in_w = x_shape | |||
| input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) | |||
| o_n, o_c, o_h, o_w = y_shape | |||
| kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size) | |||
| o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0 | |||
| mo = (o_h * o_w + block_size - 1) // block_size | |||
| mi = block_size | |||
| kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0) | |||
| input_shape = [kernel_shape_fractal, input_shape_nc1hwc0] | |||
| return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype], | |||
| op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||
| op_desc.stride, op_desc.dilation, attrs], | |||
| kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) | |||
| def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False): | |||
| """Compile kernel module for vector""" | |||
| test_base = TestBase() | |||
| test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd()) | |||
| kernel_name = "poly_" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| if config is None: | |||
| attrs = {'dim': ""} | |||
| else: | |||
| tiling = [[getattr(config, name), 1] for name in getattr( | |||
| config, '_fields') if name.startswith('tiling')] | |||
| tiling_param = [] | |||
| for i, element in enumerate(tiling): | |||
| tiling_param.append(index_table[i] + element) | |||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||
| attrs = {'dim': dim_info} | |||
| _, func, args, kwargs = test_base.ana_args(op_desc) | |||
| if 'attrs' in kwargs.keys(): | |||
| kwargs['attrs']['dim'] = attrs['dim'] | |||
| kwargs['attrs']['tuning'] = gen_tiling_spaces | |||
| kwargs['attrs']['kernel_name'] = kernel_name | |||
| else: | |||
| for _, arg_ in enumerate(args): | |||
| if isinstance(arg_, dict): | |||
| arg_['dim'] = attrs['dim'] | |||
| arg_['tuning'] = gen_tiling_spaces | |||
| arg_['kernel_name'] = kernel_name | |||
| break | |||
| try: | |||
| if gen_tiling_spaces: | |||
| mod, expect, param_for_mod = func(*args, **kwargs) | |||
| mod = list(mod) | |||
| mod.append(expect) | |||
| mod.append(param_for_mod) | |||
| else: | |||
| mod = func(*args, **kwargs) | |||
| except BaseException as e: | |||
| print("Compile ERROR message:", e) | |||
| print(func) | |||
| print("Compile ERROR") | |||
| raise Exception("Compile ERROR") | |||
| return mod | |||
| def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None, | |||
| config: NamedTuple = None, idx=None, | |||
| gen_tiling_spaces=False, need_tune_json=None): | |||
| """Compile kernel module for batch_matmul in gpu""" | |||
| kernel_name = "batch_matmul_gpu_" | |||
| # wait for implementation | |||
| return | |||
| def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None, | |||
| config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): | |||
| """Compile kernel module for reduce_sum in gpu""" | |||
| kernel_name = "reduce_sum_gpu_" | |||
| if idx is not None: | |||
| kernel_name += str(idx) | |||
| attrs = op_desc[2] | |||
| if config is not None: | |||
| attrs = merge_attrs(attrs, config, need_tune_json) | |||
| try: | |||
| if gen_tiling_spaces: | |||
| # NOTE: don't use this process for reduce spaces generation, | |||
| # see function: "_get_space_reduce_gpu_manually". | |||
| from .tiling_strategies_gpu import reduce_gpu_tiling_strategy | |||
| spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ), | |||
| (attrs.in_dtype, | |||
| ), kernel_name="reduce_sum", | |||
| op_attrs=[ | |||
| attrs.axis, attrs.keepdims], | |||
| attrs={"target": "cuda", | |||
| "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, | |||
| "enable_atomic_add": attrs.enable_atomic_add, | |||
| "custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True) | |||
| from test_ms_reduce_sum import gen_data | |||
| input_for_mod, output, expect = gen_data( | |||
| attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims) | |||
| return [spaces, set_dim_key, expect, [input_for_mod, output]] | |||
| else: | |||
| mod = utils.op_build(reduce_sum, (attrs.in_shape, ), | |||
| (attrs.in_dtype, | |||
| ), kernel_name="reduce_sum", | |||
| op_attrs=[ | |||
| attrs.axis, attrs.keepdims], | |||
| attrs={"target": "cuda", | |||
| "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, | |||
| "dim": attrs.dim, | |||
| "bind_block": attrs.bind_block, | |||
| "bind_thread": attrs.bind_thread, | |||
| "enable_atomic_add": attrs.enable_atomic_add}) | |||
| return mod | |||
| except BaseException as e: | |||
| print("Compile ERROR message:", e) | |||
| print(reduce_sum) | |||
| print("Compile ERROR") | |||
| raise Exception("Compile ERROR") | |||
| def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): | |||
| """Compile kernel module for convolution in gpu using image2col+gemm""" | |||
| # wait for implementation | |||
| return | |||
| _compile_kernel_func = { | |||
| 'conv': gen_kernel_conv, | |||
| 'conv_bn1': gen_kernel_conv_bn1, | |||
| 'conv_backprop_input': gen_kernel_conv_backprop_input, | |||
| 'conv_backprop_filter': gen_kernel_conv_backprop_filter, | |||
| 'matmul': gen_kernel_matmul_cube, | |||
| 'reduce_sum_gpu': gen_kernel_reduce_sum_gpu, | |||
| 'batch_matmul_gpu': gen_kernel_batch_matmul_gpu, | |||
| 'conv_image2col_gemm_gpu': gen_kernel_conv_image2col_gemm_gpu, | |||
| } | |||
| def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None, | |||
| config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None): | |||
| """Generate kernel module for operator | |||
| Parameters | |||
| op_type: str | |||
| operator name | |||
| op_desc: NamedTuple | |||
| operator definition parameters | |||
| config_param: NameTuple | |||
| operator config parameters | |||
| idx: int | |||
| operator idx(th) kernel | |||
| gen_tiling_spaces: bool | |||
| parameter passed to utils.op_build, whether to get spaces instead of stmt | |||
| ---------- | |||
| Returns: | |||
| kernel if gen_tiling_spaces == False else np.ndarray | |||
| """ | |||
| gen_func = _compile_kernel_func.get(op_type, None) | |||
| if gen_func is None: | |||
| gen_func = gen_kernel_for_vector | |||
| if gen_tiling_spaces: | |||
| space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param, | |||
| idx, gen_tiling_spaces) | |||
| else: | |||
| if "gpu" in op_type: | |||
| mod = gen_func(op_desc, input_shape, index_table, | |||
| config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json) | |||
| else: | |||
| mod = gen_func(op_desc, input_shape, index_table, | |||
| config_param, idx, gen_tiling_spaces) | |||
| return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod | |||
| @@ -0,0 +1,243 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Runner for compile and execute a configs of an operator on device""" | |||
| import time | |||
| import multiprocessing | |||
| import logging | |||
| import json | |||
| import os | |||
| import subprocess | |||
| import time | |||
| from typing import NamedTuple | |||
| import numpy as np | |||
| from akg import composite | |||
| from akg.utils import custom_tiling as ct_util | |||
| from akg.utils import kernel_exec as utils | |||
| from .kernel_compiler import compile_kernel | |||
| from .test_data_generators import gen_data | |||
| from .tuning_utils import * | |||
| logger = logging.getLogger('fuzz.tune.autotuning.runner') | |||
| error_time_list = [ | |||
| 9999999999.0, | |||
| 9999999998.0, | |||
| 9999999997.0, | |||
| 9999999996.0, | |||
| ] | |||
| error_time_string = { | |||
| error_time_list[0]: 'run_failed', | |||
| error_time_list[1]: 'precision_error', | |||
| error_time_list[2]: 'compile_failed', | |||
| error_time_list[3]: 'timeout' | |||
| } | |||
| run_failed_time = error_time_list[0] | |||
| precision_error_time = error_time_list[1] | |||
| compile_fail_time = error_time_list[2] | |||
| timeout_time = error_time_list[3] | |||
| def get_attr_from_config(config, index_table): | |||
| tiling = [] | |||
| attrs = {} | |||
| tuning_dict = config._asdict() | |||
| for key, value in tuning_dict.items(): | |||
| if key.startswith('tiling'): | |||
| item = [value, 1] | |||
| tiling.append(item) | |||
| else: | |||
| attrs[key] = value | |||
| if len(tiling): | |||
| tiling_param = [] | |||
| for i, element in enumerate(tiling): | |||
| tiling_param.append(index_table[i] + element) | |||
| dim_info = ct_util.set_dims(tuple(tiling_param)) | |||
| attrs['dim'] = dim_info | |||
| else: | |||
| print("No tiling info. Use auto tiling.") | |||
| return attrs | |||
| class KernelRunner: | |||
| """kernel runner | |||
| This runner will compile and execute configs of an operator, and return their running times. | |||
| Parameters | |||
| ---------- | |||
| op_type: str | |||
| The name of operator | |||
| op_desc: NamedTuple | |||
| The definition parameters of operator | |||
| timeout: int | |||
| Timeout for running one config | |||
| repeat_times: | |||
| Run one config repeat_times | |||
| """ | |||
| def __init__(self, op_type: str, op_desc: NamedTuple, | |||
| index_table: list, self_attrs: list, timeout: int = 600, | |||
| repeat_times: int = 2, input_data=None, | |||
| expect=None, mod_output_param=None, is_all_space=True, | |||
| skip_config_set=None, need_tune_json=None): | |||
| self.op_type = op_type | |||
| self.op_desc = op_desc | |||
| self._index_table = index_table | |||
| self.self_attrs = self_attrs | |||
| self.run_kernel_time = 0.0 | |||
| self.tune_self_attrs = True | |||
| self.timeout = timeout | |||
| self.repeat_times = repeat_times | |||
| self.mod_output_param = mod_output_param | |||
| self.is_all_space = is_all_space | |||
| self.skip_config_set = skip_config_set | |||
| self.need_tune_json = need_tune_json | |||
| if input_data is None: | |||
| self.input, self.expect = gen_data(op_type, op_desc) | |||
| if isinstance(self.input, dict): | |||
| self.input, self.mod_output_param = self.input['args'], self.input['outputs'] | |||
| else: | |||
| self.input, self.expect = input_data, expect | |||
| self.input_shape = [x.shape for x in self.input] | |||
| def info(self): | |||
| print('run kernel time:', self.run_kernel_time) | |||
| def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False): | |||
| """Compile and execute a config of the operator on device""" | |||
| if json.dumps(config.input._asdict()) in self.skip_config_set: | |||
| print("CONFIG SKIP:", json.dumps(config.input._asdict())) | |||
| run_times[idx] = -1 | |||
| return | |||
| time_one_kernel_start = time.time() | |||
| logger.debug('compile %dth kernel', idx) | |||
| gpu_devices_list = get_available_gpu_num() | |||
| device_id = gpu_devices_list[idx % len(gpu_devices_list)] | |||
| logger.debug('run %dth kernel', idx) | |||
| logger.debug('++++++++++++++++++++++=device_id') | |||
| logger.debug(device_id) | |||
| logger.debug('++++++++++++++++++++++=device_id') | |||
| try: | |||
| time_start_build = time.time() | |||
| logger.debug(config) | |||
| if self.op_type in ["json", "extra_tune"]: | |||
| if is_auto: | |||
| mod = composite.build(self.op_desc) | |||
| if self.op_type == "extra_tune": | |||
| del os.environ['MS_GRAPH_KERNEL_TILING'] | |||
| else: | |||
| attrs = get_attr_from_config( | |||
| config.input, self._index_table) | |||
| if os.environ['RUNTIME_MODE'] == "gpu": | |||
| attrs['target'] = "cuda" | |||
| mod = composite.build(self.op_desc, attrs, use_repo=False) | |||
| else: | |||
| mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table, | |||
| None if is_auto else config.input, idx, need_tune_json=self.need_tune_json) | |||
| time_end_build = time.time() | |||
| logger.debug("build module time: %f", | |||
| time_end_build - time_start_build) | |||
| logger.debug('finished compile %dth kernel', idx) | |||
| except BaseException as e: | |||
| logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str( | |||
| config.input), str(e)) | |||
| run_times[idx] = compile_fail_time | |||
| return | |||
| run_times[idx] = run_failed_time | |||
| try: | |||
| # NOTE: in gpu tuning, it is no need to use this repeat_times, | |||
| # repeat_time has been setted in mod_launch in tuning mode. | |||
| for _ in range(self.repeat_times): | |||
| stat_info = {} | |||
| try: | |||
| time_start_launch = time.time() | |||
| if self.mod_output_param is not None: | |||
| pass | |||
| else: | |||
| output, stat_info = utils.mod_launch( | |||
| mod, self.input, tuning=True, device_id=device_id, repeat_time=40) | |||
| if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True): | |||
| stat_info['run_time'] = precision_error_time | |||
| logger.debug("Precision Error: [%s]", | |||
| "origin" if config is None else str(config.input)) | |||
| time_end_launch = time.time() | |||
| logger.debug("mod launch time: %f", | |||
| time_end_launch - time_start_launch) | |||
| except BaseException as e: | |||
| logger.debug("Run Failed: [%s] : %s", str( | |||
| config.input), str(e)) | |||
| stat_info['run_time'] = run_failed_time | |||
| run_times[idx] = np.minimum( | |||
| run_times[idx], stat_info['run_time']) | |||
| finally: | |||
| logger.debug('end of %dth kernel', idx) | |||
| time_one_kernel_end = time.time() | |||
| logger.debug('run one kernel time: %f', | |||
| time_one_kernel_end - time_one_kernel_start) | |||
| return | |||
| def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False): | |||
| """Compile and execute a batch config of the operator on device""" | |||
| start = time.time() | |||
| logger.setLevel(logging.DEBUG) | |||
| logger.debug("gen cce kernels batch: %d kernels", len(configs)) | |||
| subprocess.run("rm -rf ./jobs/JOB*", shell=True) | |||
| process_jobs = [] | |||
| run_times = multiprocessing.Manager().list( | |||
| np.full((len(configs),), compile_fail_time)) | |||
| for idx, config in enumerate(configs): | |||
| p = multiprocessing.Process(target=self.run_one_kernel, | |||
| args=(run_times, idx, config, best_time, is_auto_set_dim)) | |||
| process_jobs.append(p) | |||
| p.start() | |||
| timeout_error = False | |||
| for idx, p in enumerate(process_jobs): | |||
| if not timeout_error: | |||
| p.join(timeout=self.timeout) | |||
| if p.is_alive(): | |||
| timeout_error = True | |||
| logger.debug("Timeout Error: [%s]", str(configs[idx].input)) | |||
| run_times[idx] = timeout_time | |||
| p.terminate() | |||
| process_end = time.time() | |||
| logger.debug("process time: %f", process_end - start) | |||
| # clean the profiling directory | |||
| tune_device = int(os.environ['DEVICE_ID']) | |||
| tune_num = int(os.environ['DEVICE_TOTAL_NUM']) | |||
| if os.environ['RUNTIME_MODE'] == "gpu": | |||
| subprocess.run("rm -rf cuda_meta_*", shell=True) | |||
| else: | |||
| pass | |||
| end = time.time() | |||
| logger.debug("run kernels time: %f", end - start) | |||
| self.run_kernel_time += end - start | |||
| for idx, config in enumerate(configs): | |||
| if run_times[idx] not in error_time_list: | |||
| logger.debug("KernelRunTime : [%s] : %s", str( | |||
| configs[idx].input), str(run_times[idx])) | |||
| else: | |||
| logger.debug("KernelRunTime : [%s] : %s", | |||
| str(configs[idx].input), str(error_time_string[run_times[idx]])) | |||
| return run_times | |||
| @@ -0,0 +1,217 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Config space""" | |||
| from abc import ABCMeta, abstractmethod | |||
| from typing import NamedTuple, List | |||
| import random | |||
| import numpy as np | |||
| class ConfigEntity: | |||
| """General config entity""" | |||
| def __init__(self, input_id: int, input_space: NamedTuple): | |||
| self.__input = input_space | |||
| self.__input_id = input_id | |||
| self.__input_type = type(input_space) | |||
| def __len__(self): | |||
| return len(self.__input) | |||
| def __str__(self): | |||
| return str(self.__input_id) + ': ' + str(self.__input) | |||
| def __repr__(self): | |||
| return str(self) | |||
| @property | |||
| def input_id(self): | |||
| return self.__input_id | |||
| @property | |||
| def input_type(self): | |||
| return self.__input_type | |||
| @property | |||
| def input(self): | |||
| return self.__input | |||
| @property | |||
| def feature(self): | |||
| return self.__input | |||
| class ConfigSpace(metaclass=ABCMeta): | |||
| """Searching space of configs""" | |||
| def __init__(self, input_type): | |||
| self._input_type = input_type | |||
| self._dim_names = getattr(self._input_type, '_fields') | |||
| self._configs = [] # List[ConfigEntity] | |||
| @abstractmethod | |||
| def reset_fetch(self): | |||
| pass | |||
| @abstractmethod | |||
| def has_next(self) -> bool: | |||
| pass | |||
| @abstractmethod | |||
| def fetch_index(self) -> int: | |||
| """fetch a random index of config""" | |||
| @abstractmethod | |||
| def fetch_config(self) -> ConfigEntity: | |||
| """fetch a random config""" | |||
| @abstractmethod | |||
| def random_walk(self, p: int) -> int: | |||
| """find a neighbor hood of the p-th ConfigEntity, which only | |||
| differs with p in at most one dimension""" | |||
| def get(self, idx: int) -> ConfigEntity: | |||
| """get the `idx`-th config of the space""" | |||
| return self._configs[idx] | |||
| @property | |||
| def configs(self): | |||
| return self._configs | |||
| @property | |||
| def dim_names(self): | |||
| return self._dim_names | |||
| @property | |||
| def input_type(self): | |||
| return self._input_type | |||
| @property | |||
| # @abstractmethod | |||
| def length(self): | |||
| return len(self.configs) | |||
| class ConfigTrie: | |||
| """Trie node for config entities""" | |||
| def __init__(self): | |||
| self.ch = dict() | |||
| def add(self, config: ConfigEntity, last_dim: int): | |||
| """add a ConfigEntity""" | |||
| cur = self | |||
| for i, x in enumerate(config.input): | |||
| if i == last_dim: | |||
| continue | |||
| if x not in cur.ch: | |||
| cur.ch[x] = ConfigTrie() | |||
| if not isinstance(cur.ch, dict): | |||
| raise TypeError('none-leaf node should have a dict of childs') | |||
| cur = cur.ch[x] | |||
| if not isinstance(cur.ch, list): | |||
| cur.ch = [] | |||
| cur.ch.append(config.input_id) | |||
| def fetch_random(self, config: ConfigEntity, last_dim: int) -> int: | |||
| """randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension""" | |||
| cur = self | |||
| for i, x in enumerate(config.input): | |||
| if i == last_dim: | |||
| continue | |||
| if not isinstance(cur.ch, dict): | |||
| raise TypeError('none leaf node should have a dict of childs') | |||
| if x not in cur.ch: | |||
| raise RuntimeError('no element found') | |||
| cur = cur.ch[x] | |||
| if not cur.ch: | |||
| raise RuntimeError('no element found') | |||
| if len(cur.ch) == 1: | |||
| return cur.ch[0] | |||
| idx = config.input_id | |||
| while idx == config.input_id: | |||
| idx = random.choice(cur.ch) | |||
| return idx | |||
| class ListConfigSpace(ConfigSpace): | |||
| """Searching space of configs, which stores all possible configs in a list""" | |||
| def __init__(self, input_type): | |||
| super(ListConfigSpace, self).__init__(input_type) | |||
| self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))] | |||
| self.__fetch_pool = [] | |||
| def reset_fetch(self): | |||
| """reset fetch state""" | |||
| self.__fetch_pool = [i for i in range(len(self._configs))] | |||
| def fetch_scope(self, start, end): | |||
| self.__fetch_pool = [i for i in range(start, end)] | |||
| def has_next(self) -> bool: | |||
| return len(self.__fetch_pool) > 0 | |||
| def fetch_index(self) -> int: | |||
| """fetch a random index of config""" | |||
| idx = np.random.randint(len(self.__fetch_pool)) | |||
| ret = self.__fetch_pool[idx] | |||
| self.__fetch_pool[idx] = self.__fetch_pool[-1] | |||
| self.__fetch_pool.pop() | |||
| return ret | |||
| def fetch_next_index(self) -> int: | |||
| """fetch next index of config""" | |||
| idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0] | |||
| self.__fetch_pool.pop() | |||
| return idx | |||
| def fetch_config(self) -> ConfigEntity: | |||
| """fetch a random config""" | |||
| return self.get(self.fetch_index()) | |||
| def add(self, input_space: NamedTuple): | |||
| """add a new config to space""" | |||
| if not isinstance(input_space, self._input_type): | |||
| raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space), | |||
| self._input_type)) | |||
| config = ConfigEntity(len(self._configs), input_space) | |||
| self.__fetch_pool.append(len(self._configs)) | |||
| for i in range(len(self._dim_names)): | |||
| self.__config_tries[i].add(config, i) | |||
| self._configs.append(config) | |||
| def random_walk(self, p: int) -> int: | |||
| """find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension""" | |||
| dim = np.random.randint(len(self._dim_names)) | |||
| return self.__config_tries[dim].fetch_random(self._configs[p], dim) | |||
| @property | |||
| def length(self): | |||
| return len(self._configs) | |||
| @classmethod | |||
| def from_list(cls, configs: List[NamedTuple]): | |||
| if not isinstance(configs, list): | |||
| raise TypeError('configs must be of list type, got %s' % type(configs)) | |||
| if not configs: | |||
| raise ValueError('configs must be non-empty') | |||
| space = cls(type(configs[0])) | |||
| for config in configs: | |||
| space.add(config) | |||
| return space | |||
| @@ -0,0 +1,753 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """space generating functions for operators""" | |||
| from functools import partial | |||
| from typing import NamedTuple | |||
| from collections import namedtuple | |||
| from test_run import matmul_run | |||
| from akg.utils import validation_check as vc_util | |||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig | |||
| from .space import ListConfigSpace | |||
| from .kernel_compiler import compile_kernel | |||
| from .gen_spaces_gpu import _get_space_reduce_gpu_manually | |||
| from tqdm import tqdm | |||
| from enum import Enum | |||
| GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"} | |||
| class GpuSpacePolicy(Enum): | |||
| """Policy to expand tile candidates with block and thread.""" | |||
| FULL = "FULL" | |||
| BMM = "BMM" | |||
| REDUCE_ALL = "REDUCE_ALL" | |||
| REDUCE_X = "REDUCE_X" | |||
| REDUCE_Y = "REDUCE_Y" | |||
| def gen_bool_list(attr_list): | |||
| bool_list = [] | |||
| for _ in attr_list: | |||
| if len(bool_list) == 0: | |||
| bool_list = [[True], [False]] | |||
| else: | |||
| tmp_list = [] | |||
| for attr_option in bool_list: | |||
| tmp = attr_option[:] | |||
| tmp.append(True) | |||
| tmp1 = tmp[:] | |||
| tmp.pop() | |||
| tmp.append(False) | |||
| tmp2 = tmp[:] | |||
| tmp_list.append(tmp1) | |||
| tmp_list.append(tmp2) | |||
| bool_list = tmp_list | |||
| return bool_list | |||
| def _get_space_vector(op_type: str, op_desc): | |||
| """get config space of vector operator""" | |||
| space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, | |||
| gen_tiling_spaces=True) | |||
| if space_res is None: | |||
| raise RuntimeError('no space returned') | |||
| if 'index' not in space_res or 'tuning_space' not in space_res: | |||
| raise RuntimeError('invalid space returned') | |||
| index_table = space_res['index'] | |||
| tiling_spaces = space_res['tuning_space'] | |||
| if not tiling_spaces: | |||
| raise RuntimeError('empty tiling spaces') | |||
| dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))] | |||
| input_type = namedtuple(op_type, dim_names) | |||
| space = ListConfigSpace(input_type) | |||
| for tiling_space in tiling_spaces: | |||
| config = input_type(*tiling_space) | |||
| space.add(config) | |||
| return index_table, space, key, expect, input_for_mod | |||
| def _get_space_conv(op_desc: ConvDesc): | |||
| """get config space of convolution""" | |||
| if not isinstance(op_desc, ConvDesc): | |||
| raise TypeError('op_desc must be ConvDesc') | |||
| stride_ = op_desc.stride | |||
| pad_ = op_desc.pad | |||
| dilation_ = op_desc.dilation | |||
| vc_util.convolution_format_check( | |||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||
| config_space = ListConfigSpace(ConvConfig) | |||
| # if double buff is not enabled, set it's value to 1 | |||
| size_scale = 1 | |||
| l1_max_size = (1024 * 1024) // size_scale | |||
| l0a_max_size = (64 * 1024) // size_scale | |||
| l0b_max_size = (64 * 1024) // size_scale | |||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||
| padding = (pad_[0], pad_[1], pad_[2], pad_[3]) | |||
| p_top, p_bottom, p_left, p_right = padding | |||
| s_h, s_w = stride_ | |||
| in_c = ((in_c - 1) // 16 + 1) * 16 | |||
| tile_c = in_c | |||
| tile_co_start = 16 | |||
| data_len = 2 | |||
| h_max = in_h + p_top + p_bottom | |||
| win_h = (h_max - k_h) // s_h + 1 | |||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||
| w_max = in_w + p_left + p_right | |||
| win_w = (w_max - k_w) // s_w + 1 | |||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||
| bypass_options = [0, 1] | |||
| for bypass in bypass_options: | |||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||
| size_h = tile_h | |||
| if tile_h == h_max: | |||
| w_range = range(w_max, k_w - 1, -s_w) | |||
| size_h = in_h | |||
| else: | |||
| w_range = [w_max] | |||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||
| if h_tiles == 2: | |||
| size_h = max(tile_h - p_top, in_h + | |||
| p_top - tile_h + k_h - s_h) | |||
| for tile_w in w_range: | |||
| size_w = tile_w | |||
| if size_w == w_max: | |||
| size_w = in_w | |||
| else: | |||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||
| if w_tiles == 2: | |||
| size_w = max(tile_w - p_left, in_w + | |||
| p_left - tile_w + k_w - s_w) | |||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||
| for tile_co in co_range: | |||
| if bypass == 1: | |||
| if tile_co != k_n: | |||
| continue | |||
| l1_size = data_len * (size_h * size_w * in_c) | |||
| else: | |||
| l1_size = data_len * (size_h * size_w * in_c + | |||
| tile_co * tile_c * k_h * k_w) | |||
| if l1_size > l1_max_size: | |||
| continue | |||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||
| for tile_n in range(tile_co_, 15, -16): | |||
| k_max = in_c * k_h * k_w | |||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||
| k_size = l0b_max_size // data_len // tile_n | |||
| k_size_ = k_size // 16 * 16 | |||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||
| m_size1 = l0a_max_size // data_len // tile_k | |||
| m_size1_ = m_size1 // 16 * 16 | |||
| m_size2 = l0c_max_size // data_len // tile_n | |||
| m_size2_ = m_size2 // 16 * 16 | |||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||
| config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, | |||
| tile_n, tile_w, bypass)) | |||
| return None, config_space, op_desc.__str__(), None, None | |||
| def _get_space_conv_bn1(op_desc: ConvDesc): | |||
| """get config space of convolution""" | |||
| if not isinstance(op_desc, ConvDesc): | |||
| raise TypeError('op_desc must be ConvDesc') | |||
| stride_ = op_desc.stride | |||
| pad_ = op_desc.pad | |||
| dilation_ = op_desc.dilation | |||
| vc_util.convolution_format_check( | |||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||
| config_space = ListConfigSpace(ConvConfig) | |||
| # if double buff is not enabled, set it's value to 1 | |||
| size_scale = 1 | |||
| l1_max_size = (1024 * 1024) // size_scale | |||
| l0a_max_size = (64 * 1024) // size_scale | |||
| l0b_max_size = (64 * 1024) // size_scale | |||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4 | |||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||
| padding = (pad_[0], pad_[1], pad_[2], pad_[3]) | |||
| p_top, p_bottom, p_left, p_right = padding | |||
| s_h, s_w = stride_ | |||
| in_c = ((in_c - 1) // 16 + 1) * 16 | |||
| tile_c = in_c | |||
| tile_co_start = 16 | |||
| data_len = 2 | |||
| h_max = in_h + p_top + p_bottom | |||
| win_h = (h_max - k_h) // s_h + 1 | |||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||
| w_max = in_w + p_left + p_right | |||
| win_w = (w_max - k_w) // s_w + 1 | |||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||
| bypass_options = [0, 1] | |||
| for bypass in bypass_options: | |||
| h_range = range(h_max, k_h - 1, -s_h) | |||
| for tile_h in h_range: | |||
| size_h = tile_h | |||
| if tile_h == h_max: | |||
| w_range = range(w_max, k_w - 1, -s_w) | |||
| size_h = in_h | |||
| else: | |||
| w_range = [w_max] | |||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||
| if h_tiles == 2: | |||
| size_h = max(tile_h - p_top, in_h + | |||
| p_top - tile_h + k_h - s_h) | |||
| for tile_w in w_range: | |||
| size_w = tile_w | |||
| if size_w == w_max: | |||
| size_w = in_w | |||
| else: | |||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||
| if w_tiles == 2: | |||
| size_w = max(tile_w - p_left, in_w + | |||
| p_left - tile_w + k_w - s_w) | |||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||
| for tile_co in co_range: | |||
| if bypass == 1: | |||
| if tile_co != k_n: | |||
| continue | |||
| l1_size = data_len * (size_h * size_w * in_c) | |||
| else: | |||
| l1_size = data_len * (size_h * size_w * in_c + | |||
| tile_co * tile_c * k_h * k_w) | |||
| if l1_size > l1_max_size: | |||
| continue | |||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||
| for tile_n in range(tile_co_, 15, -16): | |||
| k_max = in_c * k_h * k_w | |||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||
| k_size = l0b_max_size // data_len // tile_n | |||
| k_size_ = k_size // 16 * 16 | |||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||
| m_size1 = l0a_max_size // data_len // tile_k | |||
| m_size1_ = m_size1 // 16 * 16 | |||
| m_size2 = l0c_max_size // data_len // tile_n | |||
| m_size2_ = m_size2 // 16 * 16 | |||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||
| config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, | |||
| tile_n, tile_w, bypass)) | |||
| return None, config_space, op_desc.__str__(), None, None | |||
| def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc): | |||
| """get config space of convolution backprop input""" | |||
| if not isinstance(op_desc, ConvBackpropDesc): | |||
| raise TypeError('op_desc must be ConvDesc') | |||
| stride_ = op_desc.stride | |||
| pad_ = op_desc.pad | |||
| dilation_ = op_desc.dilation | |||
| vc_util.convolution_format_check( | |||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||
| config_space = ListConfigSpace(ConvBackpropInputConfig) | |||
| # if double buff is not enabled, set it's value to 1 | |||
| size_scale = 1 | |||
| block_size = 16 | |||
| l1_max_size = (1024 * 1024) // size_scale | |||
| l0a_max_size = (64 * 1024) // size_scale | |||
| l0b_max_size = (64 * 1024) // size_scale | |||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||
| ub_max_size = l0c_max_size | |||
| _, in_c, in_h, in_w = op_desc.fmap_shape | |||
| k_n, _, k_h, k_w = op_desc.filter_shape | |||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||
| k_n = (k_n + block_size - 1) // block_size * block_size | |||
| pad_top, pad_bottom, pad_left, pad_right = pad_ | |||
| stride_h, stride_w = stride_ | |||
| out_c = k_n | |||
| out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1 | |||
| out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1 | |||
| out_h = out_h * stride_h | |||
| out_w = out_w * stride_w | |||
| p_top = k_h - pad_[0] - 1 | |||
| p_bottom = in_h + pad_[0] - stride_[0] * \ | |||
| ((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1) | |||
| p_left = k_w - pad_[2] - 1 | |||
| p_right = in_w + pad_[2] - stride_[1] * \ | |||
| ((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1) | |||
| s_h = 1 | |||
| s_w = 1 | |||
| tile_c = out_c | |||
| tile_co_start = 16 | |||
| data_len = 2 | |||
| h_max = out_h + p_top + p_bottom | |||
| win_h = (h_max - k_h) // s_h + 1 | |||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||
| w_max = out_w + p_left + p_right | |||
| win_w = (w_max - k_w) // s_w + 1 | |||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||
| size_h = tile_h | |||
| if tile_h == h_max: | |||
| w_range = range(w_max, k_w - 1, -s_w) | |||
| size_h = in_h | |||
| else: | |||
| w_range = [w_max] | |||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||
| if h_tiles == 2: | |||
| size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h) | |||
| for tile_w in w_range: | |||
| size_w = tile_w | |||
| if size_w == w_max: | |||
| size_w = in_w | |||
| else: | |||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||
| if w_tiles == 2: | |||
| size_w = max(tile_w - p_left, in_w + | |||
| p_left - tile_w + k_w - s_w) | |||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||
| for tile_co in co_range: | |||
| l1_size = data_len * (size_h * size_w * out_c + | |||
| tile_co * tile_c * k_h * k_w) | |||
| if l1_size > l1_max_size: | |||
| continue | |||
| ub_size = data_len * (size_h * size_w * out_c) | |||
| if ub_size > ub_max_size: | |||
| continue | |||
| tile_co_ = ((tile_co - 1) // 16 + 1) * 16 | |||
| for tile_n in range(tile_co_, 15, -16): | |||
| k_max = out_c * k_h * k_w | |||
| k_base = 16 * k_h * k_w | |||
| k_max_ = ((k_max - 1) // k_base + 1) * k_base | |||
| k_size = l0b_max_size // data_len // tile_n | |||
| k_size_ = k_size // k_base * k_base | |||
| for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base): | |||
| m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ | |||
| (int(((tile_w - k_w) // (s_w)) + 1)) | |||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||
| m_size1 = l0a_max_size // data_len // tile_k | |||
| m_size1_ = m_size1 // 16 * 16 | |||
| m_size2 = l0c_max_size // data_len // tile_n | |||
| m_size2_ = m_size2 // 16 * 16 | |||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||
| config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m, | |||
| tile_k, tile_n, tile_w)) | |||
| return None, config_space, op_desc.__str__(), None, None | |||
| def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc): | |||
| """get config space of convolution backwprop filter""" | |||
| if not isinstance(op_desc, ConvBackpropDesc): | |||
| raise TypeError('op_desc must be ConvBackpropDesc') | |||
| stride_ = op_desc.stride | |||
| pad_ = op_desc.pad | |||
| dilation_ = op_desc.dilation | |||
| vc_util.convolution_format_check( | |||
| op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) | |||
| config_space = ListConfigSpace(ConvBackpropFilterConfig) | |||
| # if double buff is not enabled, set it's value to 1 | |||
| size_scale = 1 | |||
| block_size = 16 | |||
| l1_max_size = (1024 * 1024) // size_scale | |||
| l0a_max_size = (64 * 1024) // size_scale | |||
| l0b_max_size = (64 * 1024) // size_scale | |||
| l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 | |||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||
| cout, _, k_h, k_w = op_desc.filter_shape | |||
| k_n = cout | |||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||
| cout = (cout + block_size - 1) // block_size * block_size | |||
| pad_top, pad_bottom, pad_left, pad_right = pad_ | |||
| s_h, s_w = stride_ | |||
| tile_co_start = 16 | |||
| tile_ci_start = 16 | |||
| data_len = 2 | |||
| h_max = in_h + pad_top + pad_bottom | |||
| win_h = (h_max - k_h) // s_h + 1 | |||
| h_max = (h_max - k_h) // s_h * s_h + k_h | |||
| w_max = in_w + pad_left + pad_right | |||
| win_w = (w_max - k_w) // s_w + 1 | |||
| w_max = (w_max - k_w) // s_w * s_w + k_w | |||
| for tile_h in range(h_max, k_h - 1, -s_h): | |||
| size_h = tile_h | |||
| win_tile_h = (tile_h - k_h) // s_h + 1 | |||
| # Only one head for cut H axis | |||
| if win_tile_h * s_h < pad_top: | |||
| continue | |||
| # Only one tail for cut H axis | |||
| if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top: | |||
| continue | |||
| if tile_h == h_max: | |||
| w_range = range(w_max, k_w - 1, -s_w) | |||
| size_h = in_h | |||
| else: | |||
| w_range = [w_max] | |||
| h_tiles = (win_h + win_tile_h - 1) // win_tile_h | |||
| if h_tiles == 2: | |||
| size_h = max(tile_h - pad_top, in_h + | |||
| pad_top - tile_h + k_h - s_h) | |||
| for tile_w in w_range: | |||
| size_w = tile_w | |||
| win_tile_w = (tile_w - k_w) // s_w + 1 | |||
| # Only one head for cut W axis | |||
| if win_tile_w * s_w < pad_left: | |||
| continue | |||
| # Only one tail for cut W axis | |||
| if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left: | |||
| continue | |||
| if size_w == w_max: | |||
| size_w = in_w | |||
| else: | |||
| w_tiles = (win_w + win_tile_w - 1) // win_tile_w | |||
| if w_tiles == 2: | |||
| size_w = max(tile_w - pad_left, in_w + | |||
| pad_left - tile_w + k_w - s_w) | |||
| for tile_kh in range(k_h, 0, -1): | |||
| for tile_kw in range(k_w, 0, -1): | |||
| k_n_ = ((k_n - 1) // 16 + 1) * 16 | |||
| co_range = range(k_n_, tile_co_start - 1, -16) | |||
| for tile_co in co_range: | |||
| in_c_ = ((in_c - 1) // 16 + 1) * 16 | |||
| ci_range = range(in_c_, tile_ci_start - 1, -16) | |||
| for tile_ci in ci_range: | |||
| tile_batch = 1 | |||
| l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w + | |||
| tile_ci * size_h * size_w) | |||
| if l1_size > l1_max_size: | |||
| continue | |||
| if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_): | |||
| tile_m = tile_co | |||
| tile_n = tile_ci * tile_kh * tile_kw | |||
| l0c_size = data_len * tile_n * tile_m | |||
| if l0c_size > l0c_max_size: | |||
| continue | |||
| k_max = tile_batch * tile_h * tile_w | |||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||
| k_size1 = l0a_max_size // data_len // tile_m | |||
| k_size1_ = k_size1 // 16 * 16 | |||
| k_size2 = l0b_max_size // data_len // tile_n | |||
| k_size2_ = k_size2 // 16 * 16 | |||
| for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16): | |||
| config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co, | |||
| tile_batch, tile_h, tile_w, tile_m, | |||
| tile_k, tile_n)) | |||
| else: | |||
| for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16): | |||
| k_max = tile_batch * tile_h * tile_w | |||
| k_max_ = ((k_max - 1) // 16 + 1) * 16 | |||
| k_size = l0b_max_size // data_len // tile_n | |||
| k_size_ = k_size // 16 * 16 | |||
| for tile_k in range(min(k_max_, k_size_), 15, -16): | |||
| m_max = tile_co | |||
| m_max_ = ((m_max - 1) // 16 + 1) * 16 | |||
| m_size1 = l0a_max_size // data_len // tile_k | |||
| m_size1_ = m_size1 // 16 * 16 | |||
| m_size2 = l0c_max_size // data_len // tile_n | |||
| m_size2_ = m_size2 // 16 * 16 | |||
| for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): | |||
| config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, | |||
| tile_co, tile_batch, tile_h, | |||
| tile_w, tile_m, tile_k, tile_n)) | |||
| return None, config_space, op_desc.__str__(), None, None | |||
| def _get_space_matmul_cube(op_desc: MatmulCubeDesc): | |||
| """get config space of matmul_cube""" | |||
| if not isinstance(op_desc, MatmulCubeDesc): | |||
| raise TypeError('op_desc must be MatmulCubeDesc') | |||
| config_space = ListConfigSpace(MatmulCubeConfig) | |||
| batch_tuple, m, k, n = matmul_run.extract_dim( | |||
| op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) | |||
| mmax = (m + 15) // 16 | |||
| nmax = (n + 15) // 16 | |||
| kmax = (k + 15) // 16 | |||
| double_buffer = True | |||
| mad_fp32 = True | |||
| l1_max_size = (1024 * 1024) # L1 MEM 1024KB | |||
| l0a_max_size = (64 * 1024) # L0A MEM 64KB | |||
| l0b_max_size = (64 * 1024) # L0B MEM 64KB | |||
| l0c_max_size = (256 * 1024) # L0C MEM 256KB | |||
| # UB MEM 248KB, 8KB reserved for compiler | |||
| ub_max_size = ((256 - 8) * 1024) | |||
| if double_buffer: | |||
| l1_max_size = l1_max_size // 2 | |||
| l0a_max_size = l0a_max_size // 2 | |||
| l0b_max_size = l0b_max_size // 2 | |||
| l0c_max_size = l0c_max_size // 2 | |||
| ub_max_size = ub_max_size // 2 | |||
| if mad_fp32: | |||
| l0c_max_size = l0c_max_size // 2 | |||
| if op_desc.out_dtype == 'float32': | |||
| ub_max_size = ub_max_size // 2 | |||
| bypass_options = [0, 1, 2] | |||
| for bypass in bypass_options: | |||
| if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or | |||
| (op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')): | |||
| continue | |||
| if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or | |||
| (op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')): | |||
| continue | |||
| for k_l1 in range(1, kmax + 1): | |||
| if kmax % k_l1 != 0: | |||
| continue | |||
| for k_l0 in range(1, k_l1 + 1): | |||
| if k_l1 % k_l0 != 0: | |||
| continue | |||
| # no need to cut from l1 to l0 for m and n when k is cut | |||
| for m_l1 in range(1, mmax + 1): | |||
| if mmax % m_l1 != 0: | |||
| continue | |||
| m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1) | |||
| for m_l0 in m_l0_range: | |||
| if m_l1 % m_l0 != 0: | |||
| continue | |||
| for n_l1 in range(1, nmax + 1): | |||
| if nmax % n_l1 != 0: | |||
| continue | |||
| n_l0_range = [n_l1] if k_l1 != kmax else range( | |||
| 1, n_l1 + 1) | |||
| for n_l0 in n_l0_range: | |||
| if n_l1 % n_l0 != 0: | |||
| continue | |||
| if m_l0 * 16 * k_l0 * 16 > l0a_max_size: | |||
| continue | |||
| if n_l0 * 16 * k_l0 * 16 > l0b_max_size: | |||
| continue | |||
| if m_l0 * 16 * n_l0 * 16 > l0c_max_size: | |||
| continue | |||
| if m_l0 * 16 * n_l0 * 16 > ub_max_size: | |||
| continue | |||
| if bypass == 2: | |||
| l1_size = n_l1 * 16 * k_l1 * 16 | |||
| elif bypass == 1: | |||
| l1_size = m_l1 * 16 * k_l1 * 16 | |||
| else: | |||
| l1_size = (m_l1 * 16 + n_l1 * | |||
| 16) * k_l1 * 16 | |||
| if l1_size > l1_max_size: | |||
| continue | |||
| if nmax == 1: | |||
| n_l1 = 0 | |||
| n_l0 = 0 | |||
| if mmax == 1: | |||
| m_l1 = 0 | |||
| m_l0 = 0 | |||
| if kmax == 1: | |||
| k_l1 = 16 | |||
| k_l0 = 16 | |||
| config_space.add(MatmulCubeConfig( | |||
| n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass)) | |||
| shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, | |||
| op_desc.bias, op_desc.left_format, | |||
| op_desc.right_format, op_desc.out_format) | |||
| return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format, | |||
| op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype, | |||
| op_desc.out_dtype)), None, None | |||
| def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||
| """get config space of batch_matmul operator in gpu""" | |||
| return | |||
| def get_range_block(space_res): | |||
| block_range = space_res.gpu_block_range_table.asnumpy().tolist() | |||
| block_mod = space_res.gpu_block_mod_table.asnumpy().tolist() | |||
| block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0]) | |||
| block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0]) | |||
| if len(block_y_range) == 0: block_y_range = range(1,2) | |||
| block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0]) | |||
| if len(block_z_range) == 0: block_z_range = range(1,2) | |||
| return block_x_range,block_y_range,block_z_range | |||
| def get_range_thread(space_res): | |||
| thread_range = space_res.gpu_thread_range_table.asnumpy().tolist() | |||
| thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist() | |||
| thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0]) | |||
| thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0]) | |||
| if len(thread_y_range) == 0: thread_y_range = range(1,2) | |||
| thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0]) | |||
| if len(thread_z_range) == 0: thread_z_range = range(1,2) | |||
| return thread_x_range,thread_y_range,thread_z_range | |||
| def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL): | |||
| total_shape = max([max(v) for v in tiling_spaces]) | |||
| new_spaces = [] | |||
| block_x_range, block_y_range, block_z_range = get_range_block(space_res) | |||
| thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res) | |||
| pbar = tqdm(total=len(tiling_spaces)) | |||
| max_thread = 1024 | |||
| for space in tiling_spaces: | |||
| pbar.set_description("Adding block, thread to spaces") | |||
| if policy == GpuSpacePolicy.REDUCE_ALL: | |||
| for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2): | |||
| for by in block_y_range: | |||
| for bz in block_z_range: | |||
| for tx in thread_x_range: | |||
| for ty in thread_y_range: | |||
| for tz in thread_z_range: | |||
| if tx * ty * tz > max_thread: | |||
| continue | |||
| tmp_space = space[:] | |||
| tmp_space.append(bx) | |||
| tmp_space.append(by) | |||
| tmp_space.append(bz) | |||
| tmp_space.append(tx) | |||
| tmp_space.append(ty) | |||
| tmp_space.append(tz) | |||
| new_spaces.append(tmp_space) | |||
| elif policy == GpuSpacePolicy.BMM: | |||
| for tx in thread_x_range: | |||
| for ty in thread_y_range: | |||
| for tz in thread_z_range: | |||
| if tx * ty * tz > max_thread: | |||
| continue | |||
| tmp_space = space[:] | |||
| if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]): | |||
| continue | |||
| bx = max(1, tmp_space[-1] // tx) | |||
| by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1 | |||
| bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1 | |||
| if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop: | |||
| continue | |||
| tmp_space.append(bx) | |||
| tmp_space.append(by) | |||
| tmp_space.append(bz) | |||
| tmp_space.append(tx) | |||
| tmp_space.append(ty) | |||
| tmp_space.append(tz) | |||
| new_spaces.append(tmp_space) | |||
| elif policy == GpuSpacePolicy.FULL: | |||
| for bx in block_x_range: | |||
| for by in block_y_range: | |||
| for bz in block_z_range: | |||
| for tx in thread_x_range: | |||
| for ty in thread_y_range: | |||
| for tz in thread_z_range: | |||
| tmp_space = space[:] | |||
| tmp_space.append(bx) | |||
| tmp_space.append(by) | |||
| tmp_space.append(bz) | |||
| tmp_space.append(tx) | |||
| tmp_space.append(ty) | |||
| tmp_space.append(tz) | |||
| new_spaces.append(tmp_space) | |||
| else: | |||
| raise ValueError("Policy {} is not defined.".format(policy)) | |||
| pbar.update(1) | |||
| print("total spaces size is: ",len(new_spaces)) | |||
| return new_spaces | |||
| def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): | |||
| """get config space of conv_image2col_gemm operators in gpu""" | |||
| return | |||
| _get_space_func = { | |||
| 'conv': _get_space_conv, | |||
| 'conv_bn1': _get_space_conv_bn1, | |||
| 'conv_backprop_input': _get_space_conv_backprop_input, | |||
| 'conv_backprop_filter': _get_space_conv_backprop_filter, | |||
| 'matmul': _get_space_matmul_cube, | |||
| "reduce_sum_gpu": _get_space_reduce_gpu_manually, | |||
| "batch_matmul_gpu": _get_space_batch_matmul_gpu, | |||
| "conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu, | |||
| } | |||
| def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None): | |||
| """get space of an operator""" | |||
| func = _get_space_func.get(op_type, None) | |||
| if func is None: | |||
| func = partial(_get_space_vector, op_type=op_type) | |||
| if "gpu" in op_type: | |||
| return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) | |||
| return func(op_desc=op_desc) | |||
| @@ -0,0 +1,147 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Generating test data for operators""" | |||
| from typing import NamedTuple | |||
| import numpy as np | |||
| from gen_json_data import gen_json_data | |||
| from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run | |||
| from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc | |||
| def _gen_data_json(op_desc): | |||
| """Generating test data for composite json""" | |||
| input_for_mod, expect, _ = gen_json_data(op_desc) | |||
| return input_for_mod, expect | |||
| def _gen_data_conv(op_desc: ConvDesc): | |||
| """Generating test data for conv""" | |||
| fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, | |||
| op_desc.pad, op_desc.stride, op_desc.dilation, | |||
| op_desc.use_bias) | |||
| out_data = np.full(expect.shape, 0, 'float16') | |||
| if op_desc.use_bias: | |||
| args = (fmap_data, filter_data, bias_data, out_data) | |||
| else: | |||
| args = (fmap_data, filter_data, out_data) | |||
| return args, expect | |||
| def _gen_data_conv_bn1(op_desc: ConvDesc): | |||
| """Generating test data for conv_bn1""" | |||
| fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, | |||
| op_desc.pad, op_desc.stride, op_desc.dilation, | |||
| op_desc.use_bias) | |||
| axes = (0, 2, 3) | |||
| conv_mean = np.mean(conv_expect, axis=axes, keepdims=True) | |||
| conv_square = np.power(conv_expect, 2) | |||
| conv_var_part = np.mean(conv_square, axis=axes, keepdims=True) | |||
| expects = (conv_expect, conv_var_part, conv_mean) | |||
| out_datas = [np.full(e.shape, 0, 'float16') for e in expects] | |||
| out_datas[1] = out_datas[1].astype(np.float32) | |||
| out_datas[2] = out_datas[2].astype(np.float32) | |||
| if op_desc.use_bias: | |||
| in_data = [fmap_data, filter_data, bias_data] | |||
| else: | |||
| in_data = [fmap_data, filter_data] | |||
| args = in_data | |||
| for out in out_datas: | |||
| args.append(out) | |||
| args = tuple(args) | |||
| return {"args": args, 'outputs': (-3, -2, -1)}, expects | |||
| def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc): | |||
| dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, | |||
| op_desc.stride, op_desc.dilation) | |||
| out_data = np.full(dx.shape, 0, 'float16') | |||
| args = (dout, w, out_data) | |||
| return args, dx | |||
| def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc): | |||
| """Generating test data for conv_backprop_filter""" | |||
| block_size = 16 | |||
| in_n, in_c, in_h, in_w = op_desc.fmap_shape | |||
| cout, _, w_h, w_w = op_desc.filter_shape | |||
| in_c = (in_c + block_size - 1) // block_size * block_size | |||
| cout = (cout + block_size - 1) // block_size * block_size | |||
| x_shape = (in_n, in_c, in_h, in_w) | |||
| w_shape = (cout, in_c, w_h, w_w) | |||
| dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride, | |||
| op_desc.dilation) | |||
| out_data = np.full(expect.shape, 0, 'float32') | |||
| args = (dy_data, dx_data, out_data) | |||
| return args, expect | |||
| def _gen_data_matmul_cube(op_desc: MatmulCubeDesc): | |||
| """Generating test data for matmul_cube""" | |||
| batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) | |||
| m = (m + 15) // 16 * 16 | |||
| n = (n + 15) // 16 * 16 | |||
| k = (k + 15) // 16 * 16 | |||
| _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, | |||
| op_desc.bias, op_desc.left_format, op_desc.right_format, | |||
| op_desc.out_format) | |||
| m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype, | |||
| op_desc.out_dtype, op_desc.bias, op_desc.adj_x, | |||
| op_desc.adj_y, op_desc.left_format, | |||
| op_desc.right_format, op_desc.out_format) | |||
| out_data = np.full(out_shape, np.nan, op_desc.out_dtype) | |||
| if op_desc.bias: | |||
| args = (m_x, m_y, bias_data, out_data) | |||
| else: | |||
| args = (m_x, m_y, out_data) | |||
| return args, bench_mark | |||
| _gen_data_func = { | |||
| 'json': _gen_data_json, | |||
| 'conv': _gen_data_conv, | |||
| 'conv_bn1': _gen_data_conv_bn1, | |||
| 'conv_backprop_input': _gen_data_conv_backprop_input, | |||
| 'conv_backprop_filter': _gen_data_conv_backprop_filter, | |||
| 'matmul': _gen_data_matmul_cube, | |||
| } | |||
| def gen_data(op_type: str, op_desc: NamedTuple): | |||
| """Generate test data for operator | |||
| Parameters | |||
| op_type: str | |||
| operator name | |||
| op_desc: NamedTuple | |||
| operator definition parameters | |||
| ---------- | |||
| """ | |||
| gen_func = _gen_data_func.get(op_type, None) | |||
| if gen_func is None: | |||
| raise ValueError('Unsupported op type for test data generating: %s' % op_type) | |||
| return gen_func(op_desc) | |||
| @@ -0,0 +1,84 @@ | |||
| from akg.utils import custom_tiling as ct_util | |||
| def reduce_gpu_tiling_strategy(in_shape, reduce_axis): | |||
| """Custom tiling strategy for reduce op in gpu""" | |||
| strategy = list() | |||
| if reduce_axis == None or len(reduce_axis) == len(in_shape): | |||
| """all-reduce""" | |||
| strategy.append( | |||
| ct_util.create_constraint_on_axis( | |||
| values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0 | |||
| )[0] | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD | |||
| ) | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||
| ) | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN | |||
| ) | |||
| ) | |||
| elif (len(in_shape) - 1) in reduce_axis: | |||
| """Reduce-X: dummy strategy for hand-write space""" | |||
| strategy.append( | |||
| ct_util.create_constraint_on_axis( | |||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 | |||
| )[0] | |||
| ) | |||
| strategy.append( | |||
| ct_util.create_constraint_on_axis( | |||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 | |||
| )[0] | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||
| ) | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX | |||
| ) | |||
| ) | |||
| else: | |||
| """Reduce-Y: dummy strategy for hand-write space""" | |||
| strategy.append( | |||
| ct_util.create_constraint_on_axis( | |||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 | |||
| )[0] | |||
| ) | |||
| strategy.append( | |||
| ct_util.create_constraint_on_axis( | |||
| values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 | |||
| )[0] | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX | |||
| ) | |||
| ) | |||
| strategy.append( | |||
| ct_util.modify_common_constraints( | |||
| value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX | |||
| ) | |||
| ) | |||
| return strategy | |||
| def conv_dummy_strategy(): | |||
| """Conv strategy: dummy strategy""" | |||
| return | |||
| def batch_matmul_gpu_tiling_strategy(desc): | |||
| """Custom tiling strategy for batch matmul in gpu with or without tensor core""" | |||
| return | |||
| @@ -0,0 +1,359 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Tuner for finding best config for operators""" | |||
| import logging | |||
| import time | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| from multiprocessing import Process | |||
| from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel | |||
| from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer | |||
| from .space import ConfigSpace | |||
| from .runner import KernelRunner | |||
| from tqdm import tqdm | |||
| logger = logging.getLogger('fuzz.tune.autotuning.tuner') | |||
| class Tuner: | |||
| """Basic tuner class | |||
| Parameters | |||
| ---------- | |||
| runner: KernelRunner | |||
| This is for run kernels in physical device | |||
| config_space: ConfigSpace | |||
| The space of configs | |||
| n_parallel: int | |||
| How many kernels are processed in a turn | |||
| """ | |||
| def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None): | |||
| self._runner = runner | |||
| self._index_table = index_table | |||
| self._space = config_space | |||
| self._n_parallel = n_parallel | |||
| # trial plan | |||
| self._trials = [] | |||
| self._trial_pt = 0 | |||
| self._visited = set() | |||
| # observed samples | |||
| self._xs = [] | |||
| self._ys = [] | |||
| # keep the current best | |||
| self._best_config = None # type: ConfigEntity | |||
| self._best_time = np.inf | |||
| self._best_iter = 0 | |||
| self._tuning_time = 0.0 | |||
| self._original_time = np.inf | |||
| self._skip_config_set = skip_config_set | |||
| @property | |||
| def best_config(self): | |||
| return self._best_config | |||
| @property | |||
| def best_time(self): | |||
| return self._best_time | |||
| @property | |||
| def best_iter(self): | |||
| return self._best_iter | |||
| @property | |||
| def tuning_time(self): | |||
| return self._tuning_time | |||
| @property | |||
| def original_time(self): | |||
| return self._original_time | |||
| @property | |||
| def xs(self): | |||
| return self._xs | |||
| @property | |||
| def ys(self): | |||
| return self._ys | |||
| def info(self): | |||
| print('space size:', self._space.length) | |||
| print('best config:', self._best_config) | |||
| print('best time:', self._best_time) | |||
| print('best_iter:', self._best_iter) | |||
| print('tuning time:', self._tuning_time, 'secs') | |||
| def next_batch(self, batch_size: int, is_add_visited=True): | |||
| """extract next batch with xgboost model""" | |||
| ret = [] | |||
| counter = 0 | |||
| if not is_add_visited: | |||
| return [self._space.get(index) for index in range(min(batch_size, self._space.length))] | |||
| while counter < batch_size and self._space.has_next(): | |||
| index = 0 | |||
| while self._trial_pt < len(self._trials): | |||
| index = self._trials[self._trial_pt] | |||
| if index not in self._visited: | |||
| break | |||
| self._trial_pt += 1 | |||
| if self._trial_pt >= len(self._trials): | |||
| # if the trial list is empty choose randomly | |||
| index = self._space.fetch_index() | |||
| ret.append(self._space.get(index)) | |||
| self._visited.add(index) | |||
| counter += 1 | |||
| return ret | |||
| def next_config(self, batch_size: int): | |||
| """extract next config orderly""" | |||
| ret = [] | |||
| counter = 0 | |||
| while counter < batch_size and self._space.has_next(): | |||
| index = self._space.fetch_next_index() | |||
| ret.append(self._space.get(index)) | |||
| self._visited.add(index) | |||
| counter += 1 | |||
| return ret | |||
| def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""): | |||
| """export configs""" | |||
| mode = "a" if append else "w" | |||
| with open(output_file, mode) as f: | |||
| for x, y in configs: | |||
| if y != -1: | |||
| f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y)) | |||
| def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""): | |||
| """export dim configs""" | |||
| mode = "a" if append else "w" | |||
| data = {} | |||
| try: | |||
| if os.path.isfile(output_file): | |||
| with open(output_file, 'r') as f: | |||
| data = json.load(f) | |||
| except IOError as e: | |||
| logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) | |||
| with open(output_file, mode) as f: | |||
| import re | |||
| data[key] = configs | |||
| s = json.dumps(data, sort_keys=True) | |||
| s = re.sub(r',\s*"', ',\n"', s) | |||
| s = '{\n' + s[1:-1] + '\n}' | |||
| f.write(s) | |||
| def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]): | |||
| """export dim configs""" | |||
| mode = "a" if append else "w" | |||
| data = {} | |||
| try: | |||
| if os.path.isfile(output_file): | |||
| with open(output_file, 'r') as f: | |||
| data = json.load(f) | |||
| except IOError as e: | |||
| logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) | |||
| with open(output_file, mode) as f: | |||
| import copy | |||
| tmp = copy.deepcopy(configs) | |||
| for key in reversed(keys): | |||
| info = {key: tmp} | |||
| tmp = copy.deepcopy(info) | |||
| data.update(info) | |||
| s = json.dumps(data, sort_keys=True, indent=4) | |||
| print(s) | |||
| f.write(s) | |||
| def load_configs(self, input_file: str): | |||
| """load configs""" | |||
| configs = [] | |||
| file_path = os.path.realpath(input_file) | |||
| if os.path.isfile(file_path): | |||
| with open(file_path, "r") as f: | |||
| for line in f: | |||
| x, y, _ = line.split('|') | |||
| configs.append((self._space.input_type(**json.loads(x)), np.float64(y))) | |||
| return configs | |||
| def tune(self, least_try_times: int, output_file: str = None): | |||
| """grid search all configs""" | |||
| i = 0 | |||
| pbar = tqdm(total=least_try_times) | |||
| while i < least_try_times: | |||
| if not self._space.has_next(): | |||
| break | |||
| configs = self.next_config(min(self._n_parallel, least_try_times - i)) | |||
| run_times = self._runner.run(configs, self._best_time) | |||
| results = [] | |||
| for idx, conf in enumerate(configs): | |||
| results.append((conf.input_id, run_times[idx])) | |||
| # keep best config | |||
| if self.best_time > run_times[idx]: | |||
| self._best_time = run_times[idx] | |||
| self._best_iter = i + idx | |||
| self._best_config = conf | |||
| i += len(results) | |||
| pbar.update(len(results)) | |||
| # update | |||
| for res in results: | |||
| self._xs.append(res[0]) | |||
| self._ys.append(res[1]) | |||
| if output_file: | |||
| configs = [(self._space.get(res[0]).input, res[1]) for res in results] | |||
| self.export_configs(configs, output_file) | |||
| return run_times | |||
| class ModelBasedTuner(Tuner): | |||
| """Model based tuner | |||
| This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials | |||
| Parameters | |||
| ---------- | |||
| plan_size: int | |||
| Tuner will re-fit model per `plan_size` new measure samples | |||
| pre_model: CostModel | |||
| The cost model that predicts the speed of a config (IR) | |||
| """ | |||
| def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None): | |||
| super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel) | |||
| self.__plan_size = plan_size | |||
| if pre_model is not None: | |||
| self.__cost_model = pre_model | |||
| self.__cost_model.reset_space(self._space) | |||
| else: | |||
| self.__cost_model = XgbCostModel(self._space) | |||
| self.__model_optimizer = SimulatedAnnealingOptimizer(self._space) | |||
| self.__train_ct = 0 | |||
| self.__is_auto_set_dim = False#True | |||
| # time to leave | |||
| self.__ttl = None | |||
| self.__least_try_times = None | |||
| self.__early_stopping = None | |||
| self.__model_run_time = 0.0 | |||
| def info(self): | |||
| super(ModelBasedTuner, self).info() | |||
| print('model run time:', self.__model_run_time, 'secs') | |||
| def model_res(self): | |||
| self.__cost_model.fit(self._xs, self._ys, self.__plan_size) | |||
| best_configs = self.__model_optimizer.find_best( | |||
| self.__cost_model, self.__plan_size, self._visited) | |||
| self._trials = best_configs | |||
| def tune(self, least_try_times: int, output_file: str = None): | |||
| early_stopping = least_try_times | |||
| self.__least_try_times = least_try_times | |||
| self.__early_stopping = early_stopping | |||
| logger.setLevel(logging.DEBUG) | |||
| old_level = logger.level | |||
| i = 0 | |||
| error_ct = 0 | |||
| tuning_start = time.time() | |||
| while (i < self._space.length and (i < least_try_times | |||
| or (self._best_time > self._original_time - 0.9 | |||
| and i < least_try_times * 3))): | |||
| if not self._space.has_next(): | |||
| break | |||
| iter_start = time.time() | |||
| if not self.__is_auto_set_dim: | |||
| configs = self.next_batch(min(self._n_parallel, self._space.length - i)) | |||
| else: | |||
| configs = self.next_batch(min(self._n_parallel, self._space.length - i), False) | |||
| logger.debug('--indexes: %s', str([x.input_id for x in configs])) | |||
| run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim) | |||
| if self.__is_auto_set_dim: | |||
| from operator import add | |||
| from functools import reduce | |||
| self._original_time = reduce(add, run_times) / len(run_times) | |||
| self._best_time = self._original_time | |||
| self._best_iter = -1 | |||
| self._best_config = None | |||
| run_times = None | |||
| self.__is_auto_set_dim = False | |||
| continue | |||
| results = [] | |||
| for idx, conf in enumerate(configs): | |||
| if run_times[idx] == -1: | |||
| continue | |||
| results.append((conf.input_id, run_times[idx])) | |||
| # keep best config | |||
| if self._best_time > run_times[idx]: | |||
| self._best_time = run_times[idx] | |||
| self._best_iter = i + idx | |||
| self._best_config = conf | |||
| i += len(results) | |||
| self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i | |||
| start = time.time() | |||
| # update | |||
| for res in results: | |||
| self._xs.append(res[0]) | |||
| self._ys.append(res[1]) | |||
| if output_file: | |||
| configs = [(self._space.get(res[0]).input, res[1]) for res in results] | |||
| desc = str(self._runner.op_desc) | |||
| self.export_configs(configs, output_file, desc=desc) | |||
| # if we have enough new training samples | |||
| if len(self._xs) >= self.__plan_size * (self.__train_ct + 1): | |||
| p = Process(target=self.model_res) | |||
| p.start() | |||
| p.join() | |||
| self._trial_pt = 0 | |||
| self.__train_ct += 1 | |||
| end = time.time() | |||
| logger.debug('model running time: %f seconds', end - start) | |||
| self.__model_run_time += end - start | |||
| iter_end = time.time() | |||
| logger.debug('iter time: %f seconds', iter_end - iter_start) | |||
| if self._best_iter > 0 and i >= self.best_iter + early_stopping: | |||
| logger.debug('Early stopped. Best iter: %d', self._best_iter) | |||
| return | |||
| print("tuning time already, ", time.time() - tuning_start) | |||
| if time.time() - tuning_start > 7200: | |||
| logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter) | |||
| return | |||
| if error_ct > 150: | |||
| logging.warning('Too many errors happen in the tuning. Now is in debug mode') | |||
| logger.setLevel(logging.DEBUG) | |||
| else: | |||
| logger.setLevel(old_level) | |||
| self._tuning_time += time.time() - tuning_start | |||
| @@ -0,0 +1,9 @@ | |||
| { | |||
| "enable_atomic_add": { | |||
| "dtype": "bool", | |||
| "options": [ | |||
| "False", | |||
| "True" | |||
| ] | |||
| } | |||
| } | |||
| @@ -0,0 +1,155 @@ | |||
| from collections import namedtuple | |||
| import os | |||
| import logging | |||
| def get_block_str_from_config(config: namedtuple): | |||
| block_param = "" | |||
| if "block_x" in getattr(config, "_fields"): | |||
| block_param += str(config.block_x) + " " | |||
| if "block_y" in getattr(config, "_fields"): | |||
| block_param += str(config.block_y) + " " | |||
| if "block_z" in getattr(config, "_fields"): | |||
| block_param += str(config.block_z) + " " | |||
| return block_param | |||
| def get_thread_str_from_config(config: namedtuple): | |||
| thread_param = "" | |||
| if "thread_x" in getattr(config, "_fields"): | |||
| thread_param += str(config.thread_x) + " " | |||
| if "thread_y" in getattr(config, "_fields"): | |||
| thread_param += str(config.thread_y) + " " | |||
| if "thread_z" in getattr(config, "_fields"): | |||
| thread_param += str(config.thread_z) + " " | |||
| return thread_param | |||
| def get_parallel_build_num(): | |||
| """get the num of parallel build""" | |||
| env_dic = os.environ | |||
| try: | |||
| return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1 | |||
| except NameError as e: | |||
| logging.error(e) | |||
| return 1 | |||
| def get_available_gpu_num(): | |||
| """get the num of gpu""" | |||
| env_dic = os.environ | |||
| try: | |||
| return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ] | |||
| except NameError as e: | |||
| logging.error(e) | |||
| return 1 | |||
| def get_real_attr(value ,key ,need_tune_json, need_tune_keys): | |||
| if key not in need_tune_keys: | |||
| return value | |||
| if need_tune_json[key]['dtype'] == "bool": | |||
| if need_tune_json[key]['options'][value].lower() == "true": | |||
| return True | |||
| elif need_tune_json[key]['options'][value].lower() == "false": | |||
| return False | |||
| else: | |||
| raise TypeError("Wrong boolean type, please check json file") | |||
| elif need_tune_json[key]['dtype'] == "str": | |||
| if isinstance(need_tune_json[key]['options'][value], str): | |||
| return need_tune_json[key]['options'][value] | |||
| else: | |||
| raise TypeError("Wrong str type, please check json file") | |||
| elif need_tune_json[key]['dtype'] == "int": | |||
| if isinstance(need_tune_json[key]['options'][value], int): | |||
| return need_tune_json[key]['options'][value] | |||
| else: | |||
| raise TypeError("Wrong int type, please check json file") | |||
| def merge_attrs(attrs, config, need_tune_json): | |||
| tiling = [getattr(config, name) for name in getattr( | |||
| config, '_fields') if name.startswith('tiling')] | |||
| dim_str = '' | |||
| d_config = config._asdict() | |||
| d_attrs = attrs._asdict() | |||
| is_2d_tiling = False | |||
| for name in getattr(config, '_fields'): | |||
| if name.startswith('tiling'): | |||
| if name.count("_") == 2: | |||
| is_2d_tiling = True | |||
| break | |||
| for i, element in enumerate(tiling): | |||
| if is_2d_tiling: | |||
| if i % 2 == 0: | |||
| dim_str += "0 " + str(i//2) + " " | |||
| dim_str += str(element) + " " | |||
| else: | |||
| # 1d tiling | |||
| dim_str += "0 " + str(i) + " " + str(element) + " 1 " | |||
| # add block, thread information | |||
| block = [str(getattr(config, name)) for name in getattr( | |||
| config, '_fields') if name.startswith('block')] | |||
| bind_block_str = ' '.join(block) | |||
| thread = [str(getattr(config, name)) for name in getattr( | |||
| config, '_fields') if name.startswith('thread')] | |||
| bind_thread_str = ' '.join(thread) | |||
| d_attrs['dim'] = dim_str | |||
| d_attrs['bind_block'] = bind_block_str | |||
| d_attrs['bind_thread'] = bind_thread_str | |||
| need_tune_keys = need_tune_json.keys() | |||
| for key in need_tune_keys: | |||
| d_attrs[key] = d_config[key] | |||
| # make a new attrs with config info | |||
| attrs_type = type(attrs) | |||
| config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs] | |||
| new_attrs = attrs_type(*config_list) | |||
| return new_attrs | |||
| def get_skip_configs_from_log(skip_configs_log): | |||
| skip_config_set = set() | |||
| if skip_configs_log != "": | |||
| with open(skip_configs_log, 'r') as file: | |||
| for line in file: | |||
| config = str(line.split("|")[1]).strip() | |||
| skip_config_set.add(config) | |||
| print("SKIP CONFIGS NUMBER:", len(skip_config_set)) | |||
| return skip_config_set | |||
| def get_tuning_attrs_from_json(tuning_attrs_json): | |||
| import json | |||
| need_tune_spaces = [[]] | |||
| keys = [] | |||
| json_string = dict() | |||
| if tuning_attrs_json != "": | |||
| with open(tuning_attrs_json,'r') as file: | |||
| json_string =json.load(file) | |||
| for key in json_string.keys(): | |||
| keys.append(key) | |||
| num_options = len(json_string[key]['options']) | |||
| tmp_spaces = [] | |||
| for space in need_tune_spaces: | |||
| for i in range(num_options): | |||
| tmp_space = space[:] | |||
| tmp_space.append(i) | |||
| tmp_spaces.append(tmp_space) | |||
| need_tune_spaces = tmp_spaces[:] | |||
| return (keys, need_tune_spaces, json_string) | |||
| if __name__ == "__main__": | |||
| """test components""" | |||
| file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json" | |||
| keys, need_tune_spaces = get_tuning_attrs_from_json(file_name) | |||
| print(keys) | |||
| print(need_tune_spaces) | |||
| @@ -0,0 +1,49 @@ | |||
| # Copyright 2019 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """operator description and config param definitions""" | |||
| from collections import namedtuple | |||
| # op desc for ascend | |||
| ConvDesc = namedtuple("ConvDesc", [ | |||
| 'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias']) | |||
| ConvBackpropDesc = namedtuple( | |||
| "ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation']) | |||
| MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format", | |||
| "out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"]) | |||
| # op desc for gpu | |||
| ReduceGpuDesc = namedtuple("ReduceGpuDesc", [ | |||
| "in_shape", "in_dtype", "axis", "keepdims", | |||
| "poly_sch", "dim", "bind_block", "bind_thread", | |||
| "enable_akg_reduce_lib", "enable_atomic_add"]) | |||
| # config param definitions for ascend | |||
| ConvConfig = namedtuple('ConvConfig', [ | |||
| 'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass']) | |||
| ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig', | |||
| ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w']) | |||
| ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig', | |||
| ['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch', | |||
| 'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n']) | |||
| MatmulCubeConfig = namedtuple( | |||
| 'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass']) | |||
| # config param definitions for gpu | |||
| EmptyConfig = namedtuple('empty', []) | |||
| @@ -0,0 +1,16 @@ | |||
| # how many multi-processing to build | |||
| export BUILD_PARALLEL_NUM=4 | |||
| # set the default gpu devices, plz never change it | |||
| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |||
| # set the real devices you want to use | |||
| export USE_GPU_DEVICES=0,1,2,3 | |||
| export RUNTIME_MODE=gpu | |||
| export PROFILING_MODE=true | |||
| # ascend config | |||
| export DEVICE_ID=0 | |||
| export DEVICE_TOTAL_NUM=8 | |||
| @@ -0,0 +1,67 @@ | |||
| # Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """test""" | |||
| import time | |||
| from autotuning.job import launch | |||
| from akg.utils import kernel_exec | |||
| from akg.ops.math_gpu import reduce_sum | |||
| from autotuning.type_definitions import ReduceGpuDesc | |||
| import numpy as np | |||
| import sys | |||
| import argparse | |||
| from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json | |||
| def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False): | |||
| mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ), | |||
| kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims], | |||
| attrs={"target": "cuda", "enable_akg_reduce_lib": True}) | |||
| return mod | |||
| def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None): | |||
| time_start = time.time() | |||
| op_type_ = 'reduce_sum_gpu' | |||
| debug_mode_ = True | |||
| save_res_ = True | |||
| all_space_ = True | |||
| op_config = [in_shape, in_dtype, axis, keepdims, | |||
| "", "", "", | |||
| True, True, True] | |||
| op_config = ReduceGpuDesc(*op_config) | |||
| desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute, | |||
| op_config, tuning_attrs_info) | |||
| launch(op_type=op_type_, debug_mode=debug_mode_, | |||
| save_res=save_res_, desc=desc_, all_space=all_space_, | |||
| from_json=False, skip_config_set=skip_config_set, | |||
| tuning_attrs_info=tuning_attrs_info) | |||
| time_end = time.time() | |||
| print("total tuning time: ", time_end - time_start) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument("--skip_configs_log", type=str, | |||
| default="", help="skip those configs in .log file") | |||
| parser.add_argument("--tuning_attrs_json", type=str, default="", | |||
| help="the json file to describe the tuning atttrs") | |||
| args = parser.parse_args() | |||
| # check whether have configs need to skip | |||
| skip_config_set = get_skip_configs_from_log(args.skip_configs_log) | |||
| # add tuning_attrs from json file | |||
| tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json) | |||
| run_test_reduce_sum((1024, 1024), "float32", (1,), | |||
| False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) | |||
| @@ -25,7 +25,7 @@ else | |||
| TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm" | |||
| export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH} | |||
| export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH} | |||
| export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH} | |||
| if [ $# -eq 1 ] && [ $1 = "gpu" ]; then | |||
| export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} | |||
| fi | |||