enhance mapping: drop axes that cannot be mapped to block before block mapping

5 years ago · c21f95afb4
--- a/src/poly/tiling/tiling_strategy_manager_gpu.cc
+++ b/src/poly/tiling/tiling_strategy_manager_gpu.cc
@@ -376,7 +376,7 @@ void ReduceStrategy::DealWithPostReduceTensors() {
  }
 }

 int GpuStrategy::GetLocalAllocBufCount () {
 int GpuStrategy::GetLocalAllocBufCount() {
  int count = 0;
  for (auto &it : analyzer_->buf_info_) {
    auto buf = it.second.get();
@@ -526,7 +526,7 @@ void GpuStrategy::AddGpuConstraint() {
      axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0);
    });
  }
  // TODO: This is a very naive strategy to avoid cuda launch out of resources 
  // TODO: This is a very naive strategy to avoid cuda launch out of resources
  //       and we should fix this in register memory promotion pass.
  if (template_ != Template::REDUCTION && template_ != Template::ALL_REDUCE) {
    auto local_buf_count = GetLocalAllocBufCount();
@@ -639,6 +639,12 @@ void GpuStrategy::InnerThreadOuterBlock() {

  // tile from inner to outer and map to thread
  analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to thread-----");
  ss << "[Thread Limit]: ";
  for (auto l : thread_limit_) {
    ss << l << ", ";
  }
  analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);

  size_t ori_size = pending_axes_.size();
  size_t inner_dim = 0;
  for (size_t i = 0; i < ori_size; ++i) {
@@ -746,20 +752,30 @@ void GpuStrategy::InnerThreadOuterBlock() {
    }
  } else {
    for (size_t i = pending_axes_.size() - 1; i >= ori_size; --i) {
      if (pending_axes_[i].second <= 1 && indexing.size() == block_limit_.size()) {
        continue;
      }
      indexing.emplace_back(i);
    }
  }

  // map outer band to block according to predefined indice
  analyzer_->GetTileLogger().AppendLine(GPU_MAPPING, "-----Map to block-----");
  ss << "[Block Limit]: ";
  for (auto l : block_limit_) {
    ss << l << ", ";
  }
  analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);

  for (const auto &i : indexing) {
    TileAxis *axis;
    int64_t shape;
    std::tie(axis, shape) = pending_axes_[i];
    auto idx = depth_ - 1 - (pending_axes_.size() - 1 - i);
    idx = reverse_binding_ ? std::min(depth_, block_limit_.size()) - 1 - idx : idx;
    auto rest_blocks = std::min(block_limit_[idx], axis->block_constraints.map_extent_);
    ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks;
    auto rest_blocks = idx < block_limit_.size() ? std::min(block_limit_[idx], axis->block_constraints.map_extent_) : 1;
    ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", block_idx = " << idx
       << ", rest blocks = " << rest_blocks;
    if (block_count_ >= static_cast<int>(block_dim)) {
      ss << "-> No mapping.";
      analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
--- a/third_party/incubator-tvm/src/runtime/cuda/cuda_module.cc
+++ b/third_party/incubator-tvm/src/runtime/cuda/cuda_module.cc
@@ -109,7 +109,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
      options[0] = CU_JIT_MAX_REGISTERS;
      void* values[1];
      long register_nums =
          MAX_REGISTER_PER_THREAD_BLOCK / wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2);
          MAX_REGISTER_PER_THREAD_BLOCK / (wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2));
      values[0] = (void*)register_nums;
      CUDA_DRIVER_CALL(
          cuModuleLoadDataEx(&(module_[device_id]), data_.c_str(), 1, options, values));