Browse Source

unify cpu thread pool

tags/v1.6.0
fanjibin 4 years ago
parent
commit
206e7b9a1e
14 changed files with 44 additions and 34 deletions
  1. +20
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
  2. +2
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
  3. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc
  4. +2
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc
  5. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/rank_cpu_kernel.cc
  6. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/rolling_cpu_kernel.cc
  7. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.cc
  8. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/shift_cpu_kernel.cc
  9. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/sort_cpu_kernel.cc
  10. +4
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_optimizer_cpu_kernel.h
  11. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.cc
  12. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc
  13. +4
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
  14. +4
    -4
      mindspore/ccsrc/runtime/framework/actor/actor_common.h

+ 20
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc View File

@@ -20,8 +20,8 @@
#include <utility>
#include <cmath>

#include "common/thread_pool.h"
#include "utils/profile.h"
#include "runtime/framework/actor/actor_common.h"

namespace mindspore {
namespace kernel {
@@ -167,17 +167,10 @@ ActorThreadPool *GetActorMgrInnerThreadPool() {
auto thread_pool = actor_manager->GetActorThreadPool();
// Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
if (thread_pool == nullptr) {
const size_t kMaxThreadNum = 23;
size_t max_thread_num = std::thread::hardware_concurrency() - 1;
#if ENABLE_D || ENABLE_GPU
const size_t kDeviceNum = 8;
max_thread_num /= kDeviceNum;
#endif
if (max_thread_num < 1) {
max_thread_num = 1;
}
max_thread_num = max_thread_num < kMaxThreadNum ? max_thread_num : kMaxThreadNum;
(void)actor_manager->Initialize(true, 0, max_thread_num);
size_t actor_thread_num = 0;
size_t actor_and_kernel_thread_num = 0;
runtime::ComputeThreadNums(&actor_thread_num, &actor_and_kernel_thread_num);
(void)actor_manager->Initialize(true, actor_thread_num, actor_and_kernel_thread_num);
thread_pool = actor_manager->GetActorThreadPool();
MS_EXCEPTION_IF_NULL(thread_pool);
}
@@ -207,6 +200,21 @@ void ParallelLaunch(const CTask &task, size_t count, float block_size, Content c
(void)thread_pool->ParallelLaunch(func, content, task_num);
}

void ParallelLaunch(const std::vector<common::Task> &tasks, Content content) {
auto thread_pool = GetActorMgrInnerThreadPool();
size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
if (kernel_thread_num == 0) {
MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
}

size_t task_num = tasks.size();
auto func = [&](void *, int task_id, float, float) {
tasks[task_id]();
return common::SUCCESS;
};
(void)thread_pool->ParallelLaunch(func, content, task_num);
}

void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
ParallelSearchInfo *parallel_search_info) {
const size_t MAX_POW = 6;


+ 2
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h View File

@@ -30,6 +30,7 @@
#include "ir/anf.h"
#include "runtime/framework/graph_scheduler.h"
#include "actor/actormgr.h"
#include "common/thread_pool.h"
#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
#define PLATFORM_86
#endif
@@ -224,6 +225,7 @@ class TransposeIterator {

ActorThreadPool *GetActorMgrInnerThreadPool();
void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr);
void ParallelLaunch(const std::vector<common::Task> &tasks, Content content = nullptr);
void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
ParallelSearchInfo *parallel_search_info);



+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc View File

@@ -120,7 +120,7 @@ void LayerNormCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, con
};
(void)tasks.emplace_back(block);
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}
} // namespace kernel
} // namespace mindspore

+ 2
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc View File

@@ -144,7 +144,7 @@ void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
};
tasks1.emplace_back(block);
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks1);
ParallelLaunch(tasks1);
for (size_t i = 0; i < thread_num2; ++i) {
auto block = [&, i]() {
task2(i);
@@ -152,7 +152,7 @@ void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
};
tasks2.emplace_back(block);
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks2);
ParallelLaunch(tasks2);
}
} // namespace kernel
} // namespace mindspore

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/rank_cpu_kernel.cc View File

@@ -299,7 +299,7 @@ bool RankCpuKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::
});
}
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
return true;
}
} // namespace kernel


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/rolling_cpu_kernel.cc View File

@@ -201,7 +201,7 @@ bool RollingCpuKernel<T, S>::Launch(const std::vector<AddressPtr> &inputs, const
});
}
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
return true;
}
} // namespace kernel


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.cc View File

@@ -168,7 +168,7 @@ void ScatterNdUpdateCPUKernel::LaunchKernel(const std::vector<AddressPtr> &input
(void)tasks.emplace_back(task);
start += once_compute_size;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}
} // namespace kernel
} // namespace mindspore

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/shift_cpu_kernel.cc View File

@@ -133,7 +133,7 @@ bool ShiftCpuKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std:
return common::SUCCESS;
});
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
return true;
}
} // namespace kernel


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/sort_cpu_kernel.cc View File

@@ -113,7 +113,7 @@ bool SortCpuKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::
(void)tasks.emplace_back(task);
}
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
return true;
}
} // namespace kernel


+ 4
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_optimizer_cpu_kernel.h View File

@@ -135,7 +135,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
start += once_compute_size;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

private:
@@ -197,7 +197,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
current_indices_offset += indices_size;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

template <typename T>
@@ -279,7 +279,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
current_indices_offset += segments[i]->indices_size_;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

template <typename T>
@@ -407,7 +407,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
current_indices_offset += buckets[i]->indices_size_;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

template <typename T>


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.cc View File

@@ -221,7 +221,7 @@ void StridedSliceCPUKernel::ParallelRun(const uint8_t *input_addr, uint8_t *outp
std::bind(execute_func, this, input_addr, output_addr, thread_index * cal_num_per_thread_));
thread_index++;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

bool StridedSliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc View File

@@ -86,7 +86,7 @@ void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st
return common::SUCCESS;
});
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {


+ 4
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h View File

@@ -128,7 +128,7 @@ class UniqueCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
current_offset += data_size;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
}

template <typename DataType, typename IndexType>
@@ -231,7 +231,7 @@ class UniqueCPUKernel : public CPUKernel {
(void)tasks.emplace_back(task);
current_offset += segments[i]->input_size_;
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
MS_LOG(DEBUG) << "End";
}

@@ -303,7 +303,7 @@ class UniqueCPUKernel : public CPUKernel {
};
(void)tasks.emplace_back(task);
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
MS_LOG(DEBUG) << "End";
}

@@ -359,7 +359,7 @@ class UniqueCPUKernel : public CPUKernel {
};
(void)tasks.emplace_back(task);
}
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
ParallelLaunch(tasks);
MS_LOG(DEBUG) << "End";
}



+ 4
- 4
mindspore/ccsrc/runtime/framework/actor/actor_common.h View File

@@ -150,10 +150,10 @@ class ActorDispatcher {

// The first five executions are for warm-up, the next five executions are statistics of multi thread execution time,
// and the next next five executions are statistics of single thread execution time.
static constexpr size_t kMultiThreadExecutionCountBegin{31};
static constexpr size_t kMultiThreadExecutionCountEnd{40};
static constexpr size_t kSingleThreadExecutionCountBegin{41};
static constexpr size_t kSingleThreadExecutionCountEnd{50};
static constexpr size_t kMultiThreadExecutionCountBegin{21};
static constexpr size_t kMultiThreadExecutionCountEnd{30};
static constexpr size_t kSingleThreadExecutionCountBegin{31};
static constexpr size_t kSingleThreadExecutionCountEnd{40};
// The single thread execution constraint.
static constexpr size_t kSingleThreadExecutionActorMaxNum{100};



Loading…
Cancel
Save