| @@ -24,7 +24,11 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| constexpr size_t kAdamDeltaInputSize = 9; | constexpr size_t kAdamDeltaInputSize = 9; | ||||
| #ifdef ENABLE_D | |||||
| constexpr size_t kUsedThreadNum = 23; | constexpr size_t kUsedThreadNum = 23; | ||||
| #else | |||||
| constexpr size_t kUsedThreadNum = 8; | |||||
| #endif | |||||
| namespace { | namespace { | ||||
| struct ComputeParam { | struct ComputeParam { | ||||
| float *delta_{nullptr}; | float *delta_{nullptr}; | ||||
| @@ -22,6 +22,11 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | namespace { | ||||
| #ifdef ENABLE_D | |||||
| constexpr size_t kUsedThreadNum = 23; | |||||
| #else | |||||
| constexpr size_t kUsedThreadNum = 8; | |||||
| #endif | |||||
| template <typename T> | template <typename T> | ||||
| void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t indices_lens, | void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t indices_lens, | ||||
| size_t outer_dim_size, T offset, size_t first_dim_size) { | size_t outer_dim_size, T offset, size_t first_dim_size) { | ||||
| @@ -92,10 +97,9 @@ void EmbeddingLookUpCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr | |||||
| auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); | auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); | ||||
| auto indices_addr = reinterpret_cast<T *>(inputs[1]->addr); | auto indices_addr = reinterpret_cast<T *>(inputs[1]->addr); | ||||
| auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | ||||
| const size_t kMaxThreadNum = 16; | |||||
| size_t thread_num = indices_lens_ / 10000 + 1; | size_t thread_num = indices_lens_ / 10000 + 1; | ||||
| thread_num = thread_num > kMaxThreadNum ? kMaxThreadNum : thread_num; | |||||
| std::thread threads[kMaxThreadNum]; | |||||
| thread_num = thread_num > kUsedThreadNum ? kUsedThreadNum : thread_num; | |||||
| std::thread threads[kUsedThreadNum]; | |||||
| size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num; | size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num; | ||||
| size_t i; | size_t i; | ||||
| size_t task_offset = 0; | size_t task_offset = 0; | ||||
| @@ -22,6 +22,11 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | namespace { | ||||
| #ifdef ENABLE_D | |||||
| constexpr size_t kUsedThreadNum = 23; | |||||
| #else | |||||
| constexpr size_t kUsedThreadNum = 8; | |||||
| #endif | |||||
| template <typename T> | template <typename T> | ||||
| void Compute(const ComputeParams<T> *params, const size_t start, const size_t end) { | void Compute(const ComputeParams<T> *params, const size_t start, const size_t end) { | ||||
| MS_EXCEPTION_IF_NULL(params); | MS_EXCEPTION_IF_NULL(params); | ||||
| @@ -115,10 +120,9 @@ void ScatterNdUpdateCPUKernel::LaunchKernel(const std::vector<AddressPtr> &input | |||||
| params.indices_unit_rank_ = indices_unit_rank_; | params.indices_unit_rank_ = indices_unit_rank_; | ||||
| params.out_strides_ = &out_strides_; | params.out_strides_ = &out_strides_; | ||||
| const size_t thread_num = 24; | |||||
| std::vector<Task> tasks; | std::vector<Task> tasks; | ||||
| size_t start = 0; | size_t start = 0; | ||||
| size_t once_compute_size = (num_units_ + thread_num - 1) / thread_num; | |||||
| size_t once_compute_size = (num_units_ + kUsedThreadNum - 1) / kUsedThreadNum; | |||||
| while (start < num_units_) { | while (start < num_units_) { | ||||
| size_t end = (start + once_compute_size) > num_units_ ? num_units_ : (start + once_compute_size); | size_t end = (start + once_compute_size) > num_units_ ? num_units_ : (start + once_compute_size); | ||||
| auto task = [¶ms, start, end]() -> int { | auto task = [¶ms, start, end]() -> int { | ||||
| @@ -27,6 +27,11 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| #ifdef ENABLE_D | |||||
| constexpr size_t kUsedThreadNum = 23; | |||||
| #else | |||||
| constexpr size_t kUsedThreadNum = 8; | |||||
| #endif | |||||
| template <typename T> | template <typename T> | ||||
| struct SparseGradient { | struct SparseGradient { | ||||
| float *value_{nullptr}; | float *value_{nullptr}; | ||||
| @@ -95,7 +100,7 @@ class SparseOptimizerCPUKernel : public CPUKernel { | |||||
| static void BucketReduceSparseGradient(const ReduceSparseGradientParam<T> ¶m) { | static void BucketReduceSparseGradient(const ReduceSparseGradientParam<T> ¶m) { | ||||
| MS_LOG(DEBUG) << "Start"; | MS_LOG(DEBUG) << "Start"; | ||||
| MS_EXCEPTION_IF_NULL(param.input_grad_); | MS_EXCEPTION_IF_NULL(param.input_grad_); | ||||
| size_t thread_num = 23; | |||||
| size_t thread_num = kUsedThreadNum; | |||||
| if (param.input_grad_->indices_size_ < thread_num) { | if (param.input_grad_->indices_size_ < thread_num) { | ||||
| thread_num = param.input_grad_->indices_size_; | thread_num = param.input_grad_->indices_size_; | ||||
| } | } | ||||
| @@ -120,11 +125,10 @@ class SparseOptimizerCPUKernel : public CPUKernel { | |||||
| template <typename T> | template <typename T> | ||||
| void MultiThreadCompute(const MultiThreadComputeFunc<T> &func, MultiThreadComputeParams<T> *params, | void MultiThreadCompute(const MultiThreadComputeFunc<T> &func, MultiThreadComputeParams<T> *params, | ||||
| size_t total_compute_size) const { | size_t total_compute_size) const { | ||||
| const size_t kThreadNum = 24; | |||||
| std::vector<std::thread> threads; | std::vector<std::thread> threads; | ||||
| threads.reserve(kThreadNum); | |||||
| threads.reserve(kUsedThreadNum); | |||||
| size_t start = 0; | size_t start = 0; | ||||
| size_t once_compute_size = (total_compute_size + kThreadNum - 1) / kThreadNum; | |||||
| size_t once_compute_size = (total_compute_size + kUsedThreadNum - 1) / kUsedThreadNum; | |||||
| while (start < total_compute_size) { | while (start < total_compute_size) { | ||||
| size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size); | size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size); | ||||
| threads.emplace_back(std::thread(func, params, start, end)); | threads.emplace_back(std::thread(func, params, start, end)); | ||||
| @@ -20,7 +20,11 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| const size_t kUseBucketUniqueSize = 100000; | const size_t kUseBucketUniqueSize = 100000; | ||||
| const size_t kUniqueThreadNum = 23; | |||||
| #ifdef ENABLE_D | |||||
| constexpr size_t kUniqueThreadNum = 23; | |||||
| #else | |||||
| constexpr size_t kUniqueThreadNum = 8; | |||||
| #endif | |||||
| void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) { | void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) { | ||||
| node_ = kernel_node; | node_ = kernel_node; | ||||
| CheckParam(kernel_node); | CheckParam(kernel_node); | ||||