Browse Source

!15202 优化 Unstack 算子

From: @he-botao
Reviewed-by: @wuxuejian,@liangchenghui
Signed-off-by: @wuxuejian
pull/15202/MERGE
mindspore-ci-bot Gitee 4 years ago
parent
commit
cd272cf581
2 changed files with 9 additions and 33 deletions
  1. +9
    -32
      mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.cc
  2. +0
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.h

+ 9
- 32
mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.cc View File

@@ -64,38 +64,15 @@ void UnpackCPUKernel<T>::LaunchKernel(const std::vector<AddressPtr> &inputs,
outputs_host_[i] = reinterpret_cast<T *>(outputs[i]->addr);
MS_EXCEPTION_IF_NULL(outputs_host_[i]);
}
auto max_thread_num = std::thread::hardware_concurrency();
size_t thread_num = input_size_ < 128 * max_thread_num ? std::ceil(input_size_ / 128.0) : max_thread_num;
if (thread_num < 1) {
MS_LOG(ERROR) << "Invalid value: thread_num" << thread_num;
return;
}
std::vector<std::thread> threads;
threads.reserve(thread_num);
size_t start = 0;
size_t one_gap = (input_size_ + thread_num - 1) / thread_num;
if (one_gap < 1) {
MS_LOG(ERROR) << "Invalid value: one_gap " << one_gap;
return;
}
while (start < input_size_) {
size_t end = (start + one_gap) > input_size_ ? input_size_ : (start + one_gap);
threads.emplace_back(std::thread(&UnpackCPUKernel::UnpackResult, this, start, end));
start += one_gap;
}
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
}

template <typename T>
void UnpackCPUKernel<T>::UnpackResult(const size_t start, const size_t end) {
for (size_t i = start; i < end; ++i) {
size_t output_index = (i / dims_after_axis_) % output_num_;
size_t number_of_reset = output_num_ * dims_after_axis_;
size_t tensor_index = i / number_of_reset * dims_after_axis_ + i % dims_after_axis_;
outputs_host_[output_index][tensor_index] = input_[i];
}
size_t number_of_reset = output_num_ * dims_after_axis_;
auto task = [this, number_of_reset](const size_t start, const size_t end) {
for (size_t i = start; i < end; ++i) {
size_t output_index = (i / dims_after_axis_) % output_num_;
size_t tensor_index = i / number_of_reset * dims_after_axis_ + i % dims_after_axis_;
outputs_host_[output_index][tensor_index] = input_[i];
}
};
CPUKernelUtils::ParallelFor(task, input_size_);
}

template <typename T>


+ 0
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.h View File

@@ -41,7 +41,6 @@ class UnpackCPUKernel : public CPUKernel {

protected:
virtual void CheckParam(const CNodePtr &kernel_node);
virtual void UnpackResult(const size_t start, const size_t end);
size_t input_size_{1};
size_t output_num_{0};
size_t dims_after_axis_{1};


Loading…
Cancel
Save