| @@ -19,45 +19,67 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| const size_t kUseBucketUniqueSize = 100000; | |||||
| const size_t kUniqueThreadNum = 23; | |||||
| void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) { | void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) { | ||||
| CheckParam(kernel_node); | CheckParam(kernel_node); | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | ||||
| n_ = input_shape[0]; | |||||
| input_size_ = input_shape[0]; | |||||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | ||||
| } | } | ||||
| void UniqueCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||||
| CPUKernel::InitInputOutputSize(kernel_node); | |||||
| workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t)); | |||||
| workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t)); | |||||
| workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t)); | |||||
| } | |||||
| bool UniqueCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | bool UniqueCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | ||||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||||
| const std::vector<kernel::AddressPtr> &workspace, | |||||
| const std::vector<kernel::AddressPtr> &outputs) { | const std::vector<kernel::AddressPtr> &outputs) { | ||||
| if (dtype_ == kNumberTypeInt32) { | if (dtype_ == kNumberTypeInt32) { | ||||
| LaunchKernel<int>(inputs, outputs); | |||||
| } else if (dtype_ == kNumberTypeFloat32) { | |||||
| LaunchKernel<float>(inputs, outputs); | |||||
| LaunchKernel<int, int>(inputs, workspace, outputs); | |||||
| } else if (dtype_ == kNumberTypeInt64) { | } else if (dtype_ == kNumberTypeInt64) { | ||||
| LaunchKernel<int64_t>(inputs, outputs); | |||||
| LaunchKernel<int64_t, int>(inputs, workspace, outputs); | |||||
| } else if (dtype_ == kNumberTypeFloat32) { | |||||
| LaunchKernel<float, int>(inputs, workspace, outputs); | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| template <typename T> | |||||
| void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||||
| auto x_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||||
| auto y_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||||
| auto idx_addr = reinterpret_cast<int64_t *>(outputs[1]->addr); | |||||
| std::unordered_map<T, int64_t> uniq; | |||||
| int n = SizeToInt(n_); | |||||
| uniq.reserve(n * 2); | |||||
| for (int i = 0, j = 0; i < n; ++i) { | |||||
| auto it = uniq.emplace(x_addr[i], j); | |||||
| idx_addr[i] = it.first->second; | |||||
| if (it.second) { | |||||
| ++j; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs) { | |||||
| if (input_size_ == 0) { | |||||
| return; | |||||
| } | |||||
| if (inputs.size() < 1) { | |||||
| MS_LOG(EXCEPTION) << "Input size should be large than 0"; | |||||
| } | |||||
| if (workspace.size() < 3) { | |||||
| MS_LOG(EXCEPTION) << "workspace size should be large than 2"; | |||||
| } | |||||
| if (outputs.size() < 2) { | |||||
| MS_LOG(EXCEPTION) << "Output size should be large than 1"; | |||||
| } | } | ||||
| for (const auto &it : uniq) { | |||||
| y_addr[it.second] = it.first; | |||||
| auto params = std::make_shared<UniqueParam<DataType, IndexType>>(); | |||||
| params->input_ = reinterpret_cast<DataType *>(inputs[0]->addr); | |||||
| params->input_idx_ = reinterpret_cast<IndexType *>(workspace[0]->addr); | |||||
| params->workspace_ = reinterpret_cast<DataType *>(workspace[1]->addr); | |||||
| params->workspace_idx_ = reinterpret_cast<IndexType *>(workspace[2]->addr); | |||||
| params->output_ = reinterpret_cast<DataType *>(outputs[0]->addr); | |||||
| params->inverse_idx_ = reinterpret_cast<IndexType *>(outputs[1]->addr); | |||||
| params->input_size_ = input_size_; | |||||
| params->output_size_ = 0; | |||||
| params->need_sort_ = true; | |||||
| params->thread_num_ = kUniqueThreadNum; | |||||
| if (input_size_ < kUseBucketUniqueSize) { | |||||
| Unique(params); | |||||
| } else { | |||||
| BucketUnique(params); | |||||
| } | } | ||||
| output_size_ = params->output_size_; | |||||
| } | } | ||||
| void UniqueCPUKernel::CheckParam(const CNodePtr &kernel_node) { | void UniqueCPUKernel::CheckParam(const CNodePtr &kernel_node) { | ||||
| @@ -16,31 +16,339 @@ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_ | #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_ | ||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_ | #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_ | ||||
| #include <vector> | |||||
| #include <algorithm> | |||||
| #include <memory> | #include <memory> | ||||
| #include <thread> | |||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include <vector> | |||||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | #include "backend/kernel_compiler/cpu/cpu_kernel.h" | ||||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| template <typename DataType, typename IndexType> | |||||
| struct UniqueParam { | |||||
| DataType *input_{nullptr}; | |||||
| IndexType *input_idx_{nullptr}; | |||||
| DataType *output_{nullptr}; | |||||
| IndexType *inverse_idx_{nullptr}; | |||||
| DataType *workspace_{nullptr}; | |||||
| IndexType *workspace_idx_{nullptr}; | |||||
| IndexType input_size_{0}; | |||||
| IndexType output_size_{0}; | |||||
| size_t thread_num_{0}; | |||||
| bool need_sort_{true}; | |||||
| }; | |||||
| class UniqueCPUKernel : public CPUKernel { | class UniqueCPUKernel : public CPUKernel { | ||||
| public: | public: | ||||
| UniqueCPUKernel() = default; | UniqueCPUKernel() = default; | ||||
| ~UniqueCPUKernel() override = default; | ~UniqueCPUKernel() override = default; | ||||
| void InitKernel(const CNodePtr &kernel_node) override; | void InitKernel(const CNodePtr &kernel_node) override; | ||||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| template <typename T> | |||||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||||
| template <typename DataType, typename IndexType> | |||||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs); | |||||
| private: | |||||
| void CheckParam(const CNodePtr &kernel_node); | |||||
| size_t n_{0}; | |||||
| protected: | |||||
| virtual void CheckParam(const CNodePtr &kernel_node); | |||||
| size_t input_size_{0}; | |||||
| TypeId dtype_{kTypeUnknown}; | TypeId dtype_{kTypeUnknown}; | ||||
| size_t output_size_{0}; | |||||
| template <typename DataType> | |||||
| static size_t BucketId(DataType data, size_t bucket_num) { | |||||
| return static_cast<size_t>(data) % bucket_num; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void CalculateEachBucketSize(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms, | |||||
| std::vector<IndexType> *each_bucket_size) { | |||||
| MS_EXCEPTION_IF_NULL(params); | |||||
| MS_EXCEPTION_IF_NULL(params->input_); | |||||
| MS_EXCEPTION_IF_NULL(each_bucket_size); | |||||
| size_t bucket_num = each_bucket_size->size(); | |||||
| for (IndexType i = 0; i < params->input_size_; ++i) { | |||||
| auto bucket_id = BucketId(params->input_[i], bucket_num); | |||||
| each_bucket_size->at(bucket_id)++; | |||||
| } | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void SplitAndCalculateBucketSize( | |||||
| const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms, | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr, | |||||
| std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr) { | |||||
| MS_EXCEPTION_IF_NULL(params); | |||||
| MS_EXCEPTION_IF_NULL(params->input_); | |||||
| MS_EXCEPTION_IF_NULL(segments_ptr); | |||||
| MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr); | |||||
| auto &segments = *segments_ptr; | |||||
| auto &segment_bucket_sizes = *segment_bucket_sizes_ptr; | |||||
| IndexType input_size = params->input_size_; | |||||
| size_t thread_num = params->thread_num_; | |||||
| if (thread_num < 1) { | |||||
| MS_LOG(EXCEPTION) << "Thread num must > 0 !"; | |||||
| } | |||||
| IndexType thread_data_size = input_size / thread_num; | |||||
| size_t left_data_size = input_size % thread_num; | |||||
| std::vector<std::thread> threads; | |||||
| threads.reserve(thread_num); | |||||
| segments.reserve(thread_num); | |||||
| segment_bucket_sizes.reserve(thread_num); | |||||
| IndexType current_offset = 0; | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| segment_bucket_sizes.emplace_back(std::make_shared<std::vector<IndexType>>(thread_num, 0)); | |||||
| IndexType data_size = thread_data_size; | |||||
| if (i < left_data_size) { | |||||
| data_size += 1; | |||||
| } | |||||
| segments.emplace_back(std::make_shared<UniqueParam<DataType, IndexType>>()); | |||||
| segments[i]->input_ = params->input_ + current_offset; | |||||
| segments[i]->input_size_ = data_size; | |||||
| segments[i]->thread_num_ = thread_num; | |||||
| threads.emplace_back( | |||||
| std::thread(CalculateEachBucketSize<DataType, IndexType>, segments[i], segment_bucket_sizes[i].get())); | |||||
| current_offset += data_size; | |||||
| } | |||||
| for (size_t i = 0; i < params->thread_num_; ++i) { | |||||
| threads[i].join(); | |||||
| } | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void SegmentToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &segment, | |||||
| IndexType segment_offset, | |||||
| const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) { | |||||
| MS_LOG(DEBUG) << "Start"; | |||||
| MS_EXCEPTION_IF_NULL(segment); | |||||
| MS_EXCEPTION_IF_NULL(segment->input_); | |||||
| std::vector<IndexType> bucket_data_num(segment->thread_num_, 0); | |||||
| auto bucket_size = buckets.size(); | |||||
| for (IndexType i = 0; i < segment->input_size_; ++i) { | |||||
| DataType data = segment->input_[i]; | |||||
| auto bucket_id = BucketId(data, segment->thread_num_); | |||||
| auto bucket_index = bucket_data_num[bucket_id]; | |||||
| if (bucket_id >= bucket_size) { | |||||
| MS_LOG(ERROR) << "Error bucket id!"; | |||||
| continue; | |||||
| } | |||||
| auto &bucket = buckets[bucket_id]; | |||||
| MS_EXCEPTION_IF_NULL(bucket); | |||||
| if (bucket_index >= bucket->input_size_) { | |||||
| MS_LOG(ERROR) << "Error bucket index!"; | |||||
| continue; | |||||
| } | |||||
| bucket->input_[bucket_index] = data; | |||||
| bucket->workspace_idx_[bucket_index] = segment_offset + i; | |||||
| bucket_data_num[bucket_id]++; | |||||
| } | |||||
| MS_LOG(DEBUG) << "End"; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void GatherSegmentsToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms, | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr, | |||||
| std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr, | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *buckets_ptr) { | |||||
| MS_LOG(DEBUG) << "Start"; | |||||
| MS_EXCEPTION_IF_NULL(params); | |||||
| MS_EXCEPTION_IF_NULL(params->workspace_); | |||||
| MS_EXCEPTION_IF_NULL(params->inverse_idx_); | |||||
| MS_EXCEPTION_IF_NULL(params->workspace_idx_); | |||||
| MS_EXCEPTION_IF_NULL(params->output_); | |||||
| MS_EXCEPTION_IF_NULL(params->input_idx_); | |||||
| MS_EXCEPTION_IF_NULL(segments_ptr); | |||||
| MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr); | |||||
| MS_EXCEPTION_IF_NULL(buckets_ptr); | |||||
| auto &segments = *segments_ptr; | |||||
| auto &segment_bucket_sizes = *segment_bucket_sizes_ptr; | |||||
| auto &buckets = *buckets_ptr; | |||||
| auto thread_num = segments.size(); | |||||
| buckets.reserve(thread_num); | |||||
| std::vector<IndexType> bucket_data_size(thread_num, 0); | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| for (size_t j = 0; j < thread_num; ++j) { | |||||
| bucket_data_size[j] += segment_bucket_sizes[i]->at(j); | |||||
| } | |||||
| } | |||||
| IndexType current_offset = 0; | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>(); | |||||
| bucket->input_ = params->output_ + current_offset; | |||||
| bucket->input_idx_ = params->inverse_idx_ + current_offset; | |||||
| bucket->workspace_idx_ = params->workspace_idx_ + current_offset; | |||||
| bucket->output_ = params->workspace_ + current_offset; | |||||
| bucket->inverse_idx_ = params->input_idx_ + current_offset; | |||||
| bucket->input_size_ = bucket_data_size[i]; | |||||
| current_offset += bucket_data_size[i]; | |||||
| buckets.emplace_back(bucket); | |||||
| } | |||||
| std::vector<IndexType> tmp_bucket_data_size(thread_num, 0); | |||||
| std::vector<std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>>> thread_buckets; | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> local_buckets; | |||||
| for (size_t j = 0; j < thread_num; ++j) { | |||||
| auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>(); | |||||
| bucket->input_ = buckets[j]->input_ + tmp_bucket_data_size[j]; | |||||
| bucket->input_size_ = buckets[j]->input_size_ - tmp_bucket_data_size[j]; | |||||
| bucket->workspace_idx_ = buckets[j]->workspace_idx_ + tmp_bucket_data_size[j]; | |||||
| local_buckets.emplace_back(bucket); | |||||
| tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j); | |||||
| } | |||||
| thread_buckets.emplace_back(local_buckets); | |||||
| } | |||||
| std::vector<std::thread> threads; | |||||
| threads.reserve(thread_num); | |||||
| current_offset = 0; | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| MS_EXCEPTION_IF_NULL(segments[i]); | |||||
| threads.emplace_back( | |||||
| std::thread(SegmentToBuckets<DataType, IndexType>, segments[i], current_offset, thread_buckets[i])); | |||||
| current_offset += segments[i]->input_size_; | |||||
| } | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| threads[i].join(); | |||||
| } | |||||
| MS_LOG(DEBUG) << "End"; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void Unique(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms) { | |||||
| MS_LOG(DEBUG) << "Start"; | |||||
| MS_EXCEPTION_IF_NULL(params); | |||||
| DataType *input = params->input_; | |||||
| IndexType *input_idx = params->input_idx_; | |||||
| DataType *output = params->output_; | |||||
| IndexType *inverse_idx = params->inverse_idx_; | |||||
| MS_EXCEPTION_IF_NULL(input); | |||||
| MS_EXCEPTION_IF_NULL(input_idx); | |||||
| MS_EXCEPTION_IF_NULL(output); | |||||
| MS_EXCEPTION_IF_NULL(inverse_idx); | |||||
| IndexType j = 0; | |||||
| if (params->need_sort_) { | |||||
| for (IndexType i = 0; i < params->input_size_; ++i) { | |||||
| input_idx[i] = i; | |||||
| } | |||||
| std::sort(input_idx, input_idx + params->input_size_, | |||||
| [&](IndexType left, IndexType right) { return input[left] < input[right]; }); | |||||
| DataType last = input[0]; | |||||
| for (IndexType i = 0; i < params->input_size_; ++i) { | |||||
| auto curr = input[input_idx[i]]; | |||||
| if (i == 0 || curr != last) { | |||||
| if (i != 0) { | |||||
| j++; | |||||
| } | |||||
| output[j] = curr; | |||||
| inverse_idx[input_idx[i]] = j; | |||||
| last = curr; | |||||
| } else { | |||||
| inverse_idx[input_idx[i]] = j; | |||||
| } | |||||
| } | |||||
| params->output_size_ = j + 1; | |||||
| } else { | |||||
| std::unordered_map<DataType, IndexType> uniq; | |||||
| uniq.reserve(params->input_size_); | |||||
| for (IndexType i = 0; i < params->input_size_; ++i) { | |||||
| auto it = uniq.emplace(input[i], j); | |||||
| inverse_idx[i] = it.first->second; | |||||
| if (it.second) { | |||||
| ++j; | |||||
| } | |||||
| } | |||||
| for (const auto &it : uniq) { | |||||
| output[it.second] = it.first; | |||||
| } | |||||
| params->output_size_ = j; | |||||
| } | |||||
| MS_LOG(DEBUG) << "End"; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void UniqueEachBucket(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) { | |||||
| MS_LOG(DEBUG) << "Start"; | |||||
| size_t thread_num = buckets.size(); | |||||
| std::vector<std::thread> threads; | |||||
| threads.reserve(thread_num); | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| threads.emplace_back(std::thread(Unique<DataType, IndexType>, buckets[i])); | |||||
| } | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| threads[i].join(); | |||||
| } | |||||
| MS_LOG(DEBUG) << "End"; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void TransformBucketReverseIndices(const std::shared_ptr<UniqueParam<DataType, IndexType>> &bucket, | |||||
| const std::shared_ptr<UniqueParam<DataType, IndexType>> &result, | |||||
| IndexType offset) { | |||||
| MS_EXCEPTION_IF_NULL(bucket); | |||||
| MS_EXCEPTION_IF_NULL(bucket->inverse_idx_); | |||||
| MS_EXCEPTION_IF_NULL(bucket->workspace_idx_); | |||||
| MS_EXCEPTION_IF_NULL(result); | |||||
| MS_EXCEPTION_IF_NULL(result->inverse_idx_); | |||||
| for (IndexType i = 0; i < bucket->input_size_; ++i) { | |||||
| auto origin_idx = bucket->workspace_idx_[i]; | |||||
| if (origin_idx >= 0 && origin_idx < result->input_size_) { | |||||
| result->inverse_idx_[origin_idx] = bucket->inverse_idx_[i] + offset; | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void MergeBuckets(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets, | |||||
| const std::shared_ptr<UniqueParam<DataType, IndexType>> &result) { | |||||
| MS_LOG(DEBUG) << "Start"; | |||||
| MS_EXCEPTION_IF_NULL(result); | |||||
| MS_EXCEPTION_IF_NULL(result->output_); | |||||
| size_t thread_num = buckets.size(); | |||||
| std::vector<IndexType> bucket_offsets(thread_num); | |||||
| IndexType current_size = 0; | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| auto bucket = buckets[i]; | |||||
| MS_EXCEPTION_IF_NULL(bucket); | |||||
| MS_EXCEPTION_IF_NULL(bucket->output_); | |||||
| bucket_offsets[i] = current_size; | |||||
| auto ret_code = memcpy_s(result->output_ + current_size, (result->input_size_ - current_size) * sizeof(DataType), | |||||
| bucket->output_, bucket->output_size_ * sizeof(DataType)); | |||||
| if (ret_code != EOK) { | |||||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||||
| } | |||||
| current_size += bucket->output_size_; | |||||
| } | |||||
| result->output_size_ = current_size; | |||||
| std::vector<std::thread> threads; | |||||
| threads.reserve(thread_num); | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| threads.emplace_back( | |||||
| std::thread(TransformBucketReverseIndices<DataType, IndexType>, buckets[i], result, bucket_offsets[i])); | |||||
| } | |||||
| for (size_t i = 0; i < thread_num; ++i) { | |||||
| threads[i].join(); | |||||
| } | |||||
| MS_LOG(DEBUG) << "End"; | |||||
| } | |||||
| template <typename DataType, typename IndexType> | |||||
| static void BucketUnique(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms) { | |||||
| MS_EXCEPTION_IF_NULL(params); | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> segments; | |||||
| std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> buckets; | |||||
| std::vector<std::shared_ptr<std::vector<IndexType>>> segment_bucket_sizes; | |||||
| SplitAndCalculateBucketSize(params, &segments, &segment_bucket_sizes); | |||||
| GatherSegmentsToBuckets(params, &segments, &segment_bucket_sizes, &buckets); | |||||
| UniqueEachBucket(buckets); | |||||
| MergeBuckets(buckets, params); | |||||
| } | |||||
| }; | }; | ||||
| MS_REG_CPU_KERNEL( | MS_REG_CPU_KERNEL( | ||||
| @@ -19,49 +19,33 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| void UniqueWithPadCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||||
| CheckParam(kernel_node); | |||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| n_ = SizeToLong(input_shape[0]); | |||||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | |||||
| } | |||||
| bool UniqueWithPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | bool UniqueWithPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | ||||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||||
| const std::vector<kernel::AddressPtr> &workspace, | |||||
| const std::vector<kernel::AddressPtr> &outputs) { | const std::vector<kernel::AddressPtr> &outputs) { | ||||
| UniqueCPUKernel::Launch(inputs, workspace, outputs); | |||||
| if (dtype_ == kNumberTypeInt32) { | if (dtype_ == kNumberTypeInt32) { | ||||
| LaunchKernel<int>(inputs, outputs); | |||||
| PadOutput<int>(inputs, outputs); | |||||
| } else if (dtype_ == kNumberTypeInt64) { | } else if (dtype_ == kNumberTypeInt64) { | ||||
| LaunchKernel<int64_t>(inputs, outputs); | |||||
| } else { | |||||
| MS_LOG(EXCEPTION) << "Only unsupported int32 or int64 dtype"; | |||||
| PadOutput<int64_t>(inputs, outputs); | |||||
| } else if (dtype_ == kNumberTypeFloat32) { | |||||
| PadOutput<float>(inputs, outputs); | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| void UniqueWithPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||||
| const std::vector<AddressPtr> &outputs) { | |||||
| T *a = reinterpret_cast<T *>(inputs[0]->addr); | |||||
| void UniqueWithPadCPUKernel::PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||||
| if (inputs.size() < 2) { | |||||
| MS_LOG(EXCEPTION) << "Input size should be large than 1"; | |||||
| } | |||||
| if (outputs.size() < 1) { | |||||
| MS_LOG(EXCEPTION) << "Output size should be large than 0"; | |||||
| } | |||||
| T pad_num = *reinterpret_cast<T *>(inputs[1]->addr); | T pad_num = *reinterpret_cast<T *>(inputs[1]->addr); | ||||
| T *out = reinterpret_cast<T *>(outputs[0]->addr); | T *out = reinterpret_cast<T *>(outputs[0]->addr); | ||||
| T *idx_vec = reinterpret_cast<T *>(outputs[1]->addr); | |||||
| for (int64_t i = 0; i < n_; ++i) { | |||||
| for (size_t i = output_size_; i < input_size_; ++i) { | |||||
| out[i] = pad_num; | out[i] = pad_num; | ||||
| } | } | ||||
| std::unordered_map<T, int> uniq; | |||||
| uniq.reserve(n_); | |||||
| for (int64_t i = 0, j = 0; i < n_; ++i) { | |||||
| auto it = uniq.emplace(a[i], j); | |||||
| idx_vec[i] = it.first->second; | |||||
| if (it.second) { | |||||
| ++j; | |||||
| } | |||||
| } | |||||
| for (const auto &it : uniq) { | |||||
| out[it.second] = it.first; | |||||
| } | |||||
| } | } | ||||
| void UniqueWithPadCPUKernel::CheckParam(const CNodePtr &kernel_node) { | void UniqueWithPadCPUKernel::CheckParam(const CNodePtr &kernel_node) { | ||||
| @@ -16,31 +16,26 @@ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_ | #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_ | ||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_ | #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_ | ||||
| #include <vector> | |||||
| #include <memory> | #include <memory> | ||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include <vector> | |||||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | #include "backend/kernel_compiler/cpu/cpu_kernel.h" | ||||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | ||||
| #include "backend/kernel_compiler/cpu/unique_cpu_kernel.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| class UniqueWithPadCPUKernel : public CPUKernel { | |||||
| class UniqueWithPadCPUKernel : public UniqueCPUKernel { | |||||
| public: | public: | ||||
| UniqueWithPadCPUKernel() = default; | UniqueWithPadCPUKernel() = default; | ||||
| ~UniqueWithPadCPUKernel() override = default; | ~UniqueWithPadCPUKernel() override = default; | ||||
| void InitKernel(const CNodePtr &kernel_node) override; | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| template <typename T> | template <typename T> | ||||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||||
| void PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||||
| private: | |||||
| void CheckParam(const CNodePtr &kernel_node); | |||||
| int64_t n_{0}; | |||||
| TypeId dtype_{kTypeUnknown}; | |||||
| protected: | |||||
| void CheckParam(const CNodePtr &kernel_node) override; | |||||
| }; | }; | ||||
| MS_REG_CPU_KERNEL(UniqueWithPad, | MS_REG_CPU_KERNEL(UniqueWithPad, | ||||
| @@ -56,7 +51,15 @@ MS_REG_CPU_KERNEL(UniqueWithPad, | |||||
| .AddInputAttr(kNumberTypeInt64) | .AddInputAttr(kNumberTypeInt64) | ||||
| .AddInputAttr(kNumberTypeInt64) | .AddInputAttr(kNumberTypeInt64) | ||||
| .AddOutputAttr(kNumberTypeInt64) | .AddOutputAttr(kNumberTypeInt64) | ||||
| .AddOutputAttr(kNumberTypeInt64), | |||||
| .AddOutputAttr(kNumberTypeInt32), | |||||
| UniqueWithPadCPUKernel); | |||||
| MS_REG_CPU_KERNEL(UniqueWithPad, | |||||
| KernelAttr() | |||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddOutputAttr(kNumberTypeFloat32) | |||||
| .AddOutputAttr(kNumberTypeInt32), | |||||
| UniqueWithPadCPUKernel); | UniqueWithPadCPUKernel); | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -33,7 +33,7 @@ class Net(nn.Cell): | |||||
| return self.uniq(x) | return self.uniq(x) | ||||
| def test_net(): | |||||
| def test_net_fp32(): | |||||
| x = Tensor(np.array([1, 2, 5, 2]), mstype.float32) | x = Tensor(np.array([1, 2, 5, 2]), mstype.float32) | ||||
| uniq = Net() | uniq = Net() | ||||
| output = uniq(x) | output = uniq(x) | ||||
| @@ -45,3 +45,31 @@ def test_net(): | |||||
| assert (output[0].asnumpy() == expect_y_result).all() | assert (output[0].asnumpy() == expect_y_result).all() | ||||
| assert (output[1].asnumpy() == expect_idx_result).all() | assert (output[1].asnumpy() == expect_idx_result).all() | ||||
| def test_net_int32(): | |||||
| x = Tensor(np.array([1, 2, 5, 2]), mstype.int32) | |||||
| uniq = Net() | |||||
| output = uniq(x) | |||||
| print("x:\n", x) | |||||
| print("y:\n", output[0]) | |||||
| print("idx:\n", output[1]) | |||||
| expect_y_result = [1, 2, 5] | |||||
| expect_idx_result = [0, 1, 2, 1] | |||||
| assert (output[0].asnumpy() == expect_y_result).all() | |||||
| assert (output[1].asnumpy() == expect_idx_result).all() | |||||
| def test_net_int64(): | |||||
| x = Tensor(np.array([1, 2, 5, 2]), mstype.int64) | |||||
| uniq = Net() | |||||
| output = uniq(x) | |||||
| print("x:\n", x) | |||||
| print("y:\n", output[0]) | |||||
| print("idx:\n", output[1]) | |||||
| expect_y_result = [1, 2, 5] | |||||
| expect_idx_result = [0, 1, 2, 1] | |||||
| assert (output[0].asnumpy() == expect_y_result).all() | |||||
| assert (output[1].asnumpy() == expect_idx_result).all() | |||||
| @@ -29,7 +29,7 @@ class UniqueCpuKernelTest : public UT::Common { | |||||
| UniqueCpuKernelTest() : unique_(std::make_shared<UniqueCPUKernel>()) {} | UniqueCpuKernelTest() : unique_(std::make_shared<UniqueCPUKernel>()) {} | ||||
| void SetUp() override { | void SetUp() override { | ||||
| unique_->n_ = 9; | |||||
| unique_->input_size_ = 9; | |||||
| unique_->dtype_ = kNumberTypeFloat32; | unique_->dtype_ = kNumberTypeFloat32; | ||||
| inputs_.clear(); | inputs_.clear(); | ||||
| workspace_.clear(); | workspace_.clear(); | ||||
| @@ -42,16 +42,19 @@ class UniqueCpuKernelTest : public UT::Common { | |||||
| return kernel_addr; | return kernel_addr; | ||||
| } | } | ||||
| void CreateInputAddress() { inputs_.push_back(CreateKernelAddress(x_.data())); } | |||||
| void CreateOutputAddress() { | |||||
| void CreateAddress() { | |||||
| inputs_.push_back(CreateKernelAddress(x_.data())); | |||||
| outputs_.push_back(CreateKernelAddress(y_.data())); | outputs_.push_back(CreateKernelAddress(y_.data())); | ||||
| outputs_.push_back(CreateKernelAddress(idx_.data())); | outputs_.push_back(CreateKernelAddress(idx_.data())); | ||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| } | } | ||||
| std::vector<float> x_; | std::vector<float> x_; | ||||
| std::vector<float> y_; | std::vector<float> y_; | ||||
| std::vector<int64_t> idx_; | |||||
| std::vector<int> idx_; | |||||
| std::vector<int64_t> workspace_idx_; | |||||
| std::vector<AddressPtr> inputs_; | std::vector<AddressPtr> inputs_; | ||||
| std::vector<AddressPtr> workspace_; | std::vector<AddressPtr> workspace_; | ||||
| std::vector<AddressPtr> outputs_; | std::vector<AddressPtr> outputs_; | ||||
| @@ -62,13 +65,13 @@ TEST_F(UniqueCpuKernelTest, compute_test) { | |||||
| x_ = {1, 1, 2, 4, 4, 4, 7, 8, 8}; | x_ = {1, 1, 2, 4, 4, 4, 7, 8, 8}; | ||||
| y_ = {1, 1, 1, 1, 1}; | y_ = {1, 1, 1, 1, 1}; | ||||
| idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1}; | idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1}; | ||||
| CreateInputAddress(); | |||||
| CreateOutputAddress(); | |||||
| workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1}; | |||||
| CreateAddress(); | |||||
| unique_->Launch(inputs_, workspace_, outputs_); | unique_->Launch(inputs_, workspace_, outputs_); | ||||
| // check compute result | // check compute result | ||||
| std::vector<float> expect_y{1, 2, 4, 7, 8}; | std::vector<float> expect_y{1, 2, 4, 7, 8}; | ||||
| std::vector<int64_t> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4}; | |||||
| std::vector<int> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4}; | |||||
| EXPECT_TRUE(y_ == expect_y); | EXPECT_TRUE(y_ == expect_y); | ||||
| EXPECT_TRUE(idx_ == expect_idx); | EXPECT_TRUE(idx_ == expect_idx); | ||||
| } | } | ||||
| @@ -29,7 +29,7 @@ class UniqueWithPadCpuKernelTest : public UT::Common { | |||||
| UniqueWithPadCpuKernelTest() : unique_with_pad_(std::make_shared<UniqueWithPadCPUKernel>()) {} | UniqueWithPadCpuKernelTest() : unique_with_pad_(std::make_shared<UniqueWithPadCPUKernel>()) {} | ||||
| void SetUp() override { | void SetUp() override { | ||||
| unique_with_pad_->n_ = 10; | |||||
| unique_with_pad_->input_size_ = 10; | |||||
| unique_with_pad_->dtype_ = kNumberTypeInt64; | unique_with_pad_->dtype_ = kNumberTypeInt64; | ||||
| inputs_.clear(); | inputs_.clear(); | ||||
| workspace_.clear(); | workspace_.clear(); | ||||
| @@ -42,21 +42,21 @@ class UniqueWithPadCpuKernelTest : public UT::Common { | |||||
| return kernel_addr; | return kernel_addr; | ||||
| } | } | ||||
| void CreateInputAddress() { | |||||
| void CreateAddress() { | |||||
| inputs_.push_back(CreateKernelAddress(x_.data())); | inputs_.push_back(CreateKernelAddress(x_.data())); | ||||
| inputs_.push_back(CreateKernelAddress(&pad_dim_)); | inputs_.push_back(CreateKernelAddress(&pad_dim_)); | ||||
| ; | |||||
| } | |||||
| void CreateOutputAddress() { | |||||
| outputs_.push_back(CreateKernelAddress(out_.data())); | outputs_.push_back(CreateKernelAddress(out_.data())); | ||||
| outputs_.push_back(CreateKernelAddress(idx_.data())); | outputs_.push_back(CreateKernelAddress(idx_.data())); | ||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| workspace_.push_back(CreateKernelAddress(workspace_idx_.data())); | |||||
| } | } | ||||
| std::vector<int64_t> x_; | std::vector<int64_t> x_; | ||||
| int64_t pad_dim_; | int64_t pad_dim_; | ||||
| std::vector<int64_t> out_; | std::vector<int64_t> out_; | ||||
| std::vector<int64_t> idx_; | |||||
| std::vector<int> idx_; | |||||
| std::vector<int64_t> workspace_idx_; | |||||
| std::vector<AddressPtr> inputs_; | std::vector<AddressPtr> inputs_; | ||||
| std::vector<AddressPtr> workspace_; | std::vector<AddressPtr> workspace_; | ||||
| std::vector<AddressPtr> outputs_; | std::vector<AddressPtr> outputs_; | ||||
| @@ -68,13 +68,13 @@ TEST_F(UniqueWithPadCpuKernelTest, compute_test) { | |||||
| pad_dim_ = 8; | pad_dim_ = 8; | ||||
| out_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | out_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | ||||
| idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | ||||
| CreateInputAddress(); | |||||
| CreateOutputAddress(); | |||||
| workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1}; | |||||
| CreateAddress(); | |||||
| unique_with_pad_->Launch(inputs_, workspace_, outputs_); | unique_with_pad_->Launch(inputs_, workspace_, outputs_); | ||||
| // check compute result | // check compute result | ||||
| std::vector<int64_t> expect_out{1, 5, 4, 3, 2, 8, 8, 8, 8, 8}; | |||||
| std::vector<int64_t> expect_idx{0, 0, 1, 1, 2, 2, 3, 3, 4, 4}; | |||||
| std::vector<int64_t> expect_out{1, 2, 3, 4, 5, 8, 8, 8, 8, 8}; | |||||
| std::vector<int> expect_idx{0, 0, 4, 4, 3, 3, 2, 2, 1, 1}; | |||||
| EXPECT_TRUE(out_ == expect_out); | EXPECT_TRUE(out_ == expect_out); | ||||
| EXPECT_TRUE(idx_ == expect_idx); | EXPECT_TRUE(idx_ == expect_idx); | ||||
| } | } | ||||