From: @zhangzhewei01 Reviewed-by: @wuxuejian Signed-off-by:tags/v1.3.0
| @@ -22,16 +22,16 @@ | |||
| typedef struct TileParameter { | |||
| // primitive parameter | |||
| OpParameter op_parameter_; | |||
| int multiples_[5]; | |||
| int dims_[5]; | |||
| int multiples_[7]; | |||
| int dims_[7]; | |||
| size_t dims_size_; | |||
| size_t multiples_size_; | |||
| // shape correlative | |||
| int in_shape_[5]; | |||
| int out_shape_[5]; | |||
| int in_strides_[5]; | |||
| int out_strides_[5]; | |||
| int in_shape_[7]; | |||
| int out_shape_[7]; | |||
| int in_strides_[7]; | |||
| int out_strides_[7]; | |||
| // other parameter | |||
| int in_dim_; | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "nnacl/base/unsorted_segment_sum_base.h" | |||
| #include "nnacl/errorcode.h" | |||
| #define UNSORTEDSEGMENTSUM(type) \ | |||
| int UnsortedSegmentSum_##type(const type *input, int unit_num, int input_dim1, const int *indices, type *output, \ | |||
| int output_dim0, int output_dim1) { \ | |||
| if (input_dim1 == 0) { \ | |||
| return NNACL_ERR; \ | |||
| } \ | |||
| for (int i = 0; i < unit_num; ++i) { \ | |||
| int j = i / input_dim1; \ | |||
| int k = i % input_dim1; \ | |||
| \ | |||
| int index = indices[j]; \ | |||
| if (index < 0 || index >= output_dim0) { \ | |||
| continue; \ | |||
| } \ | |||
| int output_index = index * output_dim1 + k; \ | |||
| output[output_index] += input[i]; \ | |||
| } \ | |||
| return NNACL_OK; \ | |||
| } | |||
| UNSORTEDSEGMENTSUM(int) | |||
| UNSORTEDSEGMENTSUM(float) | |||
| @@ -0,0 +1,32 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_ | |||
| #define MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #define UnsortedSegmentSum(type, input, unit_num, input_dim1, indices, output, output_dim0, output_dim1) \ | |||
| UnsortedSegmentSum_##type(input, unit_num, input_dim1, indices, output, output_dim0, output_dim1) | |||
| int UnsortedSegmentSum_int(const int *input, int unit_num, int input_dim1, const int *indices, int *output, | |||
| int output_dim0, int output_dim1); | |||
| int UnsortedSegmentSum_float(const float *input, int unit_num, int input_dim1, const int *indices, float *output, | |||
| int output_dim0, int output_dim1); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_ | |||
| @@ -1,36 +0,0 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "nnacl/fp32_grad/unsorted_segment_sum.h" | |||
| #include "nnacl/errorcode.h" | |||
| int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output, | |||
| int output_dim0, int output_dim1) { | |||
| if (input_dim1 == 0) { | |||
| return NNACL_ERR; | |||
| } | |||
| for (int i = 0; i < unit_num; ++i) { | |||
| int j = i / input_dim1; | |||
| int k = i % input_dim1; | |||
| int index = indices[j]; | |||
| if (index < 0 || index >= output_dim0) { | |||
| continue; | |||
| } | |||
| int output_index = index * output_dim1 + k; | |||
| output[output_index] += input[i]; | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| @@ -1,29 +0,0 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_ | |||
| #define MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output, | |||
| int output_dim0, int output_dim1); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_ | |||
| @@ -20,18 +20,69 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| void TileCPUKernel::TileMultipleCompute(void) { | |||
| int large_one_multiple_count_ = 0; | |||
| int multiple = 0; | |||
| int mul_index = 0; | |||
| for (size_t i = 0; i < multiples_.size(); i++) { | |||
| tile_parameter_.multiples_[i] = multiples_[i]; | |||
| if (tile_parameter_.multiples_[i] > 1) { | |||
| large_one_multiple_count_++; | |||
| multiple = tile_parameter_.multiples_[i]; | |||
| mul_index = i; | |||
| } | |||
| } | |||
| one_dim_tile_ = large_one_multiple_count_ == 1; | |||
| if (one_dim_tile_) { | |||
| tile_parameter_.fast_multiple_ = static_cast<size_t>(multiple); | |||
| tile_parameter_.fast_stride_ = static_cast<size_t>(x_shape_[mul_index] * tile_parameter_.in_strides_[mul_index]); | |||
| tile_parameter_.fast_outer_size_ = static_cast<size_t>(input_size_ / tile_parameter_.fast_stride_); | |||
| } | |||
| } | |||
| void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) { | |||
| x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| y_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| std::vector<int64_t> multiples_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "multiples"); | |||
| (void)std::transform(multiples_me.begin(), multiples_me.end(), std::back_inserter(multiples_), | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0); | |||
| dtype_ = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, 0); | |||
| if (dtype_ == kTypeUnknown) { | |||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | |||
| } | |||
| size_t ones = multiples_.size() - x_shape_.size(); | |||
| if (ones > 0) { | |||
| for (size_t i = 0; i < ones; ++i) { | |||
| x_shape_.insert(x_shape_.begin(), 1); | |||
| } | |||
| } | |||
| input_size_ = 1; | |||
| tile_parameter_.in_dim_ = x_shape_.size(); | |||
| for (int i = 0; i < tile_parameter_.in_dim_; i++) { | |||
| input_size_ *= x_shape_[i]; | |||
| tile_parameter_.in_shape_[i] = x_shape_[i]; | |||
| tile_parameter_.out_shape_[i] = y_shape_[i]; | |||
| } | |||
| int stridex = 1; | |||
| int stridey = 1; | |||
| for (int i = tile_parameter_.in_dim_ - 1; i >= 0; i--) { | |||
| tile_parameter_.in_strides_[i] = stridex; | |||
| tile_parameter_.out_strides_[i] = stridey; | |||
| stridex *= x_shape_[i]; | |||
| stridey *= y_shape_[i]; | |||
| } | |||
| TileMultipleCompute(); | |||
| } | |||
| void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| TileTensorParamrInit(kernel_node); | |||
| launch_map_[kNumberTypeInt8] = &TileCPUKernel::LaunchKernel<int8_t>; | |||
| launch_map_[kNumberTypeInt16] = &TileCPUKernel::LaunchKernel<int16_t>; | |||
| launch_map_[kNumberTypeInt32] = &TileCPUKernel::LaunchKernel<int>; | |||
| @@ -57,54 +108,18 @@ bool TileCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void TileRecTask(const T *x, T *y, size_t dim, size_t *offset, std::vector<size_t> *pos, | |||
| const std::vector<int> &multiples, const std::vector<size_t> &cargo_x, | |||
| const std::vector<size_t> &cargo_y, const std::vector<size_t> &x_shape) { | |||
| if (dim == x_shape.size()) { | |||
| return; | |||
| } | |||
| for (size_t i = 0; i < x_shape[dim]; ++i) { | |||
| (*pos)[dim] = i; | |||
| if (dim == x_shape.size() - 1) { | |||
| size_t x_offset = 0; | |||
| for (size_t j = 0; j < (*pos).size(); ++j) { | |||
| x_offset += (*pos)[j] * cargo_x[j]; | |||
| } | |||
| memcpy_s(y + *offset, sizeof(T), x + x_offset, sizeof(T)); | |||
| *offset += 1; | |||
| continue; | |||
| } | |||
| TileRecTask(x, y, dim + 1, offset, pos, multiples, cargo_x, cargo_y, x_shape); | |||
| } | |||
| size_t dim_size = cargo_y[dim] * sizeof(T); | |||
| for (int m = 0; m < multiples[dim] - 1; ++m) { | |||
| size_t y_offset = *offset - cargo_y[dim]; | |||
| memcpy_s(y + *offset, dim_size, y + y_offset, dim_size); | |||
| *offset += cargo_y[dim]; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void TileCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| auto x_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto y_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t ones = multiples_.size() - x_shape_.size(); | |||
| if (ones > 0) { | |||
| for (size_t i = 0; i < ones; ++i) { | |||
| x_shape_.insert(x_shape_.begin(), 1); | |||
| } | |||
| } | |||
| int d = multiples_.size(); | |||
| std::vector<size_t> pos(d, 0); | |||
| std::vector<size_t> cargo_x(d, 1); | |||
| std::vector<size_t> cargo_y = x_shape_; | |||
| for (int i = d - 2; i >= 0; --i) { | |||
| cargo_x[i] = x_shape_[i + 1] * cargo_x[i + 1]; | |||
| cargo_y[i] *= cargo_y[i + 1] * multiples_[i + 1]; | |||
| tile_parameter_.data_size_ = sizeof(T); | |||
| if (one_dim_tile_) { | |||
| auto task = [&](size_t start, size_t end) { TileSimple(x_addr, y_addr, start, end, &tile_parameter_); }; | |||
| CPUKernelUtils::ParallelFor(task, tile_parameter_.fast_outer_size_); | |||
| } | |||
| size_t offset = 0; | |||
| TileRecTask<T>(x_addr, y_addr, 0, &offset, &pos, multiples_, cargo_x, cargo_y, x_shape_); | |||
| Tile(x_addr, y_addr, &tile_parameter_); | |||
| } | |||
| void TileCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| @@ -21,6 +21,7 @@ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/tile_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -37,6 +38,10 @@ class TileCPUKernel : public CPUKernel { | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void TileTensorParamrInit(const CNodePtr &kernel_node); | |||
| void TileMultipleCompute(void); | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> x_shape_; | |||
| @@ -47,6 +52,9 @@ class TileCPUKernel : public CPUKernel { | |||
| std::function<void(TileCPUKernel *, const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs)>; | |||
| std::unordered_map<TypeId, TypeKernel> launch_map_; | |||
| TypeKernel launch_func_; | |||
| TileParameter tile_parameter_; | |||
| bool one_dim_tile_; | |||
| size_t input_size_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(Tile, KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), TileCPUKernel); | |||
| @@ -52,44 +52,38 @@ bool UnsortedSegmentSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> & | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| bool ret{true}; | |||
| void *input_addr = inputs[0]->addr; | |||
| const int *indices_addr = reinterpret_cast<const int *>(inputs[1]->addr); | |||
| void *output_addr = outputs[0]->addr; | |||
| auto ret1 = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size); | |||
| if (ret1 != EOK) { | |||
| MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret1; | |||
| return false; | |||
| } | |||
| if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt32) { | |||
| ret = LaunchKernel<int, int>(inputs, outputs); | |||
| ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr, | |||
| static_cast<int *>(output_addr), output_dim0_, output_dim1_); | |||
| } else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt32) { | |||
| ret = LaunchKernel<float, int>(inputs, outputs); | |||
| ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr, | |||
| static_cast<float *>(output_addr), output_dim0_, output_dim1_); | |||
| } else if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt64) { | |||
| ret = LaunchKernel<int, int64_t>(inputs, outputs); | |||
| ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr, | |||
| static_cast<int *>(output_addr), output_dim0_, output_dim1_); | |||
| } else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt64) { | |||
| ret = LaunchKernel<float, int64_t>(inputs, outputs); | |||
| ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr, | |||
| static_cast<float *>(output_addr), output_dim0_, output_dim1_); | |||
| } else { | |||
| MS_LOG(ERROR) << "Only support input_x int32 and float32, indices int32 and int64"; | |||
| return false; | |||
| } | |||
| return ret; | |||
| } | |||
| template <typename S, typename T> | |||
| bool UnsortedSegmentSumCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| S *input_addr = reinterpret_cast<S *>(inputs[0]->addr); | |||
| T *indices_addr = reinterpret_cast<T *>(inputs[1]->addr); | |||
| S *output_addr = reinterpret_cast<S *>(outputs[0]->addr); | |||
| auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size); | |||
| if (ret != EOK) { | |||
| MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret; | |||
| if (ret1 != EOK) { | |||
| MS_LOG(ERROR) << "unsortedSegmentSum failed. ret:" << ret1; | |||
| return false; | |||
| } | |||
| for (size_t i = 0; i < unit_num_; ++i) { | |||
| size_t j = i / input_dim1_; | |||
| size_t k = i % input_dim1_; | |||
| T index = indices_addr[j]; | |||
| if (index < 0 || index >= SizeToInt(output_dim0_)) { | |||
| continue; | |||
| } | |||
| size_t output_index = index * output_dim1_ + k; | |||
| output_addr[output_index] += input_addr[i]; | |||
| } | |||
| return true; | |||
| return ret; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -21,6 +21,7 @@ | |||
| #include <unordered_map> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/unsorted_segment_sum_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -19,7 +19,7 @@ | |||
| #include <algorithm> | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "nnacl/fp32_grad/unsorted_segment_sum.h" | |||
| #include "nnacl/base/unsorted_segment_sum_base.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| @@ -86,7 +86,7 @@ int UnsortedSegmentSumCPUKernel::Execute(int task_id) { | |||
| int *indices = reinterpret_cast<int *>(indices_tensor->data_c()); | |||
| float *output = reinterpret_cast<float *>(output_tensor->MutableData()); | |||
| std::fill(output, output + output_tensor->ElementsNum(), 0.f); | |||
| ret = UnsortedSegmentSum(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_); | |||
| ret = UnsortedSegmentSum(float, input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||