!15898 convert the implementation of tile unsortedSegmentSum CPU ops to nnacl

From: @zhangzhewei01 Reviewed-by: @wuxuejian Signed-off-by:
4 years ago · 5e8486adea
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h
@@ -22,16 +22,16 @@
 typedef struct TileParameter {
  // primitive parameter
  OpParameter op_parameter_;
  int multiples_[5];
  int dims_[5];
  int multiples_[7];
  int dims_[7];
  size_t dims_size_;
  size_t multiples_size_;

  // shape correlative
  int in_shape_[5];
  int out_shape_[5];
  int in_strides_[5];
  int out_strides_[5];
  int in_shape_[7];
  int out_shape_[7];
  int in_strides_[7];
  int out_strides_[7];

  // other parameter
  int in_dim_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c
@@ -0,0 +1,40 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "nnacl/base/unsorted_segment_sum_base.h"
 #include "nnacl/errorcode.h"

 #define UNSORTEDSEGMENTSUM(type)                                                                                   \
  int UnsortedSegmentSum_##type(const type *input, int unit_num, int input_dim1, const int *indices, type *output, \
                                int output_dim0, int output_dim1) {                                                \
    if (input_dim1 == 0) {                                                                                         \
      return NNACL_ERR;                                                                                            \
    }                                                                                                              \
    for (int i = 0; i < unit_num; ++i) {                                                                           \
      int j = i / input_dim1;                                                                                      \
      int k = i % input_dim1;                                                                                      \
                                                                                                                   \
      int index = indices[j];                                                                                      \
      if (index < 0 || index >= output_dim0) {                                                                     \
        continue;                                                                                                  \
      }                                                                                                            \
      int output_index = index * output_dim1 + k;                                                                  \
      output[output_index] += input[i];                                                                            \
    }                                                                                                              \
    return NNACL_OK;                                                                                               \
  }

 UNSORTEDSEGMENTSUM(int)
 UNSORTEDSEGMENTSUM(float)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.h
@@ -0,0 +1,32 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_
 #define MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_

 #ifdef __cplusplus
 extern "C" {
 #endif
 #define UnsortedSegmentSum(type, input, unit_num, input_dim1, indices, output, output_dim0, output_dim1) \
  UnsortedSegmentSum_##type(input, unit_num, input_dim1, indices, output, output_dim0, output_dim1)
 int UnsortedSegmentSum_int(const int *input, int unit_num, int input_dim1, const int *indices, int *output,
                           int output_dim0, int output_dim1);
 int UnsortedSegmentSum_float(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
                             int output_dim0, int output_dim1);
 #ifdef __cplusplus
 }
 #endif
 #endif  //  MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.c
@@ -1,36 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "nnacl/fp32_grad/unsorted_segment_sum.h"
 #include "nnacl/errorcode.h"

 int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
                       int output_dim0, int output_dim1) {
  if (input_dim1 == 0) {
    return NNACL_ERR;
  }
  for (int i = 0; i < unit_num; ++i) {
    int j = i / input_dim1;
    int k = i % input_dim1;

    int index = indices[j];
    if (index < 0 || index >= output_dim0) {
      continue;
    }
    int output_index = index * output_dim1 + k;
    output[output_index] += input[i];
  }
  return NNACL_OK;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.h
@@ -1,29 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_
 #define MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_

 #ifdef __cplusplus
 extern "C" {
 #endif

 int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
                       int output_dim0, int output_dim1);
 #ifdef __cplusplus
 }
 #endif
 #endif  //  MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
@@ -20,18 +20,69 @@

 namespace mindspore {
 namespace kernel {
 void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
 void TileCPUKernel::TileMultipleCompute(void) {
  int large_one_multiple_count_ = 0;
  int multiple = 0;
  int mul_index = 0;
  for (size_t i = 0; i < multiples_.size(); i++) {
    tile_parameter_.multiples_[i] = multiples_[i];
    if (tile_parameter_.multiples_[i] > 1) {
      large_one_multiple_count_++;
      multiple = tile_parameter_.multiples_[i];
      mul_index = i;
    }
  }

  one_dim_tile_ = large_one_multiple_count_ == 1;
  if (one_dim_tile_) {
    tile_parameter_.fast_multiple_ = static_cast<size_t>(multiple);
    tile_parameter_.fast_stride_ = static_cast<size_t>(x_shape_[mul_index] * tile_parameter_.in_strides_[mul_index]);
    tile_parameter_.fast_outer_size_ = static_cast<size_t>(input_size_ / tile_parameter_.fast_stride_);
  }
 }

 void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) {
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  y_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  std::vector<int64_t> multiples_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "multiples");
  (void)std::transform(multiples_me.begin(), multiples_me.end(), std::back_inserter(multiples_),
                       [](const int64_t &value) { return static_cast<int>(value); });
  dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
  dtype_ = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
  if (dtype_ == kTypeUnknown) {
    dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
  }

  size_t ones = multiples_.size() - x_shape_.size();
  if (ones > 0) {
    for (size_t i = 0; i < ones; ++i) {
      x_shape_.insert(x_shape_.begin(), 1);
    }
  }

  input_size_ = 1;
  tile_parameter_.in_dim_ = x_shape_.size();
  for (int i = 0; i < tile_parameter_.in_dim_; i++) {
    input_size_ *= x_shape_[i];
    tile_parameter_.in_shape_[i] = x_shape_[i];
    tile_parameter_.out_shape_[i] = y_shape_[i];
  }

  int stridex = 1;
  int stridey = 1;
  for (int i = tile_parameter_.in_dim_ - 1; i >= 0; i--) {
    tile_parameter_.in_strides_[i] = stridex;
    tile_parameter_.out_strides_[i] = stridey;
    stridex *= x_shape_[i];
    stridey *= y_shape_[i];
  }

  TileMultipleCompute();
 }

 void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  TileTensorParamrInit(kernel_node);

  launch_map_[kNumberTypeInt8] = &TileCPUKernel::LaunchKernel<int8_t>;
  launch_map_[kNumberTypeInt16] = &TileCPUKernel::LaunchKernel<int16_t>;
  launch_map_[kNumberTypeInt32] = &TileCPUKernel::LaunchKernel<int>;
@@ -57,54 +108,18 @@ bool TileCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
  return true;
 }

 template <typename T>
 void TileRecTask(const T *x, T *y, size_t dim, size_t *offset, std::vector<size_t> *pos,
                 const std::vector<int> &multiples, const std::vector<size_t> &cargo_x,
                 const std::vector<size_t> &cargo_y, const std::vector<size_t> &x_shape) {
  if (dim == x_shape.size()) {
    return;
  }
  for (size_t i = 0; i < x_shape[dim]; ++i) {
    (*pos)[dim] = i;
    if (dim == x_shape.size() - 1) {
      size_t x_offset = 0;
      for (size_t j = 0; j < (*pos).size(); ++j) {
        x_offset += (*pos)[j] * cargo_x[j];
      }
      memcpy_s(y + *offset, sizeof(T), x + x_offset, sizeof(T));
      *offset += 1;
      continue;
    }
    TileRecTask(x, y, dim + 1, offset, pos, multiples, cargo_x, cargo_y, x_shape);
  }
  size_t dim_size = cargo_y[dim] * sizeof(T);
  for (int m = 0; m < multiples[dim] - 1; ++m) {
    size_t y_offset = *offset - cargo_y[dim];
    memcpy_s(y + *offset, dim_size, y + y_offset, dim_size);
    *offset += cargo_y[dim];
  }
 }

 template <typename T>
 void TileCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto y_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t ones = multiples_.size() - x_shape_.size();
  if (ones > 0) {
    for (size_t i = 0; i < ones; ++i) {
      x_shape_.insert(x_shape_.begin(), 1);
    }
  }
  int d = multiples_.size();
  std::vector<size_t> pos(d, 0);
  std::vector<size_t> cargo_x(d, 1);
  std::vector<size_t> cargo_y = x_shape_;
  for (int i = d - 2; i >= 0; --i) {
    cargo_x[i] = x_shape_[i + 1] * cargo_x[i + 1];
    cargo_y[i] *= cargo_y[i + 1] * multiples_[i + 1];
  tile_parameter_.data_size_ = sizeof(T);

  if (one_dim_tile_) {
    auto task = [&](size_t start, size_t end) { TileSimple(x_addr, y_addr, start, end, &tile_parameter_); };
    CPUKernelUtils::ParallelFor(task, tile_parameter_.fast_outer_size_);
  }
  size_t offset = 0;
  TileRecTask<T>(x_addr, y_addr, 0, &offset, &pos, multiples_, cargo_x, cargo_y, x_shape_);

  Tile(x_addr, y_addr, &tile_parameter_);
 }

 void TileCPUKernel::CheckParam(const CNodePtr &kernel_node) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h
@@ -21,6 +21,7 @@
 #include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/tile_base.h"

 namespace mindspore {
 namespace kernel {
@@ -37,6 +38,10 @@ class TileCPUKernel : public CPUKernel {
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  void TileTensorParamrInit(const CNodePtr &kernel_node);

  void TileMultipleCompute(void);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
@@ -47,6 +52,9 @@ class TileCPUKernel : public CPUKernel {
    std::function<void(TileCPUKernel *, const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs)>;
  std::unordered_map<TypeId, TypeKernel> launch_map_;
  TypeKernel launch_func_;
  TileParameter tile_parameter_;
  bool one_dim_tile_;
  size_t input_size_;
 };

 MS_REG_CPU_KERNEL(Tile, KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), TileCPUKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.cc
@@ -52,44 +52,38 @@ bool UnsortedSegmentSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &
                                         const std::vector<kernel::AddressPtr> &,
                                         const std::vector<kernel::AddressPtr> &outputs) {
  bool ret{true};
  void *input_addr = inputs[0]->addr;
  const int *indices_addr = reinterpret_cast<const int *>(inputs[1]->addr);
  void *output_addr = outputs[0]->addr;
  auto ret1 = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
  if (ret1 != EOK) {
    MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret1;
    return false;
  }

  if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt32) {
    ret = LaunchKernel<int, int>(inputs, outputs);
    ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr,
                              static_cast<int *>(output_addr), output_dim0_, output_dim1_);
  } else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt32) {
    ret = LaunchKernel<float, int>(inputs, outputs);
    ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr,
                              static_cast<float *>(output_addr), output_dim0_, output_dim1_);
  } else if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt64) {
    ret = LaunchKernel<int, int64_t>(inputs, outputs);
    ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr,
                              static_cast<int *>(output_addr), output_dim0_, output_dim1_);
  } else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt64) {
    ret = LaunchKernel<float, int64_t>(inputs, outputs);
    ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr,
                              static_cast<float *>(output_addr), output_dim0_, output_dim1_);
  } else {
    MS_LOG(ERROR) << "Only support input_x int32 and float32, indices int32 and int64";
    return false;
  }
  return ret;
 }

 template <typename S, typename T>
 bool UnsortedSegmentSumCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                               const std::vector<kernel::AddressPtr> &outputs) {
  S *input_addr = reinterpret_cast<S *>(inputs[0]->addr);
  T *indices_addr = reinterpret_cast<T *>(inputs[1]->addr);
  S *output_addr = reinterpret_cast<S *>(outputs[0]->addr);
  auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
  if (ret != EOK) {
    MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret;
  if (ret1 != EOK) {
    MS_LOG(ERROR) << "unsortedSegmentSum failed. ret:" << ret1;
    return false;
  }
  for (size_t i = 0; i < unit_num_; ++i) {
    size_t j = i / input_dim1_;
    size_t k = i % input_dim1_;

    T index = indices_addr[j];
    if (index < 0 || index >= SizeToInt(output_dim0_)) {
      continue;
    }
    size_t output_index = index * output_dim1_ + k;
    output_addr[output_index] += input_addr[i];
  }
  return true;
  return ret;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.h
@@ -21,6 +21,7 @@
 #include <unordered_map>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/unsorted_segment_sum_base.h"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/unsorted_segment_sum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/unsorted_segment_sum.cc
@@ -19,7 +19,7 @@
 #include <algorithm>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "nnacl/fp32_grad/unsorted_segment_sum.h"
 #include "nnacl/base/unsorted_segment_sum_base.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"

@@ -86,7 +86,7 @@ int UnsortedSegmentSumCPUKernel::Execute(int task_id) {
  int *indices = reinterpret_cast<int *>(indices_tensor->data_c());
  float *output = reinterpret_cast<float *>(output_tensor->MutableData());
  std::fill(output, output + output_tensor->ElementsNum(), 0.f);
  ret = UnsortedSegmentSum(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
  ret = UnsortedSegmentSum(float, input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]";
    return RET_ERROR;