Browse Source

!15898 convert the implementation of tile unsortedSegmentSum CPU ops to nnacl

From: @zhangzhewei01
Reviewed-by: @wuxuejian
Signed-off-by:
tags/v1.3.0
mindspore-ci-bot Gitee 4 years ago
parent
commit
5e8486adea
10 changed files with 170 additions and 145 deletions
  1. +6
    -6
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h
  2. +40
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c
  3. +32
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.h
  4. +0
    -36
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.c
  5. +0
    -29
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.h
  6. +61
    -46
      mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
  7. +8
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h
  8. +20
    -26
      mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.cc
  9. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.h
  10. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32_grad/unsorted_segment_sum.cc

+ 6
- 6
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h View File

@@ -22,16 +22,16 @@
typedef struct TileParameter {
// primitive parameter
OpParameter op_parameter_;
int multiples_[5];
int dims_[5];
int multiples_[7];
int dims_[7];
size_t dims_size_;
size_t multiples_size_;

// shape correlative
int in_shape_[5];
int out_shape_[5];
int in_strides_[5];
int out_strides_[5];
int in_shape_[7];
int out_shape_[7];
int in_strides_[7];
int out_strides_[7];

// other parameter
int in_dim_;


+ 40
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c View File

@@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/base/unsorted_segment_sum_base.h"
#include "nnacl/errorcode.h"

#define UNSORTEDSEGMENTSUM(type) \
int UnsortedSegmentSum_##type(const type *input, int unit_num, int input_dim1, const int *indices, type *output, \
int output_dim0, int output_dim1) { \
if (input_dim1 == 0) { \
return NNACL_ERR; \
} \
for (int i = 0; i < unit_num; ++i) { \
int j = i / input_dim1; \
int k = i % input_dim1; \
\
int index = indices[j]; \
if (index < 0 || index >= output_dim0) { \
continue; \
} \
int output_index = index * output_dim1 + k; \
output[output_index] += input[i]; \
} \
return NNACL_OK; \
}

UNSORTEDSEGMENTSUM(int)
UNSORTEDSEGMENTSUM(float)

+ 32
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.h View File

@@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_
#define MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_

#ifdef __cplusplus
extern "C" {
#endif
#define UnsortedSegmentSum(type, input, unit_num, input_dim1, indices, output, output_dim0, output_dim1) \
UnsortedSegmentSum_##type(input, unit_num, input_dim1, indices, output, output_dim0, output_dim1)
int UnsortedSegmentSum_int(const int *input, int unit_num, int input_dim1, const int *indices, int *output,
int output_dim0, int output_dim1);
int UnsortedSegmentSum_float(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
int output_dim0, int output_dim1);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_NNACL_UNSORTED_SEGMENT_SUM_BASE_H_

+ 0
- 36
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.c View File

@@ -1,36 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp32_grad/unsorted_segment_sum.h"
#include "nnacl/errorcode.h"

int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
int output_dim0, int output_dim1) {
if (input_dim1 == 0) {
return NNACL_ERR;
}
for (int i = 0; i < unit_num; ++i) {
int j = i / input_dim1;
int k = i % input_dim1;

int index = indices[j];
if (index < 0 || index >= output_dim0) {
continue;
}
int output_index = index * output_dim1 + k;
output[output_index] += input[i];
}
return NNACL_OK;
}

+ 0
- 29
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/unsorted_segment_sum.h View File

@@ -1,29 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_
#define MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_

#ifdef __cplusplus
extern "C" {
#endif

int UnsortedSegmentSum(const float *input, int unit_num, int input_dim1, const int *indices, float *output,
int output_dim0, int output_dim1);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_NNACL_FP32_GRAD_UNSORTED_SEGMENT_SUM_H_

+ 61
- 46
mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc View File

@@ -20,18 +20,69 @@

namespace mindspore {
namespace kernel {
void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
void TileCPUKernel::TileMultipleCompute(void) {
int large_one_multiple_count_ = 0;
int multiple = 0;
int mul_index = 0;
for (size_t i = 0; i < multiples_.size(); i++) {
tile_parameter_.multiples_[i] = multiples_[i];
if (tile_parameter_.multiples_[i] > 1) {
large_one_multiple_count_++;
multiple = tile_parameter_.multiples_[i];
mul_index = i;
}
}

one_dim_tile_ = large_one_multiple_count_ == 1;
if (one_dim_tile_) {
tile_parameter_.fast_multiple_ = static_cast<size_t>(multiple);
tile_parameter_.fast_stride_ = static_cast<size_t>(x_shape_[mul_index] * tile_parameter_.in_strides_[mul_index]);
tile_parameter_.fast_outer_size_ = static_cast<size_t>(input_size_ / tile_parameter_.fast_stride_);
}
}

void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) {
x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
y_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
std::vector<int64_t> multiples_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "multiples");
(void)std::transform(multiples_me.begin(), multiples_me.end(), std::back_inserter(multiples_),
[](const int64_t &value) { return static_cast<int>(value); });
dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
dtype_ = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
if (dtype_ == kTypeUnknown) {
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
}

size_t ones = multiples_.size() - x_shape_.size();
if (ones > 0) {
for (size_t i = 0; i < ones; ++i) {
x_shape_.insert(x_shape_.begin(), 1);
}
}

input_size_ = 1;
tile_parameter_.in_dim_ = x_shape_.size();
for (int i = 0; i < tile_parameter_.in_dim_; i++) {
input_size_ *= x_shape_[i];
tile_parameter_.in_shape_[i] = x_shape_[i];
tile_parameter_.out_shape_[i] = y_shape_[i];
}

int stridex = 1;
int stridey = 1;
for (int i = tile_parameter_.in_dim_ - 1; i >= 0; i--) {
tile_parameter_.in_strides_[i] = stridex;
tile_parameter_.out_strides_[i] = stridey;
stridex *= x_shape_[i];
stridey *= y_shape_[i];
}

TileMultipleCompute();
}

void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
TileTensorParamrInit(kernel_node);

launch_map_[kNumberTypeInt8] = &TileCPUKernel::LaunchKernel<int8_t>;
launch_map_[kNumberTypeInt16] = &TileCPUKernel::LaunchKernel<int16_t>;
launch_map_[kNumberTypeInt32] = &TileCPUKernel::LaunchKernel<int>;
@@ -57,54 +108,18 @@ bool TileCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
return true;
}

template <typename T>
void TileRecTask(const T *x, T *y, size_t dim, size_t *offset, std::vector<size_t> *pos,
const std::vector<int> &multiples, const std::vector<size_t> &cargo_x,
const std::vector<size_t> &cargo_y, const std::vector<size_t> &x_shape) {
if (dim == x_shape.size()) {
return;
}
for (size_t i = 0; i < x_shape[dim]; ++i) {
(*pos)[dim] = i;
if (dim == x_shape.size() - 1) {
size_t x_offset = 0;
for (size_t j = 0; j < (*pos).size(); ++j) {
x_offset += (*pos)[j] * cargo_x[j];
}
memcpy_s(y + *offset, sizeof(T), x + x_offset, sizeof(T));
*offset += 1;
continue;
}
TileRecTask(x, y, dim + 1, offset, pos, multiples, cargo_x, cargo_y, x_shape);
}
size_t dim_size = cargo_y[dim] * sizeof(T);
for (int m = 0; m < multiples[dim] - 1; ++m) {
size_t y_offset = *offset - cargo_y[dim];
memcpy_s(y + *offset, dim_size, y + y_offset, dim_size);
*offset += cargo_y[dim];
}
}

template <typename T>
void TileCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto y_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t ones = multiples_.size() - x_shape_.size();
if (ones > 0) {
for (size_t i = 0; i < ones; ++i) {
x_shape_.insert(x_shape_.begin(), 1);
}
}
int d = multiples_.size();
std::vector<size_t> pos(d, 0);
std::vector<size_t> cargo_x(d, 1);
std::vector<size_t> cargo_y = x_shape_;
for (int i = d - 2; i >= 0; --i) {
cargo_x[i] = x_shape_[i + 1] * cargo_x[i + 1];
cargo_y[i] *= cargo_y[i + 1] * multiples_[i + 1];
tile_parameter_.data_size_ = sizeof(T);

if (one_dim_tile_) {
auto task = [&](size_t start, size_t end) { TileSimple(x_addr, y_addr, start, end, &tile_parameter_); };
CPUKernelUtils::ParallelFor(task, tile_parameter_.fast_outer_size_);
}
size_t offset = 0;
TileRecTask<T>(x_addr, y_addr, 0, &offset, &pos, multiples_, cargo_x, cargo_y, x_shape_);

Tile(x_addr, y_addr, &tile_parameter_);
}

void TileCPUKernel::CheckParam(const CNodePtr &kernel_node) {


+ 8
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h View File

@@ -21,6 +21,7 @@
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/tile_base.h"

namespace mindspore {
namespace kernel {
@@ -37,6 +38,10 @@ class TileCPUKernel : public CPUKernel {
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

void TileTensorParamrInit(const CNodePtr &kernel_node);

void TileMultipleCompute(void);

private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> x_shape_;
@@ -47,6 +52,9 @@ class TileCPUKernel : public CPUKernel {
std::function<void(TileCPUKernel *, const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs)>;
std::unordered_map<TypeId, TypeKernel> launch_map_;
TypeKernel launch_func_;
TileParameter tile_parameter_;
bool one_dim_tile_;
size_t input_size_;
};

MS_REG_CPU_KERNEL(Tile, KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), TileCPUKernel);


+ 20
- 26
mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.cc View File

@@ -52,44 +52,38 @@ bool UnsortedSegmentSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
bool ret{true};
void *input_addr = inputs[0]->addr;
const int *indices_addr = reinterpret_cast<const int *>(inputs[1]->addr);
void *output_addr = outputs[0]->addr;
auto ret1 = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
if (ret1 != EOK) {
MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret1;
return false;
}

if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt32) {
ret = LaunchKernel<int, int>(inputs, outputs);
ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr,
static_cast<int *>(output_addr), output_dim0_, output_dim1_);
} else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt32) {
ret = LaunchKernel<float, int>(inputs, outputs);
ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr,
static_cast<float *>(output_addr), output_dim0_, output_dim1_);
} else if (dtype_ == kNumberTypeInt32 && segment_ids_dtype_ == kNumberTypeInt64) {
ret = LaunchKernel<int, int64_t>(inputs, outputs);
ret1 = UnsortedSegmentSum(int, static_cast<const int *>(input_addr), unit_num_, input_dim1_, indices_addr,
static_cast<int *>(output_addr), output_dim0_, output_dim1_);
} else if (dtype_ == kNumberTypeFloat32 && segment_ids_dtype_ == kNumberTypeInt64) {
ret = LaunchKernel<float, int64_t>(inputs, outputs);
ret1 = UnsortedSegmentSum(float, static_cast<const float *>(input_addr), unit_num_, input_dim1_, indices_addr,
static_cast<float *>(output_addr), output_dim0_, output_dim1_);
} else {
MS_LOG(ERROR) << "Only support input_x int32 and float32, indices int32 and int64";
return false;
}
return ret;
}

template <typename S, typename T>
bool UnsortedSegmentSumCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs) {
S *input_addr = reinterpret_cast<S *>(inputs[0]->addr);
T *indices_addr = reinterpret_cast<T *>(inputs[1]->addr);
S *output_addr = reinterpret_cast<S *>(outputs[0]->addr);
auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
if (ret != EOK) {
MS_LOG(ERROR) << "Output buff memset fail. ret:" << ret;
if (ret1 != EOK) {
MS_LOG(ERROR) << "unsortedSegmentSum failed. ret:" << ret1;
return false;
}
for (size_t i = 0; i < unit_num_; ++i) {
size_t j = i / input_dim1_;
size_t k = i % input_dim1_;

T index = indices_addr[j];
if (index < 0 || index >= SizeToInt(output_dim0_)) {
continue;
}
size_t output_index = index * output_dim1_ + k;
output_addr[output_index] += input_addr[i];
}
return true;
return ret;
}
} // namespace kernel
} // namespace mindspore

+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/unsorted_segment_sum_cpu_kernel.h View File

@@ -21,6 +21,7 @@
#include <unordered_map>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/unsorted_segment_sum_base.h"

namespace mindspore {
namespace kernel {


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/fp32_grad/unsorted_segment_sum.cc View File

@@ -19,7 +19,7 @@
#include <algorithm>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp32_grad/unsorted_segment_sum.h"
#include "nnacl/base/unsorted_segment_sum_base.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"

@@ -86,7 +86,7 @@ int UnsortedSegmentSumCPUKernel::Execute(int task_id) {
int *indices = reinterpret_cast<int *>(indices_tensor->data_c());
float *output = reinterpret_cast<float *>(output_tensor->MutableData());
std::fill(output, output + output_tensor->ElementsNum(), 0.f);
ret = UnsortedSegmentSum(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
ret = UnsortedSegmentSum(float, input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]";
return RET_ERROR;


Loading…
Cancel
Save