Browse Source

converter the implementation of DropGrad and unpack and transpose CPU operators to nnacl

tags/v1.3.0
范吉斌 5 years ago
parent
commit
3a23520bba
7 changed files with 516 additions and 41 deletions
  1. +24
    -6
      mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
  2. +296
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
  3. +64
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.h
  4. +104
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc
  5. +6
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h
  6. +20
    -19
      mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.cc
  7. +2
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.h

+ 24
- 6
mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc View File

@@ -16,6 +16,7 @@
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/dropout_grad_kernel.h"
#include "nnacl/fp32_grad/dropout_grad.h"

namespace mindspore {
namespace kernel {
@@ -46,6 +47,8 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs,
DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_);
} else if (dtype_ == kNumberTypeFloat32) {
DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_);
} else {
MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
}

return true;
@@ -55,13 +58,28 @@ template <typename T>
void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs, size_t num_count,
float keep_prob) {
auto dx = reinterpret_cast<T *>(outputs[0]->addr);
auto dy = reinterpret_cast<T *>(inputs[0]->addr);
auto mask = reinterpret_cast<T *>(inputs[1]->addr);

auto *output = reinterpret_cast<T *>(outputs[0]->addr);
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
const auto *mask = reinterpret_cast<T *>(inputs[1]->addr);
const float scale = 1.f / keep_prob;
for (size_t i = 0; i < num_count; i += 1) {
dx[i] = (T)(scale * static_cast<float>(dy[i] * mask[i]));

if constexpr (std::is_same_v<T, float16>) {
float *input_tmp = new float[num_count_];
float *output_tmp = new float[num_count_];
float *mask_tmp = new float[num_count_];
for (size_t i = 0; i < num_count_; ++i) {
input_tmp[i] = static_cast<float>(input[i]);
mask_tmp[i] = static_cast<float>(mask[i]);
}
DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale);
for (size_t i = 0; i < num_count_; ++i) {
output[i] = static_cast<float16>(output_tmp[i]);
}
delete[] input_tmp;
delete[] output_tmp;
delete[] mask_tmp;
} else if constexpr (std::is_same_v<T, float>) {
DropoutGrad(input, mask, output, num_count_, scale);
}
}
} // namespace kernel


+ 296
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c View File

@@ -0,0 +1,296 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nnacl/base/transpose_base.h"
#include "nnacl/errorcode.h"

#define TRANSPOSE_TWO_DIMS(TYPE, NAME) \
void TransposeDim2##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape) { \
const int stride0 = strides[perm[0]]; \
const int stride1 = strides[perm[1]]; \
const int output0 = output_shape[0]; \
const int output1 = output_shape[1]; \
for (int i = 0; i < output0; ++i) { \
int out_stride0_i = i * output1; \
int stride0_i = i * 1 * stride0; \
for (int j = 0; j < output1; ++j) { \
out_data[out_stride0_i + j] = in_data[stride0_i + j * stride1]; \
} \
} \
}

#define TRANSPOSE_THREE_DIMS(TYPE, NAME) \
void TransposeDim3##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape) { \
const int stride0 = strides[perm[0]]; \
const int stride1 = strides[perm[1]]; \
const int stride2 = strides[perm[2]]; \
const int out_stride0 = out_strides[0]; \
const int out_stride1 = out_strides[1]; \
const int output0 = output_shape[0]; \
const int output1 = output_shape[1]; \
const int output2 = output_shape[2]; \
for (int i = 0; i < output0; ++i) { \
int out_stride0_i = i * out_stride0; \
int stride0_i = i * stride0; \
for (int j = 0; j < output1; ++j) { \
int out_stride1_j = j * out_stride1; \
int stride1_j = j * stride1; \
for (int k = 0; k < output2; ++k) { \
out_data[out_stride0_i + out_stride1_j + k] = in_data[stride0_i + stride1_j + k * stride2]; \
} \
} \
} \
}

#define TRANSPOSE_FOUR_DIMS(TYPE, NAME) \
void TransposeDim4##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape) { \
const int stride0 = strides[perm[0]]; \
const int stride1 = strides[perm[1]]; \
const int stride2 = strides[perm[2]]; \
const int stride3 = strides[perm[3]]; \
const int out_stride0 = out_strides[0]; \
const int out_stride1 = out_strides[1]; \
const int out_stride2 = out_strides[2]; \
const int output0 = output_shape[0]; \
const int output1 = output_shape[1]; \
const int output2 = output_shape[2]; \
const int output3 = output_shape[3]; \
for (int i = 0; i < output0; ++i) { \
int out_stride0_i = i * out_stride0; \
int stride0_i = i * stride0; \
for (int j = 0; j < output1; ++j) { \
int out_stride1_j = j * out_stride1; \
int stride1_j = j * stride1; \
for (int k = 0; k < output2; ++k) { \
int out_stride2_k = k * out_stride2; \
int stride2_k = k * stride2; \
for (int m = 0; m < output3; ++m) { \
out_data[out_stride0_i + out_stride1_j + out_stride2_k + m] = \
in_data[stride0_i + stride1_j + stride2_k + m * stride3]; \
} \
} \
} \
} \
}

#define TRANSPOSE_FIVE_DIMS(TYPE, NAME) \
void TransposeDim5##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape) { \
const int stride0 = strides[perm[0]]; \
const int stride1 = strides[perm[1]]; \
const int stride2 = strides[perm[2]]; \
const int stride3 = strides[perm[3]]; \
const int stride4 = strides[perm[4]]; \
const int out_stride0 = out_strides[0]; \
const int out_stride1 = out_strides[1]; \
const int out_stride2 = out_strides[2]; \
const int out_stride3 = out_strides[3]; \
const int output0 = output_shape[0]; \
const int output1 = output_shape[1]; \
const int output2 = output_shape[2]; \
const int output3 = output_shape[3]; \
const int output4 = output_shape[4]; \
for (int i = 0; i < output0; ++i) { \
int out_stride0_i = i * out_stride0; \
int stride0_i = i * stride0; \
for (int j = 0; j < output1; ++j) { \
int out_stride1_j = j * out_stride1; \
int stride1_j = j * stride1; \
for (int k = 0; k < output2; ++k) { \
int out_stride2_k = k * out_stride2; \
int stride2_k = k * stride2; \
for (int m = 0; m < output3; ++m) { \
int out_stride3_m = m * out_stride3; \
int stride3_m = m * stride3; \
for (int n = 0; n < output4; ++n) { \
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + n] = \
in_data[stride0_i + stride1_j + stride2_k + stride3_m + n * stride4]; \
} \
} \
} \
} \
} \
}

#define TRANSPOSE_SIX_DIMS(TYPE, NAME) \
void TransposeDim6##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape) { \
const int stride0 = strides[perm[0]]; \
const int stride1 = strides[perm[1]]; \
const int stride2 = strides[perm[2]]; \
const int stride3 = strides[perm[3]]; \
const int stride4 = strides[perm[4]]; \
const int stride5 = strides[perm[5]]; \
const int out_stride0 = out_strides[0]; \
const int out_stride1 = out_strides[1]; \
const int out_stride2 = out_strides[2]; \
const int out_stride3 = out_strides[3]; \
const int out_stride4 = out_strides[4]; \
const int output0 = output_shape[0]; \
const int output1 = output_shape[1]; \
const int output2 = output_shape[2]; \
const int output3 = output_shape[3]; \
const int output4 = output_shape[4]; \
const int output5 = output_shape[5]; \
for (int i = 0; i < output0; ++i) { \
int out_stride0_i = i * out_stride0; \
int stride0_i = i * stride0; \
for (int j = 0; j < output1; ++j) { \
int out_stride1_j = j * out_stride1; \
int stride1_j = j * stride1; \
for (int k = 0; k < output2; ++k) { \
int out_stride2_k = k * out_stride2; \
int stride2_k = k * stride2; \
for (int m = 0; m < output3; ++m) { \
int out_stride3_m = m * out_stride3; \
int stride3_m = m * stride3; \
for (int n = 0; n < output4; ++n) { \
int out_stride4_n = n * out_stride4; \
int stride4_n = n * stride4; \
for (int g = 0; g < output5; ++g) { \
out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] = \
in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5]; \
} \
} \
} \
} \
} \
} \
}

#define TRANSPOSE_MULTI_DIMS(TYPE, NAME) \
void Transpose##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
const int *perm, const int *output_shape, int dims, int *size, int *position) { \
*(size + dims - 1) = 1; \
for (int i = dims - 1; i > 0; --i) { \
*(size + i - 1) = *(size + i) * output_shape[i]; \
} \
for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \
for (int i = 0; i < dims; ++i) { \
*(position + i) = pos / *(size + i); \
int out_stride = i < dims - 1 ? out_strides[i] : 1; \
output_idx += (*(position + i) * out_stride); \
input_idx += (*(position + i) * strides[perm[i]]); \
pos -= *(position + i) * (*(size + i)); \
} \
out_data[output_idx] = in_data[input_idx]; \
} \
}

#define TRANSPOSE_DIMS(TYPE, NAME) \
void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, int *size, int *position, \
TransposeParameter *transpose_param, int task_id, int thread_num) { \
int *perm = transpose_param->perm_; \
int *strides = transpose_param->strides_; \
int *out_strides = transpose_param->out_strides_; \
int num_axes = transpose_param->num_axes_; \
size_t data_size = (*size) * output_shape[0]; \
size_t offset_size = UP_DIV(data_size, thread_num); \
size_t task_offset = offset_size * task_id; \
int count = data_size - task_offset; \
if (count <= 0) { \
return; \
} \
count = MSMIN(offset_size, count); \
for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \
int pos = idx; \
int output_idx = 0; \
int input_idx = 0; \
for (int i = 0; i < num_axes; ++i) { \
*(position + i) = pos / *(size + i); \
int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \
output_idx += (*(position + i) * out_stride); \
input_idx += (*(position + i) * strides[perm[i]]); \
pos -= *(position + i) * (*(size + i)); \
} \
out_data[output_idx] = in_data[input_idx]; \
} \
}

#define DOTRANSPOSE(TYPE, NAME) \
int DoTranspose##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, \
TransposeParameter *transpose_param) { \
if (in_data == NULL || out_data == NULL) { \
return NNACL_ERR; \
} \
const int *perm = transpose_param->perm_; \
const int *strides = transpose_param->strides_; \
const int *out_strides = transpose_param->out_strides_; \
int data_size = transpose_param->data_size_; \
int num_axes = transpose_param->num_axes_; \
if (num_axes < 2) { \
return NNACL_ERR; \
} \
bool needTranspose = false; \
for (int i = 1; i < num_axes; ++i) { \
if (perm[i] - perm[i - 1] != 1) { \
needTranspose = true; \
break; \
} \
} \
if (!needTranspose) { \
(void)memcpy(out_data, in_data, data_size); \
return NNACL_OK; \
} \
for (int i = 0; i < num_axes; ++i) { \
if (perm[i] < 0) { \
return NNACL_PARAM_INVALID; \
} \
} \
if (num_axes == 2) { \
TransposeDim2##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
} else if (num_axes == 3) { \
TransposeDim3##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
} else if (num_axes == 4) { \
TransposeDim4##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
} else if (num_axes == 5) { \
TransposeDim5##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
} else if (num_axes == 6) { \
TransposeDim6##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
} else { \
int *size = (int *)(malloc(num_axes * sizeof(int))); \
int *position = (int *)(malloc(num_axes * sizeof(int))); \
Transpose##NAME(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); \
free(size); \
free(position); \
} \
return NNACL_OK; \
}

#define TRANSPOSE_TEMPLATE(TYPE, NAME) \
TRANSPOSE_TWO_DIMS(TYPE, NAME) \
TRANSPOSE_THREE_DIMS(TYPE, NAME) \
TRANSPOSE_FOUR_DIMS(TYPE, NAME) \
TRANSPOSE_FIVE_DIMS(TYPE, NAME) \
TRANSPOSE_SIX_DIMS(TYPE, NAME) \
TRANSPOSE_MULTI_DIMS(TYPE, NAME) \
TRANSPOSE_DIMS(TYPE, NAME) \
DOTRANSPOSE(TYPE, NAME)

TRANSPOSE_TEMPLATE(uint8_t, UInt8)
TRANSPOSE_TEMPLATE(uint16_t, UInt16)
TRANSPOSE_TEMPLATE(uint32_t, UInt32)
TRANSPOSE_TEMPLATE(uint64_t, UInt64)
TRANSPOSE_TEMPLATE(int16_t, Int16)
TRANSPOSE_TEMPLATE(int32_t, Int32)
TRANSPOSE_TEMPLATE(int64_t, Int64)
TRANSPOSE_TEMPLATE(bool, Bool)

+ 64
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.h View File

@@ -0,0 +1,64 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_NNACL_TRANSPOSE_BASE_H_
#define MINDSPORE_NNACL_TRANSPOSE_BASE_H_

#include "nnacl/transpose.h"
#include <string.h>

#ifdef __cplusplus
extern "C" {
#endif

int DoTransposeUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape,
TransposeParameter *transpose_param);
int DoTransposeBool(const bool *in_data, bool *out_data, const int *output_shape, TransposeParameter *transpose_param);

void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);
void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape, int *size, int *position,
TransposeParameter *transpose_param, int task_id, int thread_num);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_NNACL_TRANSPOSE_BASE_H_

+ 104
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc View File

@@ -18,6 +18,9 @@
#include <algorithm>
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
#include "common/thread_pool.h"
#include "nnacl/fp32/transpose_fp32.h"
#include "nnacl/int8/transpose_int8.h"
namespace mindspore {
namespace kernel {
@@ -31,6 +34,23 @@ void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
if (dtype_ == kTypeUnknown) {
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
}
if (axes_.size() > MAX_SHAPE_SIZE) {
MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_SHAPE_SIZE << "D, but got " << axes_.size()
<< "D.";
}
for (size_t i = 0; i < axes_.size(); ++i) {
transpose_param_.perm_[i] = SizeToInt(axes_[i]);
}
int num_axes = SizeToInt(input_shape_.size());
transpose_param_.perm_size_ = axes_.size();
transpose_param_.num_axes_ = num_axes;
transpose_param_.strides_[num_axes - 1] = 1;
transpose_param_.out_strides_[num_axes - 1] = 1;
for (int i = num_axes - 2; i >= 0; i--) {
transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
}
launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
@@ -61,19 +81,91 @@ bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs
template <typename T>
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = IntToSize(inputs[0]->size / sizeof(T));
TransposeIterator base_iter(output_shape_, axes_, input_shape_);
auto task = [&base_iter, input_addr, output_addr](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr[iter.GetPos()];
iter.GenNextPos();
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
transpose_param_.data_size_ = IntToSize(inputs[0]->size);
int output_shape[SizeToInt(output_shape_.size())];
for (size_t i = 0; i < output_shape_.size(); ++i) {
output_shape[i] = SizeToInt(output_shape_[i]);
}
if (axes_.size() <= MAX_TRANSPOSE_DIM_SIZE) {
if constexpr (std::is_same_v<T, int8_t>) {
DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int16_t>) {
DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int32_t>) {
DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, int64_t>) {
DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint8_t>) {
DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint16_t>) {
DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint32_t>) {
DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, uint64_t>) {
DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, float>) {
DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
} else if constexpr (std::is_same_v<T, bool>) {
DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
}
};
CPUKernelUtils::ParallelFor(task, size);
} else {
size_t data_count = (inputs[0]->size) / sizeof(T);
ParallelRun(input_addr, output_addr, output_shape, data_count);
}
}
template <typename T>
void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
int dims = SizeToInt(axes_.size());
int *size = new int[dims];
size[dims - 1] = 1;
for (int i = dims - 1; i > 0; i--) {
size[i - 1] = size[i] * output_shape_[i];
}
int **position = new int *[thread_num];
for (size_t i = 0; i < thread_num; ++i) {
position[i] = new int[dims];
}
std::vector<common::Task> tasks;
std::function<void(const T *, T *, const int *, int *, int *, TransposeParameter *, int, int)> TransposeDims;
if constexpr (std::is_same_v<T, int8_t>) {
TransposeDims = &TransposeDimsInt8;
} else if constexpr (std::is_same_v<T, int16_t>) {
TransposeDims = &TransposeDimsInt16;
} else if constexpr (std::is_same_v<T, int32_t>) {
TransposeDims = &TransposeDimsInt32;
} else if constexpr (std::is_same_v<T, int64_t>) {
TransposeDims = &TransposeDimsInt64;
} else if constexpr (std::is_same_v<T, uint8_t>) {
TransposeDims = &TransposeDimsUInt8;
} else if constexpr (std::is_same_v<T, uint16_t>) {
TransposeDims = &TransposeDimsUInt16;
} else if constexpr (std::is_same_v<T, uint32_t>) {
TransposeDims = &TransposeDimsUInt32;
} else if constexpr (std::is_same_v<T, uint64_t>) {
TransposeDims = &TransposeDimsUInt64;
} else if constexpr (std::is_same_v<T, float>) {
TransposeDims = &TransposeDimsFp32;
} else if constexpr (std::is_same_v<T, bool>) {
TransposeDims = &TransposeDimsBool;
}
for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
auto task = [&, task_id, thread_num]() {
TransposeDims(input_addr, output_addr, output_shape, size, position[task_id], &transpose_param_, task_id,
SizeToInt(thread_num));
return common::SUCCESS;
};
tasks.emplace_back(task);
}
common::ThreadPool::GetInstance().SyncRun(tasks);
delete[] size;
delete[] position;
}
} // namespace kernel
} // namespace mindspore

+ 6
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h View File

@@ -21,6 +21,8 @@
#include <string>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/transpose_base.h"
namespace mindspore {
namespace kernel {
class TransposeCPUFwdKernel : public CPUKernel {
@@ -37,6 +39,10 @@ class TransposeCPUFwdKernel : public CPUKernel {
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
template <typename T>
void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);
TransposeParameter transpose_param_;
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
std::vector<size_t> axes_;


+ 20
- 19
mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.cc View File

@@ -27,12 +27,20 @@ void UnpackCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
if (axis_tmp < 0) {
axis_tmp += SizeToLong(input_shape.size());
}
size_t axis_ = LongToSize(axis_tmp);
output_num_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num"));
unstack_param_.num_ = SizeToInt(output_num_);
unstack_param_.axis_ = LongToSize(axis_tmp);
unstack_param_.pre_dims_ = 1;
unstack_param_.axis_dim_ = 1;
unstack_param_.after_dims_ = 1;

for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
if (i > IntToSize(axis_)) {
dims_after_axis_ *= input_shape[i];
if (static_cast<int>(i) < unstack_param_.axis_) {
unstack_param_.pre_dims_ *= input_shape[i];
} else if (static_cast<int>(i) > unstack_param_.axis_) {
unstack_param_.after_dims_ *= input_shape[i];
} else {
unstack_param_.axis_dim_ = input_shape[i];
}
}
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
@@ -56,23 +64,16 @@ template <typename T>
void UnpackCPUKernel<T>::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
input_ = reinterpret_cast<T *>(inputs[0]->addr);
MS_EXCEPTION_IF_NULL(input_);
outputs_host_ = reinterpret_cast<T **>(workspace[0]->addr);
MS_EXCEPTION_IF_NULL(outputs_host_);
const void *input = reinterpret_cast<void *>(inputs[0]->addr);
MS_EXCEPTION_IF_NULL(input);
void **outputs_host = reinterpret_cast<void **>(workspace[0]->addr);
MS_EXCEPTION_IF_NULL(outputs_host);
for (size_t i = 0; i < outputs.size(); i++) {
outputs_host_[i] = reinterpret_cast<T *>(outputs[i]->addr);
MS_EXCEPTION_IF_NULL(outputs_host_[i]);
outputs_host[i] = reinterpret_cast<T *>(outputs[i]->addr);
MS_EXCEPTION_IF_NULL(outputs_host[i]);
}
size_t number_of_reset = output_num_ * dims_after_axis_;
auto task = [this, number_of_reset](const size_t start, const size_t end) {
for (size_t i = start; i < end; ++i) {
size_t output_index = (i / dims_after_axis_) % output_num_;
size_t tensor_index = i / number_of_reset * dims_after_axis_ + i % dims_after_axis_;
outputs_host_[output_index][tensor_index] = input_[i];
}
};
CPUKernelUtils::ParallelFor(task, input_size_);
int data_size = SizeToInt(sizeof(T));
Unstack(input, outputs_host, &unstack_param_, data_size);
}

template <typename T>


+ 2
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/unpack_cpu_kernel.h View File

@@ -23,6 +23,7 @@
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/unstack_base.h"

namespace mindspore {
namespace kernel {
@@ -41,11 +42,8 @@ class UnpackCPUKernel : public CPUKernel {

protected:
virtual void CheckParam(const CNodePtr &kernel_node);
size_t input_size_{1};
UnstackParameter unstack_param_;
size_t output_num_{0};
size_t dims_after_axis_{1};
T *input_{nullptr};
T **outputs_host_{nullptr};
TypeId dtype_{kTypeUnknown};
};
MS_REG_CPU_KERNEL_T(Unstack,


Loading…
Cancel
Save