| @@ -16,6 +16,7 @@ | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/dropout_grad_kernel.h" | |||
| #include "nnacl/fp32_grad/dropout_grad.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -46,6 +47,8 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_); | |||
| } else { | |||
| MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU."; | |||
| } | |||
| return true; | |||
| @@ -55,13 +58,28 @@ template <typename T> | |||
| void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs, size_t num_count, | |||
| float keep_prob) { | |||
| auto dx = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto dy = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto mask = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *mask = reinterpret_cast<T *>(inputs[1]->addr); | |||
| const float scale = 1.f / keep_prob; | |||
| for (size_t i = 0; i < num_count; i += 1) { | |||
| dx[i] = (T)(scale * static_cast<float>(dy[i] * mask[i])); | |||
| if constexpr (std::is_same_v<T, float16>) { | |||
| float *input_tmp = new float[num_count_]; | |||
| float *output_tmp = new float[num_count_]; | |||
| float *mask_tmp = new float[num_count_]; | |||
| for (size_t i = 0; i < num_count_; ++i) { | |||
| input_tmp[i] = static_cast<float>(input[i]); | |||
| mask_tmp[i] = static_cast<float>(mask[i]); | |||
| } | |||
| DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale); | |||
| for (size_t i = 0; i < num_count_; ++i) { | |||
| output[i] = static_cast<float16>(output_tmp[i]); | |||
| } | |||
| delete[] input_tmp; | |||
| delete[] output_tmp; | |||
| delete[] mask_tmp; | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| DropoutGrad(input, mask, output, num_count_, scale); | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| @@ -0,0 +1,296 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "nnacl/base/transpose_base.h" | |||
| #include "nnacl/errorcode.h" | |||
| #define TRANSPOSE_TWO_DIMS(TYPE, NAME) \ | |||
| void TransposeDim2##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape) { \ | |||
| const int stride0 = strides[perm[0]]; \ | |||
| const int stride1 = strides[perm[1]]; \ | |||
| const int output0 = output_shape[0]; \ | |||
| const int output1 = output_shape[1]; \ | |||
| for (int i = 0; i < output0; ++i) { \ | |||
| int out_stride0_i = i * output1; \ | |||
| int stride0_i = i * 1 * stride0; \ | |||
| for (int j = 0; j < output1; ++j) { \ | |||
| out_data[out_stride0_i + j] = in_data[stride0_i + j * stride1]; \ | |||
| } \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_THREE_DIMS(TYPE, NAME) \ | |||
| void TransposeDim3##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape) { \ | |||
| const int stride0 = strides[perm[0]]; \ | |||
| const int stride1 = strides[perm[1]]; \ | |||
| const int stride2 = strides[perm[2]]; \ | |||
| const int out_stride0 = out_strides[0]; \ | |||
| const int out_stride1 = out_strides[1]; \ | |||
| const int output0 = output_shape[0]; \ | |||
| const int output1 = output_shape[1]; \ | |||
| const int output2 = output_shape[2]; \ | |||
| for (int i = 0; i < output0; ++i) { \ | |||
| int out_stride0_i = i * out_stride0; \ | |||
| int stride0_i = i * stride0; \ | |||
| for (int j = 0; j < output1; ++j) { \ | |||
| int out_stride1_j = j * out_stride1; \ | |||
| int stride1_j = j * stride1; \ | |||
| for (int k = 0; k < output2; ++k) { \ | |||
| out_data[out_stride0_i + out_stride1_j + k] = in_data[stride0_i + stride1_j + k * stride2]; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_FOUR_DIMS(TYPE, NAME) \ | |||
| void TransposeDim4##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape) { \ | |||
| const int stride0 = strides[perm[0]]; \ | |||
| const int stride1 = strides[perm[1]]; \ | |||
| const int stride2 = strides[perm[2]]; \ | |||
| const int stride3 = strides[perm[3]]; \ | |||
| const int out_stride0 = out_strides[0]; \ | |||
| const int out_stride1 = out_strides[1]; \ | |||
| const int out_stride2 = out_strides[2]; \ | |||
| const int output0 = output_shape[0]; \ | |||
| const int output1 = output_shape[1]; \ | |||
| const int output2 = output_shape[2]; \ | |||
| const int output3 = output_shape[3]; \ | |||
| for (int i = 0; i < output0; ++i) { \ | |||
| int out_stride0_i = i * out_stride0; \ | |||
| int stride0_i = i * stride0; \ | |||
| for (int j = 0; j < output1; ++j) { \ | |||
| int out_stride1_j = j * out_stride1; \ | |||
| int stride1_j = j * stride1; \ | |||
| for (int k = 0; k < output2; ++k) { \ | |||
| int out_stride2_k = k * out_stride2; \ | |||
| int stride2_k = k * stride2; \ | |||
| for (int m = 0; m < output3; ++m) { \ | |||
| out_data[out_stride0_i + out_stride1_j + out_stride2_k + m] = \ | |||
| in_data[stride0_i + stride1_j + stride2_k + m * stride3]; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_FIVE_DIMS(TYPE, NAME) \ | |||
| void TransposeDim5##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape) { \ | |||
| const int stride0 = strides[perm[0]]; \ | |||
| const int stride1 = strides[perm[1]]; \ | |||
| const int stride2 = strides[perm[2]]; \ | |||
| const int stride3 = strides[perm[3]]; \ | |||
| const int stride4 = strides[perm[4]]; \ | |||
| const int out_stride0 = out_strides[0]; \ | |||
| const int out_stride1 = out_strides[1]; \ | |||
| const int out_stride2 = out_strides[2]; \ | |||
| const int out_stride3 = out_strides[3]; \ | |||
| const int output0 = output_shape[0]; \ | |||
| const int output1 = output_shape[1]; \ | |||
| const int output2 = output_shape[2]; \ | |||
| const int output3 = output_shape[3]; \ | |||
| const int output4 = output_shape[4]; \ | |||
| for (int i = 0; i < output0; ++i) { \ | |||
| int out_stride0_i = i * out_stride0; \ | |||
| int stride0_i = i * stride0; \ | |||
| for (int j = 0; j < output1; ++j) { \ | |||
| int out_stride1_j = j * out_stride1; \ | |||
| int stride1_j = j * stride1; \ | |||
| for (int k = 0; k < output2; ++k) { \ | |||
| int out_stride2_k = k * out_stride2; \ | |||
| int stride2_k = k * stride2; \ | |||
| for (int m = 0; m < output3; ++m) { \ | |||
| int out_stride3_m = m * out_stride3; \ | |||
| int stride3_m = m * stride3; \ | |||
| for (int n = 0; n < output4; ++n) { \ | |||
| out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + n] = \ | |||
| in_data[stride0_i + stride1_j + stride2_k + stride3_m + n * stride4]; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_SIX_DIMS(TYPE, NAME) \ | |||
| void TransposeDim6##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape) { \ | |||
| const int stride0 = strides[perm[0]]; \ | |||
| const int stride1 = strides[perm[1]]; \ | |||
| const int stride2 = strides[perm[2]]; \ | |||
| const int stride3 = strides[perm[3]]; \ | |||
| const int stride4 = strides[perm[4]]; \ | |||
| const int stride5 = strides[perm[5]]; \ | |||
| const int out_stride0 = out_strides[0]; \ | |||
| const int out_stride1 = out_strides[1]; \ | |||
| const int out_stride2 = out_strides[2]; \ | |||
| const int out_stride3 = out_strides[3]; \ | |||
| const int out_stride4 = out_strides[4]; \ | |||
| const int output0 = output_shape[0]; \ | |||
| const int output1 = output_shape[1]; \ | |||
| const int output2 = output_shape[2]; \ | |||
| const int output3 = output_shape[3]; \ | |||
| const int output4 = output_shape[4]; \ | |||
| const int output5 = output_shape[5]; \ | |||
| for (int i = 0; i < output0; ++i) { \ | |||
| int out_stride0_i = i * out_stride0; \ | |||
| int stride0_i = i * stride0; \ | |||
| for (int j = 0; j < output1; ++j) { \ | |||
| int out_stride1_j = j * out_stride1; \ | |||
| int stride1_j = j * stride1; \ | |||
| for (int k = 0; k < output2; ++k) { \ | |||
| int out_stride2_k = k * out_stride2; \ | |||
| int stride2_k = k * stride2; \ | |||
| for (int m = 0; m < output3; ++m) { \ | |||
| int out_stride3_m = m * out_stride3; \ | |||
| int stride3_m = m * stride3; \ | |||
| for (int n = 0; n < output4; ++n) { \ | |||
| int out_stride4_n = n * out_stride4; \ | |||
| int stride4_n = n * stride4; \ | |||
| for (int g = 0; g < output5; ++g) { \ | |||
| out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] = \ | |||
| in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5]; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_MULTI_DIMS(TYPE, NAME) \ | |||
| void Transpose##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \ | |||
| const int *perm, const int *output_shape, int dims, int *size, int *position) { \ | |||
| *(size + dims - 1) = 1; \ | |||
| for (int i = dims - 1; i > 0; --i) { \ | |||
| *(size + i - 1) = *(size + i) * output_shape[i]; \ | |||
| } \ | |||
| for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { \ | |||
| int pos = idx; \ | |||
| int output_idx = 0; \ | |||
| int input_idx = 0; \ | |||
| for (int i = 0; i < dims; ++i) { \ | |||
| *(position + i) = pos / *(size + i); \ | |||
| int out_stride = i < dims - 1 ? out_strides[i] : 1; \ | |||
| output_idx += (*(position + i) * out_stride); \ | |||
| input_idx += (*(position + i) * strides[perm[i]]); \ | |||
| pos -= *(position + i) * (*(size + i)); \ | |||
| } \ | |||
| out_data[output_idx] = in_data[input_idx]; \ | |||
| } \ | |||
| } | |||
| #define TRANSPOSE_DIMS(TYPE, NAME) \ | |||
| void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, int *size, int *position, \ | |||
| TransposeParameter *transpose_param, int task_id, int thread_num) { \ | |||
| int *perm = transpose_param->perm_; \ | |||
| int *strides = transpose_param->strides_; \ | |||
| int *out_strides = transpose_param->out_strides_; \ | |||
| int num_axes = transpose_param->num_axes_; \ | |||
| size_t data_size = (*size) * output_shape[0]; \ | |||
| size_t offset_size = UP_DIV(data_size, thread_num); \ | |||
| size_t task_offset = offset_size * task_id; \ | |||
| int count = data_size - task_offset; \ | |||
| if (count <= 0) { \ | |||
| return; \ | |||
| } \ | |||
| count = MSMIN(offset_size, count); \ | |||
| for (size_t idx = task_offset; idx < task_offset + count; ++idx) { \ | |||
| int pos = idx; \ | |||
| int output_idx = 0; \ | |||
| int input_idx = 0; \ | |||
| for (int i = 0; i < num_axes; ++i) { \ | |||
| *(position + i) = pos / *(size + i); \ | |||
| int out_stride = i < num_axes - 1 ? out_strides[i] : 1; \ | |||
| output_idx += (*(position + i) * out_stride); \ | |||
| input_idx += (*(position + i) * strides[perm[i]]); \ | |||
| pos -= *(position + i) * (*(size + i)); \ | |||
| } \ | |||
| out_data[output_idx] = in_data[input_idx]; \ | |||
| } \ | |||
| } | |||
| #define DOTRANSPOSE(TYPE, NAME) \ | |||
| int DoTranspose##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape, \ | |||
| TransposeParameter *transpose_param) { \ | |||
| if (in_data == NULL || out_data == NULL) { \ | |||
| return NNACL_ERR; \ | |||
| } \ | |||
| const int *perm = transpose_param->perm_; \ | |||
| const int *strides = transpose_param->strides_; \ | |||
| const int *out_strides = transpose_param->out_strides_; \ | |||
| int data_size = transpose_param->data_size_; \ | |||
| int num_axes = transpose_param->num_axes_; \ | |||
| if (num_axes < 2) { \ | |||
| return NNACL_ERR; \ | |||
| } \ | |||
| bool needTranspose = false; \ | |||
| for (int i = 1; i < num_axes; ++i) { \ | |||
| if (perm[i] - perm[i - 1] != 1) { \ | |||
| needTranspose = true; \ | |||
| break; \ | |||
| } \ | |||
| } \ | |||
| if (!needTranspose) { \ | |||
| (void)memcpy(out_data, in_data, data_size); \ | |||
| return NNACL_OK; \ | |||
| } \ | |||
| for (int i = 0; i < num_axes; ++i) { \ | |||
| if (perm[i] < 0) { \ | |||
| return NNACL_PARAM_INVALID; \ | |||
| } \ | |||
| } \ | |||
| if (num_axes == 2) { \ | |||
| TransposeDim2##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \ | |||
| } else if (num_axes == 3) { \ | |||
| TransposeDim3##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \ | |||
| } else if (num_axes == 4) { \ | |||
| TransposeDim4##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \ | |||
| } else if (num_axes == 5) { \ | |||
| TransposeDim5##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \ | |||
| } else if (num_axes == 6) { \ | |||
| TransposeDim6##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \ | |||
| } else { \ | |||
| int *size = (int *)(malloc(num_axes * sizeof(int))); \ | |||
| int *position = (int *)(malloc(num_axes * sizeof(int))); \ | |||
| Transpose##NAME(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); \ | |||
| free(size); \ | |||
| free(position); \ | |||
| } \ | |||
| return NNACL_OK; \ | |||
| } | |||
| #define TRANSPOSE_TEMPLATE(TYPE, NAME) \ | |||
| TRANSPOSE_TWO_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_THREE_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_FOUR_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_FIVE_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_SIX_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_MULTI_DIMS(TYPE, NAME) \ | |||
| TRANSPOSE_DIMS(TYPE, NAME) \ | |||
| DOTRANSPOSE(TYPE, NAME) | |||
| TRANSPOSE_TEMPLATE(uint8_t, UInt8) | |||
| TRANSPOSE_TEMPLATE(uint16_t, UInt16) | |||
| TRANSPOSE_TEMPLATE(uint32_t, UInt32) | |||
| TRANSPOSE_TEMPLATE(uint64_t, UInt64) | |||
| TRANSPOSE_TEMPLATE(int16_t, Int16) | |||
| TRANSPOSE_TEMPLATE(int32_t, Int32) | |||
| TRANSPOSE_TEMPLATE(int64_t, Int64) | |||
| TRANSPOSE_TEMPLATE(bool, Bool) | |||
| @@ -0,0 +1,64 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_NNACL_TRANSPOSE_BASE_H_ | |||
| #define MINDSPORE_NNACL_TRANSPOSE_BASE_H_ | |||
| #include "nnacl/transpose.h" | |||
| #include <string.h> | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| int DoTransposeUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, | |||
| TransposeParameter *transpose_param); | |||
| int DoTransposeBool(const bool *in_data, bool *out_data, const int *output_shape, TransposeParameter *transpose_param); | |||
| void TransposeDimsUInt8(const uint8_t *in_data, uint8_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsUInt16(const uint16_t *in_data, uint16_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsUInt32(const uint32_t *in_data, uint32_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsUInt64(const uint64_t *in_data, uint64_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsInt16(const int16_t *in_data, int16_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsInt32(const int32_t *in_data, int32_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsInt64(const int64_t *in_data, int64_t *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| void TransposeDimsBool(const bool *in_data, bool *out_data, const int *output_shape, int *size, int *position, | |||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // MINDSPORE_NNACL_TRANSPOSE_BASE_H_ | |||
| @@ -18,6 +18,9 @@ | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "common/thread_pool.h" | |||
| #include "nnacl/fp32/transpose_fp32.h" | |||
| #include "nnacl/int8/transpose_int8.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -31,6 +34,23 @@ void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| if (dtype_ == kTypeUnknown) { | |||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | |||
| } | |||
| if (axes_.size() > MAX_SHAPE_SIZE) { | |||
| MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_SHAPE_SIZE << "D, but got " << axes_.size() | |||
| << "D."; | |||
| } | |||
| for (size_t i = 0; i < axes_.size(); ++i) { | |||
| transpose_param_.perm_[i] = SizeToInt(axes_[i]); | |||
| } | |||
| int num_axes = SizeToInt(input_shape_.size()); | |||
| transpose_param_.perm_size_ = axes_.size(); | |||
| transpose_param_.num_axes_ = num_axes; | |||
| transpose_param_.strides_[num_axes - 1] = 1; | |||
| transpose_param_.out_strides_[num_axes - 1] = 1; | |||
| for (int i = num_axes - 2; i >= 0; i--) { | |||
| transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1]; | |||
| transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1]; | |||
| } | |||
| launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>; | |||
| launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>; | |||
| @@ -61,19 +81,91 @@ bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||
| TransposeIterator base_iter(output_shape_, axes_, input_shape_); | |||
| auto task = [&base_iter, input_addr, output_addr](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; ++i) { | |||
| output_addr[i] = input_addr[iter.GetPos()]; | |||
| iter.GenNextPos(); | |||
| const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| transpose_param_.data_size_ = IntToSize(inputs[0]->size); | |||
| int output_shape[SizeToInt(output_shape_.size())]; | |||
| for (size_t i = 0; i < output_shape_.size(); ++i) { | |||
| output_shape[i] = SizeToInt(output_shape_[i]); | |||
| } | |||
| if (axes_.size() <= MAX_TRANSPOSE_DIM_SIZE) { | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| } else { | |||
| size_t data_count = (inputs[0]->size) / sizeof(T); | |||
| ParallelRun(input_addr, output_addr, output_shape, data_count); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| int dims = SizeToInt(axes_.size()); | |||
| int *size = new int[dims]; | |||
| size[dims - 1] = 1; | |||
| for (int i = dims - 1; i > 0; i--) { | |||
| size[i - 1] = size[i] * output_shape_[i]; | |||
| } | |||
| int **position = new int *[thread_num]; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| position[i] = new int[dims]; | |||
| } | |||
| std::vector<common::Task> tasks; | |||
| std::function<void(const T *, T *, const int *, int *, int *, TransposeParameter *, int, int)> TransposeDims; | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| TransposeDims = &TransposeDimsInt8; | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| TransposeDims = &TransposeDimsInt16; | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| TransposeDims = &TransposeDimsInt32; | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| TransposeDims = &TransposeDimsInt64; | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| TransposeDims = &TransposeDimsUInt8; | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| TransposeDims = &TransposeDimsUInt16; | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| TransposeDims = &TransposeDimsUInt32; | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| TransposeDims = &TransposeDimsUInt64; | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| TransposeDims = &TransposeDimsFp32; | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| TransposeDims = &TransposeDimsBool; | |||
| } | |||
| for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) { | |||
| auto task = [&, task_id, thread_num]() { | |||
| TransposeDims(input_addr, output_addr, output_shape, size, position[task_id], &transpose_param_, task_id, | |||
| SizeToInt(thread_num)); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(task); | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| delete[] size; | |||
| delete[] position; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -21,6 +21,8 @@ | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/transpose_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class TransposeCPUFwdKernel : public CPUKernel { | |||
| @@ -37,6 +39,10 @@ class TransposeCPUFwdKernel : public CPUKernel { | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T> | |||
| void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count); | |||
| TransposeParameter transpose_param_; | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> axes_; | |||
| @@ -27,12 +27,20 @@ void UnpackCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| if (axis_tmp < 0) { | |||
| axis_tmp += SizeToLong(input_shape.size()); | |||
| } | |||
| size_t axis_ = LongToSize(axis_tmp); | |||
| output_num_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num")); | |||
| unstack_param_.num_ = SizeToInt(output_num_); | |||
| unstack_param_.axis_ = LongToSize(axis_tmp); | |||
| unstack_param_.pre_dims_ = 1; | |||
| unstack_param_.axis_dim_ = 1; | |||
| unstack_param_.after_dims_ = 1; | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| if (i > IntToSize(axis_)) { | |||
| dims_after_axis_ *= input_shape[i]; | |||
| if (static_cast<int>(i) < unstack_param_.axis_) { | |||
| unstack_param_.pre_dims_ *= input_shape[i]; | |||
| } else if (static_cast<int>(i) > unstack_param_.axis_) { | |||
| unstack_param_.after_dims_ *= input_shape[i]; | |||
| } else { | |||
| unstack_param_.axis_dim_ = input_shape[i]; | |||
| } | |||
| } | |||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | |||
| @@ -56,23 +64,16 @@ template <typename T> | |||
| void UnpackCPUKernel<T>::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| input_ = reinterpret_cast<T *>(inputs[0]->addr); | |||
| MS_EXCEPTION_IF_NULL(input_); | |||
| outputs_host_ = reinterpret_cast<T **>(workspace[0]->addr); | |||
| MS_EXCEPTION_IF_NULL(outputs_host_); | |||
| const void *input = reinterpret_cast<void *>(inputs[0]->addr); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| void **outputs_host = reinterpret_cast<void **>(workspace[0]->addr); | |||
| MS_EXCEPTION_IF_NULL(outputs_host); | |||
| for (size_t i = 0; i < outputs.size(); i++) { | |||
| outputs_host_[i] = reinterpret_cast<T *>(outputs[i]->addr); | |||
| MS_EXCEPTION_IF_NULL(outputs_host_[i]); | |||
| outputs_host[i] = reinterpret_cast<T *>(outputs[i]->addr); | |||
| MS_EXCEPTION_IF_NULL(outputs_host[i]); | |||
| } | |||
| size_t number_of_reset = output_num_ * dims_after_axis_; | |||
| auto task = [this, number_of_reset](const size_t start, const size_t end) { | |||
| for (size_t i = start; i < end; ++i) { | |||
| size_t output_index = (i / dims_after_axis_) % output_num_; | |||
| size_t tensor_index = i / number_of_reset * dims_after_axis_ + i % dims_after_axis_; | |||
| outputs_host_[output_index][tensor_index] = input_[i]; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, input_size_); | |||
| int data_size = SizeToInt(sizeof(T)); | |||
| Unstack(input, outputs_host, &unstack_param_, data_size); | |||
| } | |||
| template <typename T> | |||
| @@ -23,6 +23,7 @@ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/unstack_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -41,11 +42,8 @@ class UnpackCPUKernel : public CPUKernel { | |||
| protected: | |||
| virtual void CheckParam(const CNodePtr &kernel_node); | |||
| size_t input_size_{1}; | |||
| UnstackParameter unstack_param_; | |||
| size_t output_num_{0}; | |||
| size_t dims_after_axis_{1}; | |||
| T *input_{nullptr}; | |||
| T **outputs_host_{nullptr}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(Unstack, | |||