diff --git a/mindspore/lite/nnacl/fp16/transpose_fp16.c b/mindspore/lite/nnacl/fp16/transpose_fp16.c index dfcad94ac7..aa62d715e9 100644 --- a/mindspore/lite/nnacl/fp16/transpose_fp16.c +++ b/mindspore/lite/nnacl/fp16/transpose_fp16.c @@ -18,8 +18,8 @@ #include #include "nnacl/errorcode.h" -void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end) { +void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, + const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int output0 = output_shape[0]; @@ -33,8 +33,8 @@ void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, in } } -void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end) { +void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, + const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -56,8 +56,8 @@ void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, in } } -void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end) { +void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, + const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -88,8 +88,8 @@ void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, in } } -void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end) { +void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, + const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -127,8 +127,30 @@ void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, in } } -int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape, - TransposeParameter *transpose_param, int h_start, int h_end) { +void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, + const int *perm, const int *output_shape, int dims, int *size, int *position) { + *(size + dims - 1) = 1; + for (int i = dims - 1; i > 0; --i) { + *(size + i - 1) = *(size + i) * output_shape[i]; + } + + for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { + int pos = idx; + int output_idx = 0; + int input_idx = 0; + for (int i = 0; i < dims; ++i) { + *(position + i) = pos / *(size + i); + int out_stride = i < dims - 1 ? out_strides[i] : 1; + output_idx += (*(position + i) * out_stride); + input_idx += (*(position + i) * strides[perm[i]]); + pos -= *(position + i) * (*(size + i)); + } + out_data[output_idx] = in_data[input_idx]; + } +} + +int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape, + TransposeParameter *transpose_param, int *size, int *position) { if (in_data == NULL || out_data == NULL) { return NNACL_ERR; } @@ -138,7 +160,7 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i int data_size = transpose_param->data_size_; int num_axes = transpose_param->num_axes_; - if (num_axes < 2 || num_axes > 5) { + if (num_axes < 2) { return NNACL_ERR; } @@ -155,14 +177,21 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i (void)memcpy(out_data, in_data, data_size); return NNACL_OK; } + for (int i = 0; i < num_axes; ++i) { + if (perm[i] < 0) { + return NNACL_PARAM_INVALID; + } + } if (num_axes == 2) { - Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 3) { - Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 4) { - Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 5) { - Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); + } else { + TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); } return NNACL_OK; } diff --git a/mindspore/lite/nnacl/fp16/transpose_fp16.h b/mindspore/lite/nnacl/fp16/transpose_fp16.h index c7d2f631f2..aa7b21b63f 100644 --- a/mindspore/lite/nnacl/fp16/transpose_fp16.h +++ b/mindspore/lite/nnacl/fp16/transpose_fp16.h @@ -18,38 +18,16 @@ #define MINDSPORE_LITE_NNACL_FP16_TRANSPOSE_FP16_H_ #include "nnacl/op_base.h" +#include "nnacl/transpose.h" #ifdef ENABLE_NEON #include #endif -typedef struct TransposeParameter { - // primitive parameter - OpParameter op_parameter_; - int perm_[8]; - bool conjugate_; - - // shape correlative - int strides_[8]; - int out_strides_[8]; - - // other parameter - int num_axes_; - int data_size_; -} TransposeParameter; - #ifdef __cplusplus extern "C" { #endif -int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape, - TransposeParameter *transpose_param, int h_start, int h_end); -void TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end); -void TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end); -void TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end); -void TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, - int *output_shape, int h_start, int h_end); +int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape, + TransposeParameter *transpose_param, int *size, int *position); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp32/transpose_fp32.c b/mindspore/lite/nnacl/fp32/transpose_fp32.c index 011594c630..af8ec38e95 100644 --- a/mindspore/lite/nnacl/fp32/transpose_fp32.c +++ b/mindspore/lite/nnacl/fp32/transpose_fp32.c @@ -17,7 +17,7 @@ #include "nnacl/fp32/transpose_fp32.h" void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides, int *out_strides, const int *perm, - const int *output_shape, int h_start, int h_end) { + const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int output0 = output_shape[0]; @@ -32,7 +32,7 @@ void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides } void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, - const int *perm, const int *output_shape, int h_start, int h_end) { + const int *perm, const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -55,7 +55,7 @@ void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides } void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, - const int *perm, const int *output_shape, int h_start, int h_end) { + const int *perm, const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -87,7 +87,7 @@ void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides } void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, - const int *perm, const int *output_shape, int h_start, int h_end) { + const int *perm, const int *output_shape) { const int stride0 = strides[perm[0]]; const int stride1 = strides[perm[1]]; const int stride2 = strides[perm[2]]; @@ -126,8 +126,7 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides } void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, - const int *perm, const int *output_shape, int h_start, int h_end, int dims, int *size, - int *position) { + const int *perm, const int *output_shape, int dims, int *size, int *position) { *(size + dims - 1) = 1; for (int i = dims - 1; i > 0; --i) { *(size + i - 1) = *(size + i) * output_shape[i]; @@ -148,8 +147,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides } } -int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape, - TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position) { +int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, + int *size, int *position) { if (in_data == NULL || out_data == NULL) { return NNACL_ERR; } @@ -182,16 +181,15 @@ int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, con } } if (num_axes == 2) { - TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 3) { - TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 4) { - TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else if (num_axes == 5) { - TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); + TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); } else { - TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end, num_axes, size, - position); + TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); } return NNACL_OK; } diff --git a/mindspore/lite/nnacl/fp32/transpose_fp32.h b/mindspore/lite/nnacl/fp32/transpose_fp32.h index 0ce061740a..9027577b06 100644 --- a/mindspore/lite/nnacl/fp32/transpose_fp32.h +++ b/mindspore/lite/nnacl/fp32/transpose_fp32.h @@ -25,8 +25,8 @@ extern "C" { #endif -int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape, - TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position); +int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, + int *size, int *position); #ifdef __cplusplus } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc index 03eae3afa1..7b4b7a9c9c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc @@ -34,141 +34,52 @@ int TransposeFp16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } - return ReSize(); + return TransposeCPUKernel::ReSize(); } -int TransposeFp16CPUKernel::ReSize() { - TransposeParameter *param = reinterpret_cast(this->op_parameter_); - num_unit_ = static_cast(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H])); - thread_h_num_ = MSMIN(thread_num_, num_unit_); - thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_); +int TransposeFp16CPUKernel::Run() { + MS_ASSERT(in_tensors_.size() == 1); + MS_ASSERT(out_tensors_.size() == 1); auto &in_tensor = in_tensors_.front(); auto &out_tensor = out_tensors_.front(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - param->strides_[param->num_axes_ - 1] = 1; - param->out_strides_[param->num_axes_ - 1] = 1; - param->data_size_ = in_tensor->Size(); - for (int i = param->num_axes_ - 2; i >= 0; i--) { - param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; - param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; + if (in_tensor == nullptr || out_tensor == nullptr) { + MS_LOG(ERROR) << "null pointer referencing."; + return RET_ERROR; } - - return RET_OK; -} - -int TransposeFp16CPUKernel::MallocFp16Buffer() { - auto &in_tensor = in_tensors_.front(); - auto &out_tensor = out_tensors_.front(); - - if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { - fp16_in_data_ = - reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); - if (fp16_in_data_ == nullptr) { + in_data_fp16_ = reinterpret_cast(in_tensor->MutableData()); + out_data_fp16_ = reinterpret_cast(out_tensor->MutableData()); + int dims = out_tensor->shape().size(); + if (dims > MAX_TRANSPOSE_DIM_SIZE) { + dim_size_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); + if (dim_size_ == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; return RET_ERROR; } - } - if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { - fp16_out_data_ = - reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); - if (fp16_out_data_ == nullptr) { + position_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); + if (position_ == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; + context_->allocator->Free(dim_size_); + dim_size_ = nullptr; return RET_ERROR; } } - return RET_OK; -} - -void TransposeFp16CPUKernel::FreeFp16Buffer() { - auto &in_tensor = in_tensors_.front(); - auto &out_tensor = out_tensors_.front(); - - if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { - if (fp16_in_data_ != nullptr) { - context_->allocator->Free(fp16_in_data_); - fp16_in_data_ = nullptr; - } - } - if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { - if (fp16_out_data_ != nullptr) { - context_->allocator->Free(fp16_out_data_); - fp16_out_data_ = nullptr; - } - } -} - -int TransposeFp16CPUKernel::TransposeParallel(int task_id) { - int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); - if (num_unit_thread <= 0) { - return RET_OK; - } - int thread_offset = task_id * thread_h_stride_; TransposeParameter *param = reinterpret_cast(this->op_parameter_); - - auto ret = Fp16DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, - thread_offset + num_unit_thread); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; - return RET_ERROR; + MS_ASSERT(param); + MS_ASSERT(in_data_fp16_); + MS_ASSERT(out_data_fp16_); + MS_ASSERT(out_shape_); + auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_); + if (dims > MAX_TRANSPOSE_DIM_SIZE) { + context_->allocator->Free(dim_size_); + context_->allocator->Free(position_); + dim_size_ = nullptr; + position_ = nullptr; } - - return RET_OK; -} - -static int TransposeFp16Run(void *cdata, int task_id) { - auto g_kernel = reinterpret_cast(cdata); - auto ret = g_kernel->TransposeParallel(task_id); if (ret != RET_OK) { - MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]"; - return RET_OP_EXECUTE_FAILURE; - } - return RET_OK; -} - -int TransposeFp16CPUKernel::Run() { - MS_ASSERT(in_tensors_.size() == 1); - MS_ASSERT(out_tensors_.size() == 1); - auto &in_tensor = in_tensors_.front(); - auto &out_tensor = out_tensors_.front(); - if (in_tensor == nullptr || out_tensor == nullptr) { - MS_LOG(ERROR) << "null pointer referencing."; + MS_LOG(ERROR) << "Transpose run failed"; return RET_ERROR; } - // malloc when Run - auto ret = MallocFp16Buffer(); - if (ret != RET_OK) { - FreeFp16Buffer(); - return ret; - } - - if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { - in_data_ = reinterpret_cast(in_tensor->MutableData()); - Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); - } else { - fp16_in_data_ = reinterpret_cast(in_tensor->MutableData()); - } - if (out_tensor->data_type() == kNumberTypeFloat16) { - fp16_out_data_ = reinterpret_cast(out_tensor->MutableData()); - } - - memcpy(in_shape_, in_tensor->shape().data(), in_tensor->shape().size() * sizeof(int)); - memcpy(out_shape_, out_tensor->shape().data(), out_tensor->shape().size() * sizeof(int)); - - ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp16Run, this, thread_h_num_); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; - FreeFp16Buffer(); - return ret; - } - - if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { - out_data_ = reinterpret_cast(out_tensor->MutableData()); - Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); - } - FreeFp16Buffer(); - return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h index 79251b9486..2695f29476 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h @@ -21,35 +21,24 @@ #include #include "src/lite_kernel.h" #include "src/kernel_registry.h" +#include "src/runtime/kernel/arm/fp32/transpose_fp32.h" namespace mindspore::kernel { -class TransposeFp16CPUKernel : public LiteKernel { +class TransposeFp16CPUKernel : public TransposeCPUKernel { public: explicit TransposeFp16CPUKernel(OpParameter *param, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} + : TransposeCPUKernel(param, inputs, outputs, ctx, primitive) {} ~TransposeFp16CPUKernel() = default; int Init() override; - int ReSize() override; int Run() override; - int TransposeParallel(int task_id); - void FreeFp16Buffer(); - int MallocFp16Buffer(); private: - int thread_num_; - int thread_h_stride_; - int thread_h_num_; - int num_unit_; - float *in_data_; - float *out_data_; - float16_t *fp16_in_data_ = nullptr; - float16_t *fp16_out_data_ = nullptr; - int in_shape_[8]; - int out_shape_[8]; + float16_t *in_data_fp16_ = nullptr; + float16_t *out_data_fp16_ = nullptr; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc index 2be0c2db7e..0288052088 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc @@ -36,9 +36,6 @@ int TransposeCPUKernel::Init() { int TransposeCPUKernel::ReSize() { TransposeParameter *param = reinterpret_cast(op_parameter_); - num_unit_ = static_cast(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H])); - thread_h_num_ = MSMIN(thread_num_, num_unit_); - thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_); auto &inTensor = in_tensors_.front(); auto &outTensor = out_tensors_.front(); @@ -51,75 +48,27 @@ int TransposeCPUKernel::ReSize() { param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; } - if (this->in_shape_ != nullptr) { - free(this->in_shape_); - in_shape_ = nullptr; - } + if (this->out_shape_ != nullptr) { free(this->out_shape_); this->out_shape_ = nullptr; } - in_shape_ = reinterpret_cast(malloc(in_shape.size() * sizeof(int))); - if (in_shape_ == nullptr) { - MS_LOG(ERROR) << "malloc in_shape_ failed."; - return RET_ERROR; - } + out_shape_ = reinterpret_cast(malloc(out_shape.size() * sizeof(int))); if (out_shape_ == nullptr) { MS_LOG(ERROR) << "malloc out_shape_ failed."; return RET_ERROR; } - memcpy(in_shape_, in_shape.data(), in_shape.size() * sizeof(int)); memcpy(out_shape_, out_shape.data(), in_shape.size() * sizeof(int)); return RET_OK; } TransposeCPUKernel::~TransposeCPUKernel() { - if (this->in_shape_ != nullptr) { - free(this->in_shape_); - } if (this->out_shape_ != nullptr) { free(this->out_shape_); } } -int TransposeCPUKernel::TransposeParallel(int task_id) { - int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); - if (num_unit_thread <= 0) { - return RET_OK; - } - int thread_offset = task_id * thread_h_stride_; - TransposeParameter *param = reinterpret_cast(this->op_parameter_); - MS_ASSERT(param); - int *size = nullptr; - int *position = nullptr; - if (this->dim_size_ != nullptr && this->position_ != nullptr) { - size = this->dim_size_ + task_id * param->num_axes_; - position = this->position_ + task_id * param->num_axes_; - } - MS_ASSERT(in_data_); - MS_ASSERT(out_data_); - MS_ASSERT(in_shape_); - MS_ASSERT(out_shape_); - auto ret = DoTransposeFp32(in_data_, out_data_, in_shape_, out_shape_, param, thread_offset, - thread_offset + num_unit_thread, size, position); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; - return RET_ERROR; - } - return RET_OK; -} - -int TransposeFp32Run(void *cdata, int task_id) { - auto g_kernel = reinterpret_cast(cdata); - auto ret = g_kernel->TransposeParallel(task_id); - if (ret != RET_OK) { - MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]"; - return RET_OP_EXECUTE_FAILURE; - } - return RET_OK; -} - int TransposeCPUKernel::Run() { MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2); MS_ASSERT(out_tensors_.size() == 1); @@ -133,12 +82,12 @@ int TransposeCPUKernel::Run() { out_data_ = reinterpret_cast(out_tensor->MutableData()); int dims = out_tensor->shape().size(); if (dims > MAX_TRANSPOSE_DIM_SIZE) { - dim_size_ = reinterpret_cast(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int))); + dim_size_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); if (dim_size_ == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; return RET_ERROR; } - position_ = reinterpret_cast(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int))); + position_ = reinterpret_cast(context_->allocator->Malloc(dims * sizeof(int))); if (position_ == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; context_->allocator->Free(dim_size_); @@ -147,7 +96,12 @@ int TransposeCPUKernel::Run() { } } - auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp32Run, this, thread_h_num_); + TransposeParameter *param = reinterpret_cast(this->op_parameter_); + MS_ASSERT(param); + MS_ASSERT(in_data_); + MS_ASSERT(out_data_); + MS_ASSERT(out_shape_); + auto ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_); if (dims > MAX_TRANSPOSE_DIM_SIZE) { context_->allocator->Free(dim_size_); context_->allocator->Free(position_); @@ -155,9 +109,10 @@ int TransposeCPUKernel::Run() { position_ = nullptr; } if (ret != RET_OK) { - MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; - return ret; + MS_LOG(ERROR) << "Transpose run failed"; + return RET_ERROR; } + return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h index c459cfa6ee..569a955a18 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h @@ -29,22 +29,16 @@ class TransposeCPUKernel : public LiteKernel { explicit TransposeCPUKernel(OpParameter *param, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} + : LiteKernel(param, inputs, outputs, ctx, primitive) {} ~TransposeCPUKernel() override; int Init() override; int ReSize() override; int Run() override; - int TransposeParallel(int task_id); - private: - int thread_num_ = 1; - int thread_h_stride_ = 0; - int thread_h_num_ = 0; - int num_unit_ = 0; + protected: float *in_data_ = nullptr; float *out_data_ = nullptr; - int *in_shape_ = nullptr; int *out_shape_ = nullptr; int *dim_size_ = nullptr; int *position_ = nullptr; diff --git a/mindspore/lite/test/models_tflite_fp16.cfg b/mindspore/lite/test/models_tflite_fp16.cfg index eefb04f46e..c337f9041b 100644 --- a/mindspore/lite/test/models_tflite_fp16.cfg +++ b/mindspore/lite/test/models_tflite_fp16.cfg @@ -131,4 +131,8 @@ smartreply.tflite 0.1 mindspore_text_classification_tflite.tflite 4 #ml_location.tflite 0.1 ml_text_correction.tflite 1 +# ml_pic_shopping.tflite involves subtract two close numbers. +# In fp16 case, such subtract will cause a great relative error comparing to fp32. +# e.g. fp32: 27.5 -27.4 = 0.1 +# fp16: 27.6 - 27.4 = 0.2 #ml_pic_shopping.tflite 0.1 diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc index 13748becd5..a409819f7e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc @@ -43,7 +43,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { 1.2791597, -1.02032341, 0.17405411, -0.66358529, 1.20223761, -1.65733338, -0.36793608, 1.91074871, 0.42663834, 1.8033383, 0.30183748, 0.3952082}; - int input_shape[4] = {1, 2, 3, 4}; int output_shape[4] = {4, 3, 2, 1}; int perm[8] = {3, 2, 1, 0, 0, 0, 0, 0}; int strides[8] = {24, 12, 4, 1, 1, 1, 1, 1}; @@ -64,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); @@ -83,7 +82,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { -0.52817175, 1.13376944, 1.74481176, 0.04221375, 1.46210794, 0.90159072, -1.07296862, -1.09989127, -0.7612069, 0.58281521, -2.06014071, 0.50249434}; - int input_shape[3] = {2, 3, 4}; int output_shape[3] = {4, 3, 2}; int perm[8] = {2, 1, 0, 0, 0, 0, 0, 0}; int strides[8] = {12, 4, 1, 1, 1, 1, 1, 1}; @@ -104,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); @@ -123,7 +121,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { -0.52817175, 1.74481176, 1.46210794, 1.13376944, 0.04221375, 0.90159072, -1.07296862, -0.7612069, -2.06014071, -1.09989127, 0.58281521, 0.50249434}; - int input_shape[2] = {6, 4}; int output_shape[2] = {4, 6}; int perm[8] = {1, 0, 0, 0, 0, 0, 0, 0}; int strides[8] = {4, 1, 1, 1, 1, 1, 1, 1}; @@ -145,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { param->out_strides_[i] = out_strides[i]; } - auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 6, nullptr, nullptr); + auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); ASSERT_EQ(ret, 0); delete param; ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));