Browse Source

!9813 remove redundant param in transpose

From: @zhaozhenlong
Reviewed-by: @zhang_xue_tong,@zhanghaibo5
Signed-off-by: @zhang_xue_tong
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
6e9fd1ef95
10 changed files with 116 additions and 261 deletions
  1. +44
    -15
      mindspore/lite/nnacl/fp16/transpose_fp16.c
  2. +3
    -25
      mindspore/lite/nnacl/fp16/transpose_fp16.h
  3. +12
    -14
      mindspore/lite/nnacl/fp32/transpose_fp32.c
  4. +2
    -2
      mindspore/lite/nnacl/fp32/transpose_fp32.h
  5. +28
    -117
      mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
  6. +5
    -16
      mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
  7. +13
    -58
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
  8. +2
    -8
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
  9. +4
    -0
      mindspore/lite/test/models_tflite_fp16.cfg
  10. +3
    -6
      mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc

+ 44
- 15
mindspore/lite/nnacl/fp16/transpose_fp16.c View File

@@ -18,8 +18,8 @@
#include <string.h> #include <string.h>
#include "nnacl/errorcode.h" #include "nnacl/errorcode.h"


void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end) {
void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int output0 = output_shape[0]; const int output0 = output_shape[0];
@@ -33,8 +33,8 @@ void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, in
} }
} }


void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end) {
void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -56,8 +56,8 @@ void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, in
} }
} }


void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end) {
void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -88,8 +88,8 @@ void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, in
} }
} }


void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end) {
void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -127,8 +127,30 @@ void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, in
} }
} }


int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape,
TransposeParameter *transpose_param, int h_start, int h_end) {
void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1;
for (int i = dims - 1; i > 0; --i) {
*(size + i - 1) = *(size + i) * output_shape[i];
}

for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {
int pos = idx;
int output_idx = 0;
int input_idx = 0;
for (int i = 0; i < dims; ++i) {
*(position + i) = pos / *(size + i);
int out_stride = i < dims - 1 ? out_strides[i] : 1;
output_idx += (*(position + i) * out_stride);
input_idx += (*(position + i) * strides[perm[i]]);
pos -= *(position + i) * (*(size + i));
}
out_data[output_idx] = in_data[input_idx];
}
}

int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int *size, int *position) {
if (in_data == NULL || out_data == NULL) { if (in_data == NULL || out_data == NULL) {
return NNACL_ERR; return NNACL_ERR;
} }
@@ -138,7 +160,7 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i
int data_size = transpose_param->data_size_; int data_size = transpose_param->data_size_;
int num_axes = transpose_param->num_axes_; int num_axes = transpose_param->num_axes_;


if (num_axes < 2 || num_axes > 5) {
if (num_axes < 2) {
return NNACL_ERR; return NNACL_ERR;
} }


@@ -155,14 +177,21 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i
(void)memcpy(out_data, in_data, data_size); (void)memcpy(out_data, in_data, data_size);
return NNACL_OK; return NNACL_OK;
} }
for (int i = 0; i < num_axes; ++i) {
if (perm[i] < 0) {
return NNACL_PARAM_INVALID;
}
}
if (num_axes == 2) { if (num_axes == 2) {
Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 3) { } else if (num_axes == 3) {
Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 4) { } else if (num_axes == 4) {
Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) { } else if (num_axes == 5) {
Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape);
} else {
TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
} }
return NNACL_OK; return NNACL_OK;
} }

+ 3
- 25
mindspore/lite/nnacl/fp16/transpose_fp16.h View File

@@ -18,38 +18,16 @@
#define MINDSPORE_LITE_NNACL_FP16_TRANSPOSE_FP16_H_ #define MINDSPORE_LITE_NNACL_FP16_TRANSPOSE_FP16_H_


#include "nnacl/op_base.h" #include "nnacl/op_base.h"
#include "nnacl/transpose.h"
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif


typedef struct TransposeParameter {
// primitive parameter
OpParameter op_parameter_;
int perm_[8];
bool conjugate_;

// shape correlative
int strides_[8];
int out_strides_[8];

// other parameter
int num_axes_;
int data_size_;
} TransposeParameter;

#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape,
TransposeParameter *transpose_param, int h_start, int h_end);
void TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end);
void TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end);
void TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end);
void TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
int *output_shape, int h_start, int h_end);
int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape,
TransposeParameter *transpose_param, int *size, int *position);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif


+ 12
- 14
mindspore/lite/nnacl/fp32/transpose_fp32.c View File

@@ -17,7 +17,7 @@
#include "nnacl/fp32/transpose_fp32.h" #include "nnacl/fp32/transpose_fp32.h"


void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides, int *out_strides, const int *perm, void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides, int *out_strides, const int *perm,
const int *output_shape, int h_start, int h_end) {
const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int output0 = output_shape[0]; const int output0 = output_shape[0];
@@ -32,7 +32,7 @@ void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides
} }


void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int h_start, int h_end) {
const int *perm, const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -55,7 +55,7 @@ void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides
} }


void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int h_start, int h_end) {
const int *perm, const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -87,7 +87,7 @@ void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides
} }


void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int h_start, int h_end) {
const int *perm, const int *output_shape) {
const int stride0 = strides[perm[0]]; const int stride0 = strides[perm[0]];
const int stride1 = strides[perm[1]]; const int stride1 = strides[perm[1]];
const int stride2 = strides[perm[2]]; const int stride2 = strides[perm[2]];
@@ -126,8 +126,7 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides
} }


void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides,
const int *perm, const int *output_shape, int h_start, int h_end, int dims, int *size,
int *position) {
const int *perm, const int *output_shape, int dims, int *size, int *position) {
*(size + dims - 1) = 1; *(size + dims - 1) = 1;
for (int i = dims - 1; i > 0; --i) { for (int i = dims - 1; i > 0; --i) {
*(size + i - 1) = *(size + i) * output_shape[i]; *(size + i - 1) = *(size + i) * output_shape[i];
@@ -148,8 +147,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides
} }
} }


int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape,
TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position) {
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param,
int *size, int *position) {
if (in_data == NULL || out_data == NULL) { if (in_data == NULL || out_data == NULL) {
return NNACL_ERR; return NNACL_ERR;
} }
@@ -182,16 +181,15 @@ int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, con
} }
} }
if (num_axes == 2) { if (num_axes == 2) {
TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 3) { } else if (num_axes == 3) {
TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 4) { } else if (num_axes == 4) {
TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else if (num_axes == 5) { } else if (num_axes == 5) {
TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end);
TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape);
} else { } else {
TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end, num_axes, size,
position);
TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position);
} }
return NNACL_OK; return NNACL_OK;
} }

+ 2
- 2
mindspore/lite/nnacl/fp32/transpose_fp32.h View File

@@ -25,8 +25,8 @@
extern "C" { extern "C" {
#endif #endif


int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape,
TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position);
int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param,
int *size, int *position);


#ifdef __cplusplus #ifdef __cplusplus
} }


+ 28
- 117
mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc View File

@@ -34,141 +34,52 @@ int TransposeFp16CPUKernel::Init() {
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
return ReSize();
return TransposeCPUKernel::ReSize();
} }


int TransposeFp16CPUKernel::ReSize() {
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
num_unit_ = static_cast<int>(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H]));
thread_h_num_ = MSMIN(thread_num_, num_unit_);
thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_);
int TransposeFp16CPUKernel::Run() {
MS_ASSERT(in_tensors_.size() == 1);
MS_ASSERT(out_tensors_.size() == 1);
auto &in_tensor = in_tensors_.front(); auto &in_tensor = in_tensors_.front();
auto &out_tensor = out_tensors_.front(); auto &out_tensor = out_tensors_.front();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
param->strides_[param->num_axes_ - 1] = 1;
param->out_strides_[param->num_axes_ - 1] = 1;
param->data_size_ = in_tensor->Size();
for (int i = param->num_axes_ - 2; i >= 0; i--) {
param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1];
param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1];
if (in_tensor == nullptr || out_tensor == nullptr) {
MS_LOG(ERROR) << "null pointer referencing.";
return RET_ERROR;
} }

return RET_OK;
}

int TransposeFp16CPUKernel::MallocFp16Buffer() {
auto &in_tensor = in_tensors_.front();
auto &out_tensor = out_tensors_.front();

if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) {
fp16_in_data_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum()));
if (fp16_in_data_ == nullptr) {
in_data_fp16_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
out_data_fp16_ = reinterpret_cast<float16_t *>(out_tensor->MutableData());
int dims = out_tensor->shape().size();
if (dims > MAX_TRANSPOSE_DIM_SIZE) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed"; MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR; return RET_ERROR;
} }
}
if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) {
fp16_out_data_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum()));
if (fp16_out_data_ == nullptr) {
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed"; MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_);
dim_size_ = nullptr;
return RET_ERROR; return RET_ERROR;
} }
} }
return RET_OK;
}

void TransposeFp16CPUKernel::FreeFp16Buffer() {
auto &in_tensor = in_tensors_.front();
auto &out_tensor = out_tensors_.front();

if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) {
if (fp16_in_data_ != nullptr) {
context_->allocator->Free(fp16_in_data_);
fp16_in_data_ = nullptr;
}
}
if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) {
if (fp16_out_data_ != nullptr) {
context_->allocator->Free(fp16_out_data_);
fp16_out_data_ = nullptr;
}
}
}

int TransposeFp16CPUKernel::TransposeParallel(int task_id) {
int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_);
if (num_unit_thread <= 0) {
return RET_OK;
}
int thread_offset = task_id * thread_h_stride_;
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);

auto ret = Fp16DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset,
thread_offset + num_unit_thread);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
MS_ASSERT(param);
MS_ASSERT(in_data_fp16_);
MS_ASSERT(out_data_fp16_);
MS_ASSERT(out_shape_);
auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_);
if (dims > MAX_TRANSPOSE_DIM_SIZE) {
context_->allocator->Free(dim_size_);
context_->allocator->Free(position_);
dim_size_ = nullptr;
position_ = nullptr;
} }

return RET_OK;
}

static int TransposeFp16Run(void *cdata, int task_id) {
auto g_kernel = reinterpret_cast<TransposeFp16CPUKernel *>(cdata);
auto ret = g_kernel->TransposeParallel(task_id);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_OP_EXECUTE_FAILURE;
}
return RET_OK;
}

int TransposeFp16CPUKernel::Run() {
MS_ASSERT(in_tensors_.size() == 1);
MS_ASSERT(out_tensors_.size() == 1);
auto &in_tensor = in_tensors_.front();
auto &out_tensor = out_tensors_.front();
if (in_tensor == nullptr || out_tensor == nullptr) {
MS_LOG(ERROR) << "null pointer referencing.";
MS_LOG(ERROR) << "Transpose run failed";
return RET_ERROR; return RET_ERROR;
} }


// malloc when Run
auto ret = MallocFp16Buffer();
if (ret != RET_OK) {
FreeFp16Buffer();
return ret;
}

if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) {
in_data_ = reinterpret_cast<float *>(in_tensor->MutableData());
Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum());
} else {
fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
}
if (out_tensor->data_type() == kNumberTypeFloat16) {
fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensor->MutableData());
}

memcpy(in_shape_, in_tensor->shape().data(), in_tensor->shape().size() * sizeof(int));
memcpy(out_shape_, out_tensor->shape().data(), out_tensor->shape().size() * sizeof(int));

ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp16Run, this, thread_h_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
FreeFp16Buffer();
return ret;
}

if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) {
out_data_ = reinterpret_cast<float *>(out_tensor->MutableData());
Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum());
}
FreeFp16Buffer();

return ret; return ret;
} }




+ 5
- 16
mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h View File

@@ -21,35 +21,24 @@
#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "src/runtime/kernel/arm/fp32/transpose_fp32.h"


namespace mindspore::kernel { namespace mindspore::kernel {


class TransposeFp16CPUKernel : public LiteKernel {
class TransposeFp16CPUKernel : public TransposeCPUKernel {
public: public:
explicit TransposeFp16CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, explicit TransposeFp16CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive) const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {}
: TransposeCPUKernel(param, inputs, outputs, ctx, primitive) {}
~TransposeFp16CPUKernel() = default; ~TransposeFp16CPUKernel() = default;


int Init() override; int Init() override;
int ReSize() override;
int Run() override; int Run() override;
int TransposeParallel(int task_id);
void FreeFp16Buffer();
int MallocFp16Buffer();


private: private:
int thread_num_;
int thread_h_stride_;
int thread_h_num_;
int num_unit_;
float *in_data_;
float *out_data_;
float16_t *fp16_in_data_ = nullptr;
float16_t *fp16_out_data_ = nullptr;
int in_shape_[8];
int out_shape_[8];
float16_t *in_data_fp16_ = nullptr;
float16_t *out_data_fp16_ = nullptr;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel




+ 13
- 58
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc View File

@@ -36,9 +36,6 @@ int TransposeCPUKernel::Init() {


int TransposeCPUKernel::ReSize() { int TransposeCPUKernel::ReSize() {
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_); TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_);
num_unit_ = static_cast<int>(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H]));
thread_h_num_ = MSMIN(thread_num_, num_unit_);
thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_);


auto &inTensor = in_tensors_.front(); auto &inTensor = in_tensors_.front();
auto &outTensor = out_tensors_.front(); auto &outTensor = out_tensors_.front();
@@ -51,75 +48,27 @@ int TransposeCPUKernel::ReSize() {
param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1];
param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1];
} }
if (this->in_shape_ != nullptr) {
free(this->in_shape_);
in_shape_ = nullptr;
}

if (this->out_shape_ != nullptr) { if (this->out_shape_ != nullptr) {
free(this->out_shape_); free(this->out_shape_);
this->out_shape_ = nullptr; this->out_shape_ = nullptr;
} }
in_shape_ = reinterpret_cast<int *>(malloc(in_shape.size() * sizeof(int)));
if (in_shape_ == nullptr) {
MS_LOG(ERROR) << "malloc in_shape_ failed.";
return RET_ERROR;
}

out_shape_ = reinterpret_cast<int *>(malloc(out_shape.size() * sizeof(int))); out_shape_ = reinterpret_cast<int *>(malloc(out_shape.size() * sizeof(int)));
if (out_shape_ == nullptr) { if (out_shape_ == nullptr) {
MS_LOG(ERROR) << "malloc out_shape_ failed."; MS_LOG(ERROR) << "malloc out_shape_ failed.";
return RET_ERROR; return RET_ERROR;
} }
memcpy(in_shape_, in_shape.data(), in_shape.size() * sizeof(int));
memcpy(out_shape_, out_shape.data(), in_shape.size() * sizeof(int)); memcpy(out_shape_, out_shape.data(), in_shape.size() * sizeof(int));
return RET_OK; return RET_OK;
} }


TransposeCPUKernel::~TransposeCPUKernel() { TransposeCPUKernel::~TransposeCPUKernel() {
if (this->in_shape_ != nullptr) {
free(this->in_shape_);
}
if (this->out_shape_ != nullptr) { if (this->out_shape_ != nullptr) {
free(this->out_shape_); free(this->out_shape_);
} }
} }


int TransposeCPUKernel::TransposeParallel(int task_id) {
int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_);
if (num_unit_thread <= 0) {
return RET_OK;
}
int thread_offset = task_id * thread_h_stride_;
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
MS_ASSERT(param);
int *size = nullptr;
int *position = nullptr;
if (this->dim_size_ != nullptr && this->position_ != nullptr) {
size = this->dim_size_ + task_id * param->num_axes_;
position = this->position_ + task_id * param->num_axes_;
}
MS_ASSERT(in_data_);
MS_ASSERT(out_data_);
MS_ASSERT(in_shape_);
MS_ASSERT(out_shape_);
auto ret = DoTransposeFp32(in_data_, out_data_, in_shape_, out_shape_, param, thread_offset,
thread_offset + num_unit_thread, size, position);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}

int TransposeFp32Run(void *cdata, int task_id) {
auto g_kernel = reinterpret_cast<TransposeCPUKernel *>(cdata);
auto ret = g_kernel->TransposeParallel(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_OP_EXECUTE_FAILURE;
}
return RET_OK;
}

int TransposeCPUKernel::Run() { int TransposeCPUKernel::Run() {
MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2); MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2);
MS_ASSERT(out_tensors_.size() == 1); MS_ASSERT(out_tensors_.size() == 1);
@@ -133,12 +82,12 @@ int TransposeCPUKernel::Run() {
out_data_ = reinterpret_cast<float *>(out_tensor->MutableData()); out_data_ = reinterpret_cast<float *>(out_tensor->MutableData());
int dims = out_tensor->shape().size(); int dims = out_tensor->shape().size();
if (dims > MAX_TRANSPOSE_DIM_SIZE) { if (dims > MAX_TRANSPOSE_DIM_SIZE) {
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int)));
dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (dim_size_ == nullptr) { if (dim_size_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed"; MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR; return RET_ERROR;
} }
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int)));
position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int)));
if (position_ == nullptr) { if (position_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed"; MS_LOG(ERROR) << "Malloc data failed";
context_->allocator->Free(dim_size_); context_->allocator->Free(dim_size_);
@@ -147,7 +96,12 @@ int TransposeCPUKernel::Run() {
} }
} }


auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp32Run, this, thread_h_num_);
TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_);
MS_ASSERT(param);
MS_ASSERT(in_data_);
MS_ASSERT(out_data_);
MS_ASSERT(out_shape_);
auto ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_);
if (dims > MAX_TRANSPOSE_DIM_SIZE) { if (dims > MAX_TRANSPOSE_DIM_SIZE) {
context_->allocator->Free(dim_size_); context_->allocator->Free(dim_size_);
context_->allocator->Free(position_); context_->allocator->Free(position_);
@@ -155,9 +109,10 @@ int TransposeCPUKernel::Run() {
position_ = nullptr; position_ = nullptr;
} }
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
return ret;
MS_LOG(ERROR) << "Transpose run failed";
return RET_ERROR;
} }

return ret; return ret;
} }




+ 2
- 8
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h View File

@@ -29,22 +29,16 @@ class TransposeCPUKernel : public LiteKernel {
explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive) const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {}
: LiteKernel(param, inputs, outputs, ctx, primitive) {}
~TransposeCPUKernel() override; ~TransposeCPUKernel() override;


int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
int TransposeParallel(int task_id);


private:
int thread_num_ = 1;
int thread_h_stride_ = 0;
int thread_h_num_ = 0;
int num_unit_ = 0;
protected:
float *in_data_ = nullptr; float *in_data_ = nullptr;
float *out_data_ = nullptr; float *out_data_ = nullptr;
int *in_shape_ = nullptr;
int *out_shape_ = nullptr; int *out_shape_ = nullptr;
int *dim_size_ = nullptr; int *dim_size_ = nullptr;
int *position_ = nullptr; int *position_ = nullptr;


+ 4
- 0
mindspore/lite/test/models_tflite_fp16.cfg View File

@@ -131,4 +131,8 @@ smartreply.tflite 0.1
mindspore_text_classification_tflite.tflite 4 mindspore_text_classification_tflite.tflite 4
#ml_location.tflite 0.1 #ml_location.tflite 0.1
ml_text_correction.tflite 1 ml_text_correction.tflite 1
# ml_pic_shopping.tflite involves subtract two close numbers.
# In fp16 case, such subtract will cause a great relative error comparing to fp32.
# e.g. fp32: 27.5 -27.4 = 0.1
# fp16: 27.6 - 27.4 = 0.2
#ml_pic_shopping.tflite 0.1 #ml_pic_shopping.tflite 0.1

+ 3
- 6
mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/transpose_fp32_tests.cc View File

@@ -43,7 +43,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) {
1.2791597, -1.02032341, 0.17405411, -0.66358529, 1.20223761, -1.65733338, 1.2791597, -1.02032341, 0.17405411, -0.66358529, 1.20223761, -1.65733338,
-0.36793608, 1.91074871, 0.42663834, 1.8033383, 0.30183748, 0.3952082}; -0.36793608, 1.91074871, 0.42663834, 1.8033383, 0.30183748, 0.3952082};


int input_shape[4] = {1, 2, 3, 4};
int output_shape[4] = {4, 3, 2, 1}; int output_shape[4] = {4, 3, 2, 1};
int perm[8] = {3, 2, 1, 0, 0, 0, 0, 0}; int perm[8] = {3, 2, 1, 0, 0, 0, 0, 0};
int strides[8] = {24, 12, 4, 1, 1, 1, 1, 1}; int strides[8] = {24, 12, 4, 1, 1, 1, 1, 1};
@@ -64,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }


auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@@ -83,7 +82,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) {
-0.52817175, 1.13376944, 1.74481176, 0.04221375, 1.46210794, 0.90159072, -0.52817175, 1.13376944, 1.74481176, 0.04221375, 1.46210794, 0.90159072,
-1.07296862, -1.09989127, -0.7612069, 0.58281521, -2.06014071, 0.50249434}; -1.07296862, -1.09989127, -0.7612069, 0.58281521, -2.06014071, 0.50249434};


int input_shape[3] = {2, 3, 4};
int output_shape[3] = {4, 3, 2}; int output_shape[3] = {4, 3, 2};
int perm[8] = {2, 1, 0, 0, 0, 0, 0, 0}; int perm[8] = {2, 1, 0, 0, 0, 0, 0, 0};
int strides[8] = {12, 4, 1, 1, 1, 1, 1, 1}; int strides[8] = {12, 4, 1, 1, 1, 1, 1, 1};
@@ -104,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }


auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));
@@ -123,7 +121,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) {
-0.52817175, 1.74481176, 1.46210794, 1.13376944, 0.04221375, 0.90159072, -0.52817175, 1.74481176, 1.46210794, 1.13376944, 0.04221375, 0.90159072,
-1.07296862, -0.7612069, -2.06014071, -1.09989127, 0.58281521, 0.50249434}; -1.07296862, -0.7612069, -2.06014071, -1.09989127, 0.58281521, 0.50249434};


int input_shape[2] = {6, 4};
int output_shape[2] = {4, 6}; int output_shape[2] = {4, 6};
int perm[8] = {1, 0, 0, 0, 0, 0, 0, 0}; int perm[8] = {1, 0, 0, 0, 0, 0, 0, 0};
int strides[8] = {4, 1, 1, 1, 1, 1, 1, 1}; int strides[8] = {4, 1, 1, 1, 1, 1, 1, 1};
@@ -145,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) {
param->out_strides_[i] = out_strides[i]; param->out_strides_[i] = out_strides[i];
} }


auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 6, nullptr, nullptr);
auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr);
ASSERT_EQ(ret, 0); ASSERT_EQ(ret, 0);
delete param; delete param;
ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001));


Loading…
Cancel
Save