From: @zhaozhenlong Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongtags/v1.1.0
| @@ -18,8 +18,8 @@ | |||||
| #include <string.h> | #include <string.h> | ||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end) { | |||||
| void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int output0 = output_shape[0]; | const int output0 = output_shape[0]; | ||||
| @@ -33,8 +33,8 @@ void Fp16TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, in | |||||
| } | } | ||||
| } | } | ||||
| void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end) { | |||||
| void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -56,8 +56,8 @@ void Fp16TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, in | |||||
| } | } | ||||
| } | } | ||||
| void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end) { | |||||
| void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -88,8 +88,8 @@ void Fp16TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, in | |||||
| } | } | ||||
| } | } | ||||
| void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end) { | |||||
| void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -127,8 +127,30 @@ void Fp16TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, in | |||||
| } | } | ||||
| } | } | ||||
| int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape, | |||||
| TransposeParameter *transpose_param, int h_start, int h_end) { | |||||
| void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, | |||||
| const int *perm, const int *output_shape, int dims, int *size, int *position) { | |||||
| *(size + dims - 1) = 1; | |||||
| for (int i = dims - 1; i > 0; --i) { | |||||
| *(size + i - 1) = *(size + i) * output_shape[i]; | |||||
| } | |||||
| for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { | |||||
| int pos = idx; | |||||
| int output_idx = 0; | |||||
| int input_idx = 0; | |||||
| for (int i = 0; i < dims; ++i) { | |||||
| *(position + i) = pos / *(size + i); | |||||
| int out_stride = i < dims - 1 ? out_strides[i] : 1; | |||||
| output_idx += (*(position + i) * out_stride); | |||||
| input_idx += (*(position + i) * strides[perm[i]]); | |||||
| pos -= *(position + i) * (*(size + i)); | |||||
| } | |||||
| out_data[output_idx] = in_data[input_idx]; | |||||
| } | |||||
| } | |||||
| int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape, | |||||
| TransposeParameter *transpose_param, int *size, int *position) { | |||||
| if (in_data == NULL || out_data == NULL) { | if (in_data == NULL || out_data == NULL) { | ||||
| return NNACL_ERR; | return NNACL_ERR; | ||||
| } | } | ||||
| @@ -138,7 +160,7 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i | |||||
| int data_size = transpose_param->data_size_; | int data_size = transpose_param->data_size_; | ||||
| int num_axes = transpose_param->num_axes_; | int num_axes = transpose_param->num_axes_; | ||||
| if (num_axes < 2 || num_axes > 5) { | |||||
| if (num_axes < 2) { | |||||
| return NNACL_ERR; | return NNACL_ERR; | ||||
| } | } | ||||
| @@ -155,14 +177,21 @@ int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, i | |||||
| (void)memcpy(out_data, in_data, data_size); | (void)memcpy(out_data, in_data, data_size); | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| for (int i = 0; i < num_axes; ++i) { | |||||
| if (perm[i] < 0) { | |||||
| return NNACL_PARAM_INVALID; | |||||
| } | |||||
| } | |||||
| if (num_axes == 2) { | if (num_axes == 2) { | ||||
| Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| Fp16TransposeDim2(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 3) { | } else if (num_axes == 3) { | ||||
| Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| Fp16TransposeDim3(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 4) { | } else if (num_axes == 4) { | ||||
| Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 5) { | } else if (num_axes == 5) { | ||||
| Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else { | |||||
| TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| @@ -18,38 +18,16 @@ | |||||
| #define MINDSPORE_LITE_NNACL_FP16_TRANSPOSE_FP16_H_ | #define MINDSPORE_LITE_NNACL_FP16_TRANSPOSE_FP16_H_ | ||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #include "nnacl/transpose.h" | |||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #endif | #endif | ||||
| typedef struct TransposeParameter { | |||||
| // primitive parameter | |||||
| OpParameter op_parameter_; | |||||
| int perm_[8]; | |||||
| bool conjugate_; | |||||
| // shape correlative | |||||
| int strides_[8]; | |||||
| int out_strides_[8]; | |||||
| // other parameter | |||||
| int num_axes_; | |||||
| int data_size_; | |||||
| } TransposeParameter; | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| int Fp16DoTranspose(float16_t *in_data, float16_t *out_data, int *input_shape, int *output_shape, | |||||
| TransposeParameter *transpose_param, int h_start, int h_end); | |||||
| void TransposeDim2(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end); | |||||
| void TransposeDim3(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end); | |||||
| void TransposeDim4(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end); | |||||
| void TransposeDim5(float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| int *output_shape, int h_start, int h_end); | |||||
| int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *output_shape, | |||||
| TransposeParameter *transpose_param, int *size, int *position); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -17,7 +17,7 @@ | |||||
| #include "nnacl/fp32/transpose_fp32.h" | #include "nnacl/fp32/transpose_fp32.h" | ||||
| void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides, int *out_strides, const int *perm, | void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides, int *out_strides, const int *perm, | ||||
| const int *output_shape, int h_start, int h_end) { | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int output0 = output_shape[0]; | const int output0 = output_shape[0]; | ||||
| @@ -32,7 +32,7 @@ void TransposeDim2Fp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | ||||
| const int *perm, const int *output_shape, int h_start, int h_end) { | |||||
| const int *perm, const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -55,7 +55,7 @@ void TransposeDim3Fp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | ||||
| const int *perm, const int *output_shape, int h_start, int h_end) { | |||||
| const int *perm, const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -87,7 +87,7 @@ void TransposeDim4Fp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | ||||
| const int *perm, const int *output_shape, int h_start, int h_end) { | |||||
| const int *perm, const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | const int stride0 = strides[perm[0]]; | ||||
| const int stride1 = strides[perm[1]]; | const int stride1 = strides[perm[1]]; | ||||
| const int stride2 = strides[perm[2]]; | const int stride2 = strides[perm[2]]; | ||||
| @@ -126,8 +126,7 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | ||||
| const int *perm, const int *output_shape, int h_start, int h_end, int dims, int *size, | |||||
| int *position) { | |||||
| const int *perm, const int *output_shape, int dims, int *size, int *position) { | |||||
| *(size + dims - 1) = 1; | *(size + dims - 1) = 1; | ||||
| for (int i = dims - 1; i > 0; --i) { | for (int i = dims - 1; i > 0; --i) { | ||||
| *(size + i - 1) = *(size + i) * output_shape[i]; | *(size + i - 1) = *(size + i) * output_shape[i]; | ||||
| @@ -148,8 +147,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| } | } | ||||
| int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape, | |||||
| TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position) { | |||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, | |||||
| int *size, int *position) { | |||||
| if (in_data == NULL || out_data == NULL) { | if (in_data == NULL || out_data == NULL) { | ||||
| return NNACL_ERR; | return NNACL_ERR; | ||||
| } | } | ||||
| @@ -182,16 +181,15 @@ int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, con | |||||
| } | } | ||||
| } | } | ||||
| if (num_axes == 2) { | if (num_axes == 2) { | ||||
| TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| TransposeDim2Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 3) { | } else if (num_axes == 3) { | ||||
| TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| TransposeDim3Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 4) { | } else if (num_axes == 4) { | ||||
| TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else if (num_axes == 5) { | } else if (num_axes == 5) { | ||||
| TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end); | |||||
| TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else { | } else { | ||||
| TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, h_start, h_end, num_axes, size, | |||||
| position); | |||||
| TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| @@ -25,8 +25,8 @@ | |||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| int DoTransposeFp32(const float *in_data, float *out_data, int *input_shape, const int *output_shape, | |||||
| TransposeParameter *transpose_param, int h_start, int h_end, int *size, int *position); | |||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, | |||||
| int *size, int *position); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| @@ -34,141 +34,52 @@ int TransposeFp16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| return ReSize(); | |||||
| return TransposeCPUKernel::ReSize(); | |||||
| } | } | ||||
| int TransposeFp16CPUKernel::ReSize() { | |||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||||
| num_unit_ = static_cast<int>(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H])); | |||||
| thread_h_num_ = MSMIN(thread_num_, num_unit_); | |||||
| thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_); | |||||
| int TransposeFp16CPUKernel::Run() { | |||||
| MS_ASSERT(in_tensors_.size() == 1); | |||||
| MS_ASSERT(out_tensors_.size() == 1); | |||||
| auto &in_tensor = in_tensors_.front(); | auto &in_tensor = in_tensors_.front(); | ||||
| auto &out_tensor = out_tensors_.front(); | auto &out_tensor = out_tensors_.front(); | ||||
| auto in_shape = in_tensor->shape(); | |||||
| auto out_shape = out_tensor->shape(); | |||||
| param->strides_[param->num_axes_ - 1] = 1; | |||||
| param->out_strides_[param->num_axes_ - 1] = 1; | |||||
| param->data_size_ = in_tensor->Size(); | |||||
| for (int i = param->num_axes_ - 2; i >= 0; i--) { | |||||
| param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; | |||||
| param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; | |||||
| if (in_tensor == nullptr || out_tensor == nullptr) { | |||||
| MS_LOG(ERROR) << "null pointer referencing."; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | |||||
| } | |||||
| int TransposeFp16CPUKernel::MallocFp16Buffer() { | |||||
| auto &in_tensor = in_tensors_.front(); | |||||
| auto &out_tensor = out_tensors_.front(); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| fp16_in_data_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||||
| if (fp16_in_data_ == nullptr) { | |||||
| in_data_fp16_ = reinterpret_cast<float16_t *>(in_tensor->MutableData()); | |||||
| out_data_fp16_ = reinterpret_cast<float16_t *>(out_tensor->MutableData()); | |||||
| int dims = out_tensor->shape().size(); | |||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| if (dim_size_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | MS_LOG(ERROR) << "Malloc data failed"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| fp16_out_data_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); | |||||
| if (fp16_out_data_ == nullptr) { | |||||
| position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| if (position_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | MS_LOG(ERROR) << "Malloc data failed"; | ||||
| context_->allocator->Free(dim_size_); | |||||
| dim_size_ = nullptr; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | |||||
| void TransposeFp16CPUKernel::FreeFp16Buffer() { | |||||
| auto &in_tensor = in_tensors_.front(); | |||||
| auto &out_tensor = out_tensors_.front(); | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| if (fp16_in_data_ != nullptr) { | |||||
| context_->allocator->Free(fp16_in_data_); | |||||
| fp16_in_data_ = nullptr; | |||||
| } | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| if (fp16_out_data_ != nullptr) { | |||||
| context_->allocator->Free(fp16_out_data_); | |||||
| fp16_out_data_ = nullptr; | |||||
| } | |||||
| } | |||||
| } | |||||
| int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||||
| int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); | |||||
| if (num_unit_thread <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| int thread_offset = task_id * thread_h_stride_; | |||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | ||||
| auto ret = Fp16DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, | |||||
| thread_offset + num_unit_thread); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| MS_ASSERT(param); | |||||
| MS_ASSERT(in_data_fp16_); | |||||
| MS_ASSERT(out_data_fp16_); | |||||
| MS_ASSERT(out_shape_); | |||||
| auto ret = Fp16DoTranspose(in_data_fp16_, out_data_fp16_, out_shape_, param, dim_size_, position_); | |||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| context_->allocator->Free(dim_size_); | |||||
| context_->allocator->Free(position_); | |||||
| dim_size_ = nullptr; | |||||
| position_ = nullptr; | |||||
| } | } | ||||
| return RET_OK; | |||||
| } | |||||
| static int TransposeFp16Run(void *cdata, int task_id) { | |||||
| auto g_kernel = reinterpret_cast<TransposeFp16CPUKernel *>(cdata); | |||||
| auto ret = g_kernel->TransposeParallel(task_id); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_OP_EXECUTE_FAILURE; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int TransposeFp16CPUKernel::Run() { | |||||
| MS_ASSERT(in_tensors_.size() == 1); | |||||
| MS_ASSERT(out_tensors_.size() == 1); | |||||
| auto &in_tensor = in_tensors_.front(); | |||||
| auto &out_tensor = out_tensors_.front(); | |||||
| if (in_tensor == nullptr || out_tensor == nullptr) { | |||||
| MS_LOG(ERROR) << "null pointer referencing."; | |||||
| MS_LOG(ERROR) << "Transpose run failed"; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| // malloc when Run | |||||
| auto ret = MallocFp16Buffer(); | |||||
| if (ret != RET_OK) { | |||||
| FreeFp16Buffer(); | |||||
| return ret; | |||||
| } | |||||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||||
| in_data_ = reinterpret_cast<float *>(in_tensor->MutableData()); | |||||
| Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); | |||||
| } else { | |||||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensor->MutableData()); | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat16) { | |||||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensor->MutableData()); | |||||
| } | |||||
| memcpy(in_shape_, in_tensor->shape().data(), in_tensor->shape().size() * sizeof(int)); | |||||
| memcpy(out_shape_, out_tensor->shape().data(), out_tensor->shape().size() * sizeof(int)); | |||||
| ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp16Run, this, thread_h_num_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | |||||
| FreeFp16Buffer(); | |||||
| return ret; | |||||
| } | |||||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| out_data_ = reinterpret_cast<float *>(out_tensor->MutableData()); | |||||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||||
| } | |||||
| FreeFp16Buffer(); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -21,35 +21,24 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/kernel/arm/fp32/transpose_fp32.h" | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class TransposeFp16CPUKernel : public LiteKernel { | |||||
| class TransposeFp16CPUKernel : public TransposeCPUKernel { | |||||
| public: | public: | ||||
| explicit TransposeFp16CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | explicit TransposeFp16CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} | |||||
| : TransposeCPUKernel(param, inputs, outputs, ctx, primitive) {} | |||||
| ~TransposeFp16CPUKernel() = default; | ~TransposeFp16CPUKernel() = default; | ||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | |||||
| int Run() override; | int Run() override; | ||||
| int TransposeParallel(int task_id); | |||||
| void FreeFp16Buffer(); | |||||
| int MallocFp16Buffer(); | |||||
| private: | private: | ||||
| int thread_num_; | |||||
| int thread_h_stride_; | |||||
| int thread_h_num_; | |||||
| int num_unit_; | |||||
| float *in_data_; | |||||
| float *out_data_; | |||||
| float16_t *fp16_in_data_ = nullptr; | |||||
| float16_t *fp16_out_data_ = nullptr; | |||||
| int in_shape_[8]; | |||||
| int out_shape_[8]; | |||||
| float16_t *in_data_fp16_ = nullptr; | |||||
| float16_t *out_data_fp16_ = nullptr; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -36,9 +36,6 @@ int TransposeCPUKernel::Init() { | |||||
| int TransposeCPUKernel::ReSize() { | int TransposeCPUKernel::ReSize() { | ||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_); | TransposeParameter *param = reinterpret_cast<TransposeParameter *>(op_parameter_); | ||||
| num_unit_ = static_cast<int>(in_tensors_.at(kInputIndex)->shape().at(param->perm_[kNHWC_H])); | |||||
| thread_h_num_ = MSMIN(thread_num_, num_unit_); | |||||
| thread_h_stride_ = UP_DIV(num_unit_, thread_h_num_); | |||||
| auto &inTensor = in_tensors_.front(); | auto &inTensor = in_tensors_.front(); | ||||
| auto &outTensor = out_tensors_.front(); | auto &outTensor = out_tensors_.front(); | ||||
| @@ -51,75 +48,27 @@ int TransposeCPUKernel::ReSize() { | |||||
| param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; | param->strides_[i] = in_shape.at(i + 1) * param->strides_[i + 1]; | ||||
| param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; | param->out_strides_[i] = out_shape.at(i + 1) * param->out_strides_[i + 1]; | ||||
| } | } | ||||
| if (this->in_shape_ != nullptr) { | |||||
| free(this->in_shape_); | |||||
| in_shape_ = nullptr; | |||||
| } | |||||
| if (this->out_shape_ != nullptr) { | if (this->out_shape_ != nullptr) { | ||||
| free(this->out_shape_); | free(this->out_shape_); | ||||
| this->out_shape_ = nullptr; | this->out_shape_ = nullptr; | ||||
| } | } | ||||
| in_shape_ = reinterpret_cast<int *>(malloc(in_shape.size() * sizeof(int))); | |||||
| if (in_shape_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc in_shape_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| out_shape_ = reinterpret_cast<int *>(malloc(out_shape.size() * sizeof(int))); | out_shape_ = reinterpret_cast<int *>(malloc(out_shape.size() * sizeof(int))); | ||||
| if (out_shape_ == nullptr) { | if (out_shape_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc out_shape_ failed."; | MS_LOG(ERROR) << "malloc out_shape_ failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memcpy(in_shape_, in_shape.data(), in_shape.size() * sizeof(int)); | |||||
| memcpy(out_shape_, out_shape.data(), in_shape.size() * sizeof(int)); | memcpy(out_shape_, out_shape.data(), in_shape.size() * sizeof(int)); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| TransposeCPUKernel::~TransposeCPUKernel() { | TransposeCPUKernel::~TransposeCPUKernel() { | ||||
| if (this->in_shape_ != nullptr) { | |||||
| free(this->in_shape_); | |||||
| } | |||||
| if (this->out_shape_ != nullptr) { | if (this->out_shape_ != nullptr) { | ||||
| free(this->out_shape_); | free(this->out_shape_); | ||||
| } | } | ||||
| } | } | ||||
| int TransposeCPUKernel::TransposeParallel(int task_id) { | |||||
| int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); | |||||
| if (num_unit_thread <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| int thread_offset = task_id * thread_h_stride_; | |||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||||
| MS_ASSERT(param); | |||||
| int *size = nullptr; | |||||
| int *position = nullptr; | |||||
| if (this->dim_size_ != nullptr && this->position_ != nullptr) { | |||||
| size = this->dim_size_ + task_id * param->num_axes_; | |||||
| position = this->position_ + task_id * param->num_axes_; | |||||
| } | |||||
| MS_ASSERT(in_data_); | |||||
| MS_ASSERT(out_data_); | |||||
| MS_ASSERT(in_shape_); | |||||
| MS_ASSERT(out_shape_); | |||||
| auto ret = DoTransposeFp32(in_data_, out_data_, in_shape_, out_shape_, param, thread_offset, | |||||
| thread_offset + num_unit_thread, size, position); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int TransposeFp32Run(void *cdata, int task_id) { | |||||
| auto g_kernel = reinterpret_cast<TransposeCPUKernel *>(cdata); | |||||
| auto ret = g_kernel->TransposeParallel(task_id); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "TransposeRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_OP_EXECUTE_FAILURE; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int TransposeCPUKernel::Run() { | int TransposeCPUKernel::Run() { | ||||
| MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2); | MS_ASSERT(in_tensors_.size() == 1 || in_tensors_.size() == 2); | ||||
| MS_ASSERT(out_tensors_.size() == 1); | MS_ASSERT(out_tensors_.size() == 1); | ||||
| @@ -133,12 +82,12 @@ int TransposeCPUKernel::Run() { | |||||
| out_data_ = reinterpret_cast<float *>(out_tensor->MutableData()); | out_data_ = reinterpret_cast<float *>(out_tensor->MutableData()); | ||||
| int dims = out_tensor->shape().size(); | int dims = out_tensor->shape().size(); | ||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | if (dims > MAX_TRANSPOSE_DIM_SIZE) { | ||||
| dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int))); | |||||
| dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| if (dim_size_ == nullptr) { | if (dim_size_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc data failed"; | MS_LOG(ERROR) << "Malloc data failed"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * thread_h_num_ * sizeof(int))); | |||||
| position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| if (position_ == nullptr) { | if (position_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc data failed"; | MS_LOG(ERROR) << "Malloc data failed"; | ||||
| context_->allocator->Free(dim_size_); | context_->allocator->Free(dim_size_); | ||||
| @@ -147,7 +96,12 @@ int TransposeCPUKernel::Run() { | |||||
| } | } | ||||
| } | } | ||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeFp32Run, this, thread_h_num_); | |||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||||
| MS_ASSERT(param); | |||||
| MS_ASSERT(in_data_); | |||||
| MS_ASSERT(out_data_); | |||||
| MS_ASSERT(out_shape_); | |||||
| auto ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_); | |||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | if (dims > MAX_TRANSPOSE_DIM_SIZE) { | ||||
| context_->allocator->Free(dim_size_); | context_->allocator->Free(dim_size_); | ||||
| context_->allocator->Free(position_); | context_->allocator->Free(position_); | ||||
| @@ -155,9 +109,10 @@ int TransposeCPUKernel::Run() { | |||||
| position_ = nullptr; | position_ = nullptr; | ||||
| } | } | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | |||||
| return ret; | |||||
| MS_LOG(ERROR) << "Transpose run failed"; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -29,22 +29,16 @@ class TransposeCPUKernel : public LiteKernel { | |||||
| explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} | |||||
| : LiteKernel(param, inputs, outputs, ctx, primitive) {} | |||||
| ~TransposeCPUKernel() override; | ~TransposeCPUKernel() override; | ||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int TransposeParallel(int task_id); | |||||
| private: | |||||
| int thread_num_ = 1; | |||||
| int thread_h_stride_ = 0; | |||||
| int thread_h_num_ = 0; | |||||
| int num_unit_ = 0; | |||||
| protected: | |||||
| float *in_data_ = nullptr; | float *in_data_ = nullptr; | ||||
| float *out_data_ = nullptr; | float *out_data_ = nullptr; | ||||
| int *in_shape_ = nullptr; | |||||
| int *out_shape_ = nullptr; | int *out_shape_ = nullptr; | ||||
| int *dim_size_ = nullptr; | int *dim_size_ = nullptr; | ||||
| int *position_ = nullptr; | int *position_ = nullptr; | ||||
| @@ -131,4 +131,8 @@ smartreply.tflite 0.1 | |||||
| mindspore_text_classification_tflite.tflite 4 | mindspore_text_classification_tflite.tflite 4 | ||||
| #ml_location.tflite 0.1 | #ml_location.tflite 0.1 | ||||
| ml_text_correction.tflite 1 | ml_text_correction.tflite 1 | ||||
| # ml_pic_shopping.tflite involves subtract two close numbers. | |||||
| # In fp16 case, such subtract will cause a great relative error comparing to fp32. | |||||
| # e.g. fp32: 27.5 -27.4 = 0.1 | |||||
| # fp16: 27.6 - 27.4 = 0.2 | |||||
| #ml_pic_shopping.tflite 0.1 | #ml_pic_shopping.tflite 0.1 | ||||
| @@ -43,7 +43,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { | |||||
| 1.2791597, -1.02032341, 0.17405411, -0.66358529, 1.20223761, -1.65733338, | 1.2791597, -1.02032341, 0.17405411, -0.66358529, 1.20223761, -1.65733338, | ||||
| -0.36793608, 1.91074871, 0.42663834, 1.8033383, 0.30183748, 0.3952082}; | -0.36793608, 1.91074871, 0.42663834, 1.8033383, 0.30183748, 0.3952082}; | ||||
| int input_shape[4] = {1, 2, 3, 4}; | |||||
| int output_shape[4] = {4, 3, 2, 1}; | int output_shape[4] = {4, 3, 2, 1}; | ||||
| int perm[8] = {3, 2, 1, 0, 0, 0, 0, 0}; | int perm[8] = {3, 2, 1, 0, 0, 0, 0, 0}; | ||||
| int strides[8] = {24, 12, 4, 1, 1, 1, 1, 1}; | int strides[8] = {24, 12, 4, 1, 1, 1, 1, 1}; | ||||
| @@ -64,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||
| @@ -83,7 +82,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { | |||||
| -0.52817175, 1.13376944, 1.74481176, 0.04221375, 1.46210794, 0.90159072, | -0.52817175, 1.13376944, 1.74481176, 0.04221375, 1.46210794, 0.90159072, | ||||
| -1.07296862, -1.09989127, -0.7612069, 0.58281521, -2.06014071, 0.50249434}; | -1.07296862, -1.09989127, -0.7612069, 0.58281521, -2.06014071, 0.50249434}; | ||||
| int input_shape[3] = {2, 3, 4}; | |||||
| int output_shape[3] = {4, 3, 2}; | int output_shape[3] = {4, 3, 2}; | ||||
| int perm[8] = {2, 1, 0, 0, 0, 0, 0, 0}; | int perm[8] = {2, 1, 0, 0, 0, 0, 0, 0}; | ||||
| int strides[8] = {12, 4, 1, 1, 1, 1, 1, 1}; | int strides[8] = {12, 4, 1, 1, 1, 1, 1, 1}; | ||||
| @@ -104,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 3, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||
| @@ -123,7 +121,6 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { | |||||
| -0.52817175, 1.74481176, 1.46210794, 1.13376944, 0.04221375, 0.90159072, | -0.52817175, 1.74481176, 1.46210794, 1.13376944, 0.04221375, 0.90159072, | ||||
| -1.07296862, -0.7612069, -2.06014071, -1.09989127, 0.58281521, 0.50249434}; | -1.07296862, -0.7612069, -2.06014071, -1.09989127, 0.58281521, 0.50249434}; | ||||
| int input_shape[2] = {6, 4}; | |||||
| int output_shape[2] = {4, 6}; | int output_shape[2] = {4, 6}; | ||||
| int perm[8] = {1, 0, 0, 0, 0, 0, 0, 0}; | int perm[8] = {1, 0, 0, 0, 0, 0, 0, 0}; | ||||
| int strides[8] = {4, 1, 1, 1, 1, 1, 1, 1}; | int strides[8] = {4, 1, 1, 1, 1, 1, 1, 1}; | ||||
| @@ -145,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, input_shape, output_shape, param, 0, 6, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||