Merge pull request !4716 from zhaozhenlong/lite/issue/fix_fp16_transposetags/v0.7.0-beta
| @@ -43,7 +43,15 @@ int ConcatFp16CPUKernel::Init() { | |||
| int ConcatFp16CPUKernel::ReSize() { | |||
| FreeTmpBuffer(); | |||
| auto ret = MallocTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| return ConcatBaseCPUKernel::ReSize(); | |||
| } | |||
| int ConcatFp16CPUKernel::MallocTmpBuffer() { | |||
| for (const auto &in_tensor : in_tensors_) { | |||
| float16_t *ptr = nullptr; | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| @@ -58,10 +66,6 @@ int ConcatFp16CPUKernel::ReSize() { | |||
| auto &out_tensor = out_tensors_.at(0); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||
| if (fp16_output_ != nullptr) { | |||
| context_->allocator->Free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| fp16_output_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); | |||
| if (fp16_output_ == nullptr) { | |||
| @@ -70,17 +74,29 @@ int ConcatFp16CPUKernel::ReSize() { | |||
| } | |||
| } | |||
| return ConcatBaseCPUKernel::ReSize(); | |||
| return RET_OK; | |||
| } | |||
| void ConcatFp16CPUKernel::FreeTmpBuffer() { | |||
| for (auto ptr : fp16_inputs_) { | |||
| if (ptr != nullptr) { | |||
| context_->allocator->Free(ptr); | |||
| ptr = nullptr; | |||
| for (auto i = 0; i < fp16_inputs_.size(); i++) { | |||
| auto &in_tensor = in_tensors_.at(i); | |||
| auto in_ptr = fp16_inputs_.at(i); | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| if (in_ptr != nullptr) { | |||
| context_->allocator->Free(in_ptr); | |||
| in_ptr = nullptr; | |||
| } | |||
| } | |||
| } | |||
| fp16_inputs_.clear(); | |||
| auto &out_tensor = out_tensors_.at(0); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { | |||
| if (fp16_output_ != nullptr) { | |||
| context_->allocator->Free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| } | |||
| } | |||
| int ConcatFp16CPUKernel::Run() { | |||
| @@ -119,24 +135,10 @@ int ConcatFp16CPUKernel::Run() { | |||
| ConcatFp16(reinterpret_cast<void **>(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), | |||
| output_shape.size(), reinterpret_cast<void *>(fp16_output_)); | |||
| // free fp16 in out buffer | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(output_addr), out_tensors_.at(0)->ElementsNum()); | |||
| context_->allocator->Free(fp16_output_); | |||
| fp16_output_ = nullptr; | |||
| } | |||
| for (auto i = 0; i < fp16_inputs_.size(); i++) { | |||
| const auto in_tensor = in_tensors_[i]; | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| auto ptr = fp16_inputs_[i]; | |||
| if (ptr != nullptr) { | |||
| context_->allocator->Free(ptr); | |||
| ptr = nullptr; | |||
| } | |||
| } | |||
| } | |||
| fp16_inputs_.clear(); | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| @@ -164,5 +166,5 @@ kernel::LiteKernel *CpuConcatFp16KernelCreator(const std::vector<lite::tensor::T | |||
| } | |||
| return kernel; | |||
| } | |||
| // REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Concat, CpuConcatFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Concat, CpuConcatFp16KernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -41,6 +41,7 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { | |||
| int Run() override; | |||
| private: | |||
| int MallocTmpBuffer(); | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| @@ -58,20 +58,13 @@ int ReduceFp16CPUKernel::Init() { | |||
| } | |||
| int ReduceFp16CPUKernel::ReSize() { | |||
| if (fp16_input_ != nullptr) { | |||
| context_->allocator->Free(fp16_input_); | |||
| fp16_input_ = nullptr; | |||
| } | |||
| auto in_tensor = in_tensors_.front(); | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| fp16_input_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); | |||
| if (fp16_input_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensor->Data()), fp16_input_, in_tensor->ElementsNum()); | |||
| FreeTmpBuffer(); | |||
| auto ret = MallocTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| return MallocTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| int ReduceFp16CPUKernel::CallReduceUnit(int task_id) { | |||
| @@ -99,9 +92,13 @@ int ReduceFp16CPUKernel::Run() { | |||
| tmp_shape_ = in_tensors_.at(0)->shape(); | |||
| auto in_tensor = in_tensors_.at(0); | |||
| if (in_tensor->data_type() == kNumberTypeFloat16) { | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| auto input_data = reinterpret_cast<float *>(in_tensor->Data()); | |||
| Float32ToFloat16(input_data, fp16_input_, in_tensor->ElementsNum()); | |||
| } else { | |||
| fp16_input_ = reinterpret_cast<float16_t *>(in_tensor->Data()); | |||
| } | |||
| fp16_src_data_ = fp16_input_; | |||
| for (int i = 0; i < data_buffers_.size(); ++i) { | |||
| fp16_dst_data_ = data_buffers_[i]; | |||
| @@ -117,6 +114,7 @@ int ReduceFp16CPUKernel::Run() { | |||
| axis_size_ = tmp_shape_[axis]; | |||
| auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_); | |||
| if (error_code != RET_OK) { | |||
| FreeTmpBuffer(); | |||
| MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| @@ -132,16 +130,11 @@ int ReduceFp16CPUKernel::Run() { | |||
| memcpy(out_tensor->Data(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); | |||
| } | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_input_); | |||
| } | |||
| fp16_input_ = nullptr; | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| int ReduceFp16CPUKernel::FreeTmpBuffer() { | |||
| void ReduceFp16CPUKernel::FreeTmpBuffer() { | |||
| for (auto buffer : data_buffers_) { | |||
| if (buffer != nullptr) { | |||
| context_->allocator->Free(buffer); | |||
| @@ -149,12 +142,17 @@ int ReduceFp16CPUKernel::FreeTmpBuffer() { | |||
| } | |||
| } | |||
| data_buffers_.clear(); | |||
| return RET_OK; | |||
| auto in_tensor = in_tensors_.at(0); | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| if (fp16_input_ != nullptr) { | |||
| context_->allocator->Free(fp16_input_); | |||
| fp16_input_ = nullptr; | |||
| } | |||
| } | |||
| } | |||
| int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| auto ret = FreeTmpBuffer(); | |||
| auto input_shape = in_tensors_.at(0)->shape(); | |||
| for (auto i = 0; i < num_axes_; i++) { | |||
| int axis = axes_[i]; | |||
| @@ -166,13 +164,23 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { | |||
| } | |||
| float16_t *buffer = reinterpret_cast<float16_t *>(context_->allocator->Malloc(size * sizeof(float16_t))); | |||
| if (buffer == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed."; | |||
| MS_LOG(ERROR) << "Malloc data failed"; | |||
| return RET_ERROR; | |||
| } | |||
| data_buffers_.emplace_back(buffer); | |||
| input_shape[axis] = 1; | |||
| } | |||
| return ret; | |||
| auto in_tensor = in_tensors_.front(); | |||
| if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { | |||
| fp16_input_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); | |||
| if (fp16_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| @@ -235,6 +243,6 @@ kernel::LiteKernel *CpuMeanFp16KernelCreator(const std::vector<lite::tensor::Ten | |||
| return kernel; | |||
| } | |||
| // REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reduce, CpuReduceFp16KernelCreator) | |||
| // REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Mean, CpuMeanFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reduce, CpuReduceFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Mean, CpuMeanFp16KernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -52,7 +52,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { | |||
| private: | |||
| int MallocTmpBuffer(); | |||
| int FreeTmpBuffer(); | |||
| void FreeTmpBuffer(); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -72,5 +72,5 @@ int ReshapeCPUKernel::Run() { | |||
| context_->allocator->Free(input_ptr); | |||
| } | |||
| return RET_OK; | |||
| } // namespace mindspore::kernel | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -140,5 +140,4 @@ kernel::LiteKernel *CpuSplitFp16KernelCreator(const std::vector<lite::tensor::Te | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Split, CpuSplitFp16KernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -59,10 +59,19 @@ int TransposeFp16CPUKernel::ReSize() { | |||
| param->out_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; | |||
| } | |||
| if (fp16_in_data_ != nullptr) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| FreeFp16Buffer(); | |||
| auto ret = MallocFp16Buffer(); | |||
| if (ret != RET_OK) { | |||
| FreeFp16Buffer(); | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int TransposeFp16CPUKernel::MallocFp16Buffer() { | |||
| auto &in_tensor = in_tensors_.front(); | |||
| auto &out_tensor = out_tensors_.front(); | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| fp16_in_data_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); | |||
| @@ -71,11 +80,6 @@ int TransposeFp16CPUKernel::ReSize() { | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (fp16_out_data_ != nullptr) { | |||
| context_->allocator->Free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||
| fp16_out_data_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); | |||
| @@ -87,6 +91,24 @@ int TransposeFp16CPUKernel::ReSize() { | |||
| return RET_OK; | |||
| } | |||
| void TransposeFp16CPUKernel::FreeFp16Buffer() { | |||
| auto &in_tensor = in_tensors_.front(); | |||
| auto &out_tensor = out_tensors_.front(); | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| if (fp16_in_data_ != nullptr) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| } | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||
| if (fp16_out_data_ != nullptr) { | |||
| context_->allocator->Free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| } | |||
| } | |||
| int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||
| int num_unit_thread = MSMIN(thread_h_stride_, num_unit_ - task_id * thread_h_stride_); | |||
| if (num_unit_thread <= 0) { | |||
| @@ -95,13 +117,6 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||
| int thread_offset = task_id * thread_h_stride_; | |||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||
| fp16_in_data_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->Data()); | |||
| } | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { | |||
| fp16_out_data_ = reinterpret_cast<float16_t *>(out_tensors_.at(0)->Data()); | |||
| } | |||
| auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, | |||
| thread_offset + num_unit_thread); | |||
| if (ret != RET_OK) { | |||
| @@ -109,12 +124,6 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32 || in_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| } | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { | |||
| context_->allocator->Free(fp16_out_data_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -139,7 +148,8 @@ int TransposeFp16CPUKernel::Run() { | |||
| auto &in_tensor = in_tensors_.front(); | |||
| auto &out_tensor = out_tensors_.front(); | |||
| if (in_tensor == nullptr || out_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "null pointer dreferencing."; | |||
| MS_LOG(ERROR) << "null pointer referencing."; | |||
| FreeFp16Buffer(); | |||
| return RET_ERROR; | |||
| } | |||
| @@ -159,23 +169,15 @@ int TransposeFp16CPUKernel::Run() { | |||
| ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; | |||
| FreeFp16Buffer(); | |||
| return ret; | |||
| } | |||
| if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { | |||
| context_->allocator->Free(fp16_in_data_); | |||
| fp16_in_data_ = nullptr; | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { | |||
| out_data_ = reinterpret_cast<float *>(out_tensor->Data()); | |||
| if (out_data_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); | |||
| context_->allocator->Free(fp16_out_data_); | |||
| fp16_out_data_ = nullptr; | |||
| } | |||
| FreeFp16Buffer(); | |||
| return ret; | |||
| } | |||
| @@ -206,5 +208,5 @@ kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector<lite::tensor | |||
| return kernel; | |||
| } | |||
| // REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Transpose, CpuTransposeFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Transpose, CpuTransposeFp16KernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -36,6 +36,8 @@ class TransposeFp16CPUKernel : public LiteKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int TransposeParallel(int task_id); | |||
| void FreeFp16Buffer(); | |||
| int MallocFp16Buffer(); | |||
| private: | |||
| int thread_num_; | |||