From c30824a7a2a27fc956b7942255c760f91ea1bcc6 Mon Sep 17 00:00:00 2001 From: zhaozhenlong Date: Mon, 17 Aug 2020 16:34:03 +0800 Subject: [PATCH] fp16 ops check input data type --- .../runtime/kernel/arm/fp16/concat_fp16.cc | 89 +++++++++++++------ .../src/runtime/kernel/arm/fp16/concat_fp16.h | 11 +-- .../runtime/kernel/arm/fp16/reduce_fp16.cc | 52 +++++++---- .../src/runtime/kernel/arm/fp16/reduce_fp16.h | 17 +--- .../runtime/kernel/arm/fp16/transpose_fp16.cc | 81 ++++++++++++++--- .../runtime/kernel/arm/fp16/transpose_fp16.h | 11 +-- 6 files changed, 174 insertions(+), 87 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc index 1c31d90c47..d9e61256e5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc @@ -42,35 +42,47 @@ int ConcatFp16CPUKernel::Init() { } int ConcatFp16CPUKernel::ReSize() { - for (auto ptr : fp16_inputs_) { - if (ptr != nullptr) { - free(ptr); - ptr = nullptr; + FreeTmpBuffer(); + + for (const auto &in_tensor : in_tensors_) { + float16_t *ptr = nullptr; + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + ptr = reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); + if (ptr == nullptr) { + MS_LOG(ERROR) << "malloc failed"; + return RET_ERROR; + } } + fp16_inputs_.push_back(ptr); } - fp16_inputs_.clear(); - for (size_t i = 0; i < in_tensors_.size(); ++i) { - float16_t *ptr = nullptr; - ptr = reinterpret_cast(malloc(sizeof(float16_t) * in_tensors_[i]->ElementsNum())); - if (ptr == nullptr) { + + auto &out_tensor = out_tensors_.at(0); + if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { + if (fp16_output_ != nullptr) { + context_->allocator->Free(fp16_output_); + fp16_output_ = nullptr; + } + fp16_output_ = + reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); + if (fp16_output_ == nullptr) { MS_LOG(ERROR) << "malloc failed"; return RET_ERROR; } - fp16_inputs_.push_back(ptr); } - if (fp16_output_ != nullptr) { - free(fp16_output_); - fp16_output_ = nullptr; - } - fp16_output_ = reinterpret_cast(malloc(sizeof(float16_t) * out_tensors_[0]->ElementsNum())); - if (fp16_output_ == nullptr) { - MS_LOG(ERROR) << "malloc failed"; - return RET_ERROR; - } return ConcatBaseCPUKernel::ReSize(); } +void ConcatFp16CPUKernel::FreeTmpBuffer() { + for (auto ptr : fp16_inputs_) { + if (ptr != nullptr) { + context_->allocator->Free(ptr); + ptr = nullptr; + } + } + fp16_inputs_.clear(); +} + int ConcatFp16CPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { @@ -78,28 +90,53 @@ int ConcatFp16CPUKernel::Run() { return prepare_ret; } auto input_num = in_tensors_.size(); - std::vector inputs_addr(input_num, nullptr); std::vector inputs_output_shape(input_num + 1, nullptr); std::vector> shapes; for (size_t i = 0; i < input_num; ++i) { - inputs_addr[i] = reinterpret_cast(in_tensors_[i]->Data()); - if (inputs_addr[i] == nullptr) { - MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; - return RET_ERROR; + const auto in_tensor = in_tensors_[i]; + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + auto in_tensor_data = reinterpret_cast(in_tensor->Data()); + if (in_tensor_data == nullptr) { + MS_LOG(ERROR) << "got nullptr when cast in_tensor to float ptr"; + return RET_ERROR; + } + Float32ToFloat16(in_tensor_data, fp16_inputs_[i], in_tensor->ElementsNum()); + } else { + fp16_inputs_[i] = reinterpret_cast(in_tensor->Data()); } - Float32ToFloat16(inputs_addr[i], fp16_inputs_[i], in_tensors_[i]->ElementsNum()); shapes.push_back(in_tensors_[i]->shape()); inputs_output_shape[i] = shapes[i].data(); } auto output_shape = out_tensors_.at(0)->shape(); inputs_output_shape[input_num] = output_shape.data(); auto output_addr = out_tensors_.at(0)->Data(); + if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { + fp16_output_ = reinterpret_cast(out_tensors_.at(0)->Data()); + } ConcatFp16(reinterpret_cast(fp16_inputs_.data()), input_num, axis_, inputs_output_shape.data(), output_shape.size(), reinterpret_cast(fp16_output_)); - Float16ToFloat32(fp16_output_, reinterpret_cast(output_addr), out_tensors_.at(0)->ElementsNum()); + + // free fp16 in out buffer + if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { + Float16ToFloat32(fp16_output_, reinterpret_cast(output_addr), out_tensors_.at(0)->ElementsNum()); + context_->allocator->Free(fp16_output_); + fp16_output_ = nullptr; + } + for (auto i = 0; i < fp16_inputs_.size(); i++) { + const auto in_tensor = in_tensors_[i]; + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + auto ptr = fp16_inputs_[i]; + if (ptr != nullptr) { + context_->allocator->Free(ptr); + ptr = nullptr; + } + } + } + fp16_inputs_.clear(); + return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h index 3f12f1998a..598bf3e9cb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h @@ -32,13 +32,7 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { const lite::Primitive *primitive) : ConcatBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~ConcatFp16CPUKernel() { - for (auto ptr : fp16_inputs_) { - if (ptr != nullptr) { - free(ptr); - } - } - } + ~ConcatFp16CPUKernel() = default; int Init() override; @@ -46,6 +40,9 @@ class ConcatFp16CPUKernel : public ConcatBaseCPUKernel { int Run() override; + private: + void FreeTmpBuffer(); + private: std::vector fp16_inputs_; float16_t *fp16_output_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc index 14fe42874c..ce2b517bca 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc @@ -59,14 +59,17 @@ int ReduceFp16CPUKernel::Init() { int ReduceFp16CPUKernel::ReSize() { if (fp16_input_ != nullptr) { - free(fp16_input_); + context_->allocator->Free(fp16_input_); fp16_input_ = nullptr; } - auto ele_num = in_tensors_.at(0)->ElementsNum(); - fp16_input_ = reinterpret_cast(malloc(sizeof(float16_t) * ele_num)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_src_data_ falied"; - return RET_ERROR; + auto in_tensor = in_tensors_.front(); + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + fp16_input_ = + reinterpret_cast(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); + if (fp16_input_ == nullptr) { + return RET_ERROR; + } + Float32ToFloat16(reinterpret_cast(in_tensor->Data()), fp16_input_, in_tensor->ElementsNum()); } return MallocTmpBuffer(); } @@ -93,10 +96,12 @@ int ReduceFp16CPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + tmp_shape_ = in_tensors_.at(0)->shape(); - src_data_ = static_cast(in_tensors_.at(0)->Data()); - auto ele_num = in_tensors_.at(0)->ElementsNum(); - Float32ToFloat16(src_data_, fp16_input_, ele_num); + auto in_tensor = in_tensors_.at(0); + if (in_tensor->data_type() == kNumberTypeFloat16) { + fp16_input_ = reinterpret_cast(in_tensor->Data()); + } fp16_src_data_ = fp16_input_; for (int i = 0; i < data_buffers_.size(); ++i) { fp16_dst_data_ = data_buffers_[i]; @@ -119,19 +124,36 @@ int ReduceFp16CPUKernel::Run() { fp16_src_data_ = fp16_dst_data_; } - dst_data_ = reinterpret_cast(out_tensors_.at(0)->Data()); - Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensors_.at(0)->ElementsNum()); + auto out_tensor = out_tensors_.at(0); + if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { + dst_data_ = reinterpret_cast(out_tensor->Data()); + Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensor->ElementsNum()); + } else { + memcpy(out_tensor->Data(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); + } + + if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { + context_->allocator->Free(fp16_input_); + } + fp16_input_ = nullptr; + + FreeTmpBuffer(); return RET_OK; } -int ReduceFp16CPUKernel::MallocTmpBuffer() { +int ReduceFp16CPUKernel::FreeTmpBuffer() { for (auto buffer : data_buffers_) { if (buffer != nullptr) { - free(buffer); + context_->allocator->Free(buffer); buffer = nullptr; } } data_buffers_.clear(); + return RET_OK; +} + +int ReduceFp16CPUKernel::MallocTmpBuffer() { + auto ret = FreeTmpBuffer(); auto input_shape = in_tensors_.at(0)->shape(); for (auto i = 0; i < num_axes_; i++) { @@ -142,7 +164,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { size *= input_shape[j]; } } - float16_t *buffer = reinterpret_cast(malloc(size * sizeof(float16_t))); + float16_t *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(float16_t))); if (buffer == nullptr) { MS_LOG(ERROR) << "Malloc data failed."; return RET_ERROR; @@ -150,7 +172,7 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { data_buffers_.emplace_back(buffer); input_shape[axis] = 1; } - return RET_OK; + return ret; } kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h index d4d2665001..92a144cd01 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h @@ -34,21 +34,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { const std::vector &outputs, const lite::Context *ctx, const lite::Primitive *primitive) : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} - ~ReduceFp16CPUKernel() { - for (auto i = 0; i < data_buffers_.size(); i++) { - float16_t *buffer = data_buffers_[i]; - if (buffer != nullptr) { - free(buffer); - buffer = nullptr; - } - } - if (fp16_input_ != nullptr) { - free(fp16_input_); - fp16_input_ = nullptr; - } - src_data_ = nullptr; - dst_data_ = nullptr; - } + ~ReduceFp16CPUKernel() = default; int Init() override; int ReSize() override; @@ -66,6 +52,7 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { private: int MallocTmpBuffer(); + int FreeTmpBuffer(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc index 1094446ba6..e5fc82ccb6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc @@ -46,29 +46,44 @@ int TransposeFp16CPUKernel::Init() { } int TransposeFp16CPUKernel::ReSize() { - auto &inTensor = in_tensors_.front(); - auto &outTensor = out_tensors_.front(); + auto &in_tensor = in_tensors_.front(); + auto &out_tensor = out_tensors_.front(); auto param = reinterpret_cast(op_parameter_); - auto in_shape = inTensor->shape(); - auto out_shape = outTensor->shape(); + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); param->strides_[param->num_axes_ - 1] = 1; param->out_strides_[param->num_axes_ - 1] = 1; - param->data_size_ = inTensor->Size(); + param->data_size_ = in_tensor->Size(); for (int i = param->num_axes_ - 2; i >= 0; i--) { param->strides_[i] = in_shape[i + 1] * param->strides_[i + 1]; param->out_strides_[i] = out_shape[i + 1] * param->out_strides_[i + 1]; } if (fp16_in_data_ != nullptr) { - free(fp16_in_data_); + context_->allocator->Free(fp16_in_data_); fp16_in_data_ = nullptr; } - fp16_in_data_ = reinterpret_cast(malloc(sizeof(float16_t) * inTensor->ElementsNum())); + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + fp16_in_data_ = + reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * in_tensor->ElementsNum())); + if (fp16_in_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + } + if (fp16_out_data_ != nullptr) { - free(fp16_out_data_); + context_->allocator->Free(fp16_out_data_); fp16_out_data_ = nullptr; } - fp16_out_data_ = reinterpret_cast(malloc(sizeof(float16_t) * outTensor->ElementsNum())); + if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { + fp16_out_data_ = + reinterpret_cast(context_->allocator->Malloc(sizeof(float16_t) * out_tensor->ElementsNum())); + if (fp16_out_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + } return RET_OK; } @@ -79,12 +94,27 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) { } int thread_offset = task_id * thread_h_stride_; TransposeParameter *param = reinterpret_cast(this->op_parameter_); + + if (in_tensors_.at(0)->data_type() == kNumberTypeFloat16) { + fp16_in_data_ = reinterpret_cast(in_tensors_.at(0)->Data()); + } + if (out_tensors_.at(0)->data_type() == kNumberTypeFloat16) { + fp16_out_data_ = reinterpret_cast(out_tensors_.at(0)->Data()); + } + auto ret = DoTranspose(fp16_in_data_, fp16_out_data_, in_shape_, out_shape_, param, thread_offset, thread_offset + num_unit_thread); if (ret != RET_OK) { MS_LOG(ERROR) << "Transpose error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; } + + if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32 || in_tensors_.at(0)->data_type() == kNumberTypeFloat) { + context_->allocator->Free(fp16_in_data_); + } + if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32 || out_tensors_.at(0)->data_type() == kNumberTypeFloat) { + context_->allocator->Free(fp16_out_data_); + } return RET_OK; } @@ -112,9 +142,17 @@ int TransposeFp16CPUKernel::Run() { MS_LOG(ERROR) << "null pointer dreferencing."; return RET_ERROR; } - in_data_ = reinterpret_cast(in_tensor->Data()); - out_data_ = reinterpret_cast(out_tensor->Data()); - Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); + + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + in_data_ = reinterpret_cast(in_tensor->Data()); + Float32ToFloat16(in_data_, fp16_in_data_, in_tensor->ElementsNum()); + } else { + fp16_in_data_ = reinterpret_cast(in_tensor->Data()); + } + if (out_tensor->data_type() == kNumberTypeFloat16) { + fp16_out_data_ = reinterpret_cast(out_tensor->Data()); + } + in_shape_ = const_cast(in_tensor->shape().data()); out_shape_ = const_cast(out_tensor->shape().data()); @@ -123,9 +161,24 @@ int TransposeFp16CPUKernel::Run() { MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]"; return ret; } - Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); + + if (in_tensor->data_type() == kNumberTypeFloat || in_tensor->data_type() == kNumberTypeFloat32) { + context_->allocator->Free(fp16_in_data_); + fp16_in_data_ = nullptr; + } + if (out_tensor->data_type() == kNumberTypeFloat || out_tensor->data_type() == kNumberTypeFloat32) { + out_data_ = reinterpret_cast(out_tensor->Data()); + if (out_data_ == nullptr) { + return RET_ERROR; + } + Float16ToFloat32(fp16_out_data_, out_data_, out_tensor->ElementsNum()); + + context_->allocator->Free(fp16_out_data_); + fp16_out_data_ = nullptr; + } + return ret; -} // namespace mindspore::kernel +} kernel::LiteKernel *CpuTransposeFp16KernelCreator(const std::vector &inputs, const std::vector &outputs, diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h index 69732290ee..49d3e2ee51 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h @@ -30,16 +30,7 @@ class TransposeFp16CPUKernel : public LiteKernel { const std::vector &outputs, const lite::Context *ctx, const lite::Primitive *primitive) : LiteKernel(param, inputs, outputs, ctx, primitive), thread_num_(ctx->thread_num_) {} - ~TransposeFp16CPUKernel() { - if (fp16_in_data_ != nullptr) { - free(fp16_in_data_); - fp16_in_data_ = nullptr; - } - if (fp16_out_data_ != nullptr) { - free(fp16_out_data_); - fp16_out_data_ = nullptr; - } - } + ~TransposeFp16CPUKernel() = default; int Init() override; int ReSize() override;