From 77ddb17b1a053335b1c29a1e4edfb2b516fead6c Mon Sep 17 00:00:00 2001 From: zhaozhenlong Date: Sat, 26 Dec 2020 14:31:40 +0800 Subject: [PATCH] fix reduce ops buffer_sizes_ and data_buffers_ size --- .../runtime/kernel/arm/base/reduce_base.cc | 3 +- .../runtime/kernel/arm/fp16/reduce_fp16.cc | 42 +++++-------------- .../src/runtime/kernel/arm/fp16/reduce_fp16.h | 2 - .../runtime/kernel/arm/int8/reduce_int8.cc | 3 +- 4 files changed, 15 insertions(+), 35 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc index ebe0cd518d..ceb311f830 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc @@ -146,7 +146,8 @@ void ReduceBaseCPUKernel::CalculateInnerOuterSize() { void ReduceBaseCPUKernel::CalculateTmpBufferSize() { buffer_sizes_.clear(); auto input_shape = in_tensors_.at(0)->shape(); - for (auto i = 0; i < num_axes_; i++) { + // calculate size of buffer to malloc for each reducing axis + for (auto i = 0; i < num_axes_ - 1; i++) { int axis = axes_[i]; size_t size = 1; for (size_t j = 0; j < input_shape.size(); j++) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc index 149550b1e9..9a3d652bc1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc @@ -82,14 +82,7 @@ int ReduceFp16CPUKernel::Run() { } auto in_tensor = in_tensors_.at(0); - if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { - auto input_data = reinterpret_cast(in_tensor->MutableData()); - Float32ToFloat16(input_data, fp16_input_, in_tensor->ElementsNum()); - } else { - fp16_input_ = reinterpret_cast(in_tensor->MutableData()); - } - - fp16_src_data_ = fp16_input_; + fp16_src_data_ = reinterpret_cast(in_tensor->MutableData()); for (size_t i = 0; i < data_buffers_.size(); ++i) { fp16_dst_data_ = data_buffers_.at(i); outer_size_ = outer_sizes_.at(i); @@ -105,11 +98,16 @@ int ReduceFp16CPUKernel::Run() { } auto out_tensor = out_tensors_.at(0); - if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { - dst_data_ = reinterpret_cast(out_tensor->MutableData()); - Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensor->ElementsNum()); - } else { - memcpy(out_tensor->MutableData(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); + fp16_dst_data_ = reinterpret_cast(out_tensor->data_c()); + MS_ASSERT(fp16_dst_data_ != nullptr); + outer_size_ = outer_sizes_.back(); + inner_size_ = inner_sizes_.back(); + axis_size_ = axis_sizes_.back(); + auto error_code = ParallelLaunch(this->context_->thread_pool_, ReduceFp16Impl, this, context_->thread_num_); + if (error_code != RET_OK) { + FreeTmpBuffer(); + MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; + return RET_ERROR; } FreeTmpBuffer(); @@ -124,14 +122,6 @@ void ReduceFp16CPUKernel::FreeTmpBuffer() { } } data_buffers_.clear(); - - auto in_tensor = in_tensors_.at(0); - if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { - if (fp16_input_ != nullptr) { - context_->allocator->Free(fp16_input_); - fp16_input_ = nullptr; - } - } } int ReduceFp16CPUKernel::MallocTmpBuffer() { @@ -144,16 +134,6 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() { } data_buffers_.emplace_back(buffer); } - - auto in_tensor = in_tensors_.front(); - if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { - fp16_input_ = - reinterpret_cast(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t))); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc data failed"; - return RET_ERROR; - } - } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h index 3f3295342d..a7c3fbbc1a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h @@ -44,8 +44,6 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { private: Reducer reducer_ = nullptr; std::vector data_buffers_; - float *dst_data_ = nullptr; - float16_t *fp16_input_ = nullptr; const float16_t *fp16_src_data_ = nullptr; float16_t *fp16_dst_data_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc index 1b4e19362a..76e0848b60 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc @@ -315,6 +315,7 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() { int ReduceInt8CPUKernel::MallocTmpBuffer() { data_buffers_.clear(); MS_ASSERT(static_cast(buffer_sizes_.size()) == num_axes_ - 1); + // malloc num_axes_-1 buffers, since reduce on last axis will generate result to out_tensor, no need for buffer. for (auto buffer_size : buffer_sizes_) { int32_t *buffer = reinterpret_cast(context_->allocator->Malloc(buffer_size * sizeof(int32_t))); if (buffer == nullptr) { @@ -488,7 +489,7 @@ int ReduceInt8CPUKernel::Run() { begin_src_data_[i] = static_cast(input_data[i]); } src_data_ = begin_src_data_; - for (size_t i = 0; i < data_buffers_.size() - 1; ++i) { + for (size_t i = 0; i < data_buffers_.size(); ++i) { GetQuantArgs(i); dst_data_ = data_buffers_[i]; outer_size_ = outer_sizes_[i];