From 5787dd642dadb8733bc2684c51d805762d80d882 Mon Sep 17 00:00:00 2001 From: zhaozhenlong Date: Sat, 15 Aug 2020 22:16:07 +0800 Subject: [PATCH] reduce mean fp16 --- .../runtime/kernel/arm/base/reduce_base.cc | 1 + .../runtime/kernel/arm/fp16/reduce_fp16.cc | 216 ++++++++++++++++++ .../src/runtime/kernel/arm/fp16/reduce_fp16.h | 72 ++++++ .../src/runtime/kernel/arm/fp32/reduce.cc | 2 +- .../kernel/arm/nnacl/fp16/reduce_fp16.c | 41 ++++ .../kernel/arm/nnacl/fp16/reduce_fp16.h | 35 +++ .../kernel/arm/fp16/reduce_fp16_tests.cc | 100 ++++++++ 7 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.c create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.h create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/reduce_fp16_tests.cc diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc index 901113f12c..3e330d80c4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc @@ -196,4 +196,5 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector(ReduceMode_ReduceMean)) { + MS_LOG(ERROR) << "Reduce fp16 only support ReduceMode_ReduceMean"; + return RET_ERROR; + } + reducer_ = ReduceMean; + + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ReduceFp16CPUKernel::ReSize() { + if (fp16_input_ != nullptr) { + free(fp16_input_); + fp16_input_ = nullptr; + } + auto ele_num = in_tensors_.at(0)->ElementsNum(); + fp16_input_ = reinterpret_cast(malloc(sizeof(float16_t) * ele_num)); + if (fp16_input_ == nullptr) { + MS_LOG(ERROR) << "malloc fp16_src_data_ falied"; + return RET_ERROR; + } + return MallocTmpBuffer(); +} + +int ReduceFp16CPUKernel::CallReduceUnit(int task_id) { + auto ret = reducer_(outer_size_, inner_size_, axis_size_, fp16_src_data_, tmp_shape_.data(), fp16_dst_data_, task_id, + context_->thread_num_); + return ret; +} + +int ReduceImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto reduce = reinterpret_cast(cdata); + auto error_code = reduce->CallReduceUnit(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ReduceFp16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + tmp_shape_ = in_tensors_.at(0)->shape(); + src_data_ = static_cast(in_tensors_.at(0)->Data()); + auto ele_num = in_tensors_.at(0)->ElementsNum(); + Float32ToFloat16(src_data_, fp16_input_, ele_num); + fp16_src_data_ = fp16_input_; + for (int i = 0; i < data_buffers_.size(); ++i) { + fp16_dst_data_ = data_buffers_[i]; + int axis = axes_[i]; + outer_size_ = 1; + for (int j = 0; j < axis; j++) { + outer_size_ *= tmp_shape_[j]; + } + inner_size_ = 1; + for (int k = axis + 1; k < static_cast(tmp_shape_.size()); k++) { + inner_size_ *= tmp_shape_[k]; + } + axis_size_ = tmp_shape_[axis]; + auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; + return RET_ERROR; + } + tmp_shape_[axis] = 1; + fp16_src_data_ = fp16_dst_data_; + } + + dst_data_ = reinterpret_cast(out_tensors_.at(0)->Data()); + Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensors_.at(0)->ElementsNum()); + return RET_OK; +} + +int ReduceFp16CPUKernel::MallocTmpBuffer() { + for (auto buffer : data_buffers_) { + if (buffer != nullptr) { + free(buffer); + buffer = nullptr; + } + } + data_buffers_.clear(); + + auto input_shape = in_tensors_.at(0)->shape(); + for (auto i = 0; i < num_axes_; i++) { + int axis = axes_[i]; + size_t size = 1; + for (auto j = 0; j < input_shape.size(); j++) { + if (static_cast(axis) != j) { + size *= input_shape[j]; + } + } + float16_t *buffer = reinterpret_cast(malloc(size * sizeof(float16_t))); + if (buffer == nullptr) { + MS_LOG(ERROR) << "Malloc data failed."; + return RET_ERROR; + } + data_buffers_.emplace_back(buffer); + input_shape[axis] = 1; + } + return RET_OK; +} + +kernel::LiteKernel *CpuReduceFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, const lite::Primitive *primitive) { + MS_ASSERT(opParameter != nullptr); + MS_ASSERT(desc.type == schema::PrimitiveType_Reduce); + if (opParameter == nullptr) { + MS_LOG(ERROR) << "Reduce opParameter nullptr"; + return nullptr; + } + if (desc.type != schema::PrimitiveType_Reduce) { + MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type; + return nullptr; + } + auto *kernel = new (std::nothrow) ReduceFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + delete kernel; + return nullptr; + } + return kernel; +} + +kernel::LiteKernel *CpuMeanFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, const lite::Primitive *primitive) { + MS_ASSERT(opParameter != nullptr); + MS_ASSERT(desc.type == schema::PrimitiveType_Mean); + if (opParameter == nullptr) { + MS_LOG(ERROR) << "Reduce opParameter nullptr"; + return nullptr; + } + if (desc.type != schema::PrimitiveType_Mean) { + MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Mean, got " << desc.type; + return nullptr; + } + auto *kernel = new (std::nothrow) ReduceFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reduce, CpuReduceFp16KernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Mean, CpuMeanFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h new file mode 100644 index 0000000000..d4d2665001 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h @@ -0,0 +1,72 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_REDUCE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_REDUCE_FP16_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/reduce_base.h" +#include "ir/anf.h" +using mindspore::schema::ReduceMode; + +namespace mindspore::kernel { +class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { + typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, + const int *src_shape, float16_t *dst_data, const int tid, const int thread_num); + + public: + ReduceFp16CPUKernel(OpParameter *param, const std::vector &inputs, + const std::vector &outputs, const lite::Context *ctx, + const lite::Primitive *primitive) + : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} + ~ReduceFp16CPUKernel() { + for (auto i = 0; i < data_buffers_.size(); i++) { + float16_t *buffer = data_buffers_[i]; + if (buffer != nullptr) { + free(buffer); + buffer = nullptr; + } + } + if (fp16_input_ != nullptr) { + free(fp16_input_); + fp16_input_ = nullptr; + } + src_data_ = nullptr; + dst_data_ = nullptr; + } + + int Init() override; + int ReSize() override; + int Run() override; + int CallReduceUnit(int task_id); + + private: + Reducer reducer_ = nullptr; + std::vector data_buffers_; + const float *src_data_ = nullptr; + float *dst_data_ = nullptr; + float16_t *fp16_input_ = nullptr; + const float16_t *fp16_src_data_ = nullptr; + float16_t *fp16_dst_data_ = nullptr; + + private: + int MallocTmpBuffer(); +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_REDUCE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc index 45cbb28496..2376c65533 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc @@ -19,7 +19,7 @@ #include "src/kernel_registry.h" #include "include/errorcode.h" #include "src/runtime/runtime_api.h" -#include "src/runtime/kernel/arm/nnacl/fp32/reduce.h" +#include "nnacl/fp32/reduce.h" #include "src/runtime/kernel/arm/base/reduce_base.h" using mindspore::kernel::KERNEL_ARCH::kCPU; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.c new file mode 100644 index 0000000000..6a3bdf5f7c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.c @@ -0,0 +1,41 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "nnacl/fp16/reduce_fp16.h" +#include "nnacl/errorcode.h" + +int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, + const int *src_shape, float16_t *dst_data, const int tid, const int thread_num) { + if (src_data == NULL || src_shape == NULL || dst_data == NULL) { + return NNACL_NULL_PTR; + } + int i, j, k; + for (j = tid; j < outer_size; j += thread_num) { + const float16_t *outer_src = src_data + j * axis_size * inner_size; + float16_t *outer_dst = dst_data + j * inner_size; + for (k = 0; k < inner_size; k++) { + const float16_t *inner_src = outer_src + k; + float16_t *inner_dst = outer_dst + k; + float16_t tmp = 0.0; + for (i = 0; i < axis_size; i++) { + tmp += inner_src[i * inner_size]; + } + *inner_dst = tmp / (float16_t)axis_size; + } + } + return NNACL_OK; +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.h new file mode 100644 index 0000000000..12b04ed440 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/reduce_fp16.h @@ -0,0 +1,35 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_REDUCE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_REDUCE_FP16_H_ +#include "nnacl/op_base.h" +#include "src/runtime/kernel/arm/nnacl/reduce_parameter.h" + +#ifdef ENABLE_NEON +#include +#endif +#ifdef __cplusplus +extern "C" { +#endif +int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, + const int *src_shape, float16_t *dst_data, const int tid, const int thread_num); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_REDUCE_FP16_H_ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/reduce_fp16_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/reduce_fp16_tests.cc new file mode 100644 index 0000000000..afba6425a4 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/reduce_fp16_tests.cc @@ -0,0 +1,100 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "utils/log_adapter.h" +#include "common/common_test.h" +#include "src/kernel_registry.h" +#include "mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h" + +namespace mindspore { + +class TestReduceFp16 : public mindspore::CommonTest { + public: + TestReduceFp16() = default; + void Prepare(const std::vector &input_shape, const std::vector &output_shape, float *input_data, + float *output_data, const int num_axis, const int *axes, const int thread_num); + + void TearDown() override; + + public: + float err_tol = 1e-5; + lite::tensor::Tensor in_tensor_; + lite::tensor::Tensor out_tensor_; + std::vector inputs_{&in_tensor_}; + std::vector outputs_{&out_tensor_}; + ReduceParameter param_ = {{}}; + kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat16, schema::PrimitiveType_Reduce}; + lite::Context ctx_ = lite::Context(); + kernel::KernelCreator creator_ = nullptr; + kernel::LiteKernel *kernel_ = nullptr; +}; + +void TestReduceFp16::TearDown() { + in_tensor_.SetData(nullptr); + out_tensor_.SetData(nullptr); +} + +void TestReduceFp16::Prepare(const std::vector &input_shape, const std::vector &output_shape, + float *input_data, float *output_data, const int num_axis, const int *axes, + const int thread_num) { + in_tensor_.set_data_type(kNumberTypeFloat32); + in_tensor_.set_shape(input_shape); + out_tensor_.set_data_type(kNumberTypeFloat32); + out_tensor_.set_shape(output_shape); + in_tensor_.SetData(input_data); + out_tensor_.SetData(output_data); + + bool keep_axis = false; + + int mode = static_cast(schema::ReduceMode_ReduceMean); + ReduceParameter param_ = {{}}; + param_.keep_dims_ = keep_axis; + for (auto i = 0; i < num_axis; i++) { + param_.axes_[i] = axes[i]; + } + param_.num_axes_ = num_axis; + param_.mode_ = mode; + + desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat16, schema::PrimitiveType_Reduce}; + ctx_ = lite::Context(); + ctx_.thread_num_ = thread_num; + creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc); + ASSERT_NE(creator_, nullptr); + kernel_ = creator_(inputs_, outputs_, reinterpret_cast(¶m_), &ctx_, desc, nullptr); + ASSERT_NE(kernel_, nullptr); +} +TEST_F(TestReduceFp16, Mean) { + float in[96] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, + 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, + 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, + 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, + 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0}; + float out[24] = {0.0f}; + float correct[24] = {18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, + 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0}; + + std::vector input_shape = {2, 4, 4, 3}; + std::vector output_shape = {2, 1, 4, 3}; + + int axes[] = {3}; + int num_axis = 1; + int thread_num = 1; + Prepare(input_shape, output_shape, in, out, num_axis, axes, thread_num); + CompareOutputData(out, correct, 24, 1e-3); +} + +} // namespace mindspore