diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c index 3181feb978..066f075164 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c @@ -238,3 +238,63 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, } return; } + +void Fp32RowMajor2Fp16Col16Major(float *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div16 = r / 16; + int r_mod16 = r % 16; + dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(src[r * col + c]); + } + } +} + +void Fp16RowMajor2Fp16Col16Major(float16_t *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div16 = r / 16; + int r_mod16 = r % 16; + dst[r_div16 * 16 * col + c * 16 + r_mod16] = src[r * col + c]; + } + } +} + +void Fp32RowMajor2Fp16Row16Major(float *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div16 = c / 16; + int c_mod16 = c % 16; + dst[c_div16 * 16 * row + r * 16 + c_mod16] = (float16_t)(src[r * col + c]); + } + } +} + +void Fp16RowMajor2Fp16Row16Major(float16_t *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div16 = c / 16; + int c_mod16 = c % 16; + dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * col + c]; + } + } +} + +void Fp32RowMajor2Fp16Row8Major(float *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div8 = c / 8; + int c_mod8 = c % 8; + dst[c_div8 * 8 * row + r * 8 + c_mod8] = (float16_t)src[r * col + c]; + } + } +} + +void Fp32RowMajor2Fp16Col8Major(float *src, float16_t *dst, size_t row, size_t col) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div8 = r / 8; + int r_mod8 = r % 8; + dst[r_div8 * 8 * col + c * 8 + r_mod8] = (float16_t)src[r * col + c]; + } + } +} diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h index 0f9212cae8..cc5e5b5e04 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.h +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h @@ -39,6 +39,18 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); +void Fp32RowMajor2Fp16Col16Major(float *src, float16_t *dst, size_t row, size_t col); + +void Fp16RowMajor2Fp16Col16Major(float16_t *src, float16_t *dst, size_t row, size_t col); + +void Fp32RowMajor2Fp16Row16Major(float *src, float16_t *dst, size_t row, size_t col); + +void Fp16RowMajor2Fp16Row16Major(float16_t *src, float16_t *dst, size_t row, size_t col); + +void Fp32RowMajor2Fp16Row8Major(float *src, float16_t *dst, size_t row, size_t col); + +void Fp32RowMajor2Fp16Col8Major(float *src, float16_t *dst, size_t row, size_t col); + #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc new file mode 100644 index 0000000000..58f62550a3 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc @@ -0,0 +1,187 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/fullconnection_fp16.h" +#include "nnacl/fp16/matmul_fp16.h" +#include "nnacl/fp16/cast_fp16.h" +#include "src/runtime/runtime_api.h" +#include "include/errorcode.h" +#include "src/kernel_registry.h" + +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_INPUT_TENSOR_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_FullConnection; + +namespace mindspore::kernel { +FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); } + +void FullconnectionFP16CPUKernel::FreeTmpBuffer() { + if (a_pack_ptr_ != nullptr) { + ctx_->allocator->Free(a_pack_ptr_); + a_pack_ptr_ = nullptr; + } + if (b_pack_ptr_ != nullptr) { + ctx_->allocator->Free(b_pack_ptr_); + b_pack_ptr_ = nullptr; + } + if (bias_ptr_ != nullptr) { + ctx_->allocator->Free(bias_ptr_); + bias_ptr_ = nullptr; + } + if (output_fp16_ != nullptr) { + ctx_->allocator->Free(output_fp16_); + output_fp16_ = nullptr; + } +} + +int FullconnectionFP16CPUKernel::ReSize() { + FreeTmpBuffer(); + fc_param_->row_ = (in_tensors_[0]->shape())[0]; + fc_param_->col_ = (in_tensors_[1]->shape())[0]; + fc_param_->deep_ = (in_tensors_[1]->shape())[1]; + fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM); + fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); + thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM; + + a_pack_ptr_ = + reinterpret_cast(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t))); + if (a_pack_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)); + + b_pack_ptr_ = + reinterpret_cast(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t))); + if (b_pack_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)); + + InitMatrixB(reinterpret_cast(in_tensors_[1]->Data()), b_pack_ptr_); + if (in_tensors_.size() == 3) { + bias_ptr_ = reinterpret_cast(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t))); + if (bias_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t)); + Float32ToFloat16(reinterpret_cast(in_tensors_[2]->Data()), bias_ptr_, fc_param_->col_); + } + + if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { + output_fp16_ = + reinterpret_cast(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t))); + } + return RET_OK; +} + +void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { + Fp32RowMajor2Fp16Col16Major(a_ptr, a_pack_ptr, fc_param_->row_, fc_param_->deep_); +} + +void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { + Fp16RowMajor2Fp16Col16Major(a_ptr, a_pack_ptr, fc_param_->row_, fc_param_->deep_); +} + +void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { + Fp32RowMajor2Fp16Col8Major(b_ptr, b_pack_ptr, fc_param_->col_, fc_param_->deep_); +} + +int FullconnectionFP16CPUKernel::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int FullconnectionFP16CPUKernel::RunImpl(int task_id) { + int cur_stride = fc_param_->col_ - task_id * thread_stride_; + int cur_oc = MSMIN(thread_stride_, cur_stride); + if (cur_oc <= 0) { + return RET_OK; + } + auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_; + auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; + auto c = output_ptr_ + task_id * thread_stride_; + MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, + true); + return RET_OK; +} + +int FcFP16Run(void *cdata, int task_id) { + auto op = reinterpret_cast(cdata); + auto error_code = op->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int FullconnectionFP16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + auto out_tensor = out_tensors_[0]; + if (out_tensor->data_type() == kNumberTypeFloat32) { + output_ptr_ = output_fp16_; + } else { + output_ptr_ = reinterpret_cast(out_tensor->Data()); + } + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } + ParallelLaunch(THREAD_POOL_DEFAULT, FcFP16Run, this, thread_count_); + if (out_tensor->data_type() == kNumberTypeFloat32) { + auto size = out_tensor->ElementsNum(); + auto out_tensor_data = reinterpret_cast(out_tensor->Data()); + Float16ToFloat32(output_fp16_, out_tensor_data, size); + } + return RET_OK; +} + +kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel is nullptr."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h new file mode 100644 index 0000000000..964a0b017a --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h @@ -0,0 +1,56 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include +#include "src/lite_kernel.h" +#include "nnacl/matmul_parameter.h" +#include "src/runtime/kernel/arm/base/fullconnection_base.h" + +namespace mindspore::kernel { +class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel { + public: + explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::Context *ctx, + const mindspore::lite::PrimitiveC *primitive) + : FullconnectionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~FullconnectionFP16CPUKernel() override; + int Init() override; + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + + private: + void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); + void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); + void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); + void FreeTmpBuffer(); + + private: + float16_t *a_pack_ptr_ = nullptr; + float16_t *b_pack_ptr_ = nullptr; + float16_t *bias_ptr_ = nullptr; + float16_t *output_fp16_ = nullptr; + float16_t *output_ptr_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc new file mode 100644 index 0000000000..e1b4fcd7d8 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc @@ -0,0 +1,250 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/matmul_fp16.h" +#include "nnacl/fp16/matmul_fp16.h" +#include "nnacl/fp16/cast_fp16.h" +#include "src/runtime/runtime_api.h" +#include "include/errorcode.h" +#include "src/kernel_registry.h" + +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_INPUT_TENSOR_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_MatMul; + +namespace mindspore::kernel { +MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { FreeTmpBuffer(); } + +void MatmulFP16CPUKernel::FreeTmpBuffer() { + if (a_pack_ptr_ != nullptr) { + ctx_->allocator->Free(a_pack_ptr_); + a_pack_ptr_ = nullptr; + } + if (b_pack_ptr_ != nullptr) { + ctx_->allocator->Free(b_pack_ptr_); + b_pack_ptr_ = nullptr; + } + if (bias_ptr_ != nullptr) { + ctx_->allocator->Free(bias_ptr_); + bias_ptr_ = nullptr; + } + if (output_ptr_ != nullptr) { + ctx_->allocator->Free(output_ptr_); + output_ptr_ = nullptr; + } +} + +int MatmulFP16CPUKernel::ReSize() { + FreeTmpBuffer(); + int batch = 1; + auto a_shape = in_tensors_[0]->shape(); + auto c_shape = out_tensors_[0]->shape(); + if (in_tensors_.size() == 3) { + auto bias_shape = in_tensors_[2]->shape(); + if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { + MS_LOG(ERROR) << "The bias' dimension is not equal with column"; + return RET_INPUT_TENSOR_ERROR; + } + } + + for (size_t i = 0; i < a_shape.size() - 2; ++i) { + batch *= a_shape[i]; + } + params_->batch = batch; + params_->row_ = c_shape[c_shape.size() - 2]; + params_->col_ = c_shape[c_shape.size() - 1]; + params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; + params_->row_16_ = UP_ROUND(params_->row_, C16NUM); + params_->col_8_ = UP_ROUND(params_->col_, C8NUM); + thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; + + a_pack_ptr_ = reinterpret_cast( + ctx_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); + if (a_pack_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); + + b_pack_ptr_ = reinterpret_cast( + ctx_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); + if (b_pack_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); + + params_->a_const_ = (in_tensors_[0]->Data() != nullptr); + params_->b_const_ = (in_tensors_[1]->Data() != nullptr); + if (params_->a_const_ == true) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } + } + if (params_->b_const_ == true) { + InitMatrixB(reinterpret_cast(in_tensors_[1]->Data()), b_pack_ptr_); + } + + if (in_tensors_.size() == 3) { + bias_ptr_ = reinterpret_cast(ctx_->allocator->Malloc(params_->col_8_ * sizeof(float16_t))); + if (bias_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); + Float32ToFloat16(reinterpret_cast(in_tensors_[2]->Data()), bias_ptr_, params_->col_); + } + + if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { + output_ptr_ = reinterpret_cast( + ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); + } + return RET_OK; +} + +void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { + for (int i = 0; i < params_->batch; i++) { + float *src = a_ptr + i * params_->deep_ * params_->row_; + float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; + if (params_->a_transpose_) { + Fp32RowMajor2Fp16Row16Major(src, dst, params_->deep_, params_->row_); + } else { + Fp32RowMajor2Fp16Col16Major(src, dst, params_->row_, params_->deep_); + } + } +} + +void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { + for (int i = 0; i < params_->batch; i++) { + float16_t *src = a_ptr + i * params_->deep_ * params_->row_; + float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; + if (params_->a_transpose_) { + Fp16RowMajor2Fp16Row16Major(src, dst, params_->deep_, params_->row_); + } else { + Fp16RowMajor2Fp16Col16Major(src, dst, params_->row_, params_->deep_); + } + } +} + +void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { + for (int i = 0; i < params_->batch; i++) { + float *src = b_ptr + i * params_->deep_ * params_->col_; + float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; + if (params_->b_transpose_) { + Fp32RowMajor2Fp16Col8Major(src, dst, params_->col_, params_->deep_); + } else { + Fp32RowMajor2Fp16Row8Major(src, dst, params_->deep_, params_->col_); + } + } +} + +int MatmulFP16CPUKernel::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int MatmulFP16CPUKernel::RunImpl(int task_id) { + int cur_stride = params_->col_ - task_id * thread_stride_; + int cur_oc = MSMIN(thread_stride_, cur_stride); + if (cur_oc <= 0) { + return RET_OK; + } + auto b = current_b_ + task_id * thread_stride_ * params_->deep_; + auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; + auto c = current_c_ + task_id * thread_stride_; + MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true); + + return RET_OK; +} + +int MatmulFP16Run(void *cdata, int task_id) { + auto op = reinterpret_cast(cdata); + auto error_code = op->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulFP16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + auto b = reinterpret_cast(in_tensors_[1]->Data()); + auto out_tensor = out_tensors_[0]; + float16_t *c_ptr; + if (out_tensor->data_type() == kNumberTypeFloat32) { + c_ptr = output_ptr_; + } else { + c_ptr = reinterpret_cast(out_tensor->Data()); + } + if (params_->a_const_ == false) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } else { + InitMatrixA(reinterpret_cast(in_tensors_[0]->Data()), a_pack_ptr_); + } + } + if (params_->b_const_ == false) { + InitMatrixB(b, b_pack_ptr_); + } + for (int i = 0; i < params_->batch; ++i) { + current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_; + current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_; + current_c_ = c_ptr + i * params_->row_ * params_->col_; + ParallelLaunch(THREAD_POOL_DEFAULT, MatmulFP16Run, this, thread_count_); + } + if (out_tensor->data_type() == kNumberTypeFloat32) { + auto size = out_tensor->ElementsNum(); + auto out_tensor_data = reinterpret_cast(out_tensor->Data()); + Float16ToFloat32(output_ptr_, out_tensor_data, size); + } + return RET_OK; +} + +kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel is nullptr."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h new file mode 100644 index 0000000000..2600f5d6ef --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h @@ -0,0 +1,58 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include +#include "src/lite_kernel.h" +#include "nnacl/matmul_parameter.h" +#include "src/runtime/kernel/arm/base/matmul_base.h" + +namespace mindspore::kernel { +class MatmulFP16CPUKernel : public MatmulBaseCPUKernel { + public: + explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::Context *ctx, + const mindspore::lite::PrimitiveC *primitive) + : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~MatmulFP16CPUKernel() override; + int Init() override; + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + + private: + void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); + void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); + void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); + void FreeTmpBuffer(); + + private: + float16_t *a_pack_ptr_ = nullptr; + float16_t *b_pack_ptr_ = nullptr; + float16_t *bias_ptr_ = nullptr; + float16_t *output_ptr_ = nullptr; + float16_t *current_a_ = nullptr; + float16_t *current_b_ = nullptr; + float16_t *current_c_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_