diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c index 02cc6e48eb..982fc39232 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c @@ -443,17 +443,20 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si } void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { - for (int r = 0; r < row; r++) { - for (int c = 0; c < col; c++) { - int r_div16 = r / 16; - int r_mod16 = r % 16; - if (is_fp32_src) { - dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(((const float *)src)[r * col + c]); - } else { - dst[r_div16 * 16 * col + c * 16 + r_mod16] = ((const float16_t *)src)[r * col + c]; + if (is_fp32_src) { + const float *fp32_src = (const float *)src; + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div16 = r / 16; + int r_mod16 = r % 16; + dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(fp32_src[r * col + c]); } } + } else { + const float16_t *fp16_src = (const float16_t *)src; + RowMajor2Col16MajorFp16Opt(fp16_src, dst, row, col); } + return; } void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { @@ -484,6 +487,18 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b } } +void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { + for (int r = 0; r < row; ++r) { + for (int c = 0; c < col; ++c) { + if (is_fp32_src) { + dst[c * row + r] = (float16_t)(((const float *)src)[r * col + c]); + } else { + dst[c * row + r] = ((const float16_t *)src)[r * col + c]; + } + } + } +} + void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h index 086a1b973d..e2e7a80cc2 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.h +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h @@ -59,6 +59,8 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); +void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); + #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.c b/mindspore/lite/nnacl/fp32/matmul_fp32.c index cc0d492b3b..f4cfaccdc3 100644 --- a/mindspore/lite/nnacl/fp32/matmul_fp32.c +++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c @@ -27,11 +27,17 @@ void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col) { void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { const float *src = src_ptr + r * col; - for (int c = 0; c < col; c++) { + int c = 0; + for (; c < col; c++) { int cd4 = c / C4NUM; int cm4 = c % C4NUM; dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = src[c]; } + for (; c < UP_ROUND(col, C4NUM); c++) { + int cd4 = c / C4NUM; + int cm4 = c % C4NUM; + dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = 0; + } } return; } @@ -39,11 +45,17 @@ void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { const float *src = src_ptr + r * col; - for (int c = 0; c < col; c++) { + int c = 0; + for (; c < col; c++) { int cd6 = c / C6NUM; int cm6 = c % C6NUM; dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = src[c]; } + for (; c < UP_ROUND(col, C6NUM); c++) { + int cd6 = c / C6NUM; + int cm6 = c % C6NUM; + dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = 0; + } } return; } @@ -51,11 +63,17 @@ void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { const float *src = src_ptr + r * col; - for (int c = 0; c < col; c++) { + int c = 0; + for (; c < col; c++) { int cd8 = c / C8NUM; int cm8 = c % C8NUM; dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = src[c]; } + for (; c < UP_ROUND(col, C8NUM); c++) { + int cd8 = c / C8NUM; + int cm8 = c % C8NUM; + dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = 0; + } } return; } @@ -63,11 +81,17 @@ void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { const float *src = src_ptr + r * col; - for (int c = 0; c < col; c++) { + int c = 0; + for (; c < col; c++) { int cd12 = c / C12NUM; int cm12 = c % C12NUM; dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = src[c]; } + for (; c < UP_ROUND(col, C12NUM); c++) { + int cd12 = c / C12NUM; + int cm12 = c % C12NUM; + dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = 0; + } } return; } @@ -75,11 +99,17 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) { for (int r = 0; r < row; r++) { const float *src = src_ptr + r * col; - for (int c = 0; c < col; c++) { + int c = 0; + for (; c < col; c++) { int cd16 = c / C16NUM; int cm16 = c % C16NUM; dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = src[c]; } + for (; c < UP_ROUND(col, C16NUM); c++) { + int cd16 = c / C16NUM; + int cm16 = c % C16NUM; + dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = 0; + } } return; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc index 7df1608a53..b1185b1c86 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc @@ -19,236 +19,61 @@ #include "src/kernel_registry.h" using mindspore::lite::KernelRegistrar; -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_INPUT_TENSOR_ERROR; -using mindspore::lite::RET_MEMORY_FAILED; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { -FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); } +void FullconnectionFP16CPUKernel::InitAShape() { + auto a_shape = in_tensors_.at(0)->shape(); + params_->row_ = a_shape[0]; + params_->deep_ = a_shape[1]; +} -void FullconnectionFP16CPUKernel::FreeTmpBuffer() { - if (a_pack_ptr_ != nullptr) { - context_->allocator->Free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (b_pack_ptr_ != nullptr) { - context_->allocator->Free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } - if (bias_ptr_ != nullptr) { - context_->allocator->Free(bias_ptr_); - bias_ptr_ = nullptr; - } - if (output_fp16_ != nullptr) { - context_->allocator->Free(output_fp16_); - output_fp16_ = nullptr; - } +void FullconnectionFP16CPUKernel::InitBShape() { + auto b_shape = in_tensors_.at(1)->shape(); + params_->col_ = b_shape[0]; + params_->deep_ = b_shape[1]; } int FullconnectionFP16CPUKernel::ReSize() { - FreeTmpBuffer(); - int row = 1; - for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) row *= (out_tensors_.at(0)->shape())[i]; - fc_param_->row_ = row; - fc_param_->col_ = out_tensors_.at(0)->shape().back(); - fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); - fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM); - fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM; + InitAShape(); + InitBShape(); + return MatmulBaseFP16CPUKernel::ReSize(); +} - if (row == 1) is_vector_input_ = true; - int a_pack_row = 0; - int b_pack_col = 0; - if (is_vector_input_) { - a_pack_row = 1; - b_pack_col = fc_param_->col_; - } else { - a_pack_row = fc_param_->row_16_; - b_pack_col = fc_param_->col_8_; - } - a_pack_ptr_ = - reinterpret_cast(context_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t))); - if (a_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t)); +int FullconnectionFP16CPUKernel::Init() { + params_->batch = 1; + params_->a_transpose_ = false; + params_->b_transpose_ = true; - b_pack_ptr_ = - reinterpret_cast(context_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t))); - if (b_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t)); + MatmulBaseFP16CPUKernel::InitParameter(); - fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); - if (fc_param_->b_const_) { - if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { - if (is_vector_input_) { - Float32ToFloat16(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_, - fc_param_->col_ * fc_param_->deep_); - } else { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - } - } else { - if (is_vector_input_) { - memcpy(b_pack_ptr_, reinterpret_cast(in_tensors_.at(1)->data_c()), - fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t)); - } else { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - } - } - b_ptr_ = b_pack_ptr_; + if (params_->a_const_ == true) { + InitAShape(); } - if (in_tensors_.size() == 3) { - bias_ptr_ = reinterpret_cast(context_->allocator->Malloc(b_pack_col * sizeof(float16_t))); - if (bias_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t)); - Float32ToFloat16(reinterpret_cast(in_tensors_.at(2)->data_c()), bias_ptr_, fc_param_->col_); + if (params_->b_const_ == true) { + InitBShape(); } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - output_fp16_ = - reinterpret_cast(context_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t))); - if (output_fp16_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } + auto ret = MatmulBaseFP16CPUKernel::Init(); + if (ret != RET_OK) { + return ret; } - return RET_OK; -} // namespace mindspore::kernel - -void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { - RowMajor2Col16MajorFp16(reinterpret_cast(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true); -} - -void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { - RowMajor2Col16MajorFp16(reinterpret_cast(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, false); -} - -void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { - RowMajor2Col8MajorFp16(reinterpret_cast(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, true); -} - -void FullconnectionFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { - RowMajor2Col8MajorFp16(reinterpret_cast(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, false); -} -int FullconnectionFP16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } return ReSize(); } -int FullconnectionFP16CPUKernel::RunImpl(int task_id) { - int cur_stride = fc_param_->col_ - task_id * thread_stride_; - int cur_oc = MSMIN(thread_stride_, cur_stride); - if (cur_oc <= 0) { - return RET_OK; - } - auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_; - auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; - auto c = output_ptr_ + task_id * thread_stride_; - if (is_vector_input_) { - MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); - } else { - MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, - OutType_Nhwc); - } - - return RET_OK; -} - -int FcFP16Run(void *cdata, int task_id) { - auto op = reinterpret_cast(cdata); - auto error_code = op->RunImpl(task_id); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; - return RET_ERROR; - } - return RET_OK; -} - int FullconnectionFP16CPUKernel::Run() { - auto out_tensor = out_tensors_.at(0); - if (out_tensor->data_type() == kNumberTypeFloat32) { - output_ptr_ = output_fp16_; - } else { - output_ptr_ = reinterpret_cast(out_tensor->data_c()); - } - - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - if (is_vector_input_) { - Float32ToFloat16(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_, fc_param_->deep_); - } else { - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_); - } - a_ptr_ = a_pack_ptr_; - } else { - if (is_vector_input_) { - a_ptr_ = reinterpret_cast(in_tensors_.at(0)->data_c()); - } else { - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_); - a_ptr_ = a_pack_ptr_; - } - } - - if (!fc_param_->b_const_) { - if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { - if (is_vector_input_) { - Float32ToFloat16(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_, - fc_param_->col_ * fc_param_->deep_); - } else { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - } - b_ptr_ = b_pack_ptr_; - } else { - if (is_vector_input_) { - b_ptr_ = reinterpret_cast(in_tensors_.at(1)->data_c()); - } else { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - b_ptr_ = b_pack_ptr_; - } - } - } - ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_); - if (out_tensor->data_type() == kNumberTypeFloat32) { - auto size = out_tensor->ElementsNum(); - auto out_tensor_data = reinterpret_cast(out_tensor->data_c()); - Float16ToFloat32(output_fp16_, out_tensor_data, size); - } - return RET_OK; -} - -kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); + auto ret = MatmulBaseFP16CPUKernel::Run(); if (ret != RET_OK) { - MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " - << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); - delete kernel; - return nullptr; + MS_LOG(ERROR) << "FullconnectionFP16CPUKernel run failed"; } - return kernel; + return ret; } -REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h index 146bd34604..38bbce8ce5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h @@ -19,46 +19,24 @@ #include #include -#include "include/errorcode.h" -#include "nnacl/matmul_parameter.h" -#include "nnacl/fp16/matmul_fp16.h" -#include "nnacl/fp16/cast_fp16.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" namespace mindspore::kernel { -class FullconnectionFP16CPUKernel : public LiteKernel { +class FullconnectionFP16CPUKernel : public MatmulBaseFP16CPUKernel { public: explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - fc_param_ = reinterpret_cast(op_parameter_); - } - ~FullconnectionFP16CPUKernel() override; + : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~FullconnectionFP16CPUKernel() override = default; + int Init() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); - - private: - void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); - void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); - void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); - void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr); - void FreeTmpBuffer(); private: - MatMulParameter *fc_param_ = nullptr; - float16_t *a_pack_ptr_ = nullptr; - float16_t *b_pack_ptr_ = nullptr; - float16_t *bias_ptr_ = nullptr; - float16_t *output_fp16_ = nullptr; - float16_t *output_ptr_ = nullptr; - float16_t *a_ptr_ = nullptr; - float16_t *b_ptr_ = nullptr; - bool is_vector_input_ = false; - int thread_count_ = 1; - int thread_stride_ = 0; + void InitAShape(); + void InitBShape(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc new file mode 100644 index 0000000000..89a432d17b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc @@ -0,0 +1,298 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" +#include "nnacl/fp16/matmul_fp16.h" +#include "nnacl/fp16/cast_fp16.h" +#include "src/runtime/runtime_api.h" +#include "include/errorcode.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_INPUT_TENSOR_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +int MatmulBaseFP16Run(void *cdata, int task_id) { + auto op = reinterpret_cast(cdata); + auto error_code = op->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() { + if (bias_ptr_ != nullptr) { + free(bias_ptr_); + bias_ptr_ = nullptr; + } + FreeResizeBufA(); + FreeResizeBufB(); +} + +void MatmulBaseFP16CPUKernel::FreeResizeBufA() { + if (a_pack_ptr_ != nullptr) { + context_->allocator->Free(a_pack_ptr_); + a_pack_ptr_ = nullptr; + } + return; +} + +void MatmulBaseFP16CPUKernel::FreeResizeBufB() { + if (b_pack_ptr_ != nullptr) { + context_->allocator->Free(b_pack_ptr_); + b_pack_ptr_ = nullptr; + } + return; +} + +void MatmulBaseFP16CPUKernel::InitParameter() { + params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); + params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); + return; +} + +int MatmulBaseFP16CPUKernel::InitBias() { + if (in_tensors_.size() == 3) { + auto bias_tensor = in_tensors_[2]; + int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C8NUM); + bias_ptr_ = reinterpret_cast(malloc(max_bias_data * sizeof(float))); + if (bias_ptr_ == nullptr) { + MS_LOG(ERROR) << "malloc bias_ptr_ failed"; + return RET_ERROR; + } + memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t)); + Float32ToFloat16(reinterpret_cast(in_tensors_[2]->data_c()), bias_ptr_, bias_tensor->ElementsNum()); + } + return RET_OK; +} + +int MatmulBaseFP16CPUKernel::ReSize() { + ResizeParameter(); + + if (params_->b_const_ == true && src_b_ != nullptr) { + InitBufferB(); + InitMatrixB(src_b_, kNumberTypeFloat16); + free(src_b_); + src_b_ = nullptr; + } + + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; + return RET_OK; +} + +void MatmulBaseFP16CPUKernel::ResizeParameter() { + if (params_->row_ == 1) { + vec_matmul_ = true; + } + + if (vec_matmul_) { + params_->row_align_ = 1; + params_->col_align_ = params_->col_; + } else { + params_->row_align_ = UP_ROUND(params_->row_, C16NUM); + params_->col_align_ = UP_ROUND(params_->col_, C8NUM); + } + return; +} + +int MatmulBaseFP16CPUKernel::InitBufferA() { + a_pack_ptr_ = reinterpret_cast( + context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t))); + if (a_pack_ptr_ == nullptr) { + return RET_MEMORY_FAILED; + } + + memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)); + return RET_OK; +} + +int MatmulBaseFP16CPUKernel::InitBufferB() { + if (b_pack_ptr_ != nullptr) { + return RET_OK; + } + + b_pack_ptr_ = reinterpret_cast( + context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t))); + if (b_pack_ptr_ == nullptr) { + return RET_MEMORY_FAILED; + } + + memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)); + return RET_OK; +} + +void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) { + auto src_data_type = in_tensors_[0]->data_type(); + + if (vec_matmul_) { + if (src_data_type == kNumberTypeFloat32) { + Float32ToFloat16(reinterpret_cast(src_ptr), a_pack_ptr_, params_->batch * params_->deep_); + } else { + memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t)); + } + return; + } + + int8_t *int8_src = reinterpret_cast(src_ptr); + for (int i = 0; i < params_->batch; i++) { + int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); + float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; + if (params_->a_transpose_) { + RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); + } else { + RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); + } + } + return; +} + +void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) { + int8_t *int8_src = reinterpret_cast(src_ptr); + + if (vec_matmul_) { + if (params_->b_transpose_) { + if (src_data_type == kNumberTypeFloat32) { + Float32ToFloat16(reinterpret_cast(src_ptr), b_pack_ptr_, + params_->batch * params_->col_ * params_->deep_); + } else { + memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)); + } + } else { + for (int i = 0; i < params_->batch; i++) { + const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type); + float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_; + RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32); + } + } + return; + } + + for (int i = 0; i < params_->batch; i++) { + int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type); + float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; + if (params_->b_transpose_) { + RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32); + } else { + RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32); + } + } + return; +} + +int MatmulBaseFP16CPUKernel::Init() { + ResizeParameter(); + if (params_->a_const_ == true) { + if (RET_OK != InitBufferA()) { + return RET_ERROR; + } + InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c())); + } + + if (params_->b_const_ == true) { + /* copy origin b data, pack in resize + * pack after a infershape done */ + auto b_tensor = in_tensors_[1]; + src_b_ = reinterpret_cast(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t))); + if (src_b_ == nullptr) { + MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed"; + return RET_ERROR; + } + + if (b_tensor->data_type() == kNumberTypeFloat32) { + Float32ToFloat16(reinterpret_cast(b_tensor->data_c()), src_b_, + params_->batch * params_->col_ * params_->deep_); + } else { + memcpy(src_b_, b_tensor->data_c(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)); + } + } + + auto ret = InitBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulBaseFP16CPUKernel::RunImpl(int task_id) { + int cur_stride = params_->col_ - task_id * thread_stride_; + int cur_oc = MSMIN(thread_stride_, cur_stride); + if (cur_oc <= 0) { + return RET_OK; + } + + auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; + auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_; + auto c = batch_c_ptr_ + task_id * thread_stride_; + + if (vec_matmul_) { + MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); + } else { + MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, + OutType_Nhwc); + } + return RET_OK; +} + +int MatmulBaseFP16CPUKernel::Run() { + auto c_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); + + if (params_->a_const_ == false) { + if (RET_OK != InitBufferA()) { + return RET_ERROR; + } + InitMatrixA(in_tensors_.at(0)->data_c()); + } + if (params_->b_const_ == false) { + if (RET_OK != InitBufferB()) { + FreeResizeBufA(); + return RET_ERROR; + } + InitMatrixB(in_tensors_.at(1)->data_c(), in_tensors_.at(1)->data_type()); + } + + for (int i = 0; i < params_->batch; ++i) { + if (vec_matmul_) { + batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; + batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; + batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; + } else { + batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; + batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; + batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; + } + auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFP16Run, this, thread_count_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatmulBaseFloatRun failed"; + return ret; + } + } + + if (params_->a_const_ == false) { + FreeResizeBufA(); + } + + if (params_->b_const_ == false) { + FreeResizeBufB(); + } + return RET_OK; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h new file mode 100644 index 0000000000..92eb91b784 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h @@ -0,0 +1,74 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include +#include "src/lite_kernel.h" +#include "nnacl/matmul_parameter.h" + +namespace mindspore::kernel { +class MatmulBaseFP16CPUKernel : public LiteKernel { + public: + explicit MatmulBaseFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : LiteKernel(parameter, inputs, outputs, ctx, primitive) { + params_ = reinterpret_cast(op_parameter_); + } + ~MatmulBaseFP16CPUKernel() override; + int Init() override; + int ReSize() override; + int Run() override; + + public: + int RunImpl(int task_id); + + protected: + void InitParameter(); + + private: + int InitBias(); + void ResizeParameter(); + int InitBufferA(); + int InitBufferB(); + void InitMatrixA(void *src_ptr); + void InitMatrixB(void *src_ptr, TypeId data_type); + void FreeResizeBufA(); + void FreeResizeBufB(); + + protected: + MatMulParameter *params_ = nullptr; + + private: + int thread_stride_ = 0; + int thread_count_ = 0; + bool vec_matmul_ = false; + float16_t *a_pack_ptr_ = nullptr; + float16_t *b_pack_ptr_ = nullptr; + float16_t *src_b_ = nullptr; + float16_t *bias_ptr_ = nullptr; + float16_t *batch_a_ptr_ = nullptr; + float16_t *batch_b_ptr_ = nullptr; + float16_t *batch_c_ptr_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc index f3f2a8f6b2..f2e9c4ca20 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc @@ -15,48 +15,20 @@ */ #include "src/runtime/kernel/arm/fp16/matmul_fp16.h" -#include "nnacl/fp16/matmul_fp16.h" -#include "nnacl/fp16/cast_fp16.h" #include "src/runtime/runtime_api.h" #include "include/errorcode.h" #include "src/kernel_registry.h" using mindspore::lite::KernelRegistrar; -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_INPUT_TENSOR_ERROR; -using mindspore::lite::RET_MEMORY_FAILED; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { -MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { - if (a_pack_ptr_ != nullptr) { - free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (b_pack_ptr_ != nullptr) { - free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } - if (bias_ptr_ != nullptr) { - free(bias_ptr_); - bias_ptr_ = nullptr; - } -} - -void MatmulFP16CPUKernel::FreeTmpBuffer() { - if (a_pack_ptr_ != nullptr) { - params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (b_pack_ptr_ != nullptr) { - params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } -} - -int MatmulFP16CPUKernel::MallocMatrixABuffer() { +void MatmulFP16CPUKernel::InitAShape() { auto a_shape = in_tensors_[0]->shape(); + if (a_shape.empty()) { + return; + } int batch = 1; for (size_t i = 0; i < a_shape.size() - 2; ++i) { batch *= a_shape[i]; @@ -65,25 +37,12 @@ int MatmulFP16CPUKernel::MallocMatrixABuffer() { params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; params_->row_16_ = UP_ROUND(params_->row_, C16NUM); - if (params_->a_const_) { - a_pack_ptr_ = - reinterpret_cast(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); - } else { - a_pack_ptr_ = reinterpret_cast( - context_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); - } - if (a_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); - return RET_OK; } -int MatmulFP16CPUKernel::MallocMatrixBBuffer() { +void MatmulFP16CPUKernel::InitBShape() { auto b_shape = in_tensors_[1]->shape(); if (b_shape.empty()) { - return RET_OK; + return; } int batch = 1; for (size_t i = 0; i < b_shape.size() - 2; ++i) { @@ -93,257 +52,42 @@ int MatmulFP16CPUKernel::MallocMatrixBBuffer() { params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; params_->col_8_ = UP_ROUND(params_->col_, 8); params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; - - if (params_->b_const_) { - b_pack_ptr_ = - reinterpret_cast(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); - } else { - b_pack_ptr_ = reinterpret_cast( - context_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); - } - if (b_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; - return RET_OK; -} - -int MatmulFP16CPUKernel::InitBias() { - auto b_shape = in_tensors_[1]->shape(); - auto c_shape = out_tensors_[0]->shape(); - params_->col_ = params_->b_const_ - ? (params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]) - : (c_shape[c_shape.size() - 1]); - params_->col_8_ = UP_ROUND(params_->col_, 8); - bias_ptr_ = reinterpret_cast(malloc(params_->col_8_ * sizeof(float16_t))); - if (bias_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); - if (in_tensors_.size() == 3) { - Float32ToFloat16(reinterpret_cast(in_tensors_[2]->data_c()), bias_ptr_, params_->col_); - } - return RET_OK; -} - -int MatmulFP16CPUKernel::ReSize() { - if (!params_->b_const_) { - auto ret = InitBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 init bias failed"; - return RET_ERROR; - } - } - return RET_OK; -} - -void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { - for (int i = 0; i < params_->batch; i++) { - float *src = a_ptr + i * params_->deep_ * params_->row_; - float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; - if (params_->a_transpose_) { - RowMajor2Row16MajorFp16(reinterpret_cast(src), dst, params_->deep_, params_->row_, true); - } else { - RowMajor2Col16MajorFp16(reinterpret_cast(src), dst, params_->row_, params_->deep_, true); - } - } -} - -void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { - for (int i = 0; i < params_->batch; i++) { - float16_t *src = a_ptr + i * params_->deep_ * params_->row_; - float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; - if (params_->a_transpose_) { - RowMajor2Row16MajorFp16(reinterpret_cast(src), dst, params_->deep_, params_->row_, false); - } else { - RowMajor2Col16MajorFp16(reinterpret_cast(src), dst, params_->row_, params_->deep_, false); - } - } -} - -void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { - for (int i = 0; i < params_->batch; i++) { - float *src = b_ptr + i * params_->deep_ * params_->col_; - float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; - if (params_->b_transpose_) { - RowMajor2Col8MajorFp16(reinterpret_cast(src), dst, params_->col_, params_->deep_, true); - } else { - RowMajor2Row8MajorFp16(reinterpret_cast(src), dst, params_->deep_, params_->col_, true); - } - } -} - -void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { - for (int i = 0; i < params_->batch; i++) { - float16_t *src = b_ptr + i * params_->deep_ * params_->col_; - float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; - if (params_->b_transpose_) { - RowMajor2Col8MajorFp16(reinterpret_cast(src), dst, params_->col_, params_->deep_, false); - } else { - RowMajor2Row8MajorFp16(reinterpret_cast(src), dst, params_->deep_, params_->col_, false); - } - } } int MatmulFP16CPUKernel::Init() { - params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); - params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); + MatmulBaseFP16CPUKernel::InitParameter(); + if (params_->a_const_) { - auto ret = MallocMatrixABuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; - return RET_ERROR; - } - if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); - } else { - InitMatrixA(reinterpret_cast(in_tensors_[0]->data_c()), a_pack_ptr_); - } + InitAShape(); } if (params_->b_const_) { - auto ret = MallocMatrixBBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed"; - return RET_ERROR; - } - if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); - } else { - InitMatrixB(reinterpret_cast(in_tensors_[1]->data_c()), b_pack_ptr_); - } - ret = InitBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 init bias failed"; - return RET_ERROR; - } + InitBShape(); } - return RET_OK; -} -int MatmulFP16CPUKernel::MallocFp16Output() { - if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { - output_ptr_ = reinterpret_cast( - context_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); - if (output_ptr_ == nullptr) { - MS_LOG(ERROR) << "malloc output_ptr_ failed."; - return RET_MEMORY_FAILED; - } + auto ret = MatmulBaseFP16CPUKernel::Init(); + if (ret != RET_OK) { + return ret; } - return RET_OK; -} -int MatmulFP16CPUKernel::RunImpl(int task_id) { - int cur_stride = params_->col_ - task_id * thread_stride_; - int cur_oc = MSMIN(thread_stride_, cur_stride); - if (cur_oc <= 0) { + if (!InferShapeDone()) { return RET_OK; } - auto b = current_b_ + task_id * thread_stride_ * params_->deep_; - auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; - auto c = current_c_ + task_id * thread_stride_; - MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); - - return RET_OK; + return ReSize(); } -int MatmulFP16Run(void *cdata, int task_id) { - auto op = reinterpret_cast(cdata); - auto error_code = op->RunImpl(task_id); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]"; - return RET_ERROR; - } - return RET_OK; +int MatmulFP16CPUKernel::ReSize() { + InitAShape(); + InitBShape(); + return MatmulBaseFP16CPUKernel::ReSize(); } int MatmulFP16CPUKernel::Run() { - auto out_tensor = out_tensors_.at(0); - auto ret = MallocFp16Output(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul MallocFp16Output failed"; - return RET_ERROR; - } - float16_t *c_ptr = nullptr; - if (out_tensor->data_type() == kNumberTypeFloat32) { - c_ptr = output_ptr_; - } else { - c_ptr = reinterpret_cast(out_tensor->data_c()); - } - if (!params_->a_const_) { - ret = MallocMatrixABuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; - return RET_ERROR; - } - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_); - } else { - InitMatrixA(reinterpret_cast(in_tensors_.at(0)->data_c()), a_pack_ptr_); - } - } - if (!params_->b_const_) { - ret = MallocMatrixBBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed"; - return RET_ERROR; - } - if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - } else { - InitMatrixB(reinterpret_cast(in_tensors_.at(1)->data_c()), b_pack_ptr_); - } - } - for (int i = 0; i < params_->batch; ++i) { - current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_; - current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_; - current_c_ = c_ptr + i * params_->row_ * params_->col_; - ret = ParallelLaunch(this->context_->thread_pool_, MatmulFP16Run, this, thread_count_); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Matmul fp16 run function MatmulFP16Run failed"; - FreeTmpBuffer(); - return RET_ERROR; - } - } - if (out_tensor->data_type() == kNumberTypeFloat32) { - auto size = out_tensor->ElementsNum(); - auto out_tensor_data = reinterpret_cast(out_tensor->data_c()); - Float16ToFloat32(output_ptr_, out_tensor_data, size); - context_->allocator->Free(output_ptr_); - } - if (!params_->a_const_) { - context_->allocator->Free(a_pack_ptr_); - a_pack_ptr_ = nullptr; - } - if (!params_->b_const_) { - context_->allocator->Free(b_pack_ptr_); - b_pack_ptr_ = nullptr; - } - return RET_OK; -} - -kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); + auto ret = MatmulBaseFP16CPUKernel::Run(); if (ret != RET_OK) { - MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " - << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); - delete kernel; - return nullptr; + MS_LOG(ERROR) << "MatmulFP16CPUKernel run failed"; } - return kernel; + return ret; } -REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h index ff1ace9398..7a0f8980de 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h @@ -17,50 +17,24 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ -#ifdef ENABLE_NEON -#include -#endif #include -#include "src/lite_kernel.h" -#include "nnacl/matmul_parameter.h" +#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" namespace mindspore::kernel { -class MatmulFP16CPUKernel : public LiteKernel { +class MatmulFP16CPUKernel : public MatmulBaseFP16CPUKernel { public: explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - params_ = reinterpret_cast(op_parameter_); - } - ~MatmulFP16CPUKernel() override; + : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~MatmulFP16CPUKernel() override = default; int Init() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); private: - int MallocMatrixABuffer(); - int MallocMatrixBBuffer(); - int InitBias(); - int MallocFp16Output(); - void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); - void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); - void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); - void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr); - void FreeTmpBuffer(); - - private: - MatMulParameter *params_ = nullptr; - float16_t *a_pack_ptr_ = nullptr; - float16_t *b_pack_ptr_ = nullptr; - float16_t *bias_ptr_ = nullptr; - float16_t *output_ptr_ = nullptr; - float16_t *current_a_ = nullptr; - float16_t *current_b_ = nullptr; - float16_t *current_c_ = nullptr; - int thread_stride_ = 0; - int thread_count_ = 0; + void InitAShape(); + void InitBShape(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc index 0239e53fc9..319948dc34 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc @@ -25,29 +25,37 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { +void MatmulCPUKernel::InitShapeA() { + auto a_shape = in_tensors_.at(0)->shape(); + int batch = 1; + for (size_t i = 0; i < a_shape.size() - 2; ++i) { + batch *= a_shape[i]; + } + params_->batch = batch; + params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; + params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; +} + +void MatmulCPUKernel::InitShapeB() { + auto b_shape = in_tensors_.at(1)->shape(); + int batch = 1; + for (size_t i = 0; i < b_shape.size() - 2; ++i) { + batch *= b_shape[i]; + } + params_->batch = batch; + params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; + params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; +} + int MatmulCPUKernel::Init() { MatmulFp32BaseCPUKernel::InitParameter(); if (params_->a_const_ == true) { - auto a_shape = in_tensors_.at(0)->shape(); - int batch = 1; - for (size_t i = 0; i < a_shape.size() - 2; ++i) { - batch *= a_shape[i]; - } - params_->batch = batch; - params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; - params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; + InitShapeA(); } if (params_->b_const_ == true) { - auto b_shape = in_tensors_.at(1)->shape(); - int batch = 1; - for (size_t i = 0; i < b_shape.size() - 2; ++i) { - batch *= b_shape[i]; - } - params_->batch = batch; - params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; - params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; + InitShapeB(); } auto ret = MatmulFp32BaseCPUKernel::Init(); @@ -62,17 +70,8 @@ int MatmulCPUKernel::Init() { } int MatmulCPUKernel::ReSize() { - auto a_shape = in_tensors_.at(0)->shape(); - auto b_shape = in_tensors_.at(1)->shape(); - int batch = 1; - MS_ASSERT(a_shape.size() >= 2); - for (size_t i = 0; i < a_shape.size() - 2; ++i) { - batch *= a_shape[i]; - } - params_->batch = batch; - params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; - params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; - params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; + InitShapeA(); + InitShapeB(); return MatmulFp32BaseCPUKernel::ReSize(); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h index 6a9bb0305a..716bef906a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h @@ -33,6 +33,10 @@ class MatmulCPUKernel : public MatmulFp32BaseCPUKernel { int ReSize() override; int Run() override; int Eval() override; + + private: + void InitShapeA(); + void InitShapeB(); }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc index 77bf5b28f1..93869e6cb2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc @@ -68,8 +68,8 @@ int MatmulFp32BaseCPUKernel::InitBufferA() { if (a_pack_ptr_ != nullptr) { return RET_OK; } - a_pack_ptr_ = - reinterpret_cast(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); + a_pack_ptr_ = reinterpret_cast( + context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); if (a_pack_ptr_ == nullptr) { MS_LOG(ERROR) << "malloc a_pack_ptr_ failed"; return RET_ERROR; @@ -81,8 +81,8 @@ int MatmulFp32BaseCPUKernel::InitBufferB() { if (b_pack_ptr_ != nullptr) { return RET_OK; } - b_pack_ptr_ = - reinterpret_cast(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); + b_pack_ptr_ = reinterpret_cast( + context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); if (b_pack_ptr_ == nullptr) { MS_LOG(ERROR) << "malloc b_pack_ptr_ failed"; return RET_ERROR; @@ -99,6 +99,7 @@ int MatmulFp32BaseCPUKernel::InitBiasData() { MS_LOG(ERROR) << "malloc bias_ptr_ failed"; return RET_ERROR; } + memset(bias_ptr_, 0, max_bias_data * sizeof(float)); memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float)); } return RET_OK; @@ -201,7 +202,9 @@ void MatmulFp32BaseCPUKernel::FreeResizeBufB() { } int MatmulFp32BaseCPUKernel::FloatRun(int task_id) { - int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); + int current_stride_oc = thread_stride_ * col_tile_; + int current_rest_oc = params_->col_ - task_id * thread_stride_ * col_tile_; + int cur_oc = MSMIN(current_stride_oc, current_rest_oc); if (cur_oc <= 0) { return RET_OK; } @@ -254,7 +257,7 @@ int MatmulFp32BaseCPUKernel::ReSize() { int MatmulFp32BaseCPUKernel::Run() { auto a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); - c_ptr_ = reinterpret_cast(out_tensors_.at(0)->data_c()); + auto c_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); if (params_->a_const_ == false) { if (RET_OK != InitBufferA()) { @@ -274,11 +277,11 @@ int MatmulFp32BaseCPUKernel::Run() { if (vec_matmul_) { batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; - batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; + batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; } else { batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; - batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; + batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; } auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h index 188863a5e0..71b44d28cf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h @@ -62,16 +62,17 @@ class MatmulFp32BaseCPUKernel : public LiteKernel { MatMulParameter *params_ = nullptr; float *a_pack_ptr_ = nullptr; float *b_pack_ptr_ = nullptr; - float *c_ptr_ = nullptr; - float *bias_ptr_ = nullptr; - float *batch_a_ptr_ = nullptr; - float *batch_b_ptr_ = nullptr; - float *batch_c_ptr_ = nullptr; + + private: int col_tile_ = 0; int row_tile_ = 0; int thread_stride_ = 0; int thread_count_ = 0; bool vec_matmul_ = false; + float *bias_ptr_ = nullptr; + float *batch_a_ptr_ = nullptr; + float *batch_b_ptr_ = nullptr; + float *batch_c_ptr_ = nullptr; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_