| @@ -443,17 +443,20 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si | |||
| } | |||
| void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| int r_div16 = r / 16; | |||
| int r_mod16 = r % 16; | |||
| if (is_fp32_src) { | |||
| dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(((const float *)src)[r * col + c]); | |||
| } else { | |||
| dst[r_div16 * 16 * col + c * 16 + r_mod16] = ((const float16_t *)src)[r * col + c]; | |||
| if (is_fp32_src) { | |||
| const float *fp32_src = (const float *)src; | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| int r_div16 = r / 16; | |||
| int r_mod16 = r % 16; | |||
| dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(fp32_src[r * col + c]); | |||
| } | |||
| } | |||
| } else { | |||
| const float16_t *fp16_src = (const float16_t *)src; | |||
| RowMajor2Col16MajorFp16Opt(fp16_src, dst, row, col); | |||
| } | |||
| return; | |||
| } | |||
| void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||
| @@ -484,6 +487,18 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b | |||
| } | |||
| } | |||
| void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||
| for (int r = 0; r < row; ++r) { | |||
| for (int c = 0; c < col; ++c) { | |||
| if (is_fp32_src) { | |||
| dst[c * row + r] = (float16_t)(((const float *)src)[r * col + c]); | |||
| } else { | |||
| dst[c * row + r] = ((const float16_t *)src)[r * col + c]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| @@ -59,6 +59,8 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b | |||
| void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | |||
| void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -27,11 +27,17 @@ void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| for (int r = 0; r < row; r++) { | |||
| const float *src = src_ptr + r * col; | |||
| for (int c = 0; c < col; c++) { | |||
| int c = 0; | |||
| for (; c < col; c++) { | |||
| int cd4 = c / C4NUM; | |||
| int cm4 = c % C4NUM; | |||
| dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = src[c]; | |||
| } | |||
| for (; c < UP_ROUND(col, C4NUM); c++) { | |||
| int cd4 = c / C4NUM; | |||
| int cm4 = c % C4NUM; | |||
| dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -39,11 +45,17 @@ void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||
| void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| for (int r = 0; r < row; r++) { | |||
| const float *src = src_ptr + r * col; | |||
| for (int c = 0; c < col; c++) { | |||
| int c = 0; | |||
| for (; c < col; c++) { | |||
| int cd6 = c / C6NUM; | |||
| int cm6 = c % C6NUM; | |||
| dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = src[c]; | |||
| } | |||
| for (; c < UP_ROUND(col, C6NUM); c++) { | |||
| int cd6 = c / C6NUM; | |||
| int cm6 = c % C6NUM; | |||
| dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -51,11 +63,17 @@ void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||
| void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| for (int r = 0; r < row; r++) { | |||
| const float *src = src_ptr + r * col; | |||
| for (int c = 0; c < col; c++) { | |||
| int c = 0; | |||
| for (; c < col; c++) { | |||
| int cd8 = c / C8NUM; | |||
| int cm8 = c % C8NUM; | |||
| dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = src[c]; | |||
| } | |||
| for (; c < UP_ROUND(col, C8NUM); c++) { | |||
| int cd8 = c / C8NUM; | |||
| int cm8 = c % C8NUM; | |||
| dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -63,11 +81,17 @@ void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||
| void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| for (int r = 0; r < row; r++) { | |||
| const float *src = src_ptr + r * col; | |||
| for (int c = 0; c < col; c++) { | |||
| int c = 0; | |||
| for (; c < col; c++) { | |||
| int cd12 = c / C12NUM; | |||
| int cm12 = c % C12NUM; | |||
| dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = src[c]; | |||
| } | |||
| for (; c < UP_ROUND(col, C12NUM); c++) { | |||
| int cd12 = c / C12NUM; | |||
| int cm12 = c % C12NUM; | |||
| dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -75,11 +99,17 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||
| void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) { | |||
| for (int r = 0; r < row; r++) { | |||
| const float *src = src_ptr + r * col; | |||
| for (int c = 0; c < col; c++) { | |||
| int c = 0; | |||
| for (; c < col; c++) { | |||
| int cd16 = c / C16NUM; | |||
| int cm16 = c % C16NUM; | |||
| dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = src[c]; | |||
| } | |||
| for (; c < UP_ROUND(col, C16NUM); c++) { | |||
| int cd16 = c / C16NUM; | |||
| int cm16 = c % C16NUM; | |||
| dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -19,236 +19,61 @@ | |||
| #include "src/kernel_registry.h" | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_FullConnection; | |||
| namespace mindspore::kernel { | |||
| FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); } | |||
| void FullconnectionFP16CPUKernel::InitAShape() { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| params_->row_ = a_shape[0]; | |||
| params_->deep_ = a_shape[1]; | |||
| } | |||
| void FullconnectionFP16CPUKernel::FreeTmpBuffer() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| if (bias_ptr_ != nullptr) { | |||
| context_->allocator->Free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| } | |||
| if (output_fp16_ != nullptr) { | |||
| context_->allocator->Free(output_fp16_); | |||
| output_fp16_ = nullptr; | |||
| } | |||
| void FullconnectionFP16CPUKernel::InitBShape() { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| params_->col_ = b_shape[0]; | |||
| params_->deep_ = b_shape[1]; | |||
| } | |||
| int FullconnectionFP16CPUKernel::ReSize() { | |||
| FreeTmpBuffer(); | |||
| int row = 1; | |||
| for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) row *= (out_tensors_.at(0)->shape())[i]; | |||
| fc_param_->row_ = row; | |||
| fc_param_->col_ = out_tensors_.at(0)->shape().back(); | |||
| fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||
| fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM); | |||
| fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_, C8NUM)); | |||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM; | |||
| InitAShape(); | |||
| InitBShape(); | |||
| return MatmulBaseFP16CPUKernel::ReSize(); | |||
| } | |||
| if (row == 1) is_vector_input_ = true; | |||
| int a_pack_row = 0; | |||
| int b_pack_col = 0; | |||
| if (is_vector_input_) { | |||
| a_pack_row = 1; | |||
| b_pack_col = fc_param_->col_; | |||
| } else { | |||
| a_pack_row = fc_param_->row_16_; | |||
| b_pack_col = fc_param_->col_8_; | |||
| } | |||
| a_pack_ptr_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t))); | |||
| if (a_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t)); | |||
| int FullconnectionFP16CPUKernel::Init() { | |||
| params_->batch = 1; | |||
| params_->a_transpose_ = false; | |||
| params_->b_transpose_ = true; | |||
| b_pack_ptr_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t))); | |||
| if (b_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t)); | |||
| MatmulBaseFP16CPUKernel::InitParameter(); | |||
| fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); | |||
| if (fc_param_->b_const_) { | |||
| if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { | |||
| if (is_vector_input_) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_, | |||
| fc_param_->col_ * fc_param_->deep_); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| } | |||
| } else { | |||
| if (is_vector_input_) { | |||
| memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), | |||
| fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t)); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| } | |||
| } | |||
| b_ptr_ = b_pack_ptr_; | |||
| if (params_->a_const_ == true) { | |||
| InitAShape(); | |||
| } | |||
| if (in_tensors_.size() == 3) { | |||
| bias_ptr_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * sizeof(float16_t))); | |||
| if (bias_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t)); | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->data_c()), bias_ptr_, fc_param_->col_); | |||
| if (params_->b_const_ == true) { | |||
| InitBShape(); | |||
| } | |||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||
| output_fp16_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t))); | |||
| if (output_fp16_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| auto ret = MatmulBaseFP16CPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } // namespace mindspore::kernel | |||
| void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { | |||
| RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true); | |||
| } | |||
| void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { | |||
| RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, false); | |||
| } | |||
| void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { | |||
| RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, true); | |||
| } | |||
| void FullconnectionFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { | |||
| RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, false); | |||
| } | |||
| int FullconnectionFP16CPUKernel::Init() { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| return ReSize(); | |||
| } | |||
| int FullconnectionFP16CPUKernel::RunImpl(int task_id) { | |||
| int cur_stride = fc_param_->col_ - task_id * thread_stride_; | |||
| int cur_oc = MSMIN(thread_stride_, cur_stride); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_; | |||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | |||
| auto c = output_ptr_ + task_id * thread_stride_; | |||
| if (is_vector_input_) { | |||
| MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); | |||
| } else { | |||
| MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | |||
| OutType_Nhwc); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FcFP16Run(void *cdata, int task_id) { | |||
| auto op = reinterpret_cast<FullconnectionFP16CPUKernel *>(cdata); | |||
| auto error_code = op->RunImpl(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FullconnectionFP16CPUKernel::Run() { | |||
| auto out_tensor = out_tensors_.at(0); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||
| output_ptr_ = output_fp16_; | |||
| } else { | |||
| output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c()); | |||
| } | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||
| if (is_vector_input_) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_, fc_param_->deep_); | |||
| } else { | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_); | |||
| } | |||
| a_ptr_ = a_pack_ptr_; | |||
| } else { | |||
| if (is_vector_input_) { | |||
| a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| } else { | |||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| } | |||
| } | |||
| if (!fc_param_->b_const_) { | |||
| if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { | |||
| if (is_vector_input_) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_, | |||
| fc_param_->col_ * fc_param_->deep_); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| } | |||
| b_ptr_ = b_pack_ptr_; | |||
| } else { | |||
| if (is_vector_input_) { | |||
| b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| } | |||
| } | |||
| } | |||
| ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_); | |||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||
| auto size = out_tensor->ElementsNum(); | |||
| auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c()); | |||
| Float16ToFloat32(output_fp16_, out_tensor_data, size); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| auto ret = MatmulBaseFP16CPUKernel::Run(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||
| delete kernel; | |||
| return nullptr; | |||
| MS_LOG(ERROR) << "FullconnectionFP16CPUKernel run failed"; | |||
| } | |||
| return kernel; | |||
| return ret; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionFP16CPUKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -19,46 +19,24 @@ | |||
| #include <arm_neon.h> | |||
| #include <vector> | |||
| #include "include/errorcode.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| #include "nnacl/fp16/cast_fp16.h" | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" | |||
| namespace mindspore::kernel { | |||
| class FullconnectionFP16CPUKernel : public LiteKernel { | |||
| class FullconnectionFP16CPUKernel : public MatmulBaseFP16CPUKernel { | |||
| public: | |||
| explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| } | |||
| ~FullconnectionFP16CPUKernel() override; | |||
| : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~FullconnectionFP16CPUKernel() override = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int RunImpl(int task_id); | |||
| private: | |||
| void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); | |||
| void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); | |||
| void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); | |||
| void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr); | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| MatMulParameter *fc_param_ = nullptr; | |||
| float16_t *a_pack_ptr_ = nullptr; | |||
| float16_t *b_pack_ptr_ = nullptr; | |||
| float16_t *bias_ptr_ = nullptr; | |||
| float16_t *output_fp16_ = nullptr; | |||
| float16_t *output_ptr_ = nullptr; | |||
| float16_t *a_ptr_ = nullptr; | |||
| float16_t *b_ptr_ = nullptr; | |||
| bool is_vector_input_ = false; | |||
| int thread_count_ = 1; | |||
| int thread_stride_ = 0; | |||
| void InitAShape(); | |||
| void InitBShape(); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,298 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| #include "nnacl/fp16/cast_fp16.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #include "include/errorcode.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| namespace mindspore::kernel { | |||
| int MatmulBaseFP16Run(void *cdata, int task_id) { | |||
| auto op = reinterpret_cast<MatmulBaseFP16CPUKernel *>(cdata); | |||
| auto error_code = op->RunImpl(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() { | |||
| if (bias_ptr_ != nullptr) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| } | |||
| FreeResizeBufA(); | |||
| FreeResizeBufB(); | |||
| } | |||
| void MatmulBaseFP16CPUKernel::FreeResizeBufA() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| void MatmulBaseFP16CPUKernel::FreeResizeBufB() { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| void MatmulBaseFP16CPUKernel::InitParameter() { | |||
| params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | |||
| params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | |||
| return; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::InitBias() { | |||
| if (in_tensors_.size() == 3) { | |||
| auto bias_tensor = in_tensors_[2]; | |||
| int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C8NUM); | |||
| bias_ptr_ = reinterpret_cast<float16_t *>(malloc(max_bias_data * sizeof(float))); | |||
| if (bias_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t)); | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, bias_tensor->ElementsNum()); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::ReSize() { | |||
| ResizeParameter(); | |||
| if (params_->b_const_ == true && src_b_ != nullptr) { | |||
| InitBufferB(); | |||
| InitMatrixB(src_b_, kNumberTypeFloat16); | |||
| free(src_b_); | |||
| src_b_ = nullptr; | |||
| } | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); | |||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; | |||
| return RET_OK; | |||
| } | |||
| void MatmulBaseFP16CPUKernel::ResizeParameter() { | |||
| if (params_->row_ == 1) { | |||
| vec_matmul_ = true; | |||
| } | |||
| if (vec_matmul_) { | |||
| params_->row_align_ = 1; | |||
| params_->col_align_ = params_->col_; | |||
| } else { | |||
| params_->row_align_ = UP_ROUND(params_->row_, C16NUM); | |||
| params_->col_align_ = UP_ROUND(params_->col_, C8NUM); | |||
| } | |||
| return; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::InitBufferA() { | |||
| a_pack_ptr_ = reinterpret_cast<float16_t *>( | |||
| context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t))); | |||
| if (a_pack_ptr_ == nullptr) { | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::InitBufferB() { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| return RET_OK; | |||
| } | |||
| b_pack_ptr_ = reinterpret_cast<float16_t *>( | |||
| context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t))); | |||
| if (b_pack_ptr_ == nullptr) { | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) { | |||
| auto src_data_type = in_tensors_[0]->data_type(); | |||
| if (vec_matmul_) { | |||
| if (src_data_type == kNumberTypeFloat32) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_); | |||
| } else { | |||
| memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t)); | |||
| } | |||
| return; | |||
| } | |||
| int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr); | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); | |||
| float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); | |||
| } else { | |||
| RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) { | |||
| int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr); | |||
| if (vec_matmul_) { | |||
| if (params_->b_transpose_) { | |||
| if (src_data_type == kNumberTypeFloat32) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(src_ptr), b_pack_ptr_, | |||
| params_->batch * params_->col_ * params_->deep_); | |||
| } else { | |||
| memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)); | |||
| } | |||
| } else { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type); | |||
| float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_; | |||
| RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type); | |||
| float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32); | |||
| } else { | |||
| RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::Init() { | |||
| ResizeParameter(); | |||
| if (params_->a_const_ == true) { | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c())); | |||
| } | |||
| if (params_->b_const_ == true) { | |||
| /* copy origin b data, pack in resize | |||
| * pack after a infershape done */ | |||
| auto b_tensor = in_tensors_[1]; | |||
| src_b_ = reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t))); | |||
| if (src_b_ == nullptr) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (b_tensor->data_type() == kNumberTypeFloat32) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(b_tensor->data_c()), src_b_, | |||
| params_->batch * params_->col_ * params_->deep_); | |||
| } else { | |||
| memcpy(src_b_, b_tensor->data_c(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)); | |||
| } | |||
| } | |||
| auto ret = InitBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::RunImpl(int task_id) { | |||
| int cur_stride = params_->col_ - task_id * thread_stride_; | |||
| int cur_oc = MSMIN(thread_stride_, cur_stride); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | |||
| auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_; | |||
| auto c = batch_c_ptr_ + task_id * thread_stride_; | |||
| if (vec_matmul_) { | |||
| MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); | |||
| } else { | |||
| MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, | |||
| OutType_Nhwc); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulBaseFP16CPUKernel::Run() { | |||
| auto c_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (params_->a_const_ == false) { | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(in_tensors_.at(0)->data_c()); | |||
| } | |||
| if (params_->b_const_ == false) { | |||
| if (RET_OK != InitBufferB()) { | |||
| FreeResizeBufA(); | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixB(in_tensors_.at(1)->data_c(), in_tensors_.at(1)->data_type()); | |||
| } | |||
| for (int i = 0; i < params_->batch; ++i) { | |||
| if (vec_matmul_) { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; | |||
| batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; | |||
| } else { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; | |||
| } | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFP16Run, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulBaseFloatRun failed"; | |||
| return ret; | |||
| } | |||
| } | |||
| if (params_->a_const_ == false) { | |||
| FreeResizeBufA(); | |||
| } | |||
| if (params_->b_const_ == false) { | |||
| FreeResizeBufB(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,74 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ | |||
| #ifdef ENABLE_NEON | |||
| #include <arm_neon.h> | |||
| #endif | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| namespace mindspore::kernel { | |||
| class MatmulBaseFP16CPUKernel : public LiteKernel { | |||
| public: | |||
| explicit MatmulBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| params_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| } | |||
| ~MatmulBaseFP16CPUKernel() override; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| public: | |||
| int RunImpl(int task_id); | |||
| protected: | |||
| void InitParameter(); | |||
| private: | |||
| int InitBias(); | |||
| void ResizeParameter(); | |||
| int InitBufferA(); | |||
| int InitBufferB(); | |||
| void InitMatrixA(void *src_ptr); | |||
| void InitMatrixB(void *src_ptr, TypeId data_type); | |||
| void FreeResizeBufA(); | |||
| void FreeResizeBufB(); | |||
| protected: | |||
| MatMulParameter *params_ = nullptr; | |||
| private: | |||
| int thread_stride_ = 0; | |||
| int thread_count_ = 0; | |||
| bool vec_matmul_ = false; | |||
| float16_t *a_pack_ptr_ = nullptr; | |||
| float16_t *b_pack_ptr_ = nullptr; | |||
| float16_t *src_b_ = nullptr; | |||
| float16_t *bias_ptr_ = nullptr; | |||
| float16_t *batch_a_ptr_ = nullptr; | |||
| float16_t *batch_b_ptr_ = nullptr; | |||
| float16_t *batch_c_ptr_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_ | |||
| @@ -15,48 +15,20 @@ | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/matmul_fp16.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| #include "nnacl/fp16/cast_fp16.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/kernel_registry.h" | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_MatMul; | |||
| namespace mindspore::kernel { | |||
| MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| if (bias_ptr_ != nullptr) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| } | |||
| } | |||
| void MatmulFP16CPUKernel::FreeTmpBuffer() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| } | |||
| int MatmulFP16CPUKernel::MallocMatrixABuffer() { | |||
| void MatmulFP16CPUKernel::InitAShape() { | |||
| auto a_shape = in_tensors_[0]->shape(); | |||
| if (a_shape.empty()) { | |||
| return; | |||
| } | |||
| int batch = 1; | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| batch *= a_shape[i]; | |||
| @@ -65,25 +37,12 @@ int MatmulFP16CPUKernel::MallocMatrixABuffer() { | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| params_->row_16_ = UP_ROUND(params_->row_, C16NUM); | |||
| if (params_->a_const_) { | |||
| a_pack_ptr_ = | |||
| reinterpret_cast<float16_t *>(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); | |||
| } else { | |||
| a_pack_ptr_ = reinterpret_cast<float16_t *>( | |||
| context_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); | |||
| } | |||
| if (a_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| int MatmulFP16CPUKernel::MallocMatrixBBuffer() { | |||
| void MatmulFP16CPUKernel::InitBShape() { | |||
| auto b_shape = in_tensors_[1]->shape(); | |||
| if (b_shape.empty()) { | |||
| return RET_OK; | |||
| return; | |||
| } | |||
| int batch = 1; | |||
| for (size_t i = 0; i < b_shape.size() - 2; ++i) { | |||
| @@ -93,257 +52,42 @@ int MatmulFP16CPUKernel::MallocMatrixBBuffer() { | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->col_8_ = UP_ROUND(params_->col_, 8); | |||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | |||
| if (params_->b_const_) { | |||
| b_pack_ptr_ = | |||
| reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); | |||
| } else { | |||
| b_pack_ptr_ = reinterpret_cast<float16_t *>( | |||
| context_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); | |||
| } | |||
| if (b_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM)); | |||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; | |||
| return RET_OK; | |||
| } | |||
| int MatmulFP16CPUKernel::InitBias() { | |||
| auto b_shape = in_tensors_[1]->shape(); | |||
| auto c_shape = out_tensors_[0]->shape(); | |||
| params_->col_ = params_->b_const_ | |||
| ? (params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]) | |||
| : (c_shape[c_shape.size() - 1]); | |||
| params_->col_8_ = UP_ROUND(params_->col_, 8); | |||
| bias_ptr_ = reinterpret_cast<float16_t *>(malloc(params_->col_8_ * sizeof(float16_t))); | |||
| if (bias_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); | |||
| if (in_tensors_.size() == 3) { | |||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, params_->col_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFP16CPUKernel::ReSize() { | |||
| if (!params_->b_const_) { | |||
| auto ret = InitBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 init bias failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| float *src = a_ptr + i * params_->deep_ * params_->row_; | |||
| float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, true); | |||
| } else { | |||
| RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, true); | |||
| } | |||
| } | |||
| } | |||
| void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| float16_t *src = a_ptr + i * params_->deep_ * params_->row_; | |||
| float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_; | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, false); | |||
| } else { | |||
| RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, false); | |||
| } | |||
| } | |||
| } | |||
| void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| float *src = b_ptr + i * params_->deep_ * params_->col_; | |||
| float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, true); | |||
| } else { | |||
| RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, true); | |||
| } | |||
| } | |||
| } | |||
| void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| float16_t *src = b_ptr + i * params_->deep_ * params_->col_; | |||
| float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_; | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, false); | |||
| } else { | |||
| RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, false); | |||
| } | |||
| } | |||
| } | |||
| int MatmulFP16CPUKernel::Init() { | |||
| params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | |||
| params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | |||
| MatmulBaseFP16CPUKernel::InitParameter(); | |||
| if (params_->a_const_) { | |||
| auto ret = MallocMatrixABuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_); | |||
| } else { | |||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_); | |||
| } | |||
| InitAShape(); | |||
| } | |||
| if (params_->b_const_) { | |||
| auto ret = MallocMatrixBBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_); | |||
| } | |||
| ret = InitBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 init bias failed"; | |||
| return RET_ERROR; | |||
| } | |||
| InitBShape(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFP16CPUKernel::MallocFp16Output() { | |||
| if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||
| output_ptr_ = reinterpret_cast<float16_t *>( | |||
| context_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); | |||
| if (output_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc output_ptr_ failed."; | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| auto ret = MatmulBaseFP16CPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFP16CPUKernel::RunImpl(int task_id) { | |||
| int cur_stride = params_->col_ - task_id * thread_stride_; | |||
| int cur_oc = MSMIN(thread_stride_, cur_stride); | |||
| if (cur_oc <= 0) { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| auto b = current_b_ + task_id * thread_stride_ * params_->deep_; | |||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | |||
| auto c = current_c_ + task_id * thread_stride_; | |||
| MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); | |||
| return RET_OK; | |||
| return ReSize(); | |||
| } | |||
| int MatmulFP16Run(void *cdata, int task_id) { | |||
| auto op = reinterpret_cast<MatmulFP16CPUKernel *>(cdata); | |||
| auto error_code = op->RunImpl(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| int MatmulFP16CPUKernel::ReSize() { | |||
| InitAShape(); | |||
| InitBShape(); | |||
| return MatmulBaseFP16CPUKernel::ReSize(); | |||
| } | |||
| int MatmulFP16CPUKernel::Run() { | |||
| auto out_tensor = out_tensors_.at(0); | |||
| auto ret = MallocFp16Output(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul MallocFp16Output failed"; | |||
| return RET_ERROR; | |||
| } | |||
| float16_t *c_ptr = nullptr; | |||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||
| c_ptr = output_ptr_; | |||
| } else { | |||
| c_ptr = reinterpret_cast<float16_t *>(out_tensor->data_c()); | |||
| } | |||
| if (!params_->a_const_) { | |||
| ret = MallocMatrixABuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_); | |||
| } else { | |||
| InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_); | |||
| } | |||
| } | |||
| if (!params_->b_const_) { | |||
| ret = MallocMatrixBBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) { | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| } else { | |||
| InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| } | |||
| } | |||
| for (int i = 0; i < params_->batch; ++i) { | |||
| current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_; | |||
| current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_; | |||
| current_c_ = c_ptr + i * params_->row_ * params_->col_; | |||
| ret = ParallelLaunch(this->context_->thread_pool_, MatmulFP16Run, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp16 run function MatmulFP16Run failed"; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||
| auto size = out_tensor->ElementsNum(); | |||
| auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c()); | |||
| Float16ToFloat32(output_ptr_, out_tensor_data, size); | |||
| context_->allocator->Free(output_ptr_); | |||
| } | |||
| if (!params_->a_const_) { | |||
| context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (!params_->b_const_) { | |||
| context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| auto ret = MatmulBaseFP16CPUKernel::Run(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||
| delete kernel; | |||
| return nullptr; | |||
| MS_LOG(ERROR) << "MatmulFP16CPUKernel run failed"; | |||
| } | |||
| return kernel; | |||
| return ret; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, LiteKernelCreator<MatmulFP16CPUKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -17,50 +17,24 @@ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_ | |||
| #ifdef ENABLE_NEON | |||
| #include <arm_neon.h> | |||
| #endif | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h" | |||
| namespace mindspore::kernel { | |||
| class MatmulFP16CPUKernel : public LiteKernel { | |||
| class MatmulFP16CPUKernel : public MatmulBaseFP16CPUKernel { | |||
| public: | |||
| explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| params_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| } | |||
| ~MatmulFP16CPUKernel() override; | |||
| : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~MatmulFP16CPUKernel() override = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int RunImpl(int task_id); | |||
| private: | |||
| int MallocMatrixABuffer(); | |||
| int MallocMatrixBBuffer(); | |||
| int InitBias(); | |||
| int MallocFp16Output(); | |||
| void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr); | |||
| void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr); | |||
| void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr); | |||
| void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr); | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| MatMulParameter *params_ = nullptr; | |||
| float16_t *a_pack_ptr_ = nullptr; | |||
| float16_t *b_pack_ptr_ = nullptr; | |||
| float16_t *bias_ptr_ = nullptr; | |||
| float16_t *output_ptr_ = nullptr; | |||
| float16_t *current_a_ = nullptr; | |||
| float16_t *current_b_ = nullptr; | |||
| float16_t *current_c_ = nullptr; | |||
| int thread_stride_ = 0; | |||
| int thread_count_ = 0; | |||
| void InitAShape(); | |||
| void InitBShape(); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -25,29 +25,37 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_MatMul; | |||
| namespace mindspore::kernel { | |||
| void MatmulCPUKernel::InitShapeA() { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| batch *= a_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| } | |||
| void MatmulCPUKernel::InitShapeB() { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < b_shape.size() - 2; ++i) { | |||
| batch *= b_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | |||
| } | |||
| int MatmulCPUKernel::Init() { | |||
| MatmulFp32BaseCPUKernel::InitParameter(); | |||
| if (params_->a_const_ == true) { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| batch *= a_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| InitShapeA(); | |||
| } | |||
| if (params_->b_const_ == true) { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < b_shape.size() - 2; ++i) { | |||
| batch *= b_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | |||
| InitShapeB(); | |||
| } | |||
| auto ret = MatmulFp32BaseCPUKernel::Init(); | |||
| @@ -62,17 +70,8 @@ int MatmulCPUKernel::Init() { | |||
| } | |||
| int MatmulCPUKernel::ReSize() { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| int batch = 1; | |||
| MS_ASSERT(a_shape.size() >= 2); | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| batch *= a_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| InitShapeA(); | |||
| InitShapeB(); | |||
| return MatmulFp32BaseCPUKernel::ReSize(); | |||
| } | |||
| @@ -33,6 +33,10 @@ class MatmulCPUKernel : public MatmulFp32BaseCPUKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int Eval() override; | |||
| private: | |||
| void InitShapeA(); | |||
| void InitShapeB(); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ | |||
| @@ -68,8 +68,8 @@ int MatmulFp32BaseCPUKernel::InitBufferA() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| return RET_OK; | |||
| } | |||
| a_pack_ptr_ = | |||
| reinterpret_cast<float *>(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); | |||
| a_pack_ptr_ = reinterpret_cast<float *>( | |||
| context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); | |||
| if (a_pack_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc a_pack_ptr_ failed"; | |||
| return RET_ERROR; | |||
| @@ -81,8 +81,8 @@ int MatmulFp32BaseCPUKernel::InitBufferB() { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| return RET_OK; | |||
| } | |||
| b_pack_ptr_ = | |||
| reinterpret_cast<float *>(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); | |||
| b_pack_ptr_ = reinterpret_cast<float *>( | |||
| context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); | |||
| if (b_pack_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc b_pack_ptr_ failed"; | |||
| return RET_ERROR; | |||
| @@ -99,6 +99,7 @@ int MatmulFp32BaseCPUKernel::InitBiasData() { | |||
| MS_LOG(ERROR) << "malloc bias_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_ptr_, 0, max_bias_data * sizeof(float)); | |||
| memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float)); | |||
| } | |||
| return RET_OK; | |||
| @@ -201,7 +202,9 @@ void MatmulFp32BaseCPUKernel::FreeResizeBufB() { | |||
| } | |||
| int MatmulFp32BaseCPUKernel::FloatRun(int task_id) { | |||
| int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); | |||
| int current_stride_oc = thread_stride_ * col_tile_; | |||
| int current_rest_oc = params_->col_ - task_id * thread_stride_ * col_tile_; | |||
| int cur_oc = MSMIN(current_stride_oc, current_rest_oc); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| @@ -254,7 +257,7 @@ int MatmulFp32BaseCPUKernel::ReSize() { | |||
| int MatmulFp32BaseCPUKernel::Run() { | |||
| auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||
| auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | |||
| c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||
| auto c_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||
| if (params_->a_const_ == false) { | |||
| if (RET_OK != InitBufferA()) { | |||
| @@ -274,11 +277,11 @@ int MatmulFp32BaseCPUKernel::Run() { | |||
| if (vec_matmul_) { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; | |||
| batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; | |||
| batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; | |||
| } else { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; | |||
| batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_; | |||
| } | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| @@ -62,16 +62,17 @@ class MatmulFp32BaseCPUKernel : public LiteKernel { | |||
| MatMulParameter *params_ = nullptr; | |||
| float *a_pack_ptr_ = nullptr; | |||
| float *b_pack_ptr_ = nullptr; | |||
| float *c_ptr_ = nullptr; | |||
| float *bias_ptr_ = nullptr; | |||
| float *batch_a_ptr_ = nullptr; | |||
| float *batch_b_ptr_ = nullptr; | |||
| float *batch_c_ptr_ = nullptr; | |||
| private: | |||
| int col_tile_ = 0; | |||
| int row_tile_ = 0; | |||
| int thread_stride_ = 0; | |||
| int thread_count_ = 0; | |||
| bool vec_matmul_ = false; | |||
| float *bias_ptr_ = nullptr; | |||
| float *batch_a_ptr_ = nullptr; | |||
| float *batch_b_ptr_ = nullptr; | |||
| float *batch_c_ptr_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ | |||