Browse Source

[MSLITE] fp16 matmul

tags/v1.2.0-rc1
ling 4 years ago
parent
commit
9f53c656bb
13 changed files with 544 additions and 597 deletions
  1. +23
    -8
      mindspore/lite/nnacl/fp16/matmul_fp16.c
  2. +2
    -0
      mindspore/lite/nnacl/fp16/matmul_fp16.h
  3. +35
    -5
      mindspore/lite/nnacl/fp32/matmul_fp32.c
  4. +29
    -204
      mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
  5. +7
    -29
      mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
  6. +298
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
  7. +74
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
  8. +23
    -279
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
  9. +6
    -32
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
  10. +26
    -27
      mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
  11. +4
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
  12. +11
    -8
      mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
  13. +6
    -5
      mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h

+ 23
- 8
mindspore/lite/nnacl/fp16/matmul_fp16.c View File

@@ -443,17 +443,20 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si
}

void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r_div16 = r / 16;
int r_mod16 = r % 16;
if (is_fp32_src) {
dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(((const float *)src)[r * col + c]);
} else {
dst[r_div16 * 16 * col + c * 16 + r_mod16] = ((const float16_t *)src)[r * col + c];
if (is_fp32_src) {
const float *fp32_src = (const float *)src;
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r_div16 = r / 16;
int r_mod16 = r % 16;
dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(fp32_src[r * col + c]);
}
}
} else {
const float16_t *fp16_src = (const float16_t *)src;
RowMajor2Col16MajorFp16Opt(fp16_src, dst, row, col);
}
return;
}

void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
@@ -484,6 +487,18 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b
}
}

void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; ++r) {
for (int c = 0; c < col; ++c) {
if (is_fp32_src) {
dst[c * row + r] = (float16_t)(((const float *)src)[r * col + c]);
} else {
dst[c * row + r] = ((const float16_t *)src)[r * col + c];
}
}
}
}

void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {


+ 2
- 0
mindspore/lite/nnacl/fp16/matmul_fp16.h View File

@@ -59,6 +59,8 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b

void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);

void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);

#ifdef __cplusplus
}
#endif


+ 35
- 5
mindspore/lite/nnacl/fp32/matmul_fp32.c View File

@@ -27,11 +27,17 @@ void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col) {
void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
const float *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < col; c++) {
int cd4 = c / C4NUM;
int cm4 = c % C4NUM;
dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = src[c];
}
for (; c < UP_ROUND(col, C4NUM); c++) {
int cd4 = c / C4NUM;
int cm4 = c % C4NUM;
dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = 0;
}
}
return;
}
@@ -39,11 +45,17 @@ void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col)
void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
const float *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < col; c++) {
int cd6 = c / C6NUM;
int cm6 = c % C6NUM;
dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = src[c];
}
for (; c < UP_ROUND(col, C6NUM); c++) {
int cd6 = c / C6NUM;
int cm6 = c % C6NUM;
dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = 0;
}
}
return;
}
@@ -51,11 +63,17 @@ void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col)
void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
const float *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < col; c++) {
int cd8 = c / C8NUM;
int cm8 = c % C8NUM;
dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = src[c];
}
for (; c < UP_ROUND(col, C8NUM); c++) {
int cd8 = c / C8NUM;
int cm8 = c % C8NUM;
dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = 0;
}
}
return;
}
@@ -63,11 +81,17 @@ void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col)
void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
const float *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < col; c++) {
int cd12 = c / C12NUM;
int cm12 = c % C12NUM;
dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = src[c];
}
for (; c < UP_ROUND(col, C12NUM); c++) {
int cd12 = c / C12NUM;
int cm12 = c % C12NUM;
dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = 0;
}
}
return;
}
@@ -75,11 +99,17 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col)
void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
const float *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < col; c++) {
int cd16 = c / C16NUM;
int cm16 = c % C16NUM;
dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = src[c];
}
for (; c < UP_ROUND(col, C16NUM); c++) {
int cd16 = c / C16NUM;
int cm16 = c % C16NUM;
dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = 0;
}
}
return;
}


+ 29
- 204
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc View File

@@ -19,236 +19,61 @@
#include "src/kernel_registry.h"

using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_INPUT_TENSOR_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_FullConnection;

namespace mindspore::kernel {
FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); }
void FullconnectionFP16CPUKernel::InitAShape() {
auto a_shape = in_tensors_.at(0)->shape();
params_->row_ = a_shape[0];
params_->deep_ = a_shape[1];
}

void FullconnectionFP16CPUKernel::FreeTmpBuffer() {
if (a_pack_ptr_ != nullptr) {
context_->allocator->Free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
if (b_pack_ptr_ != nullptr) {
context_->allocator->Free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
if (bias_ptr_ != nullptr) {
context_->allocator->Free(bias_ptr_);
bias_ptr_ = nullptr;
}
if (output_fp16_ != nullptr) {
context_->allocator->Free(output_fp16_);
output_fp16_ = nullptr;
}
void FullconnectionFP16CPUKernel::InitBShape() {
auto b_shape = in_tensors_.at(1)->shape();
params_->col_ = b_shape[0];
params_->deep_ = b_shape[1];
}

int FullconnectionFP16CPUKernel::ReSize() {
FreeTmpBuffer();
int row = 1;
for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) row *= (out_tensors_.at(0)->shape())[i];
fc_param_->row_ = row;
fc_param_->col_ = out_tensors_.at(0)->shape().back();
fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM);
fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM;
InitAShape();
InitBShape();
return MatmulBaseFP16CPUKernel::ReSize();
}

if (row == 1) is_vector_input_ = true;
int a_pack_row = 0;
int b_pack_col = 0;
if (is_vector_input_) {
a_pack_row = 1;
b_pack_col = fc_param_->col_;
} else {
a_pack_row = fc_param_->row_16_;
b_pack_col = fc_param_->col_8_;
}
a_pack_ptr_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t)));
if (a_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t));
int FullconnectionFP16CPUKernel::Init() {
params_->batch = 1;
params_->a_transpose_ = false;
params_->b_transpose_ = true;

b_pack_ptr_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t)));
if (b_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t));
MatmulBaseFP16CPUKernel::InitParameter();

fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr);
if (fc_param_->b_const_) {
if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_,
fc_param_->col_ * fc_param_->deep_);
} else {
InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
}
} else {
if (is_vector_input_) {
memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()),
fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t));
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
}
}
b_ptr_ = b_pack_ptr_;
if (params_->a_const_ == true) {
InitAShape();
}

if (in_tensors_.size() == 3) {
bias_ptr_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * sizeof(float16_t)));
if (bias_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t));
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->data_c()), bias_ptr_, fc_param_->col_);
if (params_->b_const_ == true) {
InitBShape();
}

if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
output_fp16_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t)));
if (output_fp16_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
auto ret = MatmulBaseFP16CPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
return RET_OK;
} // namespace mindspore::kernel

void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true);
}

void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) {
RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, false);
}

void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) {
RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, true);
}

void FullconnectionFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) {
RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, false);
}

int FullconnectionFP16CPUKernel::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}

int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
int cur_stride = fc_param_->col_ - task_id * thread_stride_;
int cur_oc = MSMIN(thread_stride_, cur_stride);
if (cur_oc <= 0) {
return RET_OK;
}
auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = output_ptr_ + task_id * thread_stride_;
if (is_vector_input_) {
MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
} else {
MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
OutType_Nhwc);
}

return RET_OK;
}

int FcFP16Run(void *cdata, int task_id) {
auto op = reinterpret_cast<FullconnectionFP16CPUKernel *>(cdata);
auto error_code = op->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}

int FullconnectionFP16CPUKernel::Run() {
auto out_tensor = out_tensors_.at(0);
if (out_tensor->data_type() == kNumberTypeFloat32) {
output_ptr_ = output_fp16_;
} else {
output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
}

if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_, fc_param_->deep_);
} else {
InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
}
a_ptr_ = a_pack_ptr_;
} else {
if (is_vector_input_) {
a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
a_ptr_ = a_pack_ptr_;
}
}

if (!fc_param_->b_const_) {
if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
if (is_vector_input_) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_,
fc_param_->col_ * fc_param_->deep_);
} else {
InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
}
b_ptr_ = b_pack_ptr_;
} else {
if (is_vector_input_) {
b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c());
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
b_ptr_ = b_pack_ptr_;
}
}
}
ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_);
if (out_tensor->data_type() == kNumberTypeFloat32) {
auto size = out_tensor->ElementsNum();
auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c());
Float16ToFloat32(output_fp16_, out_tensor_data, size);
}
return RET_OK;
}

kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
auto ret = MatmulBaseFP16CPUKernel::Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
delete kernel;
return nullptr;
MS_LOG(ERROR) << "FullconnectionFP16CPUKernel run failed";
}
return kernel;
return ret;
}

REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator)
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionFP16CPUKernel>)
} // namespace mindspore::kernel

+ 7
- 29
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h View File

@@ -19,46 +19,24 @@

#include <arm_neon.h>
#include <vector>
#include "include/errorcode.h"
#include "nnacl/matmul_parameter.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"

namespace mindspore::kernel {
class FullconnectionFP16CPUKernel : public LiteKernel {
class FullconnectionFP16CPUKernel : public MatmulBaseFP16CPUKernel {
public:
explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~FullconnectionFP16CPUKernel() override;
: MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~FullconnectionFP16CPUKernel() override = default;

int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);

private:
void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr);
void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr);
void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr);
void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr);
void FreeTmpBuffer();

private:
MatMulParameter *fc_param_ = nullptr;
float16_t *a_pack_ptr_ = nullptr;
float16_t *b_pack_ptr_ = nullptr;
float16_t *bias_ptr_ = nullptr;
float16_t *output_fp16_ = nullptr;
float16_t *output_ptr_ = nullptr;
float16_t *a_ptr_ = nullptr;
float16_t *b_ptr_ = nullptr;
bool is_vector_input_ = false;
int thread_count_ = 1;
int thread_stride_ = 0;
void InitAShape();
void InitBShape();
};
} // namespace mindspore::kernel



+ 298
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc View File

@@ -0,0 +1,298 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"

using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_INPUT_TENSOR_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;

namespace mindspore::kernel {
int MatmulBaseFP16Run(void *cdata, int task_id) {
auto op = reinterpret_cast<MatmulBaseFP16CPUKernel *>(cdata);
auto error_code = op->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}

MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() {
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
FreeResizeBufA();
FreeResizeBufB();
}

void MatmulBaseFP16CPUKernel::FreeResizeBufA() {
if (a_pack_ptr_ != nullptr) {
context_->allocator->Free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
return;
}

void MatmulBaseFP16CPUKernel::FreeResizeBufB() {
if (b_pack_ptr_ != nullptr) {
context_->allocator->Free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
return;
}

void MatmulBaseFP16CPUKernel::InitParameter() {
params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
return;
}

int MatmulBaseFP16CPUKernel::InitBias() {
if (in_tensors_.size() == 3) {
auto bias_tensor = in_tensors_[2];
int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C8NUM);
bias_ptr_ = reinterpret_cast<float16_t *>(malloc(max_bias_data * sizeof(float)));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t));
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, bias_tensor->ElementsNum());
}
return RET_OK;
}

int MatmulBaseFP16CPUKernel::ReSize() {
ResizeParameter();

if (params_->b_const_ == true && src_b_ != nullptr) {
InitBufferB();
InitMatrixB(src_b_, kNumberTypeFloat16);
free(src_b_);
src_b_ = nullptr;
}

thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
return RET_OK;
}

void MatmulBaseFP16CPUKernel::ResizeParameter() {
if (params_->row_ == 1) {
vec_matmul_ = true;
}

if (vec_matmul_) {
params_->row_align_ = 1;
params_->col_align_ = params_->col_;
} else {
params_->row_align_ = UP_ROUND(params_->row_, C16NUM);
params_->col_align_ = UP_ROUND(params_->col_, C8NUM);
}
return;
}

int MatmulBaseFP16CPUKernel::InitBufferA() {
a_pack_ptr_ = reinterpret_cast<float16_t *>(
context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)));
if (a_pack_ptr_ == nullptr) {
return RET_MEMORY_FAILED;
}

memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t));
return RET_OK;
}

int MatmulBaseFP16CPUKernel::InitBufferB() {
if (b_pack_ptr_ != nullptr) {
return RET_OK;
}

b_pack_ptr_ = reinterpret_cast<float16_t *>(
context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)));
if (b_pack_ptr_ == nullptr) {
return RET_MEMORY_FAILED;
}

memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t));
return RET_OK;
}

void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) {
auto src_data_type = in_tensors_[0]->data_type();

if (vec_matmul_) {
if (src_data_type == kNumberTypeFloat32) {
Float32ToFloat16(reinterpret_cast<float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
} else {
memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t));
}
return;
}

int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);
for (int i = 0; i < params_->batch; i++) {
int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
if (params_->a_transpose_) {
RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
} else {
RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
}
}
return;
}

void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) {
int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);

if (vec_matmul_) {
if (params_->b_transpose_) {
if (src_data_type == kNumberTypeFloat32) {
Float32ToFloat16(reinterpret_cast<float *>(src_ptr), b_pack_ptr_,
params_->batch * params_->col_ * params_->deep_);
} else {
memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
}
} else {
for (int i = 0; i < params_->batch; i++) {
const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_;
RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
}
}
return;
}

for (int i = 0; i < params_->batch; i++) {
int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
if (params_->b_transpose_) {
RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32);
} else {
RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
}
}
return;
}

int MatmulBaseFP16CPUKernel::Init() {
ResizeParameter();
if (params_->a_const_ == true) {
if (RET_OK != InitBufferA()) {
return RET_ERROR;
}
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()));
}

if (params_->b_const_ == true) {
/* copy origin b data, pack in resize
* pack after a infershape done */
auto b_tensor = in_tensors_[1];
src_b_ = reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)));
if (src_b_ == nullptr) {
MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed";
return RET_ERROR;
}

if (b_tensor->data_type() == kNumberTypeFloat32) {
Float32ToFloat16(reinterpret_cast<float *>(b_tensor->data_c()), src_b_,
params_->batch * params_->col_ * params_->deep_);
} else {
memcpy(src_b_, b_tensor->data_c(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
}
}

auto ret = InitBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
return RET_ERROR;
}
return RET_OK;
}

int MatmulBaseFP16CPUKernel::RunImpl(int task_id) {
int cur_stride = params_->col_ - task_id * thread_stride_;
int cur_oc = MSMIN(thread_stride_, cur_stride);
if (cur_oc <= 0) {
return RET_OK;
}

auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_;
auto c = batch_c_ptr_ + task_id * thread_stride_;

if (vec_matmul_) {
MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
} else {
MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
OutType_Nhwc);
}
return RET_OK;
}

int MatmulBaseFP16CPUKernel::Run() {
auto c_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());

if (params_->a_const_ == false) {
if (RET_OK != InitBufferA()) {
return RET_ERROR;
}
InitMatrixA(in_tensors_.at(0)->data_c());
}
if (params_->b_const_ == false) {
if (RET_OK != InitBufferB()) {
FreeResizeBufA();
return RET_ERROR;
}
InitMatrixB(in_tensors_.at(1)->data_c(), in_tensors_.at(1)->data_type());
}

for (int i = 0; i < params_->batch; ++i) {
if (vec_matmul_) {
batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
} else {
batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
}
auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFP16Run, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
return ret;
}
}

if (params_->a_const_ == false) {
FreeResizeBufA();
}

if (params_->b_const_ == false) {
FreeResizeBufB();
}
return RET_OK;
}

} // namespace mindspore::kernel

+ 74
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h View File

@@ -0,0 +1,74 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"

namespace mindspore::kernel {
class MatmulBaseFP16CPUKernel : public LiteKernel {
public:
explicit MatmulBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~MatmulBaseFP16CPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;

public:
int RunImpl(int task_id);

protected:
void InitParameter();

private:
int InitBias();
void ResizeParameter();
int InitBufferA();
int InitBufferB();
void InitMatrixA(void *src_ptr);
void InitMatrixB(void *src_ptr, TypeId data_type);
void FreeResizeBufA();
void FreeResizeBufB();

protected:
MatMulParameter *params_ = nullptr;

private:
int thread_stride_ = 0;
int thread_count_ = 0;
bool vec_matmul_ = false;
float16_t *a_pack_ptr_ = nullptr;
float16_t *b_pack_ptr_ = nullptr;
float16_t *src_b_ = nullptr;
float16_t *bias_ptr_ = nullptr;
float16_t *batch_a_ptr_ = nullptr;
float16_t *batch_b_ptr_ = nullptr;
float16_t *batch_c_ptr_ = nullptr;
};
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_

+ 23
- 279
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc View File

@@ -15,48 +15,20 @@
*/

#include "src/runtime/kernel/arm/fp16/matmul_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
#include "src/kernel_registry.h"

using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_INPUT_TENSOR_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MatMul;

namespace mindspore::kernel {
MatmulFP16CPUKernel::~MatmulFP16CPUKernel() {
if (a_pack_ptr_ != nullptr) {
free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
if (b_pack_ptr_ != nullptr) {
free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
}

void MatmulFP16CPUKernel::FreeTmpBuffer() {
if (a_pack_ptr_ != nullptr) {
params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
if (b_pack_ptr_ != nullptr) {
params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
}

int MatmulFP16CPUKernel::MallocMatrixABuffer() {
void MatmulFP16CPUKernel::InitAShape() {
auto a_shape = in_tensors_[0]->shape();
if (a_shape.empty()) {
return;
}
int batch = 1;
for (size_t i = 0; i < a_shape.size() - 2; ++i) {
batch *= a_shape[i];
@@ -65,25 +37,12 @@ int MatmulFP16CPUKernel::MallocMatrixABuffer() {
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
params_->row_16_ = UP_ROUND(params_->row_, C16NUM);
if (params_->a_const_) {
a_pack_ptr_ =
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
} else {
a_pack_ptr_ = reinterpret_cast<float16_t *>(
context_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
}
if (a_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t));
return RET_OK;
}

int MatmulFP16CPUKernel::MallocMatrixBBuffer() {
void MatmulFP16CPUKernel::InitBShape() {
auto b_shape = in_tensors_[1]->shape();
if (b_shape.empty()) {
return RET_OK;
return;
}
int batch = 1;
for (size_t i = 0; i < b_shape.size() - 2; ++i) {
@@ -93,257 +52,42 @@ int MatmulFP16CPUKernel::MallocMatrixBBuffer() {
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
params_->col_8_ = UP_ROUND(params_->col_, 8);
params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];

if (params_->b_const_) {
b_pack_ptr_ =
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
} else {
b_pack_ptr_ = reinterpret_cast<float16_t *>(
context_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
}
if (b_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t));
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
return RET_OK;
}

int MatmulFP16CPUKernel::InitBias() {
auto b_shape = in_tensors_[1]->shape();
auto c_shape = out_tensors_[0]->shape();
params_->col_ = params_->b_const_
? (params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1])
: (c_shape[c_shape.size() - 1]);
params_->col_8_ = UP_ROUND(params_->col_, 8);
bias_ptr_ = reinterpret_cast<float16_t *>(malloc(params_->col_8_ * sizeof(float16_t)));
if (bias_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t));
if (in_tensors_.size() == 3) {
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, params_->col_);
}
return RET_OK;
}

int MatmulFP16CPUKernel::ReSize() {
if (!params_->b_const_) {
auto ret = InitBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 init bias failed";
return RET_ERROR;
}
}
return RET_OK;
}

void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
for (int i = 0; i < params_->batch; i++) {
float *src = a_ptr + i * params_->deep_ * params_->row_;
float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_;
if (params_->a_transpose_) {
RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, true);
} else {
RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, true);
}
}
}

void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) {
for (int i = 0; i < params_->batch; i++) {
float16_t *src = a_ptr + i * params_->deep_ * params_->row_;
float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_;
if (params_->a_transpose_) {
RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, false);
} else {
RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, false);
}
}
}

void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) {
for (int i = 0; i < params_->batch; i++) {
float *src = b_ptr + i * params_->deep_ * params_->col_;
float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_;
if (params_->b_transpose_) {
RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, true);
} else {
RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, true);
}
}
}

void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) {
for (int i = 0; i < params_->batch; i++) {
float16_t *src = b_ptr + i * params_->deep_ * params_->col_;
float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_;
if (params_->b_transpose_) {
RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, false);
} else {
RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, false);
}
}
}

int MatmulFP16CPUKernel::Init() {
params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
MatmulBaseFP16CPUKernel::InitParameter();

if (params_->a_const_) {
auto ret = MallocMatrixABuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
return RET_ERROR;
}
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
}
InitAShape();
}
if (params_->b_const_) {
auto ret = MallocMatrixBBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed";
return RET_ERROR;
}
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
ret = InitBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 init bias failed";
return RET_ERROR;
}
InitBShape();
}
return RET_OK;
}

int MatmulFP16CPUKernel::MallocFp16Output() {
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
output_ptr_ = reinterpret_cast<float16_t *>(
context_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t)));
if (output_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc output_ptr_ failed.";
return RET_MEMORY_FAILED;
}
auto ret = MatmulBaseFP16CPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
return RET_OK;
}

int MatmulFP16CPUKernel::RunImpl(int task_id) {
int cur_stride = params_->col_ - task_id * thread_stride_;
int cur_oc = MSMIN(thread_stride_, cur_stride);
if (cur_oc <= 0) {
if (!InferShapeDone()) {
return RET_OK;
}
auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = current_c_ + task_id * thread_stride_;
MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);

return RET_OK;
return ReSize();
}

int MatmulFP16Run(void *cdata, int task_id) {
auto op = reinterpret_cast<MatmulFP16CPUKernel *>(cdata);
auto error_code = op->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
int MatmulFP16CPUKernel::ReSize() {
InitAShape();
InitBShape();
return MatmulBaseFP16CPUKernel::ReSize();
}

int MatmulFP16CPUKernel::Run() {
auto out_tensor = out_tensors_.at(0);
auto ret = MallocFp16Output();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul MallocFp16Output failed";
return RET_ERROR;
}
float16_t *c_ptr = nullptr;
if (out_tensor->data_type() == kNumberTypeFloat32) {
c_ptr = output_ptr_;
} else {
c_ptr = reinterpret_cast<float16_t *>(out_tensor->data_c());
}
if (!params_->a_const_) {
ret = MallocMatrixABuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
return RET_ERROR;
}
if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
}
}
if (!params_->b_const_) {
ret = MallocMatrixBBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed";
return RET_ERROR;
}
if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
}
}
for (int i = 0; i < params_->batch; ++i) {
current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_;
current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_;
current_c_ = c_ptr + i * params_->row_ * params_->col_;
ret = ParallelLaunch(this->context_->thread_pool_, MatmulFP16Run, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 run function MatmulFP16Run failed";
FreeTmpBuffer();
return RET_ERROR;
}
}
if (out_tensor->data_type() == kNumberTypeFloat32) {
auto size = out_tensor->ElementsNum();
auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c());
Float16ToFloat32(output_ptr_, out_tensor_data, size);
context_->allocator->Free(output_ptr_);
}
if (!params_->a_const_) {
context_->allocator->Free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
if (!params_->b_const_) {
context_->allocator->Free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
return RET_OK;
}

kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
auto ret = MatmulBaseFP16CPUKernel::Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
delete kernel;
return nullptr;
MS_LOG(ERROR) << "MatmulFP16CPUKernel run failed";
}
return kernel;
return ret;
}

REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator)
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, LiteKernelCreator<MatmulFP16CPUKernel>)
} // namespace mindspore::kernel

+ 6
- 32
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h View File

@@ -17,50 +17,24 @@
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"
#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"

namespace mindspore::kernel {
class MatmulFP16CPUKernel : public LiteKernel {
class MatmulFP16CPUKernel : public MatmulBaseFP16CPUKernel {
public:
explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
}
~MatmulFP16CPUKernel() override;
: MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~MatmulFP16CPUKernel() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);

private:
int MallocMatrixABuffer();
int MallocMatrixBBuffer();
int InitBias();
int MallocFp16Output();
void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr);
void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr);
void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr);
void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr);
void FreeTmpBuffer();

private:
MatMulParameter *params_ = nullptr;
float16_t *a_pack_ptr_ = nullptr;
float16_t *b_pack_ptr_ = nullptr;
float16_t *bias_ptr_ = nullptr;
float16_t *output_ptr_ = nullptr;
float16_t *current_a_ = nullptr;
float16_t *current_b_ = nullptr;
float16_t *current_c_ = nullptr;
int thread_stride_ = 0;
int thread_count_ = 0;
void InitAShape();
void InitBShape();
};
} // namespace mindspore::kernel



+ 26
- 27
mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc View File

@@ -25,29 +25,37 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MatMul;

namespace mindspore::kernel {
void MatmulCPUKernel::InitShapeA() {
auto a_shape = in_tensors_.at(0)->shape();
int batch = 1;
for (size_t i = 0; i < a_shape.size() - 2; ++i) {
batch *= a_shape[i];
}
params_->batch = batch;
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
}

void MatmulCPUKernel::InitShapeB() {
auto b_shape = in_tensors_.at(1)->shape();
int batch = 1;
for (size_t i = 0; i < b_shape.size() - 2; ++i) {
batch *= b_shape[i];
}
params_->batch = batch;
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];
}

int MatmulCPUKernel::Init() {
MatmulFp32BaseCPUKernel::InitParameter();

if (params_->a_const_ == true) {
auto a_shape = in_tensors_.at(0)->shape();
int batch = 1;
for (size_t i = 0; i < a_shape.size() - 2; ++i) {
batch *= a_shape[i];
}
params_->batch = batch;
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
InitShapeA();
}

if (params_->b_const_ == true) {
auto b_shape = in_tensors_.at(1)->shape();
int batch = 1;
for (size_t i = 0; i < b_shape.size() - 2; ++i) {
batch *= b_shape[i];
}
params_->batch = batch;
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];
InitShapeB();
}

auto ret = MatmulFp32BaseCPUKernel::Init();
@@ -62,17 +70,8 @@ int MatmulCPUKernel::Init() {
}

int MatmulCPUKernel::ReSize() {
auto a_shape = in_tensors_.at(0)->shape();
auto b_shape = in_tensors_.at(1)->shape();
int batch = 1;
MS_ASSERT(a_shape.size() >= 2);
for (size_t i = 0; i < a_shape.size() - 2; ++i) {
batch *= a_shape[i];
}
params_->batch = batch;
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
InitShapeA();
InitShapeB();

return MatmulFp32BaseCPUKernel::ReSize();
}


+ 4
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h View File

@@ -33,6 +33,10 @@ class MatmulCPUKernel : public MatmulFp32BaseCPUKernel {
int ReSize() override;
int Run() override;
int Eval() override;

private:
void InitShapeA();
void InitShapeB();
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_

+ 11
- 8
mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc View File

@@ -68,8 +68,8 @@ int MatmulFp32BaseCPUKernel::InitBufferA() {
if (a_pack_ptr_ != nullptr) {
return RET_OK;
}
a_pack_ptr_ =
reinterpret_cast<float *>(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float)));
a_pack_ptr_ = reinterpret_cast<float *>(
context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float)));
if (a_pack_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
return RET_ERROR;
@@ -81,8 +81,8 @@ int MatmulFp32BaseCPUKernel::InitBufferB() {
if (b_pack_ptr_ != nullptr) {
return RET_OK;
}
b_pack_ptr_ =
reinterpret_cast<float *>(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float)));
b_pack_ptr_ = reinterpret_cast<float *>(
context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float)));
if (b_pack_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
return RET_ERROR;
@@ -99,6 +99,7 @@ int MatmulFp32BaseCPUKernel::InitBiasData() {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
memset(bias_ptr_, 0, max_bias_data * sizeof(float));
memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float));
}
return RET_OK;
@@ -201,7 +202,9 @@ void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
}

int MatmulFp32BaseCPUKernel::FloatRun(int task_id) {
int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_);
int current_stride_oc = thread_stride_ * col_tile_;
int current_rest_oc = params_->col_ - task_id * thread_stride_ * col_tile_;
int cur_oc = MSMIN(current_stride_oc, current_rest_oc);
if (cur_oc <= 0) {
return RET_OK;
}
@@ -254,7 +257,7 @@ int MatmulFp32BaseCPUKernel::ReSize() {
int MatmulFp32BaseCPUKernel::Run() {
auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
auto c_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());

if (params_->a_const_ == false) {
if (RET_OK != InitBufferA()) {
@@ -274,11 +277,11 @@ int MatmulFp32BaseCPUKernel::Run() {
if (vec_matmul_) {
batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_;
batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
} else {
batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_;
batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
}
auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_);
if (ret != RET_OK) {


+ 6
- 5
mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h View File

@@ -62,16 +62,17 @@ class MatmulFp32BaseCPUKernel : public LiteKernel {
MatMulParameter *params_ = nullptr;
float *a_pack_ptr_ = nullptr;
float *b_pack_ptr_ = nullptr;
float *c_ptr_ = nullptr;
float *bias_ptr_ = nullptr;
float *batch_a_ptr_ = nullptr;
float *batch_b_ptr_ = nullptr;
float *batch_c_ptr_ = nullptr;

private:
int col_tile_ = 0;
int row_tile_ = 0;
int thread_stride_ = 0;
int thread_count_ = 0;
bool vec_matmul_ = false;
float *bias_ptr_ = nullptr;
float *batch_a_ptr_ = nullptr;
float *batch_b_ptr_ = nullptr;
float *batch_c_ptr_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_

Loading…
Cancel
Save