Browse Source

!8268 [MSLITE][Develop] fix bug of arm cpu fp16 op matmul, init const tensor before judging infershape done

Merge pull request !8268 from yangruoqi713/lite
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
8e284859c7
4 changed files with 141 additions and 55 deletions
  1. +2
    -0
      mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
  2. +2
    -0
      mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
  3. +133
    -55
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
  4. +4
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h

+ 2
- 0
mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S View File

@@ -14,6 +14,8 @@ Float16ToFloat32:
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
cmp x2, #0
beq LoopEnd
cmp x2, #64
blt Loop
Loop64:


+ 2
- 0
mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S View File

@@ -14,6 +14,8 @@ Float32ToFloat16:
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
cmp x2, #0
beq LoopEnd
cmp x2, #64
blt Loop
Loop64:


+ 133
- 55
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc View File

@@ -34,98 +34,119 @@ MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { FreeTmpBuffer(); }

void MatmulFP16CPUKernel::FreeTmpBuffer() {
if (a_pack_ptr_ != nullptr) {
ctx_->allocator->Free(a_pack_ptr_);
free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
if (b_pack_ptr_ != nullptr) {
ctx_->allocator->Free(b_pack_ptr_);
free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
if (bias_ptr_ != nullptr) {
ctx_->allocator->Free(bias_ptr_);
free(bias_ptr_);
bias_ptr_ = nullptr;
}
if (output_ptr_ != nullptr) {
ctx_->allocator->Free(output_ptr_);
output_ptr_ = nullptr;
}
}

int MatmulFP16CPUKernel::ReSize() {
FreeTmpBuffer();
int batch = 1;
int MatmulFP16CPUKernel::MallocMatrixABuffer() {
auto a_shape = in_tensors_[0]->shape();
auto c_shape = out_tensors_[0]->shape();
if (in_tensors_.size() == 3) {
auto bias_shape = in_tensors_[2]->shape();
if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) {
MS_LOG(ERROR) << "The bias' dimension is not equal with column";
return RET_INPUT_TENSOR_ERROR;
}
}

int batch = 1;
for (size_t i = 0; i < a_shape.size() - 2; ++i) {
batch *= a_shape[i];
}
params_->batch = batch;
params_->row_ = c_shape[c_shape.size() - 2];
params_->col_ = c_shape[c_shape.size() - 1];
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
params_->row_16_ = UP_ROUND(params_->row_, C16NUM);
params_->col_8_ = UP_ROUND(params_->col_, C8NUM);
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;

a_pack_ptr_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
a_pack_ptr_ =
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
if (a_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t));
return RET_OK;
}

int MatmulFP16CPUKernel::MallocMatrixBBuffer() {
auto b_shape = in_tensors_[1]->shape();
if (b_shape.empty()) {
return RET_OK;
}
int batch = 1;
for (size_t i = 0; i < b_shape.size() - 2; ++i) {
batch *= b_shape[i];
}
params_->batch = batch;
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
params_->col_8_ = UP_ROUND(params_->col_, 8);
params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];

b_pack_ptr_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
b_pack_ptr_ =
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
if (b_pack_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t));
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM));
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
return RET_OK;
}

params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
if (params_->a_const_ == true) {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
}
}
if (params_->b_const_ == true) {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
}

int MatmulFP16CPUKernel::InitBias() {
if (in_tensors_.size() == 3) {
bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(params_->col_8_ * sizeof(float16_t)));
auto c_shape = out_tensors_[0]->shape();
auto bias_shape = in_tensors_[1]->shape();
if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) {
MS_LOG(ERROR) << "The bias'dimension is not equal with colum";
FreeTmpBuffer();
return RET_INPUT_TENSOR_ERROR;
}
auto col = c_shape[c_shape.size() - 1];
auto col_8 = UP_ROUND(col, 8);
bias_ptr_ = reinterpret_cast<float16_t *>(malloc(col_8 * sizeof(float16_t)));
if (bias_ptr_ == nullptr) {
FreeTmpBuffer();
return RET_MEMORY_FAILED;
}
memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t));
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, params_->col_);
memset(bias_ptr_, 0, col_8 * sizeof(float16_t));
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, col);
}
return RET_OK;
}

if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
output_ptr_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t)));
if (output_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc output_ptr_ failed.";
return RET_MEMORY_FAILED;
int MatmulFP16CPUKernel::ReSize() {
if (params_->a_const_ == false || params_->a_init_shape_ == false) {
if (a_pack_ptr_ != nullptr) {
free(a_pack_ptr_);
a_pack_ptr_ = nullptr;
}
auto ret = MallocMatrixABuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed";
return RET_ERROR;
}
}
if (params_->b_const_ == false || params_->b_init_shape_ == false) {
if (b_pack_ptr_ != nullptr) {
free(b_pack_ptr_);
b_pack_ptr_ = nullptr;
}
auto ret = MallocMatrixBBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed";
return RET_ERROR;
}
}
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
auto ret = InitBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 init bias failed";
return RET_ERROR;
}
return RET_OK;
}
@@ -179,10 +200,61 @@ void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) {
}

int MatmulFP16CPUKernel::Init() {
params_->a_init_shape_ = (in_tensors_[0]->shape().size() != 0);
params_->b_init_shape_ = (in_tensors_[1]->shape().size() != 0);
if (params_->a_init_shape_ == true) {
auto ret = MallocMatrixABuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed";
return RET_ERROR;
}
}
if (params_->b_init_shape_ == true) {
auto ret = MallocMatrixBBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed";
return RET_ERROR;
}
}

params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
if (params_->a_const_ == true) {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
} else {
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
}
}
if (params_->b_const_ == true) {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
} else {
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
}
}

if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
auto ret = InitBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul fp16 init bias failed";
return RET_ERROR;
}
return RET_OK;
}

int MatmulFP16CPUKernel::MallocFp16Output() {
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
output_ptr_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t)));
if (output_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc output_ptr_ failed.";
return RET_MEMORY_FAILED;
}
}
return RET_OK;
}

int MatmulFP16CPUKernel::RunImpl(int task_id) {
@@ -211,6 +283,11 @@ int MatmulFP16Run(void *cdata, int task_id) {

int MatmulFP16CPUKernel::Run() {
auto out_tensor = out_tensors_[0];
auto ret = MallocFp16Output();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Matmul MallocFp16Output failed";
return RET_ERROR;
}
float16_t *c_ptr = nullptr;
if (out_tensor->data_type() == kNumberTypeFloat32) {
c_ptr = output_ptr_;
@@ -241,6 +318,7 @@ int MatmulFP16CPUKernel::Run() {
auto size = out_tensor->ElementsNum();
auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c());
Float16ToFloat32(output_ptr_, out_tensor_data, size);
ctx_->allocator->Free(output_ptr_);
}
return RET_OK;
}


+ 4
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h View File

@@ -39,6 +39,10 @@ class MatmulFP16CPUKernel : public MatmulBaseCPUKernel {
int RunImpl(int task_id);

private:
int MallocMatrixABuffer();
int MallocMatrixBBuffer();
int InitBias();
int MallocFp16Output();
void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr);
void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr);
void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr);


Loading…
Cancel
Save