|
|
|
@@ -34,98 +34,119 @@ MatmulFP16CPUKernel::~MatmulFP16CPUKernel() { FreeTmpBuffer(); } |
|
|
|
|
|
|
|
void MatmulFP16CPUKernel::FreeTmpBuffer() { |
|
|
|
if (a_pack_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(a_pack_ptr_); |
|
|
|
free(a_pack_ptr_); |
|
|
|
a_pack_ptr_ = nullptr; |
|
|
|
} |
|
|
|
if (b_pack_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(b_pack_ptr_); |
|
|
|
free(b_pack_ptr_); |
|
|
|
b_pack_ptr_ = nullptr; |
|
|
|
} |
|
|
|
if (bias_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(bias_ptr_); |
|
|
|
free(bias_ptr_); |
|
|
|
bias_ptr_ = nullptr; |
|
|
|
} |
|
|
|
if (output_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(output_ptr_); |
|
|
|
output_ptr_ = nullptr; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::ReSize() { |
|
|
|
FreeTmpBuffer(); |
|
|
|
int batch = 1; |
|
|
|
int MatmulFP16CPUKernel::MallocMatrixABuffer() { |
|
|
|
auto a_shape = in_tensors_[0]->shape(); |
|
|
|
auto c_shape = out_tensors_[0]->shape(); |
|
|
|
if (in_tensors_.size() == 3) { |
|
|
|
auto bias_shape = in_tensors_[2]->shape(); |
|
|
|
if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { |
|
|
|
MS_LOG(ERROR) << "The bias' dimension is not equal with column"; |
|
|
|
return RET_INPUT_TENSOR_ERROR; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int batch = 1; |
|
|
|
for (size_t i = 0; i < a_shape.size() - 2; ++i) { |
|
|
|
batch *= a_shape[i]; |
|
|
|
} |
|
|
|
params_->batch = batch; |
|
|
|
params_->row_ = c_shape[c_shape.size() - 2]; |
|
|
|
params_->col_ = c_shape[c_shape.size() - 1]; |
|
|
|
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; |
|
|
|
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; |
|
|
|
params_->row_16_ = UP_ROUND(params_->row_, C16NUM); |
|
|
|
params_->col_8_ = UP_ROUND(params_->col_, C8NUM); |
|
|
|
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); |
|
|
|
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; |
|
|
|
|
|
|
|
a_pack_ptr_ = reinterpret_cast<float16_t *>( |
|
|
|
ctx_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); |
|
|
|
a_pack_ptr_ = |
|
|
|
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t))); |
|
|
|
if (a_pack_ptr_ == nullptr) { |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)); |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::MallocMatrixBBuffer() { |
|
|
|
auto b_shape = in_tensors_[1]->shape(); |
|
|
|
if (b_shape.empty()) { |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
int batch = 1; |
|
|
|
for (size_t i = 0; i < b_shape.size() - 2; ++i) { |
|
|
|
batch *= b_shape[i]; |
|
|
|
} |
|
|
|
params_->batch = batch; |
|
|
|
params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; |
|
|
|
params_->col_8_ = UP_ROUND(params_->col_, 8); |
|
|
|
params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; |
|
|
|
|
|
|
|
b_pack_ptr_ = reinterpret_cast<float16_t *>( |
|
|
|
ctx_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); |
|
|
|
b_pack_ptr_ = |
|
|
|
reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t))); |
|
|
|
if (b_pack_ptr_ == nullptr) { |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)); |
|
|
|
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_, C8NUM)); |
|
|
|
thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM; |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); |
|
|
|
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); |
|
|
|
if (params_->a_const_ == true) { |
|
|
|
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { |
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_); |
|
|
|
} else { |
|
|
|
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_); |
|
|
|
} |
|
|
|
} |
|
|
|
if (params_->b_const_ == true) { |
|
|
|
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { |
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_); |
|
|
|
} else { |
|
|
|
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::InitBias() { |
|
|
|
if (in_tensors_.size() == 3) { |
|
|
|
bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(params_->col_8_ * sizeof(float16_t))); |
|
|
|
auto c_shape = out_tensors_[0]->shape(); |
|
|
|
auto bias_shape = in_tensors_[1]->shape(); |
|
|
|
if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { |
|
|
|
MS_LOG(ERROR) << "The bias'dimension is not equal with colum"; |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_INPUT_TENSOR_ERROR; |
|
|
|
} |
|
|
|
auto col = c_shape[c_shape.size() - 1]; |
|
|
|
auto col_8 = UP_ROUND(col, 8); |
|
|
|
bias_ptr_ = reinterpret_cast<float16_t *>(malloc(col_8 * sizeof(float16_t))); |
|
|
|
if (bias_ptr_ == nullptr) { |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t)); |
|
|
|
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, params_->col_); |
|
|
|
memset(bias_ptr_, 0, col_8 * sizeof(float16_t)); |
|
|
|
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, col); |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { |
|
|
|
output_ptr_ = reinterpret_cast<float16_t *>( |
|
|
|
ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); |
|
|
|
if (output_ptr_ == nullptr) { |
|
|
|
MS_LOG(ERROR) << "malloc output_ptr_ failed."; |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
int MatmulFP16CPUKernel::ReSize() { |
|
|
|
if (params_->a_const_ == false || params_->a_init_shape_ == false) { |
|
|
|
if (a_pack_ptr_ != nullptr) { |
|
|
|
free(a_pack_ptr_); |
|
|
|
a_pack_ptr_ = nullptr; |
|
|
|
} |
|
|
|
auto ret = MallocMatrixABuffer(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
} |
|
|
|
if (params_->b_const_ == false || params_->b_init_shape_ == false) { |
|
|
|
if (b_pack_ptr_ != nullptr) { |
|
|
|
free(b_pack_ptr_); |
|
|
|
b_pack_ptr_ = nullptr; |
|
|
|
} |
|
|
|
auto ret = MallocMatrixBBuffer(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
} |
|
|
|
if (bias_ptr_ != nullptr) { |
|
|
|
free(bias_ptr_); |
|
|
|
bias_ptr_ = nullptr; |
|
|
|
} |
|
|
|
auto ret = InitBias(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 init bias failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
@@ -179,10 +200,61 @@ void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) { |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::Init() { |
|
|
|
params_->a_init_shape_ = (in_tensors_[0]->shape().size() != 0); |
|
|
|
params_->b_init_shape_ = (in_tensors_[1]->shape().size() != 0); |
|
|
|
if (params_->a_init_shape_ == true) { |
|
|
|
auto ret = MallocMatrixABuffer(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 malloc matrix a buffer failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
} |
|
|
|
if (params_->b_init_shape_ == true) { |
|
|
|
auto ret = MallocMatrixBBuffer(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 malloc matrix b buffer failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); |
|
|
|
params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); |
|
|
|
if (params_->a_const_ == true) { |
|
|
|
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { |
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_); |
|
|
|
} else { |
|
|
|
InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_); |
|
|
|
} |
|
|
|
} |
|
|
|
if (params_->b_const_ == true) { |
|
|
|
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { |
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_); |
|
|
|
} else { |
|
|
|
InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (!InferShapeDone()) { |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
return ReSize(); |
|
|
|
auto ret = InitBias(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul fp16 init bias failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::MallocFp16Output() { |
|
|
|
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { |
|
|
|
output_ptr_ = reinterpret_cast<float16_t *>( |
|
|
|
ctx_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t))); |
|
|
|
if (output_ptr_ == nullptr) { |
|
|
|
MS_LOG(ERROR) << "malloc output_ptr_ failed."; |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::RunImpl(int task_id) { |
|
|
|
@@ -211,6 +283,11 @@ int MatmulFP16Run(void *cdata, int task_id) { |
|
|
|
|
|
|
|
int MatmulFP16CPUKernel::Run() { |
|
|
|
auto out_tensor = out_tensors_[0]; |
|
|
|
auto ret = MallocFp16Output(); |
|
|
|
if (ret != RET_OK) { |
|
|
|
MS_LOG(ERROR) << "Matmul MallocFp16Output failed"; |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
float16_t *c_ptr = nullptr; |
|
|
|
if (out_tensor->data_type() == kNumberTypeFloat32) { |
|
|
|
c_ptr = output_ptr_; |
|
|
|
@@ -241,6 +318,7 @@ int MatmulFP16CPUKernel::Run() { |
|
|
|
auto size = out_tensor->ElementsNum(); |
|
|
|
auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c()); |
|
|
|
Float16ToFloat32(output_ptr_, out_tensor_data, size); |
|
|
|
ctx_->allocator->Free(output_ptr_); |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|