|
|
|
@@ -29,15 +29,15 @@ MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); } |
|
|
|
|
|
|
|
void MatmulCPUKernel::FreeTmpBuffer() { |
|
|
|
if (a_c12_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(a_c12_ptr_); |
|
|
|
free(a_c12_ptr_); |
|
|
|
a_c12_ptr_ = nullptr; |
|
|
|
} |
|
|
|
if (b_r8_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(b_r8_ptr_); |
|
|
|
free(b_r8_ptr_); |
|
|
|
b_r8_ptr_ = nullptr; |
|
|
|
} |
|
|
|
if (bias_ptr_ != nullptr) { |
|
|
|
ctx_->allocator->Free(bias_ptr_); |
|
|
|
free(bias_ptr_); |
|
|
|
bias_ptr_ = nullptr; |
|
|
|
} |
|
|
|
} |
|
|
|
@@ -67,23 +67,28 @@ int MatmulCPUKernel::ReSize() { |
|
|
|
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8)); |
|
|
|
thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_); |
|
|
|
|
|
|
|
a_c12_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_12_ * params_->deep_ * sizeof(float))); |
|
|
|
a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float))); |
|
|
|
if (a_c12_ptr_ == nullptr) { |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float)); |
|
|
|
b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float))); |
|
|
|
|
|
|
|
b_r8_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float))); |
|
|
|
if (b_r8_ptr_ == nullptr) { |
|
|
|
FreeTmpBuffer(); |
|
|
|
return RET_MEMORY_FAILED; |
|
|
|
} |
|
|
|
memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float)); |
|
|
|
|
|
|
|
params_->a_const_ = false; |
|
|
|
params_->b_const_ = false; |
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_); |
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_); |
|
|
|
params_->a_const_ = (in_tensors_[0]->Data() != nullptr); |
|
|
|
params_->b_const_ = (in_tensors_[1]->Data() != nullptr); |
|
|
|
if (params_->a_const_ == true) { |
|
|
|
InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->Data()), a_c12_ptr_); |
|
|
|
} |
|
|
|
if (params_->b_const_ == true) { |
|
|
|
InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->Data()), b_r8_ptr_); |
|
|
|
} |
|
|
|
|
|
|
|
bias_ptr_ = reinterpret_cast<float *>(malloc(params_->col_8_ * sizeof(float))); |
|
|
|
if (bias_ptr_ == nullptr) { |
|
|
|
@@ -99,35 +104,27 @@ int MatmulCPUKernel::ReSize() { |
|
|
|
} |
|
|
|
|
|
|
|
void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { |
|
|
|
if (params_->a_const_ == true) { |
|
|
|
return; |
|
|
|
} |
|
|
|
if (src_ptr == nullptr) { |
|
|
|
return; |
|
|
|
} |
|
|
|
params_->a_const_ = true; |
|
|
|
|
|
|
|
if (params_->a_transpose_) { |
|
|
|
RowMajor2Row12Major(src_ptr, dst_ptr, params_->deep_, params_->row_); |
|
|
|
} else { |
|
|
|
RowMajor2Col12Major(src_ptr, dst_ptr, params_->row_, params_->deep_); |
|
|
|
for (int i = 0; i < params_->batch; i++) { |
|
|
|
float *src = src_ptr + i * params_->deep_ * params_->row_; |
|
|
|
float *dst = dst_ptr + i * params_->deep_ * params_->row_12_; |
|
|
|
if (params_->a_transpose_) { |
|
|
|
RowMajor2Row12Major(src, dst, params_->deep_, params_->row_); |
|
|
|
} else { |
|
|
|
RowMajor2Col12Major(src, dst, params_->row_, params_->deep_); |
|
|
|
} |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { |
|
|
|
if (params_->b_const_ == true) { |
|
|
|
return; |
|
|
|
} |
|
|
|
if (src_ptr == nullptr) { |
|
|
|
return; |
|
|
|
} |
|
|
|
params_->b_const_ = true; |
|
|
|
|
|
|
|
if (params_->b_transpose_) { |
|
|
|
RowMajor2Col8Major(src_ptr, dst_ptr, params_->col_, params_->deep_); |
|
|
|
} else { |
|
|
|
RowMajor2Row8Major(src_ptr, dst_ptr, params_->deep_, params_->col_); |
|
|
|
for (int i = 0; i < params_->batch; i++) { |
|
|
|
float *src = src_ptr + i * params_->deep_ * params_->col_; |
|
|
|
float *dst = dst_ptr + i * params_->deep_ * params_->col_8_; |
|
|
|
if (params_->b_transpose_) { |
|
|
|
RowMajor2Col8Major(src, dst, params_->col_, params_->deep_); |
|
|
|
} else { |
|
|
|
RowMajor2Row8Major(src, dst, params_->deep_, params_->col_); |
|
|
|
} |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
@@ -144,8 +141,8 @@ int MatmulCPUKernel::RunImpl(int task_id) { |
|
|
|
if (cur_oc <= 0) { |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_, |
|
|
|
c_r_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No, |
|
|
|
MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_, |
|
|
|
c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No, |
|
|
|
params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
@@ -166,20 +163,21 @@ int MatmulCPUKernel::Run() { |
|
|
|
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; |
|
|
|
return prepare_ret; |
|
|
|
} |
|
|
|
auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->Data()); |
|
|
|
auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->Data()); |
|
|
|
auto c_ptr = reinterpret_cast<float *>(out_tensors_[0]->Data()); |
|
|
|
auto a_stride = params_->row_ * params_->deep_; |
|
|
|
auto b_stride = params_->deep_ * params_->col_; |
|
|
|
auto c_stride = params_->row_ * params_->col_; |
|
|
|
for (int i = 0; i < params_->batch; ++i) { |
|
|
|
auto cur_a_ptr = a_ptr + i * a_stride; |
|
|
|
auto cur_b_ptr = b_ptr + i * b_stride; |
|
|
|
c_r_ptr_ = c_ptr + i * c_stride; |
|
|
|
auto a_src = reinterpret_cast<float *>(in_tensors_[0]->Data()); |
|
|
|
auto b_src = reinterpret_cast<float *>(in_tensors_[1]->Data()); |
|
|
|
auto c_src = reinterpret_cast<float *>(out_tensors_[0]->Data()); |
|
|
|
|
|
|
|
InitMatrixA(cur_a_ptr, a_c12_ptr_); |
|
|
|
InitMatrixB(cur_b_ptr, b_r8_ptr_); |
|
|
|
if (params_->a_const_ == false) { |
|
|
|
InitMatrixA(a_src, a_c12_ptr_); |
|
|
|
} |
|
|
|
if (params_->b_const_ == false) { |
|
|
|
InitMatrixB(b_src, b_r8_ptr_); |
|
|
|
} |
|
|
|
|
|
|
|
for (int i = 0; i < params_->batch; ++i) { |
|
|
|
a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_; |
|
|
|
b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_; |
|
|
|
c_ptr_ = c_src + i * params_->row_ * params_->col_; |
|
|
|
LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_); |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
|