|
|
@@ -15,123 +15,230 @@ |
|
|
*/ |
|
|
*/ |
|
|
|
|
|
|
|
|
#include "src/runtime/kernel/arm/int8/fullconnection_int8.h" |
|
|
#include "src/runtime/kernel/arm/int8/fullconnection_int8.h" |
|
|
#include "nnacl/int8/matmul_int8.h" |
|
|
|
|
|
#include "nnacl/common_func.h" |
|
|
|
|
|
#include "src/runtime/runtime_api.h" |
|
|
#include "src/runtime/runtime_api.h" |
|
|
#include "include/errorcode.h" |
|
|
|
|
|
#include "src/kernel_registry.h" |
|
|
#include "src/kernel_registry.h" |
|
|
|
|
|
|
|
|
using mindspore::lite::RET_MEMORY_FAILED; |
|
|
|
|
|
using mindspore::lite::RET_OK; |
|
|
|
|
|
|
|
|
|
|
|
using mindspore::lite::KernelRegistrar; |
|
|
using mindspore::lite::KernelRegistrar; |
|
|
using mindspore::lite::RET_ERROR; |
|
|
using mindspore::lite::RET_ERROR; |
|
|
|
|
|
using mindspore::lite::RET_MEMORY_FAILED; |
|
|
using mindspore::lite::RET_OK; |
|
|
using mindspore::lite::RET_OK; |
|
|
using mindspore::schema::PrimitiveType_FullConnection; |
|
|
using mindspore::schema::PrimitiveType_FullConnection; |
|
|
|
|
|
|
|
|
namespace mindspore::kernel { |
|
|
namespace mindspore::kernel { |
|
|
|
|
|
void FullconnectionInt8CPUKernel::FreeQuantParam() { |
|
|
|
|
|
if (quant_.filter_scale_ != nullptr) { |
|
|
|
|
|
free(quant_.filter_scale_); |
|
|
|
|
|
quant_.filter_scale_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (quant_.filter_zp_ != nullptr) { |
|
|
|
|
|
free(quant_.filter_zp_); |
|
|
|
|
|
quant_.filter_zp_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (quant_.left_shift_ != nullptr) { |
|
|
|
|
|
free(quant_.left_shift_); |
|
|
|
|
|
quant_.left_shift_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (quant_.right_shift_ != nullptr) { |
|
|
|
|
|
free(quant_.right_shift_); |
|
|
|
|
|
quant_.right_shift_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (quant_.quant_multiplier_ != nullptr) { |
|
|
|
|
|
free(quant_.quant_multiplier_); |
|
|
|
|
|
quant_.quant_multiplier_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void FullconnectionInt8CPUKernel::FreeTmpBuffer() { |
|
|
|
|
|
if (pack_a_ptr_ != nullptr) { |
|
|
|
|
|
free(pack_a_ptr_); |
|
|
|
|
|
pack_a_ptr_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (pack_b_ptr_ != nullptr) { |
|
|
|
|
|
free(pack_b_ptr_); |
|
|
|
|
|
pack_b_ptr_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (input_sums_ != nullptr) { |
|
|
|
|
|
free(input_sums_); |
|
|
|
|
|
input_sums_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (weight_bias_sums_ != nullptr) { |
|
|
|
|
|
free(weight_bias_sums_); |
|
|
|
|
|
weight_bias_sums_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
if (bias_ptr_ != nullptr) { |
|
|
|
|
|
free(bias_ptr_); |
|
|
|
|
|
bias_ptr_ = nullptr; |
|
|
|
|
|
} |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::MallocQuantParam() { |
|
|
|
|
|
auto weight_tensor = in_tensors_[1]; |
|
|
|
|
|
auto weight_quant_params = weight_tensor->quant_params(); |
|
|
|
|
|
int col = weight_tensor->shape().front(); |
|
|
|
|
|
filter_per_channel_ = (weight_quant_params.size() > 1); |
|
|
|
|
|
|
|
|
|
|
|
int init_size = filter_per_channel_ ? col : 1; |
|
|
|
|
|
|
|
|
|
|
|
quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float))); |
|
|
|
|
|
if (quant_.filter_scale_ == nullptr) { |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); |
|
|
|
|
|
if (quant_.filter_zp_ == nullptr) { |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); |
|
|
|
|
|
if (quant_.left_shift_ == nullptr) { |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); |
|
|
|
|
|
if (quant_.right_shift_ == nullptr) { |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); |
|
|
|
|
|
if (quant_.quant_multiplier_ == nullptr) { |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
return RET_OK; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::Init() { |
|
|
int FullconnectionInt8CPUKernel::Init() { |
|
|
|
|
|
auto ret = MallocQuantParam(); |
|
|
|
|
|
if (ret != RET_OK) { |
|
|
|
|
|
FreeQuantParam(); |
|
|
|
|
|
return ret; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
auto in_quant_params = in_tensors_[0]->quant_params(); |
|
|
|
|
|
quant_.input_.zp_ = in_quant_params.front().zeroPoint; |
|
|
|
|
|
quant_.input_.scale_ = in_quant_params.front().scale; |
|
|
|
|
|
|
|
|
|
|
|
auto out_quant_params = out_tensors_[0]->quant_params(); |
|
|
|
|
|
quant_.output_.zp_ = out_quant_params.front().zeroPoint; |
|
|
|
|
|
quant_.output_.scale_ = out_quant_params.front().scale; |
|
|
|
|
|
|
|
|
|
|
|
auto weight_tensor = in_tensors_[1]; |
|
|
|
|
|
fc_param_->b_const_ = (weight_tensor->data_c() != nullptr); |
|
|
|
|
|
int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1; |
|
|
|
|
|
auto weight_quant_params = weight_tensor->quant_params(); |
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < weight_quant_num; i++) { |
|
|
|
|
|
quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint; |
|
|
|
|
|
quant_.filter_scale_[i] = weight_quant_params[i].scale; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < weight_quant_num; ++i) { |
|
|
|
|
|
const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]); |
|
|
|
|
|
double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_); |
|
|
|
|
|
QuantizeRoundParameter(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i], |
|
|
|
|
|
&quant_.right_shift_[i]); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6, |
|
|
|
|
|
quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_, |
|
|
|
|
|
&quant_.out_act_max_); |
|
|
|
|
|
|
|
|
if (!InferShapeDone()) { |
|
|
if (!InferShapeDone()) { |
|
|
return RET_OK; |
|
|
return RET_OK; |
|
|
} |
|
|
} |
|
|
return ReSize(); |
|
|
return ReSize(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::ReSize() { |
|
|
|
|
|
FreeTmpBuffer(); |
|
|
|
|
|
|
|
|
void FullconnectionInt8CPUKernel::InitParam() { |
|
|
int row = 1; |
|
|
int row = 1; |
|
|
for (size_t i = 0; i < out_tensors_[0]->shape().size() - 1; ++i) row *= (out_tensors_[0]->shape())[i]; |
|
|
|
|
|
|
|
|
for (size_t i = 0; i < out_tensors_[0]->shape().size() - 1; ++i) { |
|
|
|
|
|
row *= (out_tensors_[0]->shape())[i]; |
|
|
|
|
|
} |
|
|
fc_param_->row_ = row; |
|
|
fc_param_->row_ = row; |
|
|
fc_param_->col_ = out_tensors_[0]->shape().back(); |
|
|
fc_param_->col_ = out_tensors_[0]->shape().back(); |
|
|
fc_param_->deep_ = (in_tensors_[1]->shape())[1]; |
|
|
fc_param_->deep_ = (in_tensors_[1]->shape())[1]; |
|
|
fc_param_->row_8_ = UP_ROUND(fc_param_->row_, 8); |
|
|
|
|
|
fc_param_->col_8_ = UP_ROUND(fc_param_->col_, 8); |
|
|
|
|
|
|
|
|
|
|
|
r4_ = UP_ROUND(fc_param_->row_, 4); |
|
|
|
|
|
c4_ = UP_ROUND(fc_param_->col_, 4); |
|
|
|
|
|
d16_ = UP_ROUND(fc_param_->deep_, 16); |
|
|
|
|
|
thread_count_ = MSMIN(thread_count_, UP_DIV(c4_, 4)); |
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(c4_, 4), thread_count_); |
|
|
|
|
|
|
|
|
|
|
|
a_r4x16_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(r4_ * d16_ * sizeof(int8_t))); |
|
|
|
|
|
b_c16x4_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(c4_ * d16_ * sizeof(int8_t))); |
|
|
|
|
|
input_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(r4_ * sizeof(int))); |
|
|
|
|
|
weight_bias_sums_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(c4_ * sizeof(int))); |
|
|
|
|
|
if (a_r4x16_ptr_ == nullptr || b_c16x4_ptr_ == nullptr || input_sums_ == nullptr || weight_bias_sums_ == nullptr) { |
|
|
|
|
|
MS_LOG(ERROR) << "Memory allocation failed"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); |
|
|
|
|
|
fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM); |
|
|
|
|
|
fc_param_->col_2_ = UP_ROUND(fc_param_->col_, C2NUM); |
|
|
|
|
|
fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM); |
|
|
|
|
|
fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); |
|
|
|
|
|
fc_param_->col_16_ = UP_ROUND(fc_param_->col_, C16NUM); |
|
|
|
|
|
fc_param_->deep_4_ = UP_ROUND(fc_param_->deep_, C4NUM); |
|
|
|
|
|
fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM); |
|
|
|
|
|
|
|
|
|
|
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM)); |
|
|
|
|
|
thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_4_, C4NUM), thread_count_); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::ReSize() { |
|
|
|
|
|
FreeTmpBuffer(); |
|
|
|
|
|
|
|
|
|
|
|
InitParam(); |
|
|
|
|
|
|
|
|
|
|
|
pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t))); |
|
|
|
|
|
if (pack_a_ptr_ == nullptr) { |
|
|
|
|
|
FreeTmpBuffer(); |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
pack_b_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t))); |
|
|
|
|
|
if (pack_b_ptr_ == nullptr) { |
|
|
|
|
|
FreeTmpBuffer(); |
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
input_sums_ = reinterpret_cast<int *>(malloc(fc_param_->row_4_ * sizeof(int))); |
|
|
|
|
|
if (input_sums_ == nullptr) { |
|
|
FreeTmpBuffer(); |
|
|
FreeTmpBuffer(); |
|
|
return RET_MEMORY_FAILED; |
|
|
|
|
|
|
|
|
return RET_ERROR; |
|
|
|
|
|
} |
|
|
|
|
|
weight_bias_sums_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int))); |
|
|
|
|
|
if (weight_bias_sums_ == nullptr) { |
|
|
|
|
|
FreeTmpBuffer(); |
|
|
|
|
|
return RET_ERROR; |
|
|
} |
|
|
} |
|
|
memset(a_r4x16_ptr_, 0, r4_ * d16_ * sizeof(int8_t)); |
|
|
|
|
|
memset(b_c16x4_ptr_, 0, c4_ * d16_ * sizeof(int8_t)); |
|
|
|
|
|
memset(input_sums_, 0, r4_ * sizeof(int)); |
|
|
|
|
|
memset(weight_bias_sums_, 0, c4_ * sizeof(int)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memset(pack_a_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t)); |
|
|
|
|
|
memset(pack_b_ptr_, 0, fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t)); |
|
|
|
|
|
memset(input_sums_, 0, fc_param_->row_4_ * sizeof(int)); |
|
|
|
|
|
memset(weight_bias_sums_, 0, fc_param_->col_4_ * sizeof(int)); |
|
|
|
|
|
|
|
|
if (in_tensors_.size() == 3) { |
|
|
if (in_tensors_.size() == 3) { |
|
|
auto bias_len = fc_param_->col_8_ * sizeof(int); |
|
|
|
|
|
bias_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(bias_len)); |
|
|
|
|
|
|
|
|
bias_ptr_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int))); |
|
|
if (bias_ptr_ == nullptr) { |
|
|
if (bias_ptr_ == nullptr) { |
|
|
MS_LOG(ERROR) << "Memory allocation failed"; |
|
|
MS_LOG(ERROR) << "Memory allocation failed"; |
|
|
FreeTmpBuffer(); |
|
|
FreeTmpBuffer(); |
|
|
return RET_MEMORY_FAILED; |
|
|
return RET_MEMORY_FAILED; |
|
|
} |
|
|
} |
|
|
memcpy(bias_ptr_, in_tensors_[2]->data_c(), bias_len); |
|
|
|
|
|
|
|
|
memcpy(bias_ptr_, in_tensors_[2]->data_c(), fc_param_->col_ * sizeof(int)); |
|
|
} else { |
|
|
} else { |
|
|
bias_ptr_ = nullptr; |
|
|
bias_ptr_ = nullptr; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
auto input_tensor = in_tensors_[0]; |
|
|
|
|
|
auto params = input_tensor->quant_params(); |
|
|
|
|
|
MS_ASSERT(params.size() == 1); |
|
|
|
|
|
quant_params_.input.zp_ = params.front().zeroPoint; |
|
|
|
|
|
quant_params_.input.scale_ = params.front().scale; |
|
|
|
|
|
auto weight_tensor = in_tensors_[1]; |
|
|
|
|
|
params = weight_tensor->quant_params(); |
|
|
|
|
|
MS_ASSERT(params.size() == 1); |
|
|
|
|
|
quant_params_.weight.zp_ = params.front().zeroPoint; |
|
|
|
|
|
quant_params_.weight.scale_ = params.front().scale; |
|
|
|
|
|
auto output_tensor = out_tensors_[0]; |
|
|
|
|
|
params = output_tensor->quant_params(); |
|
|
|
|
|
MS_ASSERT(params.size() == 1); |
|
|
|
|
|
quant_params_.output.zp_ = params.front().zeroPoint; |
|
|
|
|
|
quant_params_.output.scale_ = params.front().scale; |
|
|
|
|
|
|
|
|
|
|
|
double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_; |
|
|
|
|
|
QuantizeRoundParameter(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift, |
|
|
|
|
|
&quant_params_.right_shift); |
|
|
|
|
|
CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6, |
|
|
|
|
|
quant_params_.output.zp_, quant_params_.output.scale_, &quant_params_.out_act_min, |
|
|
|
|
|
&quant_params_.out_act_max); |
|
|
|
|
|
fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); |
|
|
|
|
|
if (fc_param_->b_const_) { |
|
|
if (fc_param_->b_const_) { |
|
|
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_[1]->data_c()); |
|
|
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_[1]->data_c()); |
|
|
RowMajor2Row16x4MajorInt8(weight_data, b_c16x4_ptr_, fc_param_->col_, fc_param_->deep_); |
|
|
|
|
|
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_params_.input.zp_, |
|
|
|
|
|
quant_params_.weight.zp_, bias_ptr_, weight_bias_sums_, ColMajor); |
|
|
|
|
|
|
|
|
RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); |
|
|
|
|
|
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, |
|
|
|
|
|
weight_bias_sums_, ColMajor, filter_per_channel_); |
|
|
} |
|
|
} |
|
|
return RET_OK; |
|
|
return RET_OK; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::RunImpl(int task_id) { |
|
|
int FullconnectionInt8CPUKernel::RunImpl(int task_id) { |
|
|
int cur_oc = MSMIN(thread_stride_, UP_DIV(c4_, 4) - task_id * thread_stride_); |
|
|
|
|
|
|
|
|
int stride = thread_stride_ * C4NUM; |
|
|
|
|
|
int cur_stride = task_id * stride; |
|
|
|
|
|
int res_stride = fc_param_->col_ - cur_stride; |
|
|
|
|
|
int cur_oc = MSMIN(stride, res_stride); |
|
|
if (cur_oc <= 0) { |
|
|
if (cur_oc <= 0) { |
|
|
return RET_OK; |
|
|
return RET_OK; |
|
|
} |
|
|
} |
|
|
int cur_oc_res = MSMIN(thread_stride_ * C4NUM, fc_param_->col_ - task_id * thread_stride_ * C4NUM); |
|
|
|
|
|
auto &q = quant_params_; |
|
|
|
|
|
auto &p = fc_param_; |
|
|
|
|
|
auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * C4NUM * d16_; |
|
|
|
|
|
auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * C4NUM; |
|
|
|
|
|
auto output_ptr = reinterpret_cast<int8_t *>(out_tensors_[0]->data_c()); |
|
|
|
|
|
auto cur_c = output_ptr + task_id * thread_stride_ * C4NUM; |
|
|
|
|
|
#ifdef ENABLE_ARM64 |
|
|
|
|
|
MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, r4_, cur_oc * C4NUM, d16_, input_sums_, cur_bias, q.out_act_min, |
|
|
|
|
|
q.out_act_max, q.output.zp_, &q.quant_multiplier, &q.left_shift, &q.right_shift, p->row_, cur_oc_res, |
|
|
|
|
|
p->col_ * sizeof(int8_t), 0); |
|
|
|
|
|
#else |
|
|
|
|
|
MatMulInt8_16x4_r(a_r4x16_ptr_, cur_b, cur_c, p->row_, cur_oc_res, d16_, p->col_, input_sums_, cur_bias, |
|
|
|
|
|
&q.left_shift, &q.right_shift, &q.quant_multiplier, q.output.zp_, INT8_MIN, INT8_MAX, false); |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_; |
|
|
|
|
|
int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_; |
|
|
|
|
|
int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_; |
|
|
|
|
|
int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_; |
|
|
|
|
|
|
|
|
|
|
|
MatmulInt8Opt(pack_a_ptr_, pack_b_ptr_ + cur_stride * fc_param_->deep_16_, c_ptr_ + cur_stride, fc_param_->row_, |
|
|
|
|
|
cur_oc, fc_param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_, |
|
|
|
|
|
quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, fc_param_->col_, |
|
|
|
|
|
filter_per_channel_, cur_zp); |
|
|
|
|
|
|
|
|
return RET_OK; |
|
|
return RET_OK; |
|
|
} |
|
|
} |
|
|
@@ -148,14 +255,19 @@ int FcInt8Run(void *cdata, int task_id) { |
|
|
|
|
|
|
|
|
int FullconnectionInt8CPUKernel::Run() { |
|
|
int FullconnectionInt8CPUKernel::Run() { |
|
|
auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_[0]->data_c()); |
|
|
auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_[0]->data_c()); |
|
|
RowMajor2Row16x4MajorInt8(input_ptr, a_r4x16_ptr_, fc_param_->row_, fc_param_->deep_); |
|
|
|
|
|
CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor); |
|
|
|
|
|
|
|
|
RowMajor2Row16x4MajorInt8(input_ptr, pack_a_ptr_, fc_param_->row_, fc_param_->deep_); |
|
|
|
|
|
|
|
|
|
|
|
int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0]; |
|
|
|
|
|
CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, tmp_weight_zp, input_sums_, RowMajor); |
|
|
|
|
|
|
|
|
if (!fc_param_->b_const_) { |
|
|
if (!fc_param_->b_const_) { |
|
|
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_[1]->data_c()); |
|
|
auto weight_data = reinterpret_cast<int8_t *>(in_tensors_[1]->data_c()); |
|
|
RowMajor2Row16x4MajorInt8(weight_data, b_c16x4_ptr_, fc_param_->col_, fc_param_->deep_); |
|
|
|
|
|
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_params_.input.zp_, |
|
|
|
|
|
quant_params_.weight.zp_, bias_ptr_, weight_bias_sums_, ColMajor); |
|
|
|
|
|
|
|
|
RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); |
|
|
|
|
|
CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, |
|
|
|
|
|
weight_bias_sums_, ColMajor, filter_per_channel_); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
c_ptr_ = reinterpret_cast<int8_t *>(out_tensors_[0]->data_c()); |
|
|
auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_); |
|
|
auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_); |
|
|
if (ret != RET_OK) { |
|
|
if (ret != RET_OK) { |
|
|
MS_LOG(ERROR) << "ParallelLaunch failed"; |
|
|
MS_LOG(ERROR) << "ParallelLaunch failed"; |
|
|
@@ -163,6 +275,7 @@ int FullconnectionInt8CPUKernel::Run() { |
|
|
} |
|
|
} |
|
|
return RET_OK; |
|
|
return RET_OK; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
kernel::LiteKernel *CpuFullConnectionInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, |
|
|
kernel::LiteKernel *CpuFullConnectionInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, |
|
|
const std::vector<lite::Tensor *> &outputs, |
|
|
const std::vector<lite::Tensor *> &outputs, |
|
|
OpParameter *opParameter, const lite::InnerContext *ctx, |
|
|
OpParameter *opParameter, const lite::InnerContext *ctx, |
|
|
@@ -185,5 +298,4 @@ kernel::LiteKernel *CpuFullConnectionInt8KernelCreator(const std::vector<lite::T |
|
|
return kernel; |
|
|
return kernel; |
|
|
} |
|
|
} |
|
|
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_FullConnection, CpuFullConnectionInt8KernelCreator) |
|
|
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_FullConnection, CpuFullConnectionInt8KernelCreator) |
|
|
|
|
|
|
|
|
} // namespace mindspore::kernel |
|
|
} // namespace mindspore::kernel |