Browse Source

1.change int8 conv which not support sdot scheme (using matmul)

2.free reduandent memory in fp16 kernels
tags/v1.1.0
fuzhiye 5 years ago
parent
commit
b43cdc5df9
9 changed files with 129 additions and 608 deletions
  1. +46
    -263
      mindspore/lite/nnacl/int8/conv_int8.c
  2. +1
    -18
      mindspore/lite/nnacl/int8/conv_int8.h
  3. +23
    -89
      mindspore/lite/nnacl/pack.c
  4. +1
    -4
      mindspore/lite/nnacl/pack.h
  5. +4
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
  6. +4
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  7. +50
    -170
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
  8. +0
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
  9. +0
    -62
      mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc

+ 46
- 263
mindspore/lite/nnacl/int8/conv_int8.c View File

@@ -19,193 +19,6 @@
#include "nnacl/winograd_transform.h"
#include "nnacl/int8/common_func.h"

void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
ConvParameter *conv_param) {
int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_;
int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_;
int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_;
int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
int oc4 = UP_DIV(output_channel, C4NUM);

#ifdef ENABLE_ARM64
size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t));
#elif ENABLE_ARM32
size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t));
#else
int tile_num = conv_param->tile_num_;
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
for (int oc = 0; oc < output_channel; oc++) {
int oc4_block = oc / C4NUM;
int oc4_res = oc % C4NUM;
int weight_oc4_offset = oc4_block * C4NUM * plane_c4 * C4NUM * ic4 * C4NUM + oc4_res * C4NUM * C4NUM;
int dst_oc_offset = oc;
for (int n = 0; n < tile_num; n++) {
int src_tile_offset = n * C4NUM * C4NUM;
int dst_tile_offset = dst_oc_offset + n * output_channel;

for (int b = 0; b < kernel_plane; b++) {
int plane_c4_block = b / C4NUM;
int plane_c4_res = b % C4NUM;
int src_plane_offset = src_tile_offset + plane_c4_block * tile_num * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM;
int weight_plane_offset =
weight_oc4_offset + plane_c4_block * C4NUM * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM;
for (int i = 0; i < ic4; i++) {
int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM * C4NUM;
int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM * C4NUM;
for (int j = 0; j < C4NUM; j++) {
int weight_ic_offset = weight_ic4_offset + j;
tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j];
} // in c4num loop
} // ic4 loop
} // kernel_plane loop
if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
-shift_after[oc]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
!(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
-shift_after[0]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
!(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
tmp_dst[dst_tile_offset] -= input_sum[n];
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
-shift_after[0]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc];
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
-shift_after[oc]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
}
} // tile_num loop
} // output_channel loop
#endif
}

void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
ConvParameter *conv_param, GEMM_FUNC gemm_func) {
int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_;
int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_;
int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_;
int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
int oc4 = UP_DIV(output_channel, C4NUM);
if (gemm_func != NULL) {
#ifdef __aarch64__
size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
gemm_func(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), input_sum,
act_min, act_max, out_zp, out_multiplier, shift_before, shift_after, asymmetric, per_channel,
oc4 * C4NUM * sizeof(int32_t));
#endif
} else {
int tile_num = conv_param->tile_num_;
for (int oc = 0; oc < output_channel; oc++) {
int oc4_block = oc / C4NUM;
int oc4_res = oc % C4NUM;
int weight_oc4_offset = oc4_block * C4NUM * kernel_plane * ic4 * C4NUM + oc4_res * C4NUM;
int dst_oc_offset = oc;
for (int n = 0; n < tile_num; n++) {
int src_tile_offset = n * C4NUM;
int dst_tile_offset = dst_oc_offset + n * output_channel;

for (int b = 0; b < kernel_plane; b++) {
int src_plane_offset = src_tile_offset + b * tile_num * ic4 * C4NUM;
int weight_plane_offset = weight_oc4_offset + b * C4NUM * ic4 * C4NUM;
for (int i = 0; i < ic4; i++) {
int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM;
int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM;
for (int j = 0; j < C4NUM; j++) {
int weight_ic_offset = weight_ic4_offset + j;
tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j];
} // in c4num loop
} // ic4 loop
} // kernel_plane loop
if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
-shift_after[oc]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
!(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
-shift_after[0]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
!(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
tmp_dst[dst_tile_offset] -= input_sum[n];
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
-shift_after[0]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc];
int result = tmp_dst[dst_tile_offset] + bias[oc];
result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
-shift_after[oc]);
result += out_zp;
result = result > act_min ? result : act_min;
result = result < act_max ? result : act_max;
dst[dst_tile_offset] = (int8_t)result;
}
} // tile_num loop
} // output_channel loop
}
}

void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) {
int oc4 = UP_DIV(oc, C4NUM);
#ifdef ENABLE_ARM
@@ -249,73 +62,9 @@ void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, in
#endif
}

// int8 conv common
void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data,
int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
ConvParameter *conv_param) {
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int in_batch = conv_param->input_batch_;
int in_channel = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int out_channel = conv_param->output_channel_;
int oc4 = UP_DIV(out_channel, C4NUM);
int32_t input_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;

int tile_n = conv_param->tile_num_;
int thread_count = conv_param->thread_num_;
int output_count = out_h * out_w;
int output_tile_count = UP_DIV(output_count, tile_n);
int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int plane_block = UP_DIV(kernel_plane, C4NUM);
int unit_size = plane_block * C4NUM * ic4 * C4NUM;
int input_sum_offset;
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
input_sum_offset = tile_n * oc4 * C4NUM;
} else {
input_sum_offset = tile_n;
}

for (int b = 0; b < in_batch; b++) {
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
int out_batch_offset = b * out_channel * out_h * out_w;
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
int start_index = thread_id * tile_n;
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
// clear tmp buffer before compute
memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
int out_offset = thread_id * tile_n * out_channel + out_batch_offset;

size_t tmp_dst_size = tile_n * conv_param->output_channel_ * sizeof(int32_t);
int tmp_dst_offset = task_id * tile_n * conv_param->output_channel_;
memset(tmp_dst + tmp_dst_offset, 0, tmp_dst_size);

Im2ColPackUnitInt8(input_data + in_batch_offset, gemm_input, real_cal_num, start_index, tmp_input_sum,
conv_param);
if (real_cal_num == tile_n) {
int8_t *gemm_output = output_data + out_offset;
IndirectGemmInt8(gemm_output, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
out_channel, tmp_input_sum, conv_param);
} else {
// res part
int8_t *tmp_out_ptr = tmp_out + task_id * tile_n * out_channel;
IndirectGemmInt8(tmp_out_ptr, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
out_channel, tmp_input_sum, conv_param);
memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel);
}
}
}
}

void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight,
const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id,
ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func) {
ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize) {
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int in_batch = conv_param->input_batch_;
@@ -325,18 +74,29 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int out_channel = conv_param->output_channel_;
int oc8 = UP_DIV(out_channel, C8NUM);
int tile_n = conv_param->tile_num_;
int thread_count = conv_param->thread_num_;
int output_count = out_h * out_w;
int output_tile_count = UP_DIV(output_count, tile_n);
int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM);
int unit_size;
int input_sum_offset;
int up_round_oc;
#ifdef ENABLE_ARM32
up_round_oc = UP_ROUND(out_channel, C2NUM);
unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM);
#else
if (is_optimize) {
up_round_oc = UP_ROUND(out_channel, C8NUM);
unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM);
} else {
up_round_oc = UP_ROUND(out_channel, C4NUM);
unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM);
}
#endif
bool per_channel;
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
input_sum_offset = tile_n * oc8 * C8NUM;
input_sum_offset = tile_n * up_round_oc;
per_channel = true;
} else {
input_sum_offset = tile_n;
@@ -344,7 +104,7 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
}

for (int b = 0; b < in_batch; b++) {
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
int in_batch_offset = b * in_channel * in_h * in_w;
int out_batch_offset = b * out_channel * out_h * out_w;
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
int start_index = thread_id * tile_n;
@@ -354,15 +114,38 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
int8_t *matmul = matmul_input + task_id * kernel_plane * in_channel * tile_n;
memset(matmul, conv_param->conv_quant_arg_.input_quant_args_[0].zp_, kernel_plane * in_channel * tile_n);
Im2ColPackUnitInt8Opt(input_data + in_batch_offset, gemm_input, matmul, real_cal_num, start_index, filter_zp,
tmp_input_sum, conv_param, per_channel);
tmp_input_sum, conv_param, per_channel, is_optimize);

int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
int8_t *gemm_output = output_data + out_offset;
matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel,
tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_,
conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_,
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
#ifdef ENABLE_ARM32
MatmulInt8Neon32(
gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, tmp_input_sum, bias_data,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.quant_multiplier_,
conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, out_channel, per_channel);
#elif ENABLE_ARM64
if (is_optimize) {
matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel,
tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_,
conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_,
conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
} else {
MatmulInt8Neon64(gemm_input, packed_weight, gemm_output, UP_ROUND(real_cal_num, C4NUM),
UP_ROUND(out_channel, C4NUM), unit_size, tmp_input_sum, bias_data,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.left_shift_,
conv_param->conv_quant_arg_.right_shift_, real_cal_num, out_channel, out_channel, per_channel);
}
#else
MatMulInt8_8x8_r(
gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel, tmp_input_sum,
bias_data, conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
#endif
}
}
}


+ 1
- 18
mindspore/lite/nnacl/int8/conv_int8.h View File

@@ -28,30 +28,13 @@
#include "nnacl/matmul_parameter.h"
#include "nnacl/int8/matmul_int8.h"

typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before,
int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);

#ifdef __cplusplus
extern "C" {
#endif
void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
ConvParameter *conv_param);

void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
ConvParameter *conv_param, GEMM_FUNC gemm_func);

// int8 conv common
void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data,
int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
ConvParameter *conv_param);

void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight,
const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id,
ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func);
ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize);

// int8 convolution 1x1
void Conv1x1PreOptPeroc(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel,


+ 23
- 89
mindspore/lite/nnacl/pack.c View File

@@ -260,93 +260,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
} // tile num loop
}

void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
int32_t *input_sum, ConvParameter *conv_param) {
// input format : nhwc
int tile_num = conv_param->tile_num_;
QuantArg *filter_arg = conv_param->conv_quant_arg_.filter_quant_args_;
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int stride_h = conv_param->stride_h_;
int stride_w = conv_param->stride_w_;
int pad_h = conv_param->pad_u_;
int pad_w = conv_param->pad_l_;
int dilation_h = conv_param->dilation_h_;
int dilation_w = conv_param->dilation_w_;
int in_channel = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int ic4_minus = in_channel / C4NUM;
int ic4 = UP_DIV(in_channel, C4NUM);
int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
int out_w = conv_param->output_w_;

for (int i = 0; i < real_cal_num; i++) {
int block_start = block_index + i;
int input_h = block_start / out_w * stride_h - pad_h;
int input_w = block_start % out_w * stride_w - pad_w;
int input_cal_num_offset = i * C4NUM * C4NUM;
int32_t input_accumulator = 0;
for (int j = 0; j < kernel_h; j++) {
int input_y = input_h + j * dilation_h;
if (input_y < 0 || input_y >= in_h) {
input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w;
continue;
}
int input_y_stride = input_y * in_w * in_channel;
for (int n = 0; n < kernel_w; n++) {
int input_x = input_w + n * dilation_w;
if (input_x < 0 || input_x >= in_w) {
input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
continue;
}
int input_x_stride = input_y_stride + input_x * in_channel;
int plane_c4_block = (j * kernel_w + n) / C4NUM;
int plane_c4_res = (j * kernel_w + n) % C4NUM;
int input_plane_offset =
plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset;
for (int m = 0; m < ic4_minus; m++) {
int channel_block_stride = input_x_stride + m * C4NUM;
int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM;
(packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0];
(packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1];
(packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2];
(packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3];
input_accumulator += (packed_input + channel_block_offset)[0];
input_accumulator += (packed_input + channel_block_offset)[1];
input_accumulator += (packed_input + channel_block_offset)[2];
input_accumulator += (packed_input + channel_block_offset)[3];
} // channel_block loop
int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
for (int l = 0; l < ic_res; ++l) {
int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
packed_input[channel_block_offset] = input_data[channel_block_stride];
input_accumulator += (packed_input + channel_block_offset)[0];
}
for (int l = 0; l < (C4NUM - ic_res); l++) {
input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
}
} // kernel_w loop
} // kernel_h loop
if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
continue;
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
int cal_num_offset = i * oc4 * C4NUM;
for (int l = 0; l < conv_param->output_channel_; ++l) {
input_sum[cal_num_offset + l] = input_accumulator * filter_arg[l].zp_;
}
} else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
!(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
input_sum[i] = input_accumulator * filter_arg[0].zp_;
}
} // tile num loop
}

void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param,
bool per_channel) {
bool per_channel, bool is_optimize) {
// input format : nhwc
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
@@ -389,11 +305,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_
} // kernel_h loop
}
} // tile num loop
if (per_channel) {
Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, kernel_plane * in_channel, conv_param->output_channel_,
real_cal_num, filter_zp, C8NUM * C8NUM);
int deep = kernel_plane * in_channel;
if (is_optimize) {
if (per_channel) {
Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, deep, conv_param->output_channel_, real_cal_num,
filter_zp, C8NUM * C8NUM);
} else {
Conv1x1PreOptPert(matmul_input, packed_input, input_sum, deep, real_cal_num, conv_param);
}
} else {
Conv1x1PreOptPert(matmul_input, packed_input, input_sum, kernel_plane * in_channel, real_cal_num, conv_param);
RowMajor2Row16x4MajorInt8(matmul_input, packed_input, real_cal_num, deep);
if (per_channel) {
#ifdef ENABLE_ARM32
PackInputSum16x4PerChannelArm32(packed_input, input_sum, filter_zp, real_cal_num, deep,
conv_param->output_channel_);
#else
PackInputSum16x4PerChannel(packed_input, input_sum, filter_zp, real_cal_num, deep, conv_param->output_channel_);
#endif
} else {
size_t hw4 = UP_ROUND(real_cal_num, C4NUM);
size_t ic16 = UP_ROUND(deep, C16NUM);
PackInputSum16x4PerLayer(packed_input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4,
ic16);
}
}
}



+ 1
- 4
mindspore/lite/nnacl/pack.h View File

@@ -32,12 +32,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa

void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel);

void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
int32_t *input_sum, ConvParameter *conv_param);

void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param,
bool per_channel);
bool per_channel, bool is_optimize);

void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);



+ 4
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc View File

@@ -53,6 +53,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
}
PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}

bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
if (bias_data_ == nullptr) {


+ 4
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -78,6 +78,10 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "winograd filter transfrom failed.";
return ret;
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}

// init bias
bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));


+ 50
- 170
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc View File

@@ -33,9 +33,8 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace mindspore::kernel {
void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
tile_num_ = 8;
matmul_func_ = MatMulInt8_8x8_r;
#ifdef ENABLE_ARM32
tile_num_ = 2;
tile_num_ = 4;
support_optimize_ = false;
#endif

@@ -48,138 +47,43 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
if (dlopen_error != nullptr) {
MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
support_optimize_ = false;
matmul_func_ = nullptr;
tile_num_ = 4;
} else {
support_optimize_ = true;
}
} else {
tile_num_ = 4;
support_optimize_ = false;
matmul_func_ = nullptr;
}
#endif
conv_param_->tile_num_ = tile_num_;
}

int ConvolutionInt8CPUKernel::InitWeightBias() {
int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
int kernel_plane = kernel_h * kernel_w;
conv_param_->input_channel_ = input_channel;
conv_param_->output_channel_ = output_channel;
int ic4 = UP_DIV(input_channel, C4NUM);
int oc4 = UP_DIV(output_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w;
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * plane_c4 * C4NUM;
auto filter_arg = conv_param_->conv_quant_arg_.filter_quant_args_;
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;

// init weight
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->MutableData());
packed_weight_ = reinterpret_cast<int8_t *>(malloc(pack_weight_size));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size);
auto *weight_sum = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * output_channel));
if (weight_sum == nullptr) {
MS_LOG(ERROR) << "malloc weight_sum failed.";
return RET_ERROR;
}
for (int i = 0; i < output_channel; i++) {
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
} else {
weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
}
}
PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum);

// init bias
bias_data_ = reinterpret_cast<int32_t *>(malloc(oc4 * C4NUM * sizeof(int32_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_data_ failed.";
return RET_ERROR;
}
memset(bias_data_, 0, oc4 * C4NUM * sizeof(int32_t));
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
auto *bias_data = reinterpret_cast<int32_t *>(bias_data_);
int c4_kernel_plane_size = kernel_plane * ic4 * C4NUM;
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
for (int i = 0; i < output_channel; i++) {
bias_data[i] += filter_arg[i].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp;
}
int up_round_deep;
int up_round_oc;
#ifdef ENABLE_ARM32
up_round_oc = UP_ROUND(output_channel, C2NUM);
up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
#else
if (support_optimize_) {
up_round_oc = UP_ROUND(output_channel, C8NUM);
up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM);
} else {
for (int i = 0; i < output_channel; i++) {
bias_data[i] += filter_arg[0].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp;
}
up_round_oc = UP_ROUND(output_channel, C4NUM);
up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
}
free(weight_sum);

size_t input_sum_size;
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
input_sum_size = oc4 * C4NUM * tile_num_ * thread_count_ * sizeof(int32_t);
} else {
input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
}
input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size));
if (input_sum_ == nullptr) {
MS_LOG(ERROR) << "malloc input_sum_ failed.";
return RET_ERROR;
}
memset(input_sum_, 0, input_sum_size);
return RET_OK;
}

int ConvolutionInt8CPUKernel::InitTmpBuffer() {
MS_ASSERT(ctx_->allocator != nullptr);
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_input_ failed.";
return RET_ERROR;
}

size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
if (tmp_dst_ == nullptr) {
MS_LOG(ERROR) << "malloc tmp_dst_ failed.";
return RET_ERROR;
}

tmp_out_ =
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
if (tmp_out_ == nullptr) {
MS_LOG(ERROR) << "malloc tmp_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
conv_param_->input_channel_ = input_channel;
conv_param_->output_channel_ = output_channel;
int oc8 = UP_DIV(output_channel, C8NUM);
int kernel_plane = kernel_h * kernel_w;
int up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM);
int pack_weight_size = oc8 * C8NUM * up_round_deep;
#endif
int pack_weight_size = up_round_oc * up_round_deep;
int bias_size = up_round_oc * sizeof(int32_t);
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;

// init weight
@@ -190,15 +94,23 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size);
RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
#ifdef ENABLE_ARM32
RowMajor2Row2x16MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
#else
if (support_optimize_) {
RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
} else {
RowMajor2Row16x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
}
#endif

// init bias
bias_data_ = reinterpret_cast<int32_t *>(malloc(oc8 * C8NUM * sizeof(int32_t)));
bias_data_ = reinterpret_cast<int32_t *>(malloc(bias_size));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_data_ failed.";
return RET_ERROR;
}
memset(bias_data_, 0, oc8 * C8NUM * sizeof(int32_t));
memset(bias_data_, 0, bias_size);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
@@ -225,7 +137,7 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {

size_t input_sum_size;
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
input_sum_size = oc8 * C8NUM * tile_num_ * thread_count_ * sizeof(int32_t);
input_sum_size = up_round_oc * tile_num_ * thread_count_ * sizeof(int32_t);
} else {
input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
}
@@ -241,14 +153,19 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
MS_ASSERT(ctx_->allocator != nullptr);
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
int tmp_unit = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM);
int tmp_size;
if (support_optimize_) {
tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM);
} else {
tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM);
}
matmul_packed_input_ = reinterpret_cast<int8_t *>(
ctx_->allocator->Malloc(thread_count_ * tile_num_ * kernel_plane * conv_param_->input_channel_));
if (matmul_packed_input_ == nullptr) {
MS_LOG(ERROR) << "malloc matmul_packed_input_ failed.";
return RET_ERROR;
}
packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_unit * thread_count_ * tile_num_));
packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_size * thread_count_ * tile_num_));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_input_ failed.";
return RET_ERROR;
@@ -263,26 +180,13 @@ int ConvolutionInt8CPUKernel::Init() {
MS_LOG(ERROR) << "Set quant param failed.";
return ret;
}
// init for opt
if (support_optimize_) {
ret = InitWeightBiasOpt();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Initialization for optimized int8 conv failed.";
return RET_ERROR;
}
} else {
ret = SetIfAsymmetric();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Set if per asymmetric failed.";
return ret;
}
// init for situation that not support sdot
ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}

ret = InitWeightBiasOpt();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Initialization for optimized int8 conv failed.";
return RET_ERROR;
}

if (!InferShapeDone()) {
return RET_OK;
}
@@ -308,14 +212,9 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
auto input_tensor = in_tensors_.at(kInputIndex);
auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
if (support_optimize_) {
ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_,
reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_,
matmul_func_);
} else {
ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_,
output_addr, input_sum_, task_id, conv_param_);
}
ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_,
reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_,
matmul_func_, support_optimize_);
return RET_OK;
}

@@ -330,18 +229,10 @@ int ConvolutionInt8Impl(void *cdata, int task_id) {
}

int ConvolutionInt8CPUKernel::Run() {
if (support_optimize_) {
auto ret = InitTmpBufferOpt();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
return RET_ERROR;
}
} else {
auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
return RET_ERROR;
}
auto ret = InitTmpBufferOpt();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
return RET_ERROR;
}

int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_);
@@ -369,18 +260,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &
int dilation_w = conv_param->dilation_w_;
kernel::LiteKernel *kernel;
if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
#ifdef ENABLE_ARM64
void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
if (optimize_op_handler != nullptr) {
kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
}
#elif ENABLE_ARM32
kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
#else
kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
#endif
} else if (kernel_h == 1 && kernel_w == 1) {
kernel = new (std::nothrow) kernel::Convolution1x1Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {


+ 0
- 2
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h View File

@@ -53,8 +53,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
void CheckSupportOptimize();
int InitWeightBiasOpt();
int InitTmpBufferOpt();
int InitWeightBias();
int InitTmpBuffer();

private:
void FreeTmpBuffer() {


+ 0
- 62
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc View File

@@ -169,68 +169,6 @@ TEST_F(TestPack, PackInputFp16) {
}
#endif

TEST_F(TestPack, PackInputUint8) {
auto conv_param = new ConvParameter;
InitConvParamPack(conv_param);
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int in_batch = conv_param->input_batch_;
int in_channel = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;

int thread_count = 1;
int tile_n = 8;
int output_count = out_h * out_w;
int output_tile_count = UP_DIV(output_count, tile_n);

int inchannel_block = 4;
int channel_block = UP_DIV(in_channel, inchannel_block);
int kernel_plane = kernel_h * kernel_w;
int unit_size = kernel_plane * channel_block * inchannel_block;
int packed_input_size = output_tile_count * tile_n * unit_size;

// input
size_t input_size;
std::string input_path = "./test_data/conv/convuint8_input_1_28_28_3.bin";
auto input_data = reinterpret_cast<uint8_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
auto int8_input = reinterpret_cast<int8_t *>(malloc(input_size));
for (int i = 0; i < input_size; i++) {
int8_input[i] = (int8_t)(input_data[i] - 128);
}
auto packed_input = reinterpret_cast<int8_t *>(malloc(in_batch * packed_input_size));
memset(packed_input, 0, in_batch * packed_input_size);
int32_t *input_sum = reinterpret_cast<int32_t *>(malloc(tile_n * thread_count * sizeof(int32_t)));

for (int b = 0; b < in_batch; b++) {
int in_batch_offset = b * in_channel * in_h * in_w;
int gemm_in_batch_offset = b * packed_input_size;
for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) {
int start_index = thread_id * tile_n;
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n;
int8_t *gemm_input =
reinterpret_cast<int8_t *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset;
memset(input_sum, 0, tile_n * thread_count * sizeof(int32_t));
Im2ColPackUnitInt8(int8_input + in_batch_offset, gemm_input, real_cal_num, start_index, input_sum, conv_param);
}
}

printf("==================output data=================\n");
for (int i = 0; i < 20; i++) {
std::cout << static_cast<int>(packed_input[i]) << " ,";
}
std::cout << std::endl;

delete input_data;
delete conv_param;
free(int8_input);
free(packed_input);
free(input_sum);
MS_LOG(INFO) << "TestPackInputUint8 passed";
}

TEST_F(TestPack, PackWeightUint8) {
auto conv_param = new ConvParameter;
InitConvParamPack(conv_param);


Loading…
Cancel
Save