Merge pull request !23653 from yangruoqi713/codextags/v1.5.0-rc1
| @@ -68,7 +68,7 @@ void LayerNormGammaAndBeta(float *dst, const float *src, const float *gamma_data | |||
| } | |||
| int LayerNorm(const float *src_data, const float *gamma_data, const float *beta_data, float *dst_data, float *out_mean, | |||
| float *out_deno, LayerNormParameter *param, size_t task_id) { | |||
| float *out_deno, const LayerNormParameter *param, size_t task_id) { | |||
| if (src_data == NULL || dst_data == NULL || gamma_data == NULL || beta_data == NULL) { | |||
| return NNACL_NULL_PTR; | |||
| } | |||
| @@ -24,7 +24,7 @@ extern "C" { | |||
| #endif | |||
| int LayerNorm(const float *src_data, const float *gamma_data, const float *beta_data, float *dst_data, float *out_mean, | |||
| float *out_deno, LayerNormParameter *param, size_t task_id); | |||
| float *out_deno, const LayerNormParameter *param, size_t task_id); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -154,20 +154,23 @@ void UpdataOutput(const float *cell_state, const float *output_gate, float *hidd | |||
| void UpdateLstmGate(float *gate_buffer, const float *input, const float *weight, const float *bias, int row, int deep, | |||
| int col, int col_align, bool is_vec, float *packed_ptr) { | |||
| const float *weight_i = weight; | |||
| const float *bias_i = bias; | |||
| float *gate_i = gate_buffer; | |||
| for (int i = 0; i < 4; i++) { | |||
| const float *weight_i; | |||
| LstmMatMul(gate_i, input, weight_i, bias_i, row, deep, col, col_align, is_vec, packed_ptr); | |||
| #ifdef ENABLE_AVX | |||
| if (is_vec) { | |||
| weight_i = weight + deep * col_align * i; | |||
| weight_i += deep * col_align; | |||
| } else { | |||
| weight_i = weight + deep * col * i; | |||
| weight_i += deep * col; | |||
| } | |||
| #else | |||
| weight_i = weight + deep * col * i; | |||
| weight_i += deep * col; | |||
| #endif | |||
| const float *bias_i = bias + col_align * i; | |||
| float *gate = gate_buffer + row * col * i; | |||
| LstmMatMul(gate, input, weight_i, bias_i, row, deep, col, col_align, is_vec, packed_ptr); | |||
| bias_i += col_align; | |||
| gate_i += row * col; | |||
| } | |||
| } | |||
| @@ -228,7 +231,7 @@ void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *c | |||
| void LstmUnidirectional(float *output, const float *packed_input, const float *weight_i, const float *weight_h, | |||
| const float *input_bias, const float *state_bias, float *hidden_state, float *cell_state, | |||
| float *buffer[6], const LstmParameter *lstm_param, bool is_backward) { | |||
| float *buffer[7], const LstmParameter *lstm_param, bool is_backward) { | |||
| float *gate = buffer[1]; | |||
| for (int i = 0; i < 4; i++) { | |||
| const float *weight_loop = weight_i + lstm_param->input_size_ * lstm_param->input_col_align_ * i; | |||
| @@ -256,7 +259,7 @@ void LstmUnidirectional(float *output, const float *packed_input, const float *w | |||
| } | |||
| void Lstm(float *output, const float *input, const float *weight_i, const float *weight_h, const float *input_bias, | |||
| const float *state_bias, float *hidden_state, float *cell_state, float *buffer[6], | |||
| const float *state_bias, float *hidden_state, float *cell_state, float *buffer[7], | |||
| const LstmParameter *lstm_param) { | |||
| // forward | |||
| float *packed_input = buffer[0]; | |||
| @@ -36,10 +36,10 @@ int ElementOptMulAcc(const float *input0, const float input1, float *output, con | |||
| void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *cell_gate, float *output_gate, | |||
| const float *state_weight, const float *state_bias, float *hidden_state, float *cell_state, | |||
| float *buffer[6], const LstmParameter *lstm_param); | |||
| float *buffer[7], const LstmParameter *lstm_param); | |||
| void Lstm(float *output, const float *input, const float *weight_i, const float *weight_h, const float *input_bias, | |||
| const float *state_bias, float *hidden_state, float *cell_state, float *buffer[6], | |||
| const float *state_bias, float *hidden_state, float *cell_state, float *buffer[7], | |||
| const LstmParameter *lstm_param); | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -34,17 +34,17 @@ void PostFuncInt8C4(const int32_t *in, const int32_t *bias, int8_t *out, size_t | |||
| void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, | |||
| int output_channel, int input_step, int8_t input_zp); | |||
| void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, | |||
| int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min, | |||
| int32_t acc_max); | |||
| const int32_t *out_multiplier, const int32_t *left_shift, | |||
| const int32_t *right_shift, int32_t acc_min, int32_t acc_max); | |||
| void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, | |||
| int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); | |||
| void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, | |||
| size_t oc4, size_t offset); | |||
| void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height, | |||
| size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, | |||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp, | |||
| int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *acc_min, int32_t *acc_max); | |||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, const int8_t *in_zp, | |||
| const int32_t *out_zp, const int32_t *out_multiplier, const int32_t *left_shift, | |||
| const int32_t *right_shift, const int32_t *acc_min, const int32_t *acc_max); | |||
| void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| @@ -59,8 +59,8 @@ int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int3 | |||
| #ifdef ENABLE_ARM32 | |||
| void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, | |||
| int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min, | |||
| int32_t acc_max, size_t per_channel); | |||
| const int32_t *out_multiplier, const int32_t *left_shift, const int32_t *right_shift, | |||
| int32_t acc_min, int32_t acc_max, size_t per_channel); | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -69,23 +69,24 @@ void PostFuncInt8C4Neon64(const int32_t *in, const int32_t *bias, int8_t *out, s | |||
| int32_t zp, int32_t mini, int32_t maxi); | |||
| void ConvDw3x3Int8Neon64(int8_t *output, const int8_t *input, const int16_t *weight, const int32_t *bias, | |||
| int input_col_size, int input_row_size, int channel, int output_h, int output_w, int8_t in_zp, | |||
| int32_t out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t acc_min, int32_t acc_max, size_t per_channel); | |||
| int32_t out_zp, const int32_t *out_multiplier, const int32_t *left_shift, | |||
| const int32_t *right_shift, int32_t acc_min, int32_t acc_max, size_t per_channel); | |||
| void ConvDw3x3Int8Stride2(int8_t *output, const int8_t *input, const int16_t *weight, const int32_t *bias, | |||
| int input_col_size, int input_row_size, int channel, int output_h, int output_w, int8_t in_zp, | |||
| int32_t out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t acc_min, int32_t acc_max, size_t per_channel); | |||
| int32_t out_zp, const int32_t *out_multiplier, const int32_t *left_shift, | |||
| const int32_t *right_shift, int32_t acc_min, int32_t acc_max, size_t per_channel); | |||
| void ConvDw3x3Int8Corner(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t in_kh_step, | |||
| size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, int32_t *out_multiplier, | |||
| int32_t *left_shift, int32_t *right_shift, size_t acc_min, size_t acc_max, size_t per_channel); | |||
| size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, const int32_t *out_multiplier, | |||
| const int32_t *left_shift, const int32_t *right_shift, size_t acc_min, size_t acc_max, | |||
| size_t per_channel); | |||
| void ConvDw3x3Int8Vertical(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, | |||
| size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, | |||
| int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, size_t acc_min, | |||
| size_t acc_max, size_t per_channel); | |||
| const int32_t *out_multiplier, const int32_t *left_shift, const int32_t *right_shift, | |||
| size_t acc_min, size_t acc_max, size_t per_channel); | |||
| void ConvDw3x3Int8Horizontal(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, | |||
| size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, | |||
| int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, size_t acc_min, | |||
| size_t acc_max, size_t per_channel); | |||
| const int32_t *out_multiplier, const int32_t *left_shift, const int32_t *right_shift, | |||
| size_t acc_min, size_t acc_max, size_t per_channel); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -33,8 +33,9 @@ void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t * | |||
| } | |||
| #endif | |||
| void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int output_w, int channel, int32_t output_zp, int32_t *out_multiplier, | |||
| int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max, bool per_channel) { | |||
| void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int output_w, int channel, int32_t output_zp, | |||
| const int32_t *out_multiplier, const int32_t *left_shift, const int32_t *right_shift, | |||
| int32_t acc_min, int32_t acc_max, bool per_channel) { | |||
| if (per_channel) { | |||
| // support perchannel | |||
| for (int w = 0; w < output_w; w++) { | |||
| @@ -207,8 +208,8 @@ void ConvDw3x3Int8Window(int8_t *output, const int8_t *buffer, const int16_t *we | |||
| void ConvDw3x3Int8Block(int8_t *output, const int8_t *buffer, const int16_t *weight, const int32_t *bias, int start_c, | |||
| int end_c, int col_size, int row_size, int channel, int output_h, int output_w, int8_t in_zp, | |||
| int32_t out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t acc_min, int32_t acc_max, int stride, bool per_channel) { | |||
| int32_t out_zp, const int32_t *out_multiplier, const int32_t *left_shift, | |||
| const int32_t *right_shift, int32_t acc_min, int32_t acc_max, int stride, bool per_channel) { | |||
| for (; start_c <= end_c - 8; start_c += 8) { | |||
| #ifdef ENABLE_ARM64 | |||
| if (stride == 1) { | |||
| @@ -330,8 +331,8 @@ void ConvDw3x3Int8(int8_t *output_data, int8_t *buffer, const int8_t *input_data | |||
| #ifndef ENABLE_ARM32 | |||
| void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, | |||
| const int *out_multiplier, const int *left_shift, const int *right_shift, int32_t acc_min, | |||
| int32_t acc_max, bool per_channel) { | |||
| const int *out_multiplier, const int *left_shift, const int *right_shift, | |||
| const int32_t acc_min, const int32_t acc_max, bool per_channel) { | |||
| for (int c = 0; c < channel; c += 8) { | |||
| int tmp_buffer[8]; | |||
| for (int i = 0; i < 8; i++) { | |||
| @@ -385,22 +386,25 @@ void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *wei | |||
| #ifndef ENABLE_ARM64 | |||
| void ConvDw3x3Int8Corner(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int in_kh_step, | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, int *out_multiplier, | |||
| int *left_shift, int *right_shift, int32_t acc_min, int32_t acc_max, bool per_channel) { | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, const int *out_multiplier, | |||
| const int *left_shift, const int *right_shift, int32_t acc_min, int32_t acc_max, | |||
| bool per_channel) { | |||
| ConvDw3x3Int8BorderPixel(dst, src, weight, bias, 2, 2, in_kh_step, in_kw_step, channel, in_zp, out_zp, out_multiplier, | |||
| left_shift, right_shift, acc_min, acc_max, per_channel); | |||
| } | |||
| void ConvDw3x3Int8Vertical(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int in_kh_step, | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, int *out_multiplier, | |||
| int *left_shift, int *right_shift, int32_t acc_min, int32_t acc_max, bool per_channel) { | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, const int *out_multiplier, | |||
| const int *left_shift, const int *right_shift, int32_t acc_min, int32_t acc_max, | |||
| bool per_channel) { | |||
| ConvDw3x3Int8BorderPixel(dst, src, weight, bias, 2, 3, in_kh_step, in_kw_step, channel, in_zp, out_zp, out_multiplier, | |||
| left_shift, right_shift, acc_min, acc_max, per_channel); | |||
| } | |||
| void ConvDw3x3Int8Horizontal(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int in_kh_step, | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, int *out_multiplier, | |||
| int *left_shift, int *right_shift, int32_t acc_min, int32_t acc_max, bool per_channel) { | |||
| int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, const int *out_multiplier, | |||
| const int *left_shift, const int *right_shift, int32_t acc_min, int32_t acc_max, | |||
| bool per_channel) { | |||
| ConvDw3x3Int8BorderPixel(dst, src, weight, bias, 3, 2, in_kh_step, in_kw_step, channel, in_zp, out_zp, out_multiplier, | |||
| left_shift, right_shift, acc_min, acc_max, per_channel); | |||
| } | |||
| @@ -494,9 +498,9 @@ void ConvDw3x3Int8Pad(int8_t *output_data, const int8_t *input_data, const int16 | |||
| /*conv depthwise sliding window perchannel int8 begin*/ | |||
| void ConvDwInt8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int in_kh_step, int in_kw_step, int kernel_w, int8_t *input_zp, int32_t *out_zp, | |||
| const int *out_multiplier, const int *left_shift, const int *right_shift, int32_t *acc_min, | |||
| int32_t *acc_max) { | |||
| int width, int in_kh_step, int in_kw_step, int kernel_w, const int8_t *input_zp, | |||
| const int32_t *out_zp, const int *out_multiplier, const int *left_shift, | |||
| const int *right_shift, int32_t *acc_min, int32_t *acc_max) { | |||
| int tmp_buffer[C8NUM]; | |||
| for (int i = 0; i < C8NUM; i++) { | |||
| tmp_buffer[i] = 0; | |||
| @@ -531,7 +535,7 @@ void ConvDwInt8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight | |||
| void ConvDwInt8Border(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int top, int bottom, | |||
| int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| int8_t *in_zp, int32_t *out_zp, const int *out_multiplier, const int *left_shift, | |||
| const int8_t *in_zp, const int32_t *out_zp, const int *out_multiplier, const int *left_shift, | |||
| const int *right_shift, int32_t *acc_min, int32_t *acc_max) { | |||
| int8_t *dst_h = dst + top * sliding->out_h_step_; | |||
| for (int oh = top; oh < bottom; oh++) { | |||
| @@ -613,7 +617,7 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con | |||
| #endif | |||
| void ConvDwInt8SW(int8_t *output_data, const int8_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||
| int8_t *input_zp, int32_t *output_zp, const ConvParameter *conv_param, | |||
| const int8_t *input_zp, const int32_t *output_zp, const ConvParameter *conv_param, | |||
| const SlidingWindowParam *sliding, int task_id) { | |||
| NNACL_CHECK_ZERO_RETURN(conv_param->dilation_h_); | |||
| NNACL_CHECK_ZERO_RETURN(conv_param->dilation_w_); | |||
| @@ -631,8 +635,8 @@ void ConvDwInt8SW(int8_t *output_data, const int8_t *input_data, const int16_t * | |||
| int *right_shift = conv_param->conv_quant_arg_.right_shift_ + oc * C8NUM; | |||
| int *acc_min = conv_param->conv_quant_arg_.out_act_min_ + oc * C8NUM; | |||
| int *acc_max = conv_param->conv_quant_arg_.out_act_max_ + oc * C8NUM; | |||
| int8_t *in_zp = input_zp + oc * C8NUM; | |||
| int32_t *out_zp = output_zp + oc * C8NUM; | |||
| const int8_t *in_zp = input_zp + oc * C8NUM; | |||
| const int32_t *out_zp = output_zp + oc * C8NUM; | |||
| ConvDwInt8Border(dst_data, src_data, weight, bias, 0, sliding->top_, 0, conv_param->output_w_, conv_param, | |||
| sliding, in_zp, out_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max); | |||
| @@ -35,7 +35,7 @@ void ConvDw3x3Int8(int8_t *output_data, int8_t *buffer, const int8_t *input_data | |||
| int task_id); | |||
| void ConvDwInt8SW(int8_t *output_data, const int8_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||
| int8_t *input_zp, int32_t *output_zp, const ConvParameter *conv_param, | |||
| const int8_t *input_zp, const int32_t *output_zp, const ConvParameter *conv_param, | |||
| const SlidingWindowParam *sliding, int task_id); | |||
| void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data, | |||
| @@ -115,7 +115,7 @@ void FastMul(const int8_t *input0_data, const int8_t *input1_data, int8_t *outpu | |||
| } | |||
| #ifdef ENABLE_ARM | |||
| int32x4_t output_multiplier_vec = vdupq_n_s32(quant_arg->output_multiplier_); | |||
| int32x4_t left_shift_out_vec = vdupq_n_s32(1 << quant_arg->shift_left_); | |||
| int32x4_t left_shift_out_vec = vdupq_n_s32(1 << (size_t)quant_arg->shift_left_); | |||
| int32x4_t right_shift_out_vec = vdupq_n_s32(-quant_arg->shift_right_); | |||
| int16x8_t out_zp_vec = vdupq_n_s16(quant_arg->out_quant_arg_.zp_); | |||
| int8x16_t out_min_vec = vdupq_n_s8(quant_arg->output_activation_min_); | |||
| @@ -199,10 +199,10 @@ void FastMul(const int8_t *input0_data, const int8_t *input1_data, int8_t *outpu | |||
| for (; j < depth; ++j) { | |||
| const int32_t input0_val = zp1 + input0_data[j]; | |||
| const int32_t input1_val = zp2 + input1_data[0]; | |||
| int32_t mul_result = | |||
| RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << quant_arg->shift_left_), | |||
| quant_arg->output_multiplier_), | |||
| quant_arg->shift_right_); | |||
| int32_t mul_result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << (size_t)quant_arg->shift_left_), | |||
| quant_arg->output_multiplier_), | |||
| quant_arg->shift_right_); | |||
| mul_result += quant_arg->out_quant_arg_.zp_; | |||
| mul_result = mul_result < quant_arg->output_activation_max_ ? mul_result : quant_arg->output_activation_max_; | |||
| @@ -224,10 +224,10 @@ void Mul(const int8_t *input0_data, const int8_t *input1_data, int8_t *output_da | |||
| for (; index < real_dst_count; ++index) { | |||
| const int32_t input0_val = quant_arg->in_quant_args_[0].zp_ + input0_data[index]; | |||
| const int32_t input1_val = quant_arg->in_quant_args_[1].zp_ + input1_data[index]; | |||
| int32_t mul_result = | |||
| RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << quant_arg->shift_left_), | |||
| quant_arg->output_multiplier_), | |||
| quant_arg->shift_right_); | |||
| int32_t mul_result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << (size_t)quant_arg->shift_left_), | |||
| quant_arg->output_multiplier_), | |||
| quant_arg->shift_right_); | |||
| mul_result += quant_arg->out_quant_arg_.zp_; | |||
| mul_result = mul_result < quant_arg->output_activation_max_ ? mul_result : quant_arg->output_activation_max_; | |||
| @@ -145,6 +145,7 @@ int LstmFP32Coder::MallocRunBuffer(CoderContext *const context) { | |||
| kNumberTypeFloat32, lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float), kWorkspace)); | |||
| MS_CHECK_PTR(buffer_[5]); | |||
| } | |||
| buffer_[6] = nullptr; | |||
| return RET_OK; | |||
| } | |||
| @@ -44,7 +44,7 @@ class LstmFP32Coder final : public OperatorCoder { | |||
| float *weight_h_ptr_{nullptr}; | |||
| float *input_bias_{nullptr}; | |||
| float *state_bias_{nullptr}; | |||
| float *buffer_[6]; | |||
| float *buffer_[7]; | |||
| int row_tile_{0}; | |||
| int col_tile_{0}; | |||
| int weight_batch_{0}; | |||
| @@ -160,7 +160,7 @@ int NPUSubGraph::BuildNPUInputOp() { | |||
| auto in_tensor = op->inputs()[i]; | |||
| if (IsSubGraphInputTensor(in_tensor)) { | |||
| auto tensor_name = "Input_" + std::to_string(count++) + '_' + op->name(); | |||
| hiai::op::Data *data; | |||
| hiai::op::Data *data = nullptr; | |||
| data = ConverterToNPUData(in_tensor, tensor_name); | |||
| subgraph_input_ops_.push_back(*data); | |||
| input_ops.push_back(data); | |||
| @@ -98,13 +98,11 @@ inline void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_ | |||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) { | |||
| int hw8 = plane / C8NUM * C8NUM; | |||
| int task_start = 0; | |||
| int task_end = plane; | |||
| int batch = plane * channel; | |||
| for (int n = 0; n < batches; n++) { | |||
| const float *src_batch = (const float *)src + n * batch; | |||
| float *dst_batch = reinterpret_cast<float *>(dst) + n * batch; | |||
| int hw = task_start; | |||
| int hw = 0; | |||
| for (; hw < hw8; hw += C8NUM) { | |||
| int c = 0; | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -122,7 +120,7 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int | |||
| } | |||
| } | |||
| } | |||
| for (; hw < task_end; hw++) { | |||
| for (; hw < plane; hw++) { | |||
| const float *src_ptr = src_batch + hw * channel; | |||
| float *dst_ptr = dst_batch + hw; | |||
| for (size_t i = 0; i < channel; i++) { | |||