| @@ -29,7 +29,7 @@ mov x6, x1 | |||||
| mov x7, x2 | mov x7, x2 | ||||
| mov x8, x4 | mov x8, x4 | ||||
| LoopInputDepth16In: | |||||
| LoopDepth16In: | |||||
| cmp x8, #16 | cmp x8, #16 | ||||
| blt L4 | blt L4 | ||||
| sub x8, x8, #16 | sub x8, x8, #16 | ||||
| @@ -39,8 +39,8 @@ mov x8, x4 | |||||
| ld1 {v16.4s, v17.4s}, [x0], #32 | ld1 {v16.4s, v17.4s}, [x0], #32 | ||||
| cmp x8, #16 | cmp x8, #16 | ||||
| blt LoopInputDepth16Out | |||||
| LoopInputDepth16: | |||||
| blt LoopDepth16Out | |||||
| LoopDepth16: | |||||
| fmla v16.4s, v0.4s, v2.4s | fmla v16.4s, v0.4s, v2.4s | ||||
| fmla v17.4s, v1.4s, v3.4s | fmla v17.4s, v1.4s, v3.4s | ||||
| @@ -61,9 +61,9 @@ mov x8, x4 | |||||
| sub x8, x8, #16 | sub x8, x8, #16 | ||||
| cmp x8, #16 | cmp x8, #16 | ||||
| bge LoopInputDepth16 | |||||
| bge LoopDepth16 | |||||
| LoopInputDepth16Out: | |||||
| LoopDepth16Out: | |||||
| fmla v16.4s, v0.4s, v2.4s | fmla v16.4s, v0.4s, v2.4s | ||||
| fmla v17.4s, v1.4s, v3.4s | fmla v17.4s, v1.4s, v3.4s | ||||
| st1 {v16.4s, v17.4s}, [x9], #32 | st1 {v16.4s, v17.4s}, [x9], #32 | ||||
| @@ -81,7 +81,7 @@ mov x8, x4 | |||||
| cmp x8, #4 | cmp x8, #4 | ||||
| blt L0 | blt L0 | ||||
| LoopInputDepth4: | |||||
| LoopDepth4: | |||||
| ld1 {v0.4s}, [x6], #16 | ld1 {v0.4s}, [x6], #16 | ||||
| ld1 {v2.4s}, [x7], #16 | ld1 {v2.4s}, [x7], #16 | ||||
| ld1 {v16.4s}, [x0], #16 | ld1 {v16.4s}, [x0], #16 | ||||
| @@ -89,13 +89,13 @@ mov x8, x4 | |||||
| st1 {v16.4s}, [x9], #16 | st1 {v16.4s}, [x9], #16 | ||||
| sub x8, x8, #4 | sub x8, x8, #4 | ||||
| cmp x8, #4 | cmp x8, #4 | ||||
| bge LoopInputDepth4 | |||||
| bge LoopDepth4 | |||||
| L0: | L0: | ||||
| cmp x8, #0 | cmp x8, #0 | ||||
| beq Loop16LineEnd | beq Loop16LineEnd | ||||
| LoopInputDepth0: | |||||
| LoopDepth0: | |||||
| ldr s0, [x6], #4 | ldr s0, [x6], #4 | ||||
| ldr s1, [x7], #4 | ldr s1, [x7], #4 | ||||
| ldr s2, [x0], #4 | ldr s2, [x0], #4 | ||||
| @@ -103,7 +103,7 @@ mov x8, x4 | |||||
| fadd s2, s2, s0 | fadd s2, s2, s0 | ||||
| str s2, [x9], #4 | str s2, [x9], #4 | ||||
| subs x8, x8, #1 | subs x8, x8, #1 | ||||
| bne LoopInputDepth0 | |||||
| bne LoopDepth0 | |||||
| Loop16LineEnd: | Loop16LineEnd: | ||||
| @@ -0,0 +1,169 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| .global ConvDwInt8PostAlign4 | |||||
| #ifndef __APPLE__ | |||||
| .type ConvDwInt8PostAlign4, %function | |||||
| #endif | |||||
| // void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, | |||||
| // int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); | |||||
| // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, | |||||
| // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max | |||||
| ConvDwInt8PostAlign4: | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||||
| // x19 ~ x29 should be also preserved | |||||
| // whereas our coding style do not permit such amount of parameters | |||||
| ldr x8, [sp] | |||||
| dup v26.4s, w5 | |||||
| dup v27.4s, w4 | |||||
| dup v28.4s, w6 | |||||
| dup v29.4s, w3 | |||||
| dup v30.4s, w7 | |||||
| dup v31.4s, w8 | |||||
| cmp x2, 16 | |||||
| blt LoopDepth8 | |||||
| LoopDepth16: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| ld1 {v1.4s}, [x1], #16 | |||||
| ld1 {v2.4s}, [x1], #16 | |||||
| ld1 {v3.4s}, [x1], #16 | |||||
| sqshl v0.4s, v0.4s, v26.4s | |||||
| sqshl v1.4s, v1.4s, v26.4s | |||||
| sqshl v2.4s, v2.4s, v26.4s | |||||
| sqshl v3.4s, v3.4s, v26.4s | |||||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||||
| sqrdmulh v1.4s, v1.4s, v27.4s | |||||
| sqrdmulh v2.4s, v2.4s, v27.4s | |||||
| sqrdmulh v3.4s, v3.4s, v27.4s | |||||
| and v16.16b, v28.16b, v0.16b | |||||
| sshr v16.4s, v16.4s, #31 | |||||
| sqadd v0.4s, v0.4s, v16.4s | |||||
| srshl v0.4s, v0.4s, v28.4s | |||||
| and v17.16b, v28.16b, v1.16b | |||||
| sshr v17.4s, v17.4s, #31 | |||||
| sqadd v1.4s, v1.4s, v17.4s | |||||
| srshl v1.4s, v1.4s, v28.4s | |||||
| and v18.16b, v28.16b, v2.16b | |||||
| sshr v18.4s, v18.4s, #31 | |||||
| sqadd v2.4s, v2.4s, v18.4s | |||||
| srshl v2.4s, v2.4s, v28.4s | |||||
| and v19.16b, v28.16b, v3.16b | |||||
| sshr v19.4s, v19.4s, #31 | |||||
| sqadd v3.4s, v3.4s, v19.4s | |||||
| srshl v3.4s, v3.4s, v28.4s | |||||
| add v0.4s, v0.4s, v29.4s | |||||
| add v1.4s, v1.4s, v29.4s | |||||
| add v2.4s, v2.4s, v29.4s | |||||
| add v3.4s, v3.4s, v29.4s | |||||
| smax v0.4s, v0.4s, v30.4s | |||||
| smax v1.4s, v1.4s, v30.4s | |||||
| smax v2.4s, v2.4s, v30.4s | |||||
| smax v3.4s, v3.4s, v30.4s | |||||
| smin v0.4s, v0.4s, v31.4s | |||||
| smin v1.4s, v1.4s, v31.4s | |||||
| smin v2.4s, v2.4s, v31.4s | |||||
| smin v3.4s, v3.4s, v31.4s | |||||
| sqxtn v0.4h, v0.4s | |||||
| sqxtn v1.4h, v1.4s | |||||
| sqxtn v2.4h, v2.4s | |||||
| sqxtn v3.4h, v3.4s | |||||
| sqxtn v0.8b, v0.8h | |||||
| sqxtn v1.8b, v1.8h | |||||
| sqxtn v2.8b, v2.8h | |||||
| sqxtn v3.8b, v3.8h | |||||
| st1 {v0.s}[0], [x0], #4 | |||||
| st1 {v1.s}[0], [x0], #4 | |||||
| st1 {v2.s}[0], [x0], #4 | |||||
| st1 {v3.s}[0], [x0], #4 | |||||
| sub x2, x2, #16 | |||||
| cmp x2, #16 | |||||
| bge LoopDepth16 | |||||
| LoopDepth8: | |||||
| cmp x2, #8 | |||||
| blt LoopDepth4 | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| ld1 {v1.4s}, [x1], #16 | |||||
| sqshl v0.4s, v0.4s, v26.4s | |||||
| sqshl v1.4s, v1.4s, v26.4s | |||||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||||
| sqrdmulh v1.4s, v1.4s, v27.4s | |||||
| and v16.16b, v28.16b, v0.16b | |||||
| sshr v16.4s, v16.4s, #31 | |||||
| sqadd v0.4s, v0.4s, v16.4s | |||||
| srshl v0.4s, v0.4s, v28.4s | |||||
| and v17.16b, v28.16b, v1.16b | |||||
| sshr v17.4s, v17.4s, #31 | |||||
| sqadd v1.4s, v1.4s, v17.4s | |||||
| srshl v1.4s, v1.4s, v28.4s | |||||
| add v0.4s, v0.4s, v29.4s | |||||
| add v1.4s, v1.4s, v29.4s | |||||
| smax v0.4s, v0.4s, v30.4s | |||||
| smax v1.4s, v1.4s, v30.4s | |||||
| smin v0.4s, v0.4s, v31.4s | |||||
| smin v1.4s, v1.4s, v31.4s | |||||
| sqxtn v0.4h, v0.4s | |||||
| sqxtn v1.4h, v1.4s | |||||
| sqxtn v0.8b, v0.8h | |||||
| sqxtn v1.8b, v1.8h | |||||
| st1 {v0.s}[0], [x0], #4 | |||||
| st1 {v1.s}[0], [x0], #4 | |||||
| sub x2, x2, #8 | |||||
| cmp x2, #8 | |||||
| bge LoopDepth8 | |||||
| LoopDepth4: | |||||
| cmp x2, #4 | |||||
| blt End | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| sqshl v0.4s, v0.4s, v26.4s | |||||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||||
| and v16.16b, v28.16b, v0.16b | |||||
| sshr v16.4s, v16.4s, #31 | |||||
| sqadd v0.4s, v0.4s, v16.4s | |||||
| srshl v0.4s, v0.4s, v28.4s | |||||
| add v0.4s, v0.4s, v29.4s | |||||
| smax v0.4s, v0.4s, v30.4s | |||||
| smin v0.4s, v0.4s, v31.4s | |||||
| sqxtn v0.4h, v0.4s | |||||
| sqxtn v0.8b, v0.8h | |||||
| st1 {v0.s}[0], [x0], #4 | |||||
| sub x2, x2, #4 | |||||
| bge LoopDepth4 | |||||
| End: | |||||
| ret | |||||
| #endif | |||||
| @@ -0,0 +1,122 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| .global ConvDwInt8Row | |||||
| #ifndef __APPLE__ | |||||
| .type ConvDwInt8Row, %function | |||||
| #endif | |||||
| // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, | |||||
| // int output_channel, int input_step, int8_t input_zp) | |||||
| // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels, | |||||
| // x4: output_channel, x5: input_step, x6: input_zp | |||||
| // | |||||
| ConvDwInt8Row: | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||||
| // x19 ~ x29 should be also preserved | |||||
| // whereas our coding style do not permit such amount of parameters | |||||
| cmp x3, #0 | |||||
| beq End | |||||
| mov x10, x0 | |||||
| dup v31.8b, w6 | |||||
| LoopOutPixel: | |||||
| mov x7, x1 | |||||
| mov x8, x2 | |||||
| mov x9, x4 | |||||
| LoopDepth16In: | |||||
| cmp x9, #16 | |||||
| blt L8 | |||||
| sub x9, x9, #16 | |||||
| ld1 {v0.8b, v1.8b}, [x7], #16 | |||||
| ld1 {v2.8h, v3.8h}, [x8], #32 | |||||
| ld1 {v16.4s, v17.4s}, [x0], #32 | |||||
| ssubl v20.8h, v0.8b, v31.8b | |||||
| smlal v16.4s, v20.4h, v2.4h | |||||
| smlal2 v17.4s, v20.8h, v2.8h | |||||
| cmp x9, #16 | |||||
| blt LoopDepth16Out | |||||
| LoopDepth16: | |||||
| st1 {v16.4s, v17.4s}, [x10], #32 | |||||
| ld1 {v18.4s, v19.4s}, [x0], #32 | |||||
| ssubl v21.8h, v1.8b, v31.8b | |||||
| smlal v18.4s, v21.4h, v3.4h | |||||
| smlal2 v19.4s, v21.8h, v3.8h | |||||
| st1 {v18.4s, v19.4s}, [x10], #32 | |||||
| ld1 {v0.8b, v1.8b}, [x7], #16 | |||||
| ld1 {v2.8h, v3.8h}, [x8], #32 | |||||
| ld1 {v16.4s, v17.4s}, [x0], #32 | |||||
| ssubl v20.8h, v0.8b, v31.8b | |||||
| smlal v16.4s, v20.4h, v2.4h | |||||
| smlal2 v17.4s, v20.8h, v2.8h | |||||
| sub x9, x9, #16 | |||||
| cmp x9, #16 | |||||
| bge LoopDepth16 | |||||
| LoopDepth16Out: | |||||
| st1 {v16.4s, v17.4s}, [x10], #32 | |||||
| ld1 {v18.4s, v19.4s}, [x0], #32 | |||||
| ssubl v21.8h, v1.8b, v31.8b | |||||
| smlal v18.4s, v21.4h, v3.4h | |||||
| smlal2 v19.4s, v21.8h, v3.8h | |||||
| st1 {v18.4s, v19.4s}, [x10], #32 | |||||
| L8: | |||||
| cmp x9, #8 | |||||
| blt L0 | |||||
| LoopDepth8: | |||||
| ld1 {v0.8b}, [x7], #8 | |||||
| ld1 {v2.8h}, [x8], #16 | |||||
| ld1 {v16.4s, v17.4s}, [x0], #32 | |||||
| ssubl v20.8h, v0.8b, v31.8b | |||||
| smlal v16.4s, v20.4h, v2.4h | |||||
| smlal2 v17.4s, v20.8h, v2.8h | |||||
| st1 {v16.4s, v17.4s}, [x10], #32 | |||||
| sub x9, x9, #8 | |||||
| cmp x9, #8 | |||||
| bge LoopDepth8 | |||||
| L0: | |||||
| cmp x9, #0 | |||||
| beq Loop16LineEnd | |||||
| LoopDepth0: | |||||
| ldrsb w14, [x7], #1 | |||||
| ldrsh w15, [x8], #2 | |||||
| ldr w16, [x0], #4 | |||||
| add w14, w14, w6 | |||||
| sxth w14, w14 | |||||
| madd w14, w14, w15, w16 | |||||
| str w14, [x10], #4 | |||||
| subs x9, x9, #1 | |||||
| bne LoopDepth0 | |||||
| Loop16LineEnd: | |||||
| subs x3, x3, #1 | |||||
| add x1, x1, x5 | |||||
| bne LoopOutPixel | |||||
| End: | |||||
| ret | |||||
| #endif | |||||
| @@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co | |||||
| size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, | size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, | ||||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier, | size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier, | ||||
| int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); | int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); | ||||
| void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, | |||||
| int output_channel, int input_step, int8_t input_zp); | |||||
| void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, | |||||
| int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); | |||||
| #endif | #endif | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| @@ -20,6 +20,99 @@ | |||||
| #include "nnacl/int8/common_func.h" | #include "nnacl/int8/common_func.h" | ||||
| /*conv depthwise int8 begin*/ | /*conv depthwise int8 begin*/ | ||||
| // only support perlayer | |||||
| #ifndef ENABLE_ARM64 | |||||
| void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, | |||||
| int output_channel, int input_step, int8_t input_zp) { | |||||
| for (int i = 0; i < num_pixels; i++) { | |||||
| for (int c = 0; c < output_channel; c++) { | |||||
| const int16_t input = input_ptr[c] - input_zp; | |||||
| *output_ptr++ += input * weight_ptr[c]; | |||||
| } | |||||
| input_ptr += input_step; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| void ConvDwInt8Post(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier, | |||||
| int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max) { | |||||
| int align_num = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| align_num = num_pixels / 4 * 4; | |||||
| ConvDwInt8PostAlign4(dst, buffer, align_num, output_zp, out_multiplier, left_shift, right_shift, acc_min, acc_max); | |||||
| #endif | |||||
| for (int i = align_num; i < num_pixels; i++) { | |||||
| buffer[i] = RoundingDivideByPOT( | |||||
| SaturatingRoundingDoublingHighMul(buffer[i] * (1 << (unsigned int)left_shift), out_multiplier), -right_shift); | |||||
| buffer[i] += output_zp; | |||||
| buffer[i] = MSMAX(buffer[i], acc_min); | |||||
| buffer[i] = MSMIN(buffer[i], acc_max); | |||||
| dst[i] = (buffer[i]); | |||||
| } | |||||
| } | |||||
| void ConvDwInt8(int8_t *output_data, int32_t *row_buffer, const int8_t *input_data, const int16_t *weight_data, | |||||
| const int32_t *bias_data, const ConvParameter *conv_param, int task_id) { | |||||
| int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); | |||||
| int h_start = h_step * task_id; | |||||
| int h_end = MSMIN(h_start + h_step, conv_param->output_h_); | |||||
| int out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_[0]; | |||||
| int left_shift = conv_param->conv_quant_arg_.left_shift_[0]; | |||||
| int right_shift = conv_param->conv_quant_arg_.right_shift_[0]; | |||||
| int intput_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_; | |||||
| int output_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_; | |||||
| int acc_min = conv_param->conv_quant_arg_.out_act_min_[0]; | |||||
| int acc_max = conv_param->conv_quant_arg_.out_act_max_[0]; | |||||
| for (int b = 0; b < conv_param->output_batch_; b++) { | |||||
| const int8_t *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; | |||||
| int8_t *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; | |||||
| for (int oh = h_start; oh < h_end; oh++) { | |||||
| int8_t *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; | |||||
| int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); | |||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); | |||||
| // init acc | |||||
| for (int ow = 0; ow < conv_param->output_w_; ow++) { | |||||
| memcpy(row_buffer + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(int32_t)); | |||||
| } | |||||
| for (int kh = start_kh; kh < end_kh; kh++) { | |||||
| int ih = ih_origin + conv_param->dilation_w_ * kh; | |||||
| const int8_t *src_kh = src + ih * conv_param->input_w_ * conv_param->input_channel_; | |||||
| const int16_t *weight_kh = weight_data + kh * conv_param->kernel_w_ * conv_param->output_channel_; | |||||
| int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; | |||||
| for (int kw = 0; kw < conv_param->kernel_w_; kw++) { | |||||
| int out_w_start = MSMAX( | |||||
| 0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); | |||||
| int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ - | |||||
| conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / | |||||
| conv_param->stride_w_); | |||||
| int32_t *acc_w = row_buffer + out_w_start * conv_param->output_channel_; | |||||
| int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw; | |||||
| const int8_t *src_kw = src_kh + iw_origin * conv_param->input_channel_; | |||||
| int num_pixels = out_w_end - out_w_start; | |||||
| ConvDwInt8Row(acc_w, src_kw, weight_kh, num_pixels, conv_param->output_channel_, in_sw_step, intput_zp); | |||||
| weight_kh += conv_param->output_channel_; | |||||
| } | |||||
| } | |||||
| // post func, acc int32 -> dst int8 | |||||
| ConvDwInt8Post(dst_data, row_buffer, conv_param->output_w_ * conv_param->output_channel_, output_zp, | |||||
| out_multiplier, left_shift, right_shift, acc_min, acc_max); | |||||
| } | |||||
| } | |||||
| } | |||||
| /*conv depthwise int8 end*/ | |||||
| /*conv depthwise sliding window int8 begin*/ | |||||
| void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, | void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, | ||||
| int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier, | int width, int in_kh_step, int in_kw_step, int kernel_w, int *out_multiplier, | ||||
| int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max, | int *left_shift, int *right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max, | ||||
| @@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, | |||||
| } | } | ||||
| #endif | #endif | ||||
| void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { | |||||
| void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { | |||||
| const int16_t *src = input_data; | const int16_t *src = input_data; | ||||
| int8_t *dst = output_data; | int8_t *dst = output_data; | ||||
| bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; | bool per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; | ||||
| @@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w | |||||
| } // batch loop | } // batch loop | ||||
| // output nhwc4 | // output nhwc4 | ||||
| } | } | ||||
| /*conv depthwise int8 end*/ | |||||
| /*conv depthwise sliding window int8 end*/ | |||||
| /*deconv depthwise int8 begin*/ | /*deconv depthwise int8 begin*/ | ||||
| void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width, | void DeconvDepthwiseBorderPixelInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width, | ||||
| @@ -23,8 +23,12 @@ | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); | |||||
| void ConvDwInt8(int8_t *output_data, int32_t *output_row, const int8_t *input_data, const int16_t *weight_data, | |||||
| const int32_t *bias_data, const ConvParameter *conv_param, int task_id); | |||||
| void ConvDwSWInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id); | |||||
| void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data, | void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data, | ||||
| const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | ||||
| @@ -15,6 +15,7 @@ | |||||
| */ | */ | ||||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h" | #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h" | ||||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h" | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { | ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { | ||||
| if (sliding != nullptr) { | |||||
| delete sliding; | |||||
| sliding = nullptr; | |||||
| } | |||||
| if (packed_weight_ != nullptr) { | if (packed_weight_ != nullptr) { | ||||
| free(packed_weight_); | free(packed_weight_); | ||||
| packed_weight_ = nullptr; | packed_weight_ = nullptr; | ||||
| @@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { | |||||
| int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | ||||
| // init weight, int8 -> int16 | // init weight, int8 -> int16 | ||||
| // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 | |||||
| auto weight_tensor = in_tensors_[kWeightIndex]; | auto weight_tensor = in_tensors_[kWeightIndex]; | ||||
| auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data()); | auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data()); | ||||
| int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); | |||||
| int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); | |||||
| int channel = weight_tensor->Batch(); | |||||
| int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width(); | |||||
| auto tmp_weight = reinterpret_cast<int8_t *>(malloc(pack_weight_size * sizeof(int8_t))); | |||||
| if (tmp_weight == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| PackNCHWToNHWCInt8(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(), | |||||
| weight_tensor->Batch()); | |||||
| int weight_zp = conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_; | |||||
| packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | ||||
| if (packed_weight_ == nullptr) { | if (packed_weight_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), | |||||
| weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); | |||||
| for (int i = 0; i < weight_tensor->ElementsNum(); i++) { | |||||
| packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp); | |||||
| } | |||||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | |||||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t))); | |||||
| if (bias_data_ == nullptr) { | if (bias_data_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | |||||
| memset(bias_data_, 0, channel * sizeof(int32_t)); | |||||
| if (in_tensors_.size() == kInputSize2) { | if (in_tensors_.size() == kInputSize2) { | ||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | auto bias_tensor = in_tensors_.at(kBiasIndex); | ||||
| auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data()); | auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data()); | ||||
| memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); | memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); | ||||
| } | } | ||||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC4); | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { | |||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->input_channel_, 4); | |||||
| packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->output_channel_, C4NUM); | |||||
| packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionDepthwiseInt8CPUKernel::Init() { | int ConvolutionDepthwiseInt8CPUKernel::Init() { | ||||
| sliding = new (std::nothrow) SlidingWindowParam; | |||||
| if (sliding == nullptr) { | |||||
| MS_LOG(ERROR) << "new sliding window param."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { | |||||
| int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | ||||
| ConvolutionBaseCPUKernel::Init(); | ConvolutionBaseCPUKernel::Init(); | ||||
| InitSlidingParamConvDw(sliding, conv_param_, C4NUM); | |||||
| auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); | auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Set quant param failed."; | MS_LOG(ERROR) << "Set quant param failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| conv_param_->thread_num_ = MSMIN(thread_count_, conv_param_->output_h_); | |||||
| ret = InitWeightBias(); | ret = InitWeightBias(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; | MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; | ||||
| @@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| } | } | ||||
| int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | ||||
| ConvDwInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_, | |||||
| sliding, task_id); | |||||
| auto buffer = row_buffer_ + conv_param_->output_w_ * conv_param_->output_channel_ * task_id; | |||||
| ConvDwInt8(output_ptr_, buffer, input_ptr_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_, | |||||
| task_id); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { | |||||
| int output_row_size = conv_param_->thread_num_ * conv_param_->output_w_ * conv_param_->output_channel_; | |||||
| row_buffer_ = reinterpret_cast<int32_t *>(context_->allocator->Malloc(output_row_size * sizeof(float))); | |||||
| if (row_buffer_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseInt8CPUKernel::Run() { | int ConvolutionDepthwiseInt8CPUKernel::Run() { | ||||
| if (conv_param_->input_channel_ != conv_param_->output_channel_) { | if (conv_param_->input_channel_ != conv_param_->output_channel_) { | ||||
| MS_LOG(ERROR) << "Only support input channel equals output channel."; | MS_LOG(ERROR) << "Only support input channel equals output channel."; | ||||
| @@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { | |||||
| } | } | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | auto input_tensor = in_tensors_.at(kInputIndex); | ||||
| auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data()); | |||||
| PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | |||||
| input_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data()); | |||||
| auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data()); | |||||
| if (!need_align_) { | |||||
| packed_output_ = output_addr; | |||||
| } | |||||
| auto output_tensor = out_tensors_.at(kOutputIndex); | |||||
| output_ptr_ = reinterpret_cast<int8_t *>(output_tensor->Data()); | |||||
| ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_); | ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (need_align_) { | |||||
| PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| context_->allocator->Free(packed_output_); | |||||
| } | |||||
| context_->allocator->Free(packed_input_); | |||||
| context_->allocator->Free(row_buffer_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T | |||||
| const mindspore::lite::PrimitiveC *primitive) { | const mindspore::lite::PrimitiveC *primitive) { | ||||
| MS_ASSERT(opParameter != nullptr); | MS_ASSERT(opParameter != nullptr); | ||||
| MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D); | MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D); | ||||
| auto kernel = | |||||
| new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| kernel::LiteKernel *kernel; | |||||
| auto filter_quant_size = inputs[kWeightIndex]->GetQuantParams().size(); | |||||
| if (filter_quant_size == 1) { // per tensor | |||||
| kernel = new (std::nothrow) kernel::ConvolutionDepthwiseInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| } else { // per channel | |||||
| kernel = | |||||
| new (std::nothrow) kernel::ConvolutionDepthwiseSWInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| } | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "kernel is nullptr."; | MS_LOG(ERROR) << "kernel is nullptr."; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| int InitBuffer(); | |||||
| int Execute(int task_id); | int Execute(int task_id); | ||||
| private: | private: | ||||
| SlidingWindowParam *sliding = nullptr; | |||||
| int InitBuffer(); | |||||
| int16_t *packed_weight_ = nullptr; | int16_t *packed_weight_ = nullptr; | ||||
| int16_t *packed_input_ = nullptr; | |||||
| int8_t *packed_output_ = nullptr; | |||||
| bool need_align_ = false; | |||||
| int8_t *input_ptr_ = nullptr; | |||||
| int8_t *output_ptr_ = nullptr; | |||||
| int32_t *row_buffer_ = nullptr; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -0,0 +1,182 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h" | |||||
| #include "schema/model_generated.h" | |||||
| #include "src/kernel_registry.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "nnacl/int8/conv_depthwise_int8.h" | |||||
| #include "src/runtime/runtime_api.h" | |||||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_DepthwiseConv2D; | |||||
| namespace mindspore::kernel { | |||||
| ConvolutionDepthwiseSWInt8CPUKernel::~ConvolutionDepthwiseSWInt8CPUKernel() { | |||||
| if (sliding != nullptr) { | |||||
| delete sliding; | |||||
| sliding = nullptr; | |||||
| } | |||||
| if (packed_weight_ != nullptr) { | |||||
| free(packed_weight_); | |||||
| packed_weight_ = nullptr; | |||||
| } | |||||
| FreeQuantParam(); | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::InitWeightBias() { | |||||
| // init weight, int8 -> int16 | |||||
| // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 | |||||
| auto weight_tensor = in_tensors_[kWeightIndex]; | |||||
| auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data()); | |||||
| int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM); | |||||
| int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width(); | |||||
| packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | |||||
| if (packed_weight_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(), | |||||
| weight_tensor->Batch(), &(conv_param_->conv_quant_arg_)); | |||||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | |||||
| if (in_tensors_.size() == kInputSize2) { | |||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | |||||
| auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data()); | |||||
| memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t)); | |||||
| } | |||||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC4); | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::InitBuffer() { | |||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->input_channel_, 4); | |||||
| packed_input_ = reinterpret_cast<int16_t *>(context_->allocator->Malloc(pack_input_size * sizeof(int16_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->output_channel_, C4NUM); | |||||
| packed_output_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(pack_output_size * sizeof(int8_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::Init() { | |||||
| sliding = new (std::nothrow) SlidingWindowParam; | |||||
| if (sliding == nullptr) { | |||||
| MS_LOG(ERROR) << "new sliding window param."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | |||||
| return ReSize(); | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::ReSize() { | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| InitSlidingParamConvDw(sliding, conv_param_, C4NUM); | |||||
| auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Set quant param failed."; | |||||
| return ret; | |||||
| } | |||||
| ret = InitWeightBias(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::Execute(int task_id) { | |||||
| ConvDwSWInt8(packed_output_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), conv_param_, | |||||
| sliding, task_id); | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvDwSWInt8Run(void *cdata, int task_id) { | |||||
| auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseSWInt8CPUKernel *>(cdata); | |||||
| auto ret = conv_dw_int8->Execute(task_id); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "ConvolutionDepthwiseSWInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseSWInt8CPUKernel::Run() { | |||||
| if (conv_param_->input_channel_ != conv_param_->output_channel_) { | |||||
| MS_LOG(ERROR) << "Only support input channel equals output channel."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto ret = Prepare(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Prepare failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; | |||||
| return ret; | |||||
| } | |||||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||||
| auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data()); | |||||
| PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | |||||
| auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->Data()); | |||||
| if (!need_align_) { | |||||
| packed_output_ = output_addr; | |||||
| } | |||||
| ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWInt8Run, this, conv_param_->thread_num_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "ConvDwSWInt8Run error: error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (need_align_) { | |||||
| PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| context_->allocator->Free(packed_output_); | |||||
| } | |||||
| context_->allocator->Free(packed_input_); | |||||
| return RET_OK; | |||||
| } | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,51 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_ | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||||
| #include "nnacl/fp32/conv_depthwise.h" | |||||
| namespace mindspore::kernel { | |||||
| class ConvolutionDepthwiseSWInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| public: | |||||
| ConvolutionDepthwiseSWInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~ConvolutionDepthwiseSWInt8CPUKernel() override; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| int InitWeightBias(); | |||||
| int InitBuffer(); | |||||
| int Execute(int task_id); | |||||
| private: | |||||
| SlidingWindowParam *sliding = nullptr; | |||||
| int16_t *packed_weight_ = nullptr; | |||||
| int16_t *packed_input_ = nullptr; | |||||
| int8_t *packed_output_ = nullptr; | |||||
| bool need_align_ = false; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_ | |||||