| @@ -3,19 +3,19 @@ | |||
| .text | |||
| .align 5 | |||
| .global ConvDw3x3BorderPixelInt8 | |||
| .global ConvDw3x3Int8BorderPixel | |||
| #ifndef __APPLE__ | |||
| .type ConvDw3x3BorderPixelInt8, %function | |||
| .type ConvDw3x3Int8BorderPixel, %function | |||
| #endif | |||
| // void ConvDw3x3BorderPixelInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height, | |||
| // void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height, | |||
| // size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, | |||
| // size_t out_multiplier, size_t left_shift, size_t right_shift, size_t acc_min, size_t acc_max) { | |||
| // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | |||
| // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift | |||
| // r14: acc_min, r15: acc_max | |||
| ConvDw3x3BorderPixelInt8: | |||
| ConvDw3x3Int8BorderPixel: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| @@ -0,0 +1,74 @@ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwInt8Post | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwInt8Post, %function | |||
| #endif | |||
| // void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | |||
| // int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | |||
| // int32_t acc_max) | |||
| // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier, | |||
| // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max | |||
| DeconvDwInt8Post: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r4-r8} | |||
| add sp, sp, #20 | |||
| vld1.32 {q9}, [r2] | |||
| ldr r4, [sp] | |||
| ldr r5, [sp, #4] | |||
| vdup.32 q14, r5 // out_multiplier | |||
| ldr r6, [sp, #8] | |||
| vdup.32 q13, r6 // left_shift | |||
| ldr r5, [sp, #12] | |||
| vdup.32 q12, r5 // right_shift | |||
| ldr r6, [sp, #16] | |||
| vdup.32 q15, r6 // output_zp | |||
| ldr r7, [sp, #20] | |||
| vdup.32 q11, r7 // acc_min | |||
| ldr r8, [sp, #24] | |||
| vdup.32 q10, r8 // acc_max | |||
| LoopCount: | |||
| mov r8, r0 | |||
| vld1.32 {q0}, [r1]! | |||
| vand q0, q0, q9 | |||
| vshl.s32 q0, q0, q13 | |||
| vqrdmulh.s32 q0, q0, q14 | |||
| vand q4, q0, q12 | |||
| vshr.s32 q4, q4, #31 | |||
| vqadd.s32 q0, q0, q4 | |||
| vrshl.s32 q0, q0, q12 | |||
| vadd.i32 q0, q0, q15 | |||
| vmax.s32 q0, q0, q11 | |||
| vmin.s32 q0, q0, q10 | |||
| vqmovn.s32 d0, q0 | |||
| vqmovn.s16 d0, q0 | |||
| vst1.8 {d0[0]}, [r8]! | |||
| vst1.8 {d0[1]}, [r8]! | |||
| vst1.8 {d0[2]}, [r8]! | |||
| vst1.8 {d0[3]}, [r8]! | |||
| add r0, r0, r3 | |||
| sub r4, r4, #1 | |||
| cmp r4, #1 | |||
| bge LoopCount | |||
| End: | |||
| sub sp, sp, #20 | |||
| pop {r4-r8} | |||
| bx lr | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,58 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwInt8Post | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwInt8Post, %function | |||
| #endif | |||
| // void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | |||
| // int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | |||
| // int32_t acc_max) | |||
| // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier | |||
| // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max | |||
| DeconvDwInt8Post: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| ld1 {v25.4s}, [x2] | |||
| dup v26.4s, w6 // left_shift | |||
| dup v27.4s, w5 // out_multiplier | |||
| dup v28.4s, w7 // right_shift | |||
| ldr w8, [sp] | |||
| dup v29.4s, w8 // out_zp | |||
| ldr w9, [sp, #8] | |||
| dup v30.4s, w9 // acc_min | |||
| ldr w10, [sp, #16] | |||
| dup v31.4s, w10 // acc_max | |||
| LoopCount: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| add v0.4s, v0.4s, v25.4s | |||
| sqshl v0.4s, v0.4s, v26.4s | |||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||
| and v16.16b, v28.16b, v0.16b | |||
| sshr v16.4s, v16.4s, #31 | |||
| sqadd v0.4s, v0.4s, v16.4s | |||
| srshl v0.4s, v0.4s, v28.4s | |||
| add v0.4s, v0.4s, v29.4s | |||
| smax v0.4s, v0.4s, v30.4s | |||
| smin v0.4s, v0.4s, v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v0.8b, v0.8h | |||
| st1 {v0.s}[0], [x0], x3 | |||
| sub x4, x4, #1 | |||
| cmp x4, #1 | |||
| bge LoopCount | |||
| ret | |||
| #endif | |||
| @@ -47,6 +47,9 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con | |||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp, | |||
| int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *acc_min, int32_t *acc_max); | |||
| void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | |||
| int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | |||
| int32_t acc_max); | |||
| #endif | |||
| #ifdef ENABLE_ARM32 | |||
| @@ -54,6 +57,9 @@ void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *wei | |||
| size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before, | |||
| int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset); | |||
| void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, | |||
| int out_multiplier, int left_shift, int right_shift, int32_t acc_min, int32_t acc_max); | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -302,6 +302,7 @@ void ConvDw3x3Int8(int8_t *output_data, int8_t *buffer, const int8_t *input_data | |||
| } | |||
| } | |||
| #ifndef ENABLE_ARM32 | |||
| void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp, | |||
| int out_multiplier, int left_shift, int right_shift, int32_t acc_min, int32_t acc_max) { | |||
| @@ -338,6 +339,7 @@ void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *wei | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef ENABLE_ARM64 | |||
| void ConvDw3x3Int8Corner(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int in_kh_step, | |||
| @@ -730,12 +732,13 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t * | |||
| } | |||
| #endif | |||
| void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, | |||
| const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift, | |||
| int32_t out_zp, int32_t acc_min, int32_t acc_max) { | |||
| #ifndef ENABLE_ARM | |||
| void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | |||
| int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | |||
| int32_t acc_max) { | |||
| int8_t *dst_k = dst; | |||
| int32_t *buffer_k = output_buffer; | |||
| for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | |||
| for (int k = 0; k < pixel_nums; k++) { | |||
| for (int c = 0; c < C4NUM; c++) { | |||
| buffer_k[c] += bias[c]; | |||
| buffer_k[c] = RoundingDivideByPOT( | |||
| @@ -749,6 +752,7 @@ void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int3 | |||
| buffer_k += C4NUM; | |||
| } | |||
| } | |||
| #endif | |||
| void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data, | |||
| const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | |||
| @@ -791,11 +795,11 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in | |||
| sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); | |||
| #endif | |||
| } | |||
| DeconvDepthwisePostFuncInt8( | |||
| dst_data, output_buffer, bias, sliding->block_channel_, conv_param, | |||
| conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0], | |||
| conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]); | |||
| DeconvDwInt8Post(dst_data, output_buffer, bias, sliding->block_channel_, | |||
| conv_param->output_h_ * conv_param->output_w_, conv_param->conv_quant_arg_.quant_multiplier_[0], | |||
| conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0], | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]); | |||
| } // output C4 loop | |||
| src += sliding->in_step_; | |||
| dst += sliding->out_step_; | |||