optimization for fp32 winograd on arm32

5 years ago · 7f3582d0f5
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
@@ -381,6 +381,8 @@ LoopRow:
        ldr lr, [sp, #20]
        cmp lr, #0
        beq C8DstStep
        cmp lr, #2
        beq WinoDstStep
        mov lr, #4
        ldr r7, [sp, #12] // reload rhs col
        mul lr, lr, r7
@@ -391,6 +393,10 @@ LoopRow:
        ldr lr, [sp, #-40]
        add r2, lr, #128
        str r2, [sp, #-40]
        b NoDstStep
    WinoDstStep:
        add r2, r2, r10
        str r2, [sp, #-40]
    NoDstStep:
        cmp r6, #4
        ble LoopRowEnd
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add.S
+++ b/mindspore/lite/nnacl/assembly/arm64/bias_add.S
@@ -1,82 +0,0 @@

 #ifdef __aarch64__
    .text
    .align 5
    //.p2align 5,,15
    .global BiasAdd
 #ifndef __APPLE__
    .type BiasAdd, %function
 #endif



 //void BiasAdd(const float* bias, float* data, size_t oc4, size_t plan_size)

 //Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,

 BiasAdd:
 cmp x2, #0
 beq BiasAddEnd

 cmp x3, #0
 beq BiasAddEnd

 LoopOc4:
 ld1 {v0.4s}, [x0], #16
 mov x6, x3
 mov x5, x1

 Loop16LineIn:
 cmp x6, #4
 blt L4
 sub x6, x6, #4

 ld1 {v1.4s, v2.4s}, [x5], #32

 fadd v5.4s, v0.4s, v1.4s
 fadd v6.4s, v0.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 cmp x6, #4
 blt Loop16LineOut

 Loop16:
 st1 {v5.4s, v6.4s}, [x1], #32
 fadd v7.4s, v0.4s, v3.4s
 fadd v8.4s, v0.4s, v4.4s
 ld1 {v1.4s, v2.4s}, [x5], #32

 st1 {v7.4s, v8.4s}, [x1], #32
 fadd v5.4s, v0.4s, v1.4s
 fadd v6.4s, v0.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 sub x6, x6, #4
 cmp x6, #4
 bge Loop16

 Loop16LineOut:
 st1 {v5.4s, v6.4s}, [x1], #32
 fadd v7.4s, v0.4s, v3.4s
 fadd v8.4s, v0.4s, v4.4s

 st1 {v7.4s, v8.4s}, [x1], #32

 L4:
 cmp x6, #0
 beq Loop16LineEnd
 Loop4:
 ld1 {v1.4s}, [x5], #16
 fadd v2.4s, v1.4s, v0.4s
 subs x6, x6, #1
 st1 {v2.4s}, [x1], #16
 bne Loop4

 Loop16LineEnd:
 subs x2, x2, #1
 bne LoopOc4

 BiasAddEnd:

 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S
+++ b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S
@@ -1,94 +0,0 @@

 #ifdef __aarch64__
    .text
    .align 5
    //.p2align 5,,15
    .global BiasAddRelu
 #ifndef __APPLE__
    .type BiasAddRelu, %function
 #endif


 //void BiasAddRelu(const float* bias, float* data, size_t oc4, size_t plan_size)

 //Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,

 BiasAddRelu:
 cmp x2, #0
 beq BiasAddEnd

 cmp x3, #0
 beq BiasAddEnd

 dup v16.4s, wzr

 LoopOc4:
 ld1 {v0.4s}, [x0], #16
 mov x6, x3
 mov x5, x1

 Loop16LineIn:
 cmp x6, #4
 blt L4
 sub x6, x6, #4

 ld1 {v1.4s, v2.4s}, [x5], #32

 fadd v21.4s, v0.4s, v1.4s
 fadd v22.4s, v0.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 fmax v23.4s, v21.4s, v16.4s
 fmax v24.4s, v22.4s, v16.4s

 cmp x6, #4
 blt Loop16LineOut

 Loop16:
 st1 {v23.4s, v24.4s}, [x1], #32
 fadd v25.4s, v0.4s, v3.4s
 fadd v26.4s, v0.4s, v4.4s
 ld1 {v1.4s, v2.4s}, [x5], #32

 fmax v27.4s, v25.4s, v16.4s
 fmax v28.4s, v26.4s, v16.4s
 fadd v21.4s, v0.4s, v1.4s
 fadd v22.4s, v0.4s, v2.4s

 st1 {v27.4s, v28.4s}, [x1], #32
 ld1 {v3.4s, v4.4s}, [x5], #32
 fmax v23.4s, v21.4s, v16.4s
 fmax v24.4s, v22.4s, v16.4s
 sub x6, x6, #4
 cmp x6, #4
 bge Loop16

 Loop16LineOut:
 st1 {v23.4s, v24.4s}, [x1], #32
 fadd v25.4s, v0.4s, v3.4s
 fadd v26.4s, v0.4s, v4.4s

 fmax v27.4s, v25.4s, v16.4s
 fmax v28.4s, v26.4s, v16.4s
 st1 {v27.4s, v28.4s}, [x1], #32

 L4:
 cmp x6, #0
 beq Loop16LineEnd
 Loop4:
 ld1 {v1.4s}, [x5], #16
 fadd v1.4s, v1.4s, v0.4s
 fmax v1.4s, v1.4s, v16.4s

 subs x6, x6, #1
 st1 {v1.4s}, [x1], #16
 bne Loop4

 Loop16LineEnd:
 subs x2, x2, #1
 bne LoopOc4

 BiasAddEnd:

 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S
+++ b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S
@@ -1,113 +0,0 @@

 #ifdef __aarch64__
    .text
    .align 5
    //.p2align 5,,15
    .global BiasAddRelu6
 #ifndef __APPLE__
    .type BiasAddRelu6, %function
 #endif



 //void BiasAddRelu6(const float* bias, float* data, size_t oc4, size_t plan_size)

 //Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,

 BiasAddRelu6:
 cmp x2, #0
 beq BiasAddEnd

 cmp x3, #0
 beq BiasAddEnd

 dup v16.4s, wzr
 movi v17.4s, #6
 scvtf v17.4s, v17.4s

 LoopOc4:
 ld1 {v0.4s}, [x0], #16
 mov x6, x3
 mov x5, x1

 Loop16LineIn:
 cmp x6, #4
 blt L4
 sub x6, x6, #4

 ld1 {v1.4s, v2.4s}, [x5], #32

 fadd v21.4s, v0.4s, v1.4s
 fadd v22.4s, v0.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 fmax v23.4s, v21.4s, v16.4s
 fmax v24.4s, v22.4s, v16.4s



 cmp x6, #4
 blt Loop16LineOut

 Loop16:
 fmin v23.4s, v23.4s, v17.4s
 fmin v24.4s, v24.4s, v17.4s
 fadd v25.4s, v0.4s, v3.4s
 fadd v26.4s, v0.4s, v4.4s
 ld1 {v1.4s, v2.4s}, [x5], #32

 st1 {v23.4s, v24.4s}, [x1], #32
 fmax v27.4s, v25.4s, v16.4s
 fmax v28.4s, v26.4s, v16.4s
 fadd v21.4s, v0.4s, v1.4s
 fadd v22.4s, v0.4s, v2.4s

 fmin v27.4s, v27.4s, v17.4s
 fmin v28.4s, v28.4s, v17.4s
 fmax v23.4s, v21.4s, v16.4s
 fmax v24.4s, v22.4s, v16.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 st1 {v27.4s, v28.4s}, [x1], #32


 sub x6, x6, #4
 cmp x6, #4
 bge Loop16

 Loop16LineOut:
 fmin v23.4s, v23.4s, v17.4s
 fmin v24.4s, v24.4s, v17.4s
 fadd v25.4s, v0.4s, v3.4s
 fadd v26.4s, v0.4s, v4.4s

 fmax v27.4s, v25.4s, v16.4s
 fmax v28.4s, v26.4s, v16.4s
 st1 {v23.4s, v24.4s}, [x1], #32

 fmin v27.4s, v27.4s, v17.4s
 fmin v28.4s, v28.4s, v17.4s

 st1 {v27.4s, v28.4s}, [x1], #32

 L4:
 cmp x6, #0
 beq Loop16LineEnd
 Loop4:
 ld1 {v1.4s}, [x5], #16
 fadd v1.4s, v1.4s, v0.4s
 fmax v1.4s, v1.4s, v16.4s
 fmin v1.4s, v1.4s, v17.4s

 subs x6, x6, #1
 st1 {v1.4s}, [x1], #16
 bne Loop4

 Loop16LineEnd:
 subs x2, x2, #1
 bne LoopOc4

 BiasAddEnd:

 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/relu.S
+++ b/mindspore/lite/nnacl/assembly/arm64/relu.S
@@ -1,73 +0,0 @@

 #ifdef __aarch64__
    .text
    .align 5
    //.p2align 5,,15
    .global Relu
 #ifndef __APPLE__
    .type Relu, %function
 #endif


 //void Relu(float* data, size_t element4)

 //Auto: x0:data, x1: element4

 Relu:
 cmp x1, #0
 beq ReluEnd

 dup v16.4s, wzr

 mov x5, x0

 Loop16LineIn:
 cmp x1, #4
 blt L4
 sub x1, x1, #4

 ld1 {v1.4s, v2.4s}, [x5], #32

 fmax v5.4s, v16.4s, v1.4s
 fmax v6.4s, v16.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 cmp x1, #4
 blt Loop16LineOut

 Loop16:
 st1 {v5.4s, v6.4s}, [x0], #32
 fmax v7.4s, v16.4s, v3.4s
 fmax v8.4s, v16.4s, v4.4s
 ld1 {v1.4s, v2.4s}, [x5], #32

 st1 {v7.4s, v8.4s}, [x0], #32
 fmax v5.4s, v16.4s, v1.4s
 fmax v6.4s, v16.4s, v2.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 sub x1, x1, #4
 cmp x1, #4
 bge Loop16

 Loop16LineOut:
 st1 {v5.4s, v6.4s}, [x0], #32
 fmax v7.4s, v16.4s, v3.4s
 fmax v8.4s, v16.4s, v4.4s

 st1 {v7.4s, v8.4s}, [x0], #32

 L4:
 cmp x1, #0
 beq ReluEnd
 Loop4:
 ld1 {v1.4s}, [x5], #16
 fmax v2.4s, v16.4s, v0.4s
 subs x1, x1, #1
 st1 {v2.4s}, [x0], #16
 bne Loop4

 ReluEnd:

 ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/relu6.S
+++ b/mindspore/lite/nnacl/assembly/arm64/relu6.S
@@ -1,89 +0,0 @@

 #ifdef __aarch64__
    .text
    .align 5
    //.p2align 5,,15
    .global Relu6
 #ifndef __APPLE__
    .type Relu6, %function
 #endif


 //void Relu6(float* data, size_t element4)

 //Auto: x0:data, x1: element4

 Relu6:
 cmp x1, #0
 beq Relu6End

 dup v16.4s, wzr
 movi v17.4s, #6
 scvtf v17.4s, v17.4s

 mov x5, x0

 Loop16LineIn:
 cmp x1, #4
 blt L4
 sub x1, x1, #4

 ld1 {v1.4s, v2.4s}, [x5], #32

 fmax v21.4s, v1.4s, v16.4s
 fmax v22.4s, v2.4s, v16.4s
 ld1 {v3.4s, v4.4s}, [x5], #32

 fmin v23.4s, v21.4s, v17.4s
 fmin v24.4s, v22.4s, v17.4s


 cmp x1, #4
 blt Loop16LineOut

 Loop16:
 st1 {v23.4s, v24.4s}, [x0], #32
 fmax v25.4s, v3.4s, v16.4s
 fmax v26.4s, v4.4s, v16.4s
 ld1 {v1.4s, v2.4s}, [x5], #32

 fmin v27.4s, v25.4s, v17.4s
 fmin v28.4s, v26.4s, v17.4s
 fmax v21.4s, v1.4s, v16.4s
 fmax v22.4s, v2.4s, v16.4s

 st1 {v27.4s, v28.4s}, [x0], #32
 ld1 {v3.4s, v4.4s}, [x5], #32
 fmin v23.4s, v21.4s, v17.4s
 fmin v24.4s, v22.4s, v17.4s

 sub x1, x1, #4
 cmp x1, #4
 bge Loop16

 Loop16LineOut:
 st1 {v23.4s, v24.4s}, [x0], #32
 fmax v25.4s, v3.4s, v16.4s
 fmax v26.4s, v4.4s, v16.4s

 fmin v27.4s, v25.4s, v17.4s
 fmin v28.4s, v26.4s, v17.4s
 st1 {v27.4s, v28.4s}, [x0], #32

 L4:
 cmp x1, #0
 beq Relu6End
 Loop4:
 ld1 {v1.4s}, [x5], #16
 fmax v1.4s, v1.4s, v16.4s

 fmin v1.4s, v1.4s, v17.4s

 subs x1, x1, #1
 st1 {v1.4s}, [x0], #16
 bne Loop4

 Relu6End:

 ret
 #endif
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -81,11 +81,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
  int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
  int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
  int output_count = out_w_block * out_h_block;
 #ifdef ENABLE_ARM32
  const int tile_num = 4;
 #else
  const int tile_num = 12;
 #endif
  const int tile_num = C12NUM;
  int output_tile_count = UP_DIV(output_count, tile_num);
  int out_channel = conv_param->output_channel_;
  int oc8 = UP_DIV(out_channel, C8NUM);
@@ -117,7 +113,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
      for (int i = 0; i < input_unit_square; ++i) {
 #ifdef ENABLE_ARM32
        RowMajor2Col4Major(src_ptr + i * C4NUM * in_channel, tmp_col_ptr, C4NUM, in_channel);
        RowMajor2Col4Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #else
        RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #endif
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -85,11 +85,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
        }    // interval y loop
      }
      // input transform
 #ifdef ENABLE_ARM32
      const int tile_num = 4;
 #else
      const int tile_num = 12;
 #endif
      const int tile_num = C12NUM;
      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
      size_t dst_step = tile_num * in_channel;
      float *trans_input_ptr = trans_input + dst_ic4_offset;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -184,11 +184,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
 int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
  int channel_out = conv_param_->output_channel_;
  int oc8 = UP_DIV(channel_out, C8NUM);
 #ifdef ENABLE_ARM32
  int tile_num = 4;
 #else
  int tile_num = 12;
 #endif
  int tile_num = C12NUM;
  MS_ASSERT(ctx_->allocator != nullptr);

  size_t tile_buffer_size =