!9918 [MS][LITE][Develop]avx fp32 matmul support for winograd

From: @lx0095 Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
5 years ago · 74dc7d069d
--- a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
+++ b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
@@ -53,7 +53,7 @@ NoC8Steps:
    movq $4, %r12
    imul %r10, %r12
    imul %rbx, %r12
    movq $48, %r13
    movq $32, %r13
    imul %r10, %r13
 NoWinoSteps:
    movq $4, %rax
@@ -827,32 +827,35 @@ LoopRow:
            jmp WriteEnd
        WriteWino:
            movq %rdx, %rax
            addq %r13, %rdx
            movq %rdx, %r15
            addq %r13, %rdx
            movq %rdx, -80(%rsp)
            vmovups %ymm4, (%rax)
            vmovups %ymm5, (%r15)
            addq %r12, %rax
            addq %r12, %r15
            vmovups %ymm6, (%rax)
            vmovups %ymm7, (%r15)
            addq %r12, %rax
            addq %r12, %r15
            vmovups %ymm8, (%rax)
            vmovups %ymm9, (%r15)
            addq %r12, %rax
            addq %r12, %r15
            vmovups %ymm10, (%rax)
            vmovups %ymm11, (%r15)
            addq %r12, %rax
            addq %r12, %r15
            vmovups %ymm12, (%rax)
            vmovups %ymm13, (%r15)
            addq %r12, %rax
            addq %r12, %r15
            vmovups %ymm14, (%rax)
            vmovups %ymm15, (%r15)
            addq %r13, %rax
            movq %rax, -80(%rsp)
            vmovups %ymm4, (%rdx)
            addq %r12, %rdx
            vmovups %ymm6, (%rdx)
            addq %r12, %rdx
            vmovups %ymm8, (%rdx)
            addq %r12, %rdx
            vmovups %ymm10, (%rdx)
            addq %r12, %rdx
            vmovups %ymm12, (%rdx)
            addq %r12, %rdx
            vmovups %ymm14, (%rdx)
            cmpq $-8, %rbx
            je WriteEnd
            movq %rax, %rdx
            addq %r13, %rax
            movq %rax, -80(%rsp)
            vmovups %ymm5, (%rdx)
            addq %r12, %rdx
            vmovups %ymm7, (%rdx)
            addq %r12, %rdx
            vmovups %ymm9, (%rdx)
            addq %r12, %rdx
            vmovups %ymm11, (%rdx)
            addq %r12, %rdx
            vmovups %ymm13, (%rdx)
            addq %r12, %rdx
            vmovups %ymm15, (%rdx)
            jmp WriteEnd
        Write16:
            movq %rdx, %rax
--- a/mindspore/lite/nnacl/fp32/conv_fp32.c
+++ b/mindspore/lite/nnacl/fp32/conv_fp32.c
@@ -73,6 +73,12 @@ void ConvWinogardFp32(const float *input_data, const float *trans_weight, const
  int output_count = out_w_block * out_h_block;
  const int tile_num = C12NUM;
  int output_tile_count = UP_DIV(output_count, tile_num);
 #ifdef ENABLE_AVX
  const int col_tile = C16NUM;
 #else
  const int col_tile = C8NUM;
 #endif
  int oc_tile = UP_DIV(conv_param->output_channel_, col_tile);
  int oc8 = UP_DIV(conv_param->output_channel_, C8NUM);
  int input_unit_square = conv_param->input_unit_ * conv_param->input_unit_;

@@ -101,13 +107,15 @@ void ConvWinogardFp32(const float *input_data, const float *trans_weight, const
      float *dst_ptr = gemm_out + task_id * gemm_out_offset;
      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
      for (int i = 0; i < input_unit_square; ++i) {
 #if defined(ENABLE_ARM32) || defined(ENABLE_SSE)
 #ifdef ENABLE_AVX
        RowMajor2Col6Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #elif defined(ENABLE_ARM32) || defined(ENABLE_SSE)
        RowMajor2Col4Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #else
        RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #endif
        MatMulOpt(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
                  cal_num, oc8 * C8NUM, input_unit_square, 2);
        MatMulOpt(tmp_col_ptr, trans_weight + i * in_channel * oc_tile * col_tile, dst_ptr + i * C8NUM, NULL, 0,
                  in_channel, cal_num, oc8 * C8NUM, input_unit_square, 2);
      }

      // step 4 : output transform
--- a/mindspore/lite/nnacl/fp32/matmul_fp32.c
+++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c
@@ -793,8 +793,6 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
  return;
 }

 #ifdef ENABLE_AVX
 #ifdef WIN32
 void MatMul6x16(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row,
                int col, int stride, int out_type) {
  if (out_type == OutType_Nhwc) {
@@ -815,11 +813,29 @@ void MatMul6x16(const float *a, const float *b, float *dst, const float *bias, A
        dst[ci] = value;
      }
    }
  } else {
    for (int i = 0; i < row; ++i) {
      int dst_r_offset = i * col * stride;
      int r6div = i / C6NUM, r6mod = i % C6NUM;
      for (int j = 0; j < col; ++j) {
        int b16div = j / C16NUM, b16mod = j % C16NUM;
        int c8div = j / C8NUM, c8mod = j % C8NUM;
        size_t ci = dst_r_offset + c8div * C8NUM * stride + c8mod;
        float value = 0;
        for (int d = 0; d < deep; ++d) {
          size_t ai = r6div * deep * C6NUM + d * C6NUM + r6mod;
          size_t bi = b16div * deep * C16NUM + d * C16NUM + b16mod;
          value = value + a[ai] * b[bi];
        }
        if (bias != NULL) value += bias[j];
        if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
        if (act_type != ActType_No) value = MSMAX(0.0f, value);
        dst[ci] = value;
      }
    }
  }
  return;
 }
 #endif
 #endif

 void MatMul4x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row,
               int col, int stride, int out_type) {
@@ -862,16 +878,14 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
    MatmulFloatNeon32Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  }
 #elif ENABLE_AVX
  if (out_type == OutType_Nhwc) {
  if (out_type == OutType_C8) {
    MatmulFloatSse64(a, b, c, bias, (int)act_type, deep, row, col, stride, 0, 0);
  } else {
 #ifdef WIN32
    MatMul6x16(a, b, c, bias, act_type, deep, row, col, stride, out_type);
 #else
    MatmulFloatAvxOpt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
 #endif
  } else if (out_type == OutType_C8) {
    MatmulFloatSse64(a, b, c, bias, (int)act_type, deep, row, col, stride, 0, 0);
  } else {
    MatmulFloatSse64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  }
 #elif ENABLE_SSE
  if (out_type == OutType_C8) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@@ -49,8 +49,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
  conv_param_->output_channel_ = out_channel;

  int oc4 = UP_DIV(out_channel, C4NUM);
 #ifdef ENABLE_AVX
  const int oc_block = C16NUM;
 #else
  const int oc_block = C8NUM;
  int oc_block_num = UP_DIV(out_channel, C8NUM);
 #endif
  int oc_block_num = UP_DIV(out_channel, oc_block);

  // set data
  auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float);