zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "nnacl/fp32/conv_common_fp32.h"
#include <string.h>
#include "nnacl/fp32/common_func_fp32.h"
#include "nnacl/fp32/matmul_fp32.h"

// fp32 conv common
void ConvFp32(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data,
              float *col_major_input, float *output_data, int task_id, const ConvParameter *conv_param) {
  int out_channel = conv_param->output_channel_;
  int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
  int output_count = conv_param->output_h_ * conv_param->output_w_;
#ifdef ENABLE_AVX
  const int cal_num = C6NUM;
#elif defined(ENABLE_SSE)
  const int cal_num = C4NUM;
#else
  const int cal_num = C12NUM;
#endif
  int output_tile_count = UP_DIV(output_count, cal_num);

  for (int b = 0; b < conv_param->input_batch_; b++) {
    int in_batch_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
    int out_batch_offset = b * out_channel * output_count;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) {
      int start_index = thread_id * cal_num;
      int real_cal_num = (output_count - start_index) < cal_num ? (output_count - start_index) : cal_num;
      if (real_cal_num <= 0) {
        return;
      }
      float *gemm_input = packed_input + task_id * deep * cal_num;
      float *col_major_gemm_input = col_major_input + task_id * deep * cal_num;
      size_t packed_input_size = deep * cal_num * sizeof(float);
      memset(gemm_input, 0, packed_input_size);
      memset(col_major_gemm_input, 0, packed_input_size);
      Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);

      int out_offset = thread_id * cal_num * out_channel + out_batch_offset;
      float *gemm_output = output_data + out_offset;
#ifdef ENABLE_AVX
      RowMajor2Col6Major(gemm_input, col_major_gemm_input, cal_num, deep);
#elif defined(ENABLE_SSE)
      RowMajor2Col4Major(gemm_input, col_major_gemm_input, cal_num, deep);
#else
      RowMajor2Col12Major(gemm_input, col_major_gemm_input, cal_num, deep);
#endif
      MatMulOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep, real_cal_num,
                out_channel, out_channel, OutType_Nhwc);
    }
  }
}