|
|
|
@@ -1,5 +1,5 @@ |
|
|
|
/** |
|
|
|
* Copyright 2020 Huawei Technologies Co., Ltd |
|
|
|
* Copyright 2021 Huawei Technologies Co., Ltd |
|
|
|
* |
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
|
* you may not use this file except in compliance with the License. |
|
|
|
@@ -14,55 +14,12 @@ |
|
|
|
* limitations under the License. |
|
|
|
*/ |
|
|
|
|
|
|
|
#include "nnacl/fp32/conv_fp32.h" |
|
|
|
#include "nnacl/fp32/conv_winograd_fp32.h" |
|
|
|
#include <string.h> |
|
|
|
#include "nnacl/fp32/common_func_fp32.h" |
|
|
|
#include "nnacl/winograd_transform.h" |
|
|
|
#include "nnacl/fp32/matmul_fp32.h" |
|
|
|
|
|
|
|
// fp32 conv common |
|
|
|
void ConvFp32(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data, |
|
|
|
float *col_major_input, float *output_data, int task_id, const ConvParameter *conv_param) { |
|
|
|
int out_channel = conv_param->output_channel_; |
|
|
|
int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_; |
|
|
|
int output_count = conv_param->output_h_ * conv_param->output_w_; |
|
|
|
#ifdef ENABLE_AVX |
|
|
|
const int cal_num = C6NUM; |
|
|
|
#elif defined(ENABLE_SSE) |
|
|
|
const int cal_num = C4NUM; |
|
|
|
#else |
|
|
|
const int cal_num = C12NUM; |
|
|
|
#endif |
|
|
|
int output_tile_count = UP_DIV(output_count, cal_num); |
|
|
|
|
|
|
|
for (int b = 0; b < conv_param->input_batch_; b++) { |
|
|
|
int in_batch_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_; |
|
|
|
int out_batch_offset = b * out_channel * output_count; |
|
|
|
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) { |
|
|
|
int start_index = thread_id * cal_num; |
|
|
|
int real_cal_num = (output_count - start_index) < cal_num ? (output_count - start_index) : cal_num; |
|
|
|
float *gemm_input = packed_input + task_id * deep * cal_num; |
|
|
|
float *col_major_gemm_input = col_major_input + task_id * deep * cal_num; |
|
|
|
size_t packed_input_size = deep * cal_num * sizeof(float); |
|
|
|
memset(gemm_input, 0, packed_input_size); |
|
|
|
memset(col_major_gemm_input, 0, packed_input_size); |
|
|
|
Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index); |
|
|
|
|
|
|
|
int out_offset = thread_id * cal_num * out_channel + out_batch_offset; |
|
|
|
float *gemm_output = output_data + out_offset; |
|
|
|
#ifdef ENABLE_AVX |
|
|
|
RowMajor2Col6Major(gemm_input, col_major_gemm_input, cal_num, deep); |
|
|
|
#elif defined(ENABLE_SSE) |
|
|
|
RowMajor2Col4Major(gemm_input, col_major_gemm_input, cal_num, deep); |
|
|
|
#else |
|
|
|
RowMajor2Col12Major(gemm_input, col_major_gemm_input, cal_num, deep); |
|
|
|
#endif |
|
|
|
MatMulOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep, real_cal_num, |
|
|
|
out_channel, out_channel, OutType_Nhwc); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// fp32 conv winograd |
|
|
|
void ConvWinogardFp32(const float *input_data, const float *trans_weight, const float *bias_data, float *output_data, |
|
|
|
TmpBufferAddress *buffer_list, int task_id, const ConvParameter *conv_param, |