|
|
|
@@ -334,7 +334,7 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ |
|
|
|
bool relu6 = conv_param->is_relu6_; |
|
|
|
// todo |
|
|
|
int thread_count = conv_param->thread_num_; |
|
|
|
int tile_n = 16; |
|
|
|
const int tile_n = 16; |
|
|
|
int output_count = out_h * out_w; |
|
|
|
int output_tile_count = UP_DIV(output_count, tile_n); |
|
|
|
|
|
|
|
@@ -379,7 +379,7 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16 |
|
|
|
float16_t *tile_buffer, float16_t *block_unit_buffer, float16_t *tmp_dst_buffer, float16_t *tmp_out, |
|
|
|
int task_id, ConvParameter *conv_param) { |
|
|
|
int thread_count = conv_param->thread_num_; |
|
|
|
int tile_num = 16; |
|
|
|
const int tile_num = 16; |
|
|
|
const int output_unit = 4; |
|
|
|
const int k_plane = 36; |
|
|
|
int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); |
|
|
|
@@ -427,7 +427,7 @@ void UnPack3x3OutputFp16(const float16_t *src, float16_t *dst, int batch, int he |
|
|
|
float16_t *batch_out = dst + ro_batch_size; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = h * out_w_block * C4NUM * C8NUM; |
|
|
|
int dst_h_offset = h * width * channel; |
|
|
|
const int dst_h_offset = h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
@@ -462,7 +462,7 @@ void UnPack3x3ReluOutputFp16(const float16_t *src, float16_t *dst, int batch, in |
|
|
|
float16_t *batch_out = dst + ro_batch_size; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = h * out_w_block * C4NUM * C8NUM; |
|
|
|
int dst_h_offset = h * width * channel; |
|
|
|
const int dst_h_offset = h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
@@ -502,7 +502,7 @@ void UnPack3x3Relu6OutputFp16(const float16_t *src, float16_t *dst, int batch, i |
|
|
|
float16_t *batch_out = dst + ro_batch_size; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = h * out_w_block * C4NUM * C8NUM; |
|
|
|
int dst_h_offset = h * width * channel; |
|
|
|
const int dst_h_offset = h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
@@ -545,7 +545,7 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa |
|
|
|
int out_unit = conv_param->output_unit_; |
|
|
|
int out_w_block = UP_DIV(conv_param->output_w_, out_unit); |
|
|
|
int out_h_block = UP_DIV(conv_param->output_h_, out_unit); |
|
|
|
int tile_num = 16; |
|
|
|
const int tile_num = 16; |
|
|
|
int output_count = out_w_block * out_h_block; |
|
|
|
int output_tile_count = UP_DIV(output_count, tile_num); |
|
|
|
int out_channel = conv_param->output_channel_; |
|
|
|
@@ -594,7 +594,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i |
|
|
|
int dst_batch_offset = b * height * width * channel; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); |
|
|
|
int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
const int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
@@ -633,7 +633,7 @@ void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batc |
|
|
|
int dst_batch_offset = b * height * width * channel; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); |
|
|
|
int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
const int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
@@ -679,7 +679,7 @@ void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int bat |
|
|
|
int dst_batch_offset = b * height * width * channel; |
|
|
|
for (int h = 0; h < height; h++) { |
|
|
|
int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); |
|
|
|
int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
const int dst_h_offset = dst_batch_offset + h * width * channel; |
|
|
|
for (int w = 0; w < width; w++) { |
|
|
|
int src_w_offset = src_h_offset + w * C8NUM; |
|
|
|
int dst_w_offset = dst_h_offset + w * channel; |
|
|
|
|