Merge pull request !7059 from lixian/mastertags/v1.1.0
| @@ -1195,8 +1195,6 @@ LoopRow: | |||||
| st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64 | st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64 | ||||
| st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 | st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 | ||||
| st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 | st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 | ||||
| st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 | |||||
| st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 | |||||
| add x11, x11, x16 | add x11, x11, x16 | ||||
| b WriteEnd | b WriteEnd | ||||
| WriteWino: | WriteWino: | ||||
| @@ -1217,14 +1215,6 @@ LoopRow: | |||||
| st1 {v29.8h}, [x11], x15 | st1 {v29.8h}, [x11], x15 | ||||
| st1 {v30.8h}, [x11], x15 | st1 {v30.8h}, [x11], x15 | ||||
| st1 {v31.8h}, [x11], x15 | st1 {v31.8h}, [x11], x15 | ||||
| st1 {v24.8h}, [x11], x15 | |||||
| st1 {v25.8h}, [x11], x15 | |||||
| st1 {v26.8h}, [x11], x15 | |||||
| st1 {v27.8h}, [x11], x15 | |||||
| st1 {v28.8h}, [x11], x15 | |||||
| st1 {v29.8h}, [x11], x15 | |||||
| st1 {v30.8h}, [x11], x15 | |||||
| st1 {v31.8h}, [x11], x15 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write8: | Write8: | ||||
| add x2, x2, #16 | add x2, x2, #16 | ||||
| @@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa | |||||
| float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; | float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; | ||||
| for (int i = 0; i < input_unit_square; ++i) { | for (int i = 0; i < input_unit_square; ++i) { | ||||
| RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel); | RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel); | ||||
| MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel, | |||||
| cal_num, oc8 * C8NUM, input_unit_square, false); | |||||
| MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel, | |||||
| cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8); | |||||
| } | } | ||||
| // step 4 : output transform | // step 4 : output transform | ||||
| @@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl | |||||
| } | } | ||||
| void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | ||||
| int depth, int row, int col, int stride, bool write_nhwc) { | |||||
| if (!write_nhwc) { | |||||
| MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc); | |||||
| int depth, int row, int col, int stride, int out_type) { | |||||
| if (out_type == OutType_C8) { | |||||
| MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false); | |||||
| } else { | } else { | ||||
| MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1); | |||||
| MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type); | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl | |||||
| int deep, int row, int col, int stride, bool write_nhwc); | int deep, int row, int col, int stride, bool write_nhwc); | ||||
| void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | ||||
| int depth, int row, int col, int stride, bool write_nhwc); | |||||
| int depth, int row, int col, int stride, int out_type); | |||||
| void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); | void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); | ||||
| @@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons | |||||
| size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); | size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); | ||||
| void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | ||||
| size_t depth, size_t row, size_t col, size_t stride, int write_nhwc); | |||||
| size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); | |||||
| void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| @@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) { | |||||
| MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, | MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, | ||||
| output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, | output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, | ||||
| matmul_param_->row_, cur_oc, matmul_param_->col_, true); | |||||
| matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) { | |||||
| float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; | float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; | ||||
| MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_), | MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_), | ||||
| matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true); | |||||
| matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, | |||||
| OutType_Nhwc); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) { | |||||
| auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_; | auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_; | ||||
| MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, | MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, | ||||
| tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0, | tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0, | ||||
| false); | |||||
| OutType_C8); | |||||
| DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, | DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, | ||||
| reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM, | reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM, | ||||
| execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); | execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); | ||||
| @@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) { | |||||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | ||||
| auto c = output_ptr_ + task_id * thread_stride_; | auto c = output_ptr_ + task_id * thread_stride_; | ||||
| MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | ||||
| true); | |||||
| OutType_Nhwc); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) { | |||||
| auto b = current_b_ + task_id * thread_stride_ * params_->deep_; | auto b = current_b_ + task_id * thread_stride_ * params_->deep_; | ||||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; | ||||
| auto c = current_c_ + task_id * thread_stride_; | auto c = current_c_ + task_id * thread_stride_; | ||||
| MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true); | |||||
| MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||