Merge pull request !4525 from fuzhiye/tmptags/v0.7.0-beta
| @@ -400,16 +400,15 @@ int ConvolutionWinogradFP16CPUKernel::Run() { | |||||
| } | } | ||||
| // get real output | // get real output | ||||
| UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| int output_num = | |||||
| conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; | |||||
| if (conv_param_->is_relu_) { | if (conv_param_->is_relu_) { | ||||
| ReluFp16(execute_output_, execute_output_, output_num); | |||||
| UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } else if (conv_param_->is_relu6_) { | } else if (conv_param_->is_relu6_) { | ||||
| Relu6Fp16(execute_output_, execute_output_, output_num); | |||||
| UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } else { | } else { | ||||
| // do nothing | |||||
| UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -263,16 +263,15 @@ int Convolution3x3CPUKernel::Run() { | |||||
| auto is_relu = conv_param_->is_relu_; | auto is_relu = conv_param_->is_relu_; | ||||
| auto is_relu6 = conv_param_->is_relu6_; | auto is_relu6 = conv_param_->is_relu6_; | ||||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | ||||
| PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| int output_num = | |||||
| conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; | |||||
| if (is_relu) { | if (is_relu) { | ||||
| ReluFp32(output_addr, output_addr, output_num); | |||||
| PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| } else if (is_relu6) { | } else if (is_relu6) { | ||||
| Relu6Fp32(output_addr, output_addr, output_num); | |||||
| PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| } else { | } else { | ||||
| // do nothing | |||||
| PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | |||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -368,18 +368,16 @@ int ConvolutionWinogradCPUKernel::Run() { | |||||
| // get real output | // get real output | ||||
| auto out_tensor = out_tensors_.front(); | auto out_tensor = out_tensors_.front(); | ||||
| auto out_data = reinterpret_cast<float *>(out_tensor->Data()); | auto out_data = reinterpret_cast<float *>(out_tensor->Data()); | ||||
| UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| int output_num = | |||||
| conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; | |||||
| if (conv_param_->is_relu_) { | if (conv_param_->is_relu_) { | ||||
| ReluFp32(out_data, out_data, output_num); | |||||
| UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } else if (conv_param_->is_relu6_) { | } else if (conv_param_->is_relu6_) { | ||||
| Relu6Fp32(out_data, out_data, output_num); | |||||
| UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } else { | } else { | ||||
| // do nothing | |||||
| UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | |||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -470,8 +470,9 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | int out_h_block_num = UP_DIV(height, output_unit); | ||||
| int out_w_block_num = UP_DIV(width, output_unit); | int out_w_block_num = UP_DIV(width, output_unit); | ||||
| int c8 = UP_DIV(channel, C8NUM); | int c8 = UP_DIV(channel, C8NUM); | ||||
| int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| int src_batch_offset = b * c8 * C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| int src_batch_offset = b * c8 * c8_block; | |||||
| int dst_batch_offset = b * height * width * channel; | int dst_batch_offset = b * height * width * channel; | ||||
| for (int h = 0; h < height; h++) { | for (int h = 0; h < height; h++) { | ||||
| int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); | int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); | ||||
| @@ -480,7 +481,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i | |||||
| int src_w_offset = src_h_offset + w * C8NUM; | int src_w_offset = src_h_offset + w * C8NUM; | ||||
| int dst_w_offset = dst_h_offset + w * channel; | int dst_w_offset = dst_h_offset + w * channel; | ||||
| for (int c = 0; c < c8 - 1; c++) { | for (int c = 0; c < c8 - 1; c++) { | ||||
| int src_c8_offset = src_w_offset + c * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; | |||||
| int src_c8_offset = src_w_offset + c * c8_block; | |||||
| int dst_c8_offset = dst_w_offset + c * C8NUM; | int dst_c8_offset = dst_w_offset + c * C8NUM; | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| vst1q_f16(dst + dst_c8_offset, vld1q_f16(src + src_c8_offset)); | vst1q_f16(dst + dst_c8_offset, vld1q_f16(src + src_c8_offset)); | ||||
| @@ -491,7 +492,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i | |||||
| #endif | #endif | ||||
| } | } | ||||
| int c_res = channel - (c8 - 1) * C8NUM; | int c_res = channel - (c8 - 1) * C8NUM; | ||||
| int src_c_res_offset = (c8 - 1) * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; | |||||
| int src_c_res_offset = (c8 - 1) * c8_block; | |||||
| int dst_c_res_offset = (c8 - 1) * C8NUM; | int dst_c_res_offset = (c8 - 1) * C8NUM; | ||||
| for (int c = 0; c < c_res; c++) { | for (int c = 0; c < c_res; c++) { | ||||
| int src_c8_res_offset = src_w_offset + src_c_res_offset + c; | int src_c8_res_offset = src_w_offset + src_c_res_offset + c; | ||||
| @@ -502,3 +503,99 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | |||||
| int output_unit) { | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | |||||
| int out_w_block_num = UP_DIV(width, output_unit); | |||||
| int c8 = UP_DIV(channel, C8NUM); | |||||
| int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_batch_offset = b * c8 * c8_block; | |||||
| int dst_batch_offset = b * height * width * channel; | |||||
| for (int h = 0; h < height; h++) { | |||||
| int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); | |||||
| int dst_h_offset = dst_batch_offset + h * width * channel; | |||||
| for (int w = 0; w < width; w++) { | |||||
| int src_w_offset = src_h_offset + w * C8NUM; | |||||
| int dst_w_offset = dst_h_offset + w * channel; | |||||
| for (int c = 0; c < c8 - 1; c++) { | |||||
| int src_c8_offset = src_w_offset + c * c8_block; | |||||
| int dst_c8_offset = dst_w_offset + c * C8NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float16x8_t input_ptr = vld1q_f16(src + src_c8_offset); | |||||
| float16x8_t zero = vdupq_n_f16(0); | |||||
| input_ptr = vmaxq_f16(zero, input_ptr); | |||||
| vst1q_f16(dst + dst_c8_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C8NUM; ++i) { | |||||
| float16_t input_data = src[src_c8_offset + i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| dst[dst_c8_offset + i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| int c_res = channel - (c8 - 1) * C8NUM; | |||||
| int src_c_res_offset = (c8 - 1) * c8_block; | |||||
| int dst_c_res_offset = (c8 - 1) * C8NUM; | |||||
| for (int c = 0; c < c_res; c++) { | |||||
| int src_c8_res_offset = src_w_offset + src_c_res_offset + c; | |||||
| int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c; | |||||
| float16_t input_data = src[src_c8_res_offset]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| dst[dst_c8_res_offset] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | |||||
| int output_unit) { | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | |||||
| int out_w_block_num = UP_DIV(width, output_unit); | |||||
| int c8 = UP_DIV(channel, C8NUM); | |||||
| int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_batch_offset = b * c8 * c8_block; | |||||
| int dst_batch_offset = b * height * width * channel; | |||||
| for (int h = 0; h < height; h++) { | |||||
| int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); | |||||
| int dst_h_offset = dst_batch_offset + h * width * channel; | |||||
| for (int w = 0; w < width; w++) { | |||||
| int src_w_offset = src_h_offset + w * C8NUM; | |||||
| int dst_w_offset = dst_h_offset + w * channel; | |||||
| for (int c = 0; c < c8 - 1; c++) { | |||||
| int src_c8_offset = src_w_offset + c * c8_block; | |||||
| int dst_c8_offset = dst_w_offset + c * C8NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float16x8_t input_ptr = vld1q_f16(src + src_c8_offset); | |||||
| float16x8_t zero = vdupq_n_f16(0); | |||||
| float16x8_t six = vdupq_n_f16(6); | |||||
| input_ptr = vmaxq_f16(zero, input_ptr); | |||||
| input_ptr = vminq_f16(six, input_ptr); | |||||
| vst1q_f16(dst + dst_c8_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C8NUM; ++i) { | |||||
| float16_t input_data = src[src_c8_offset + i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| dst[dst_c8_offset + i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| int c_res = channel - (c8 - 1) * C8NUM; | |||||
| int src_c_res_offset = (c8 - 1) * c8_block; | |||||
| int dst_c_res_offset = (c8 - 1) * C8NUM; | |||||
| for (int c = 0; c < c_res; c++) { | |||||
| int src_c8_res_offset = src_w_offset + src_c_res_offset + c; | |||||
| int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c; | |||||
| float16_t input_data = src[src_c8_res_offset]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| dst[dst_c8_res_offset] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -67,6 +67,12 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa | |||||
| void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | ||||
| int output_unit); | int output_unit); | ||||
| void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | |||||
| int output_unit); | |||||
| void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, | |||||
| int output_unit); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -296,8 +296,9 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | int out_h_block_num = UP_DIV(height, output_unit); | ||||
| int out_w_block_num = UP_DIV(width, output_unit); | int out_w_block_num = UP_DIV(width, output_unit); | ||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| int src_batch_offset = b * c4 * C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| int src_batch_offset = b * c4 * c4_block; | |||||
| int dst_batch_offset = b * height * width * channel; | int dst_batch_offset = b * height * width * channel; | ||||
| for (int h = 0; h < height; h++) { | for (int h = 0; h < height; h++) { | ||||
| int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); | int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); | ||||
| @@ -306,19 +307,18 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i | |||||
| int src_w_offset = src_h_offset + w * C4NUM; | int src_w_offset = src_h_offset + w * C4NUM; | ||||
| int dst_w_offset = dst_h_offset + w * channel; | int dst_w_offset = dst_h_offset + w * channel; | ||||
| for (int c = 0; c < c4 - 1; c++) { | for (int c = 0; c < c4 - 1; c++) { | ||||
| int src_c4_offset = src_w_offset + c * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; | |||||
| int src_c4_offset = src_w_offset + c * c4_block; | |||||
| int dst_c4_offset = dst_w_offset + c * C4NUM; | int dst_c4_offset = dst_w_offset + c * C4NUM; | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset)); | vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset)); | ||||
| #else | #else | ||||
| dst[dst_c4_offset] = src[src_c4_offset]; | |||||
| dst[dst_c4_offset + 1] = src[src_c4_offset + 1]; | |||||
| dst[dst_c4_offset + 2] = src[src_c4_offset + 2]; | |||||
| dst[dst_c4_offset + 3] = src[src_c4_offset + 3]; | |||||
| for (int i = 0; i < C4NUM; ++i) { | |||||
| dst[dst_c4_offset + i] = src[src_c4_offset + i]; | |||||
| } | |||||
| #endif | #endif | ||||
| } | } | ||||
| int c_res = channel - (c4 - 1) * C4NUM; | int c_res = channel - (c4 - 1) * C4NUM; | ||||
| int src_c_res_offset = (c4 - 1) * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; | |||||
| int src_c_res_offset = (c4 - 1) * c4_block; | |||||
| int dst_c_res_offset = (c4 - 1) * C4NUM; | int dst_c_res_offset = (c4 - 1) * C4NUM; | ||||
| for (int c = 0; c < c_res; c++) { | for (int c = 0; c < c_res; c++) { | ||||
| int src_c4_res_offset = src_w_offset + src_c_res_offset + c; | int src_c4_res_offset = src_w_offset + src_c_res_offset + c; | ||||
| @@ -330,6 +330,102 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i | |||||
| } | } | ||||
| } | } | ||||
| void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel, | |||||
| int output_unit) { | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | |||||
| int out_w_block_num = UP_DIV(width, output_unit); | |||||
| int c4 = UP_DIV(channel, C4NUM); | |||||
| int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_batch_offset = b * c4 * c4_block; | |||||
| int dst_batch_offset = b * height * width * channel; | |||||
| for (int h = 0; h < height; h++) { | |||||
| int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); | |||||
| int dst_h_offset = dst_batch_offset + h * width * channel; | |||||
| for (int w = 0; w < width; w++) { | |||||
| int src_w_offset = src_h_offset + w * C4NUM; | |||||
| int dst_w_offset = dst_h_offset + w * channel; | |||||
| for (int c = 0; c < c4 - 1; c++) { | |||||
| int src_c4_offset = src_w_offset + c * c4_block; | |||||
| int dst_c4_offset = dst_w_offset + c * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); | |||||
| float32x4_t zero = vdupq_n_f32(0); | |||||
| input_ptr = vmaxq_f32(zero, input_ptr); | |||||
| vst1q_f32(dst + dst_c4_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C4NUM; ++i) { | |||||
| float input_data = src[src_c4_offset + i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| dst[dst_c4_offset + i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| int c_res = channel - (c4 - 1) * C4NUM; | |||||
| int src_c_res_offset = (c4 - 1) * c4_block; | |||||
| int dst_c_res_offset = (c4 - 1) * C4NUM; | |||||
| for (int c = 0; c < c_res; c++) { | |||||
| int src_c4_res_offset = src_w_offset + src_c_res_offset + c; | |||||
| int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; | |||||
| float input_data = src[src_c4_res_offset]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| dst[dst_c4_res_offset] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel, | |||||
| int output_unit) { | |||||
| int out_h_block_num = UP_DIV(height, output_unit); | |||||
| int out_w_block_num = UP_DIV(width, output_unit); | |||||
| int c4 = UP_DIV(channel, C4NUM); | |||||
| int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_batch_offset = b * c4 * c4_block; | |||||
| int dst_batch_offset = b * height * width * channel; | |||||
| for (int h = 0; h < height; h++) { | |||||
| int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); | |||||
| int dst_h_offset = dst_batch_offset + h * width * channel; | |||||
| for (int w = 0; w < width; w++) { | |||||
| int src_w_offset = src_h_offset + w * C4NUM; | |||||
| int dst_w_offset = dst_h_offset + w * channel; | |||||
| for (int c = 0; c < c4 - 1; c++) { | |||||
| int src_c4_offset = src_w_offset + c * c4_block; | |||||
| int dst_c4_offset = dst_w_offset + c * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); | |||||
| float32x4_t zero = vdupq_n_f32(0); | |||||
| float32x4_t six = vdupq_n_f32(6); | |||||
| input_ptr = vmaxq_f32(zero, input_ptr); | |||||
| input_ptr = vminq_f32(six, input_ptr); | |||||
| vst1q_f32(dst + dst_c4_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C4NUM; ++i) { | |||||
| float input_data = src[src_c4_offset + i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| dst[dst_c4_offset + i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| int c_res = channel - (c4 - 1) * C4NUM; | |||||
| int src_c_res_offset = (c4 - 1) * c4_block; | |||||
| int dst_c_res_offset = (c4 - 1) * C4NUM; | |||||
| for (int c = 0; c < c_res; c++) { | |||||
| int src_c4_res_offset = src_w_offset + src_c_res_offset + c; | |||||
| int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; | |||||
| float input_data = src[src_c4_res_offset]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| dst[dst_c4_res_offset] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| // fp32 conv3x3 | // fp32 conv3x3 | ||||
| void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { | TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { | ||||
| @@ -63,6 +63,12 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ | |||||
| void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); | void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); | ||||
| void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel, | |||||
| int output_unit); | |||||
| void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel, | |||||
| int output_unit); | |||||
| // fp32 conv3x3 | // fp32 conv3x3 | ||||
| void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); | TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); | ||||
| @@ -582,6 +582,84 @@ void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int | |||||
| } | } | ||||
| } | } | ||||
| void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel) { | |||||
| int c4 = UP_DIV(channel, C4NUM); | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_offset = b * plane * c4 * C4NUM; | |||||
| int dst_offset = b * plane * channel; | |||||
| for (int k = 0; k < plane; k++) { | |||||
| int src_kernel_offset = src_offset + k * C4NUM; | |||||
| int dst_kernel_offset = dst_offset + k * channel; | |||||
| for (int c = 0; c < c4 - 1; c++) { | |||||
| int src_c_offset = src_kernel_offset + c * plane * C4NUM; | |||||
| int dst_c_offset = dst_kernel_offset + c * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset); | |||||
| float32x4_t zero = vdupq_n_f32(0); | |||||
| input_ptr = vmaxq_f32(zero, input_ptr); | |||||
| vst1q_f32((float *)dst + dst_c_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C4NUM; ++i) { | |||||
| float input_data = ((float *)src + src_c_offset)[i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| ((float *)dst + dst_c_offset)[i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| // res part | |||||
| int res_c = channel - (c4 - 1) * C4NUM; | |||||
| for (int i = 0; i < res_c; i++) { | |||||
| int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i; | |||||
| int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i; | |||||
| float input_data = ((float *)src + src_res_c_offset)[0]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| ((float *)dst + dst_res_c_offset)[0] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel) { | |||||
| int c4 = UP_DIV(channel, C4NUM); | |||||
| for (int b = 0; b < batch; b++) { | |||||
| int src_offset = b * plane * c4 * C4NUM; | |||||
| int dst_offset = b * plane * channel; | |||||
| for (int k = 0; k < plane; k++) { | |||||
| int src_kernel_offset = src_offset + k * C4NUM; | |||||
| int dst_kernel_offset = dst_offset + k * channel; | |||||
| for (int c = 0; c < c4 - 1; c++) { | |||||
| int src_c_offset = src_kernel_offset + c * plane * C4NUM; | |||||
| int dst_c_offset = dst_kernel_offset + c * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset); | |||||
| float32x4_t zero = vdupq_n_f32(0); | |||||
| float32x4_t six = vdupq_n_f32(6); | |||||
| input_ptr = vmaxq_f32(zero, input_ptr); | |||||
| input_ptr = vminq_f32(six, input_ptr); | |||||
| vst1q_f32((float *)dst + dst_c_offset, input_ptr); | |||||
| #else | |||||
| for (int i = 0; i < C4NUM; ++i) { | |||||
| float input_data = ((float *)src + src_c_offset)[i]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| ((float *)dst + dst_c_offset)[i] = input_data; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| // res part | |||||
| int res_c = channel - (c4 - 1) * C4NUM; | |||||
| for (int i = 0; i < res_c; i++) { | |||||
| int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i; | |||||
| int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i; | |||||
| float input_data = ((float *)src + src_res_c_offset)[0]; | |||||
| input_data = input_data < 0 ? 0 : input_data; | |||||
| input_data = input_data > 6 ? 6 : input_data; | |||||
| ((float *)dst + dst_res_c_offset)[0] = input_data; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) { | void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) { | ||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| @@ -70,6 +70,10 @@ void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int | |||||
| void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel); | |||||
| void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel); | |||||
| void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel); | ||||