Merge pull request !5086 from fuzhiye/tmptags/v1.0.0
| @@ -47,7 +47,7 @@ | |||||
| ///////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////// | ||||
| // | // | ||||
| // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth | // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth | ||||
| // int row, int col, size_t stride, size_t writeNhwc, size_t writeC4) | |||||
| // int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino) | |||||
| // x0: a | // x0: a | ||||
| // x1: b | // x1: b | ||||
| // x2: c | // x2: c | ||||
| @@ -64,11 +64,20 @@ MatmulFloatNeon64Opt: | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldr x9, [sp, #8] | |||||
| ldr x14, [sp, #16] | |||||
| mov w18, #32 // sizeof(float) * 8 | mov w18, #32 // sizeof(float) * 8 | ||||
| mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth | mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth | ||||
| mov x11, x3 // bias flag | |||||
| mov x18, #4 | mov x18, #4 | ||||
| ldr x17, [sp] | ldr x17, [sp] | ||||
| cbz x14, NoWinoSteps | |||||
| mul x8, x7, x17 | |||||
| mov x11, #8 | |||||
| mul x11, x11, x17 | |||||
| mul x8, x8, x18 | |||||
| mul x11, x11, x18 | |||||
| NoWinoSteps: | |||||
| mul x17, x17, x18 | mul x17, x17, x18 | ||||
| L1: | L1: | ||||
| @@ -79,7 +88,6 @@ L1: | |||||
| L2: | L2: | ||||
| mov x16, x1 // reload rhs ptr | mov x16, x1 // reload rhs ptr | ||||
| mov w13, w5 // reload depth | mov w13, w5 // reload depth | ||||
| mov x14, x3 // reload bias ptr | |||||
| dup v8.4s, wzr | dup v8.4s, wzr | ||||
| dup v9.4s, wzr | dup v9.4s, wzr | ||||
| dup v10.4s, wzr | dup v10.4s, wzr | ||||
| @@ -150,7 +158,7 @@ Loop: | |||||
| fmla v11.4s, v4.4s, v0.s[1] | fmla v11.4s, v4.4s, v0.s[1] | ||||
| fmla v13.4s, v4.4s, v0.s[2] | fmla v13.4s, v4.4s, v0.s[2] | ||||
| fmla v15.4s, v4.4s, v0.s[3] | fmla v15.4s, v4.4s, v0.s[3] | ||||
| subs w13, w13, #1 | subs w13, w13, #1 | ||||
| bgt Loop | bgt Loop | ||||
| @@ -173,9 +181,10 @@ LoopEnd: | |||||
| fmla v31.4s, v4.4s, v2.s[3] | fmla v31.4s, v4.4s, v2.s[3] | ||||
| Bias: | Bias: | ||||
| cbz x11, Activation | |||||
| ld1 {v0.4s}, [x14], #16 | |||||
| ld1 {v1.4s}, [x14], #16 | |||||
| cbz x3, Activation | |||||
| ld1 {v0.4s}, [x3], #16 | |||||
| ld1 {v1.4s}, [x3] | |||||
| sub x3, x3, #16 | |||||
| fadd v8.4s, v8.4s, v0.4s | fadd v8.4s, v8.4s, v0.4s | ||||
| fadd v9.4s, v9.4s, v1.4s | fadd v9.4s, v9.4s, v1.4s | ||||
| fadd v10.4s, v10.4s, v0.4s | fadd v10.4s, v10.4s, v0.4s | ||||
| @@ -265,10 +274,8 @@ Relu: | |||||
| fmax v31.4s, v31.4s, v3.4s | fmax v31.4s, v31.4s, v3.4s | ||||
| Write: | Write: | ||||
| ldr w8, [sp, #8] | |||||
| cbz w8, WriteC8 | |||||
| ldr w8, [sp, #16] | |||||
| cbnz w8, WriteC4 | |||||
| cbnz x14, WriteWino | |||||
| cbz x9, WriteC8 | |||||
| cmp w7, #1 | cmp w7, #1 | ||||
| beq Write1 | beq Write1 | ||||
| cmp w7, #2 | cmp w7, #2 | ||||
| @@ -721,39 +728,26 @@ Write7: | |||||
| st1 {v31.s}[2], [x16], x17 | st1 {v31.s}[2], [x16], x17 | ||||
| b WriteEnd | b WriteEnd | ||||
| WriteC8: | WriteC8: | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x2], #64 | |||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x2], #64 | |||||
| st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 | |||||
| st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 | |||||
| st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64 | |||||
| st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64 | |||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64 | |||||
| st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64 | |||||
| st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64 | |||||
| st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64 | |||||
| st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 | |||||
| b WriteEnd | b WriteEnd | ||||
| WriteC4: | |||||
| st1 {v8.8h}, [x2], #16 | |||||
| st1 {v10.8h}, [x2], #16 | |||||
| st1 {v12.8h}, [x2], #16 | |||||
| st1 {v14.8h}, [x2], #16 | |||||
| st1 {v16.8h}, [x2], #16 | |||||
| st1 {v18.8h}, [x2], #16 | |||||
| st1 {v20.8h}, [x2], #16 | |||||
| st1 {v22.8h}, [x2], #16 | |||||
| st1 {v24.8h}, [x2], #16 | |||||
| st1 {v26.8h}, [x2], #16 | |||||
| st1 {v28.8h}, [x2], #16 | |||||
| st1 {v30.8h}, [x2], #16 | |||||
| add x18, x2, x17 | |||||
| st1 {v9.8h}, [x18], #16 | |||||
| st1 {v11.8h}, [x18], #16 | |||||
| st1 {v13.8h}, [x18], #16 | |||||
| st1 {v15.8h}, [x18], #16 | |||||
| st1 {v17.8h}, [x18], #16 | |||||
| st1 {v19.8h}, [x18], #16 | |||||
| st1 {v21.8h}, [x18], #16 | |||||
| st1 {v23.8h}, [x18], #16 | |||||
| st1 {v25.8h}, [x18], #16 | |||||
| st1 {v27.8h}, [x18], #16 | |||||
| st1 {v29.8h}, [x18], #16 | |||||
| st1 {v31.8h}, [x18], #16 | |||||
| WriteWino: | |||||
| st1 {v8.4s, v9.4s}, [x18], x8 | |||||
| st1 {v10.4s, v11.4s}, [x18], x8 | |||||
| st1 {v12.4s, v13.4s}, [x18], x8 | |||||
| st1 {v14.4s, v15.4s}, [x18], x8 | |||||
| st1 {v16.4s, v17.4s}, [x18], x8 | |||||
| st1 {v18.4s, v19.4s}, [x18], x8 | |||||
| st1 {v20.4s, v21.4s}, [x18], x8 | |||||
| st1 {v22.4s, v23.4s}, [x18], x8 | |||||
| st1 {v24.4s, v25.4s}, [x18], x8 | |||||
| st1 {v26.4s, v27.4s}, [x18], x8 | |||||
| st1 {v28.4s, v29.4s}, [x18], x8 | |||||
| st1 {v30.4s, v31.4s}, [x18], x8 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write8: | Write8: | ||||
| st1 {v8.4s, v9.4s}, [x18], x17 | st1 {v8.4s, v9.4s}, [x18], x17 | ||||
| @@ -798,15 +792,15 @@ WriteEnd: | |||||
| End2: | End2: | ||||
| subs w7, w7, #8 // rhs col - 8 | subs w7, w7, #8 // rhs col - 8 | ||||
| add x1, x1, x15 // rhs ptr + stride | add x1, x1, x15 // rhs ptr + stride | ||||
| cbz x3, NoBiasStep | |||||
| add x3, x3, #32 // bias ptr + stride | add x3, x3, #32 // bias ptr + stride | ||||
| ldr w8, [sp, #8] | |||||
| cbz w8, NoDstStep | |||||
| ldr w8, [sp, #16] | |||||
| cbnz w8, C4DstStep | |||||
| NoBiasStep: | |||||
| cbnz x14, WinoDstStep | |||||
| cbz x9, NoDstStep | |||||
| add x2, x2, #32 // dst ptr + stride | add x2, x2, #32 // dst ptr + stride | ||||
| b NoDstStep | b NoDstStep | ||||
| C4DstStep: | |||||
| add x2, x2, x17 | |||||
| WinoDstStep: | |||||
| add x2, x2, x11 | |||||
| NoDstStep: | NoDstStep: | ||||
| bgt L1 | bgt L1 | ||||
| @@ -32,8 +32,6 @@ typedef struct ConvParameter { | |||||
| int stride_w_; | int stride_w_; | ||||
| int dilation_h_; | int dilation_h_; | ||||
| int dilation_w_; | int dilation_w_; | ||||
| int pad_h_; | |||||
| int pad_w_; | |||||
| int pad_u_; | int pad_u_; | ||||
| int pad_d_; | int pad_d_; | ||||
| int pad_l_; | int pad_l_; | ||||
| @@ -51,8 +49,7 @@ typedef struct ConvParameter { | |||||
| int thread_num_; | int thread_num_; | ||||
| int input_unit_; | int input_unit_; | ||||
| int output_unit_; | int output_unit_; | ||||
| bool is_relu_; | |||||
| bool is_relu6_; | |||||
| ActType act_type_; | |||||
| } ConvParameter; | } ConvParameter; | ||||
| typedef struct SlidingWindowParam { | typedef struct SlidingWindowParam { | ||||
| @@ -53,16 +53,18 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1 | |||||
| void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, | void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, | ||||
| int bottom, int left, int right, const ConvParameter *conv_param, | int bottom, int left, int right, const ConvParameter *conv_param, | ||||
| const SlidingWindowParam *sliding) { | const SlidingWindowParam *sliding) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float16_t *dst_h = dst + top * sliding->out_h_step_; | float16_t *dst_h = dst + top * sliding->out_h_step_; | ||||
| for (int oh = top; oh < bottom; oh++) { | for (int oh = top; oh < bottom; oh++) { | ||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | ||||
| const float16_t *src_h = src + ih * sliding->in_h_step_; | const float16_t *src_h = src + ih * sliding->in_h_step_; | ||||
| float16_t *dst_kernel = dst_h + left * sliding->block_channel_; | float16_t *dst_kernel = dst_h + left * sliding->block_channel_; | ||||
| for (int ow = left; ow < right; ow++) { | for (int ow = left; ow < right; ow++) { | ||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | ||||
| const float16_t *src_w = src_h + iw * sliding->block_channel_; | const float16_t *src_w = src_h + iw * sliding->block_channel_; | ||||
| @@ -72,11 +74,10 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t * | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), | sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), | ||||
| conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_); | |||||
| conv_param->kernel_w_ * C8NUM * sizeof(float16_t), relu, relu6); | |||||
| #else | #else | ||||
| DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, relu, relu6); | |||||
| #endif | #endif | ||||
| dst_kernel += sliding->block_channel_; | dst_kernel += sliding->block_channel_; | ||||
| } // width loop | } // width loop | ||||
| @@ -139,6 +140,8 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * | |||||
| void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | ||||
| const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, | ||||
| int task_id) { | int task_id) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| const float16_t *src = input_data; | const float16_t *src = input_data; | ||||
| float16_t *dst = output_data; | float16_t *dst = output_data; | ||||
| for (int b = 0; b < conv_param->output_batch_; b++) { | for (int b = 0; b < conv_param->output_batch_; b++) { | ||||
| @@ -157,8 +160,8 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||||
| conv_param->output_w_, conv_param, sliding); | conv_param->output_w_, conv_param, sliding); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | ||||
| float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -166,12 +169,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t), | conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t), | ||||
| sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t), | sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t), | ||||
| sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t), | sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t), | ||||
| sliding->in_kw_step_ * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kw_step_ * sizeof(float16_t), relu, relu6); | |||||
| #else | #else | ||||
| DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, | DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, | ||||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | ||||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, | sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kh_step_, sliding->in_kw_step_, relu, relu6); | |||||
| #endif | #endif | ||||
| } | } | ||||
| } // output C8 loop | } // output C8 loop | ||||
| @@ -210,14 +213,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float | |||||
| const SlidingWindowParam *sliding) { | const SlidingWindowParam *sliding) { | ||||
| const float16_t *src_h = src + top * sliding->out_h_step_; | const float16_t *src_h = src + top * sliding->out_h_step_; | ||||
| for (int ih = top; ih < bottom; ih++) { | for (int ih = top; ih < bottom; ih++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| float16_t *dst_h = dst + oh * sliding->in_h_step_; | float16_t *dst_h = dst + oh * sliding->in_h_step_; | ||||
| const float16_t *src_kernel = src_h + left * sliding->block_channel_; | const float16_t *src_kernel = src_h + left * sliding->block_channel_; | ||||
| for (int iw = left; iw < right; iw++) { | for (int iw = left; iw < right; iw++) { | ||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | ||||
| float16_t *dst_w = dst_h + ow * sliding->block_channel_; | float16_t *dst_w = dst_h + ow * sliding->block_channel_; | ||||
| @@ -282,12 +285,14 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float | |||||
| void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, | void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, | ||||
| const ConvParameter *conv_param) { | const ConvParameter *conv_param) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float16_t *dst_k = dst; | float16_t *dst_k = dst; | ||||
| for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | ||||
| for (int c = 0; c < C8NUM; c++) { | for (int c = 0; c < C8NUM; c++) { | ||||
| dst_k[c] += bias[c]; | dst_k[c] += bias[c]; | ||||
| dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); | |||||
| dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); | |||||
| dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); | |||||
| dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); | |||||
| } | } | ||||
| dst_k += block_channel; | dst_k += block_channel; | ||||
| } | } | ||||
| @@ -315,8 +320,8 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f | |||||
| conv_param->input_w_, conv_param, sliding); | conv_param->input_w_, conv_param, sliding); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | ||||
| const float16_t *in_t = | const float16_t *in_t = | ||||
| src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| @@ -173,16 +173,18 @@ void SWBorderPixel(float16_t *dst, const float16_t *src, const float16_t *weight | |||||
| void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, | void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top, | ||||
| int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float16_t *dst_h = dst + top * sliding->out_h_step_; | float16_t *dst_h = dst + top * sliding->out_h_step_; | ||||
| for (int oh = top; oh < bottom; oh++) { | for (int oh = top; oh < bottom; oh++) { | ||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | ||||
| const float16_t *src_h = src + ih * sliding->in_h_step_; | const float16_t *src_h = src + ih * sliding->in_h_step_; | ||||
| float16_t *dst_kernel = dst_h + left * sliding->block_channel_; | float16_t *dst_kernel = dst_h + left * sliding->block_channel_; | ||||
| for (int ow = left; ow < right; ow++) { | for (int ow = left; ow < right; ow++) { | ||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | ||||
| const float16_t *src_w = src_h + iw * sliding->ic4_channel_; | const float16_t *src_w = src_h + iw * sliding->ic4_channel_; | ||||
| @@ -192,7 +194,7 @@ void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, | |||||
| SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, | sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, | ||||
| sliding->ic4_channel_, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->ic4_channel_, relu, relu6); | |||||
| dst_kernel += sliding->block_channel_; | dst_kernel += sliding->block_channel_; | ||||
| } // width loop | } // width loop | ||||
| @@ -273,6 +275,8 @@ void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, | |||||
| void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data, | void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data, | ||||
| float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param, | float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param, | ||||
| SlidingWindowParam *slidingWindow_param) { | SlidingWindowParam *slidingWindow_param) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| int oc4_res = conv_param->output_channel_ % C4NUM; | int oc4_res = conv_param->output_channel_ % C4NUM; | ||||
| const float16_t *src = input_data; | const float16_t *src = input_data; | ||||
| float16_t *dst; | float16_t *dst; | ||||
| @@ -299,8 +303,8 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con | |||||
| if (slidingWindow_param->right_ > slidingWindow_param->left_ && | if (slidingWindow_param->right_ > slidingWindow_param->left_ && | ||||
| slidingWindow_param->bottom_ > slidingWindow_param->top_) { | slidingWindow_param->bottom_ > slidingWindow_param->top_) { | ||||
| int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| const float16_t *in_t = | const float16_t *in_t = | ||||
| src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; | src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; | ||||
| float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + | float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + | ||||
| @@ -310,7 +314,7 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con | |||||
| conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, | conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, | ||||
| slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_, | slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_, | ||||
| slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, | slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, | ||||
| slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| slidingWindow_param->in_kw_step_, relu, relu6); | |||||
| } | } | ||||
| } // output C4 loop | } // output C4 loop | ||||
| src += slidingWindow_param->in_step_; | src += slidingWindow_param->in_step_; | ||||
| @@ -330,8 +334,8 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ | |||||
| int out_h = conv_param->output_h_; | int out_h = conv_param->output_h_; | ||||
| int out_w = conv_param->output_w_; | int out_w = conv_param->output_w_; | ||||
| int out_channel = conv_param->output_channel_; | int out_channel = conv_param->output_channel_; | ||||
| bool relu = conv_param->is_relu_; | |||||
| bool relu6 = conv_param->is_relu6_; | |||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| int thread_count = conv_param->thread_num_; | int thread_count = conv_param->thread_num_; | ||||
| const int tile_n = 16; | const int tile_n = 16; | ||||
| int output_count = out_h * out_w; | int output_count = out_h * out_w; | ||||
| @@ -73,8 +73,8 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, | |||||
| for (int ih = 0; ih < conv_param->input_h_; ih++) { | for (int ih = 0; ih < conv_param->input_h_; ih++) { | ||||
| for (int iw = 0; iw < conv_param->input_w_; iw++) { | for (int iw = 0; iw < conv_param->input_w_; iw++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| @@ -112,7 +112,7 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, | |||||
| } /*ih*/ | } /*ih*/ | ||||
| } /*oc8*/ | } /*oc8*/ | ||||
| PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_, | |||||
| conv_param->is_relu6_); | |||||
| PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, | |||||
| conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6); | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| @@ -21,14 +21,14 @@ | |||||
| void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) { | void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) { | ||||
| /* support nhwc */ | /* support nhwc */ | ||||
| for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { | for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { | ||||
| int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| if (src_h < 0 || src_h >= conv_param->input_h_) { | if (src_h < 0 || src_h >= conv_param->input_h_) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; | const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; | ||||
| float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; | float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; | ||||
| for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { | for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { | ||||
| int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| if (src_w < 0 || src_w >= conv_param->input_w_) { | if (src_w < 0 || src_w >= conv_param->input_w_) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -46,8 +46,8 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1 | |||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int stride_h = conv_param->stride_h_; | int stride_h = conv_param->stride_h_; | ||||
| int stride_w = conv_param->stride_w_; | int stride_w = conv_param->stride_w_; | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int dilation_h = conv_param->dilation_h_; | int dilation_h = conv_param->dilation_h_; | ||||
| int dilation_w = conv_param->dilation_w_; | int dilation_w = conv_param->dilation_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| @@ -230,8 +230,8 @@ void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_inp | |||||
| int input_channel = conv_param->input_channel_; | int input_channel = conv_param->input_channel_; | ||||
| int input_width = conv_param->input_w_; | int input_width = conv_param->input_w_; | ||||
| int input_height = conv_param->input_h_; | int input_height = conv_param->input_h_; | ||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int ic8 = UP_DIV(input_channel, C8NUM); | int ic8 = UP_DIV(input_channel, C8NUM); | ||||
| if (out_w_block == 0) { | if (out_w_block == 0) { | ||||
| return; | return; | ||||
| @@ -576,8 +576,8 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in | |||||
| int output_unit = conv_param->output_unit_; | int output_unit = conv_param->output_unit_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| int ic8 = UP_DIV(in_channel, C8NUM); | int ic8 = UP_DIV(in_channel, C8NUM); | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int input_h = conv_param->input_h_; | int input_h = conv_param->input_h_; | ||||
| int input_w = conv_param->input_w_; | int input_w = conv_param->input_w_; | ||||
| if (out_w_block_num == 0) { | if (out_w_block_num == 0) { | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include <string.h> | #include <string.h> | ||||
| #include "nnacl/fp32/common_func.h" | #include "nnacl/fp32/common_func.h" | ||||
| #include "nnacl/winograd_transform.h" | #include "nnacl/winograd_transform.h" | ||||
| #include "nnacl/fp32/matmul.h" | |||||
| void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, | void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, | ||||
| int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) { | int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) { | ||||
| @@ -57,16 +58,18 @@ void SWBorderPixel(float *dst, const float *src, const float *weight, const floa | |||||
| void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left, | void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left, | ||||
| int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| int ic4 = sliding->ic4_channel_ / C4NUM; | int ic4 = sliding->ic4_channel_ / C4NUM; | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float *dst_h = dst + top * sliding->out_h_step_; | float *dst_h = dst + top * sliding->out_h_step_; | ||||
| for (int oh = top; oh < bottom; oh++) { | for (int oh = top; oh < bottom; oh++) { | ||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | ||||
| const float *src_h = src + ih * sliding->in_h_step_; | const float *src_h = src + ih * sliding->in_h_step_; | ||||
| float *dst_kernel = dst_h + left * sliding->block_channel_; | float *dst_kernel = dst_h + left * sliding->block_channel_; | ||||
| for (int ow = left; ow < right; ow++) { | for (int ow = left; ow < right; ow++) { | ||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | ||||
| const float *src_w = src_h + iw * sliding->ic4_channel_; | const float *src_w = src_h + iw * sliding->ic4_channel_; | ||||
| @@ -75,8 +78,8 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi | |||||
| const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_; | const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_; | ||||
| SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, relu, | |||||
| relu6); | |||||
| dst_kernel += sliding->block_channel_; | dst_kernel += sliding->block_channel_; | ||||
| } // width loop | } // width loop | ||||
| @@ -144,6 +147,8 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float | |||||
| float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) { | float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) { | ||||
| int ic4 = slidingWindow_param->ic4_channel_ / C4NUM; | int ic4 = slidingWindow_param->ic4_channel_ / C4NUM; | ||||
| int oc4_res = conv_param->output_channel_ % C4NUM; | int oc4_res = conv_param->output_channel_ % C4NUM; | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| const float *src = input_data; | const float *src = input_data; | ||||
| float *dst = NULL; | float *dst = NULL; | ||||
| if (oc4_res == 0) { | if (oc4_res == 0) { | ||||
| @@ -169,28 +174,26 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float | |||||
| if (slidingWindow_param->right_ > slidingWindow_param->left_ && | if (slidingWindow_param->right_ > slidingWindow_param->left_ && | ||||
| slidingWindow_param->bottom_ > slidingWindow_param->top_) { | slidingWindow_param->bottom_ > slidingWindow_param->top_) { | ||||
| int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| const float *in_t = | const float *in_t = | ||||
| src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; | src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_; | ||||
| float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + | float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ + | ||||
| slidingWindow_param->left_ * slidingWindow_param->block_channel_; | slidingWindow_param->left_ * slidingWindow_param->block_channel_; | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, | |||||
| slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, | |||||
| conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float), | |||||
| slidingWindow_param->block_channel_ * sizeof(float), ic4, | |||||
| slidingWindow_param->in_sh_step_ * sizeof(float), | |||||
| slidingWindow_param->in_sw_step_ * sizeof(float), | |||||
| slidingWindow_param->in_kh_step_ * sizeof(float), | |||||
| slidingWindow_param->in_kw_step_ * sizeof(float), | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| ConvSwFp32Center( | |||||
| out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, | |||||
| slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||||
| slidingWindow_param->out_h_step_ * sizeof(float), slidingWindow_param->block_channel_ * sizeof(float), ic4, | |||||
| slidingWindow_param->in_sh_step_ * sizeof(float), slidingWindow_param->in_sw_step_ * sizeof(float), | |||||
| slidingWindow_param->in_kh_step_ * sizeof(float), slidingWindow_param->in_kw_step_ * sizeof(float), relu, | |||||
| relu6); | |||||
| #else | #else | ||||
| SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, | SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_, | ||||
| slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, | |||||
| conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4, | |||||
| slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||||
| slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4, | |||||
| slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, | slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_, | ||||
| slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| slidingWindow_param->in_kw_step_, relu, relu6); | |||||
| #endif | #endif | ||||
| } | } | ||||
| } // output C4 loop | } // output C4 loop | ||||
| @@ -219,6 +222,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons | |||||
| int kernel_plane = kernel_h * kernel_w; | int kernel_plane = kernel_h * kernel_w; | ||||
| int unit_size = kernel_plane * ic4 * C4NUM; | int unit_size = kernel_plane * ic4 * C4NUM; | ||||
| int packed_input_size = output_tile_count * TILE_NUM * unit_size; | int packed_input_size = output_tile_count * TILE_NUM * unit_size; | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| // we accumulate 4 channels per time for input blocks | // we accumulate 4 channels per time for input blocks | ||||
| int conv_depth = kernel_h * kernel_w; | int conv_depth = kernel_h * kernel_w; | ||||
| @@ -240,11 +245,11 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons | |||||
| if (real_cal_num == TILE_NUM) { | if (real_cal_num == TILE_NUM) { | ||||
| float *gemm_output = output_data + out_offset; | float *gemm_output = output_data + out_offset; | ||||
| gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0, | gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0, | ||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| relu, relu6); | |||||
| } else { | } else { | ||||
| // res part | // res part | ||||
| gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, | gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, | ||||
| 0, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| 0, relu, relu6); | |||||
| memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); | memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); | ||||
| } | } | ||||
| } | } | ||||
| @@ -264,34 +269,42 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ | |||||
| int out_w_block = UP_DIV(conv_param->output_w_, out_unit); | int out_w_block = UP_DIV(conv_param->output_w_, out_unit); | ||||
| int out_h_block = UP_DIV(conv_param->output_h_, out_unit); | int out_h_block = UP_DIV(conv_param->output_h_, out_unit); | ||||
| int output_count = out_w_block * out_h_block; | int output_count = out_w_block * out_h_block; | ||||
| int output_tile_count = UP_DIV(output_count, TILE_NUM); | |||||
| int output_tile_count = UP_DIV(output_count, C12NUM); | |||||
| int out_channel = conv_param->output_channel_; | int out_channel = conv_param->output_channel_; | ||||
| int oc4 = UP_DIV(out_channel, C4NUM); | int oc4 = UP_DIV(out_channel, C4NUM); | ||||
| int oc8 = UP_DIV(out_channel, C8NUM); | |||||
| int input_unit_square = input_unit * input_unit; | int input_unit_square = input_unit * input_unit; | ||||
| size_t output_offset = oc4 * C4NUM * input_unit_square * sizeof(float); | |||||
| float *trans_input = buffer_list[0]; | float *trans_input = buffer_list[0]; | ||||
| float *gemm_out = buffer_list[1]; | float *gemm_out = buffer_list[1]; | ||||
| float *tmp_out_data = buffer_list[2]; | float *tmp_out_data = buffer_list[2]; | ||||
| float *tmp_data = buffer_list[3]; | float *tmp_data = buffer_list[3]; | ||||
| int trans_input_offset = TILE_NUM * input_unit_square * ic4 * C4NUM; | |||||
| int gemm_out_offset = TILE_NUM * input_unit_square * oc4 * C4NUM; | |||||
| float *col_buffer = buffer_list[4]; | |||||
| int trans_input_offset = C12NUM * input_unit_square * ic4 * C4NUM; | |||||
| int gemm_out_offset = C12NUM * input_unit_square * oc8 * C8NUM; | |||||
| int tmp_data_offset = input_unit_square * C4NUM; | int tmp_data_offset = input_unit_square * C4NUM; | ||||
| int col_buffer_offset = C12NUM * ic4 * C4NUM; | |||||
| // step 1 : filter transform (pre-processed offline) | // step 1 : filter transform (pre-processed offline) | ||||
| // step 2 : input transform (online) | // step 2 : input transform (online) | ||||
| for (int b = 0; b < in_batch; b++) { | for (int b = 0; b < in_batch; b++) { | ||||
| int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; | int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_; | ||||
| int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM; | int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM; | ||||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) { | for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) { | ||||
| int out_tile_index = thread_id * TILE_NUM; | |||||
| int cal_num = output_count - thread_id * TILE_NUM; | |||||
| cal_num = cal_num > TILE_NUM ? TILE_NUM : cal_num; | |||||
| int out_tile_index = thread_id * C12NUM; | |||||
| int cal_num = output_count - thread_id * C12NUM; | |||||
| cal_num = cal_num > C12NUM ? C12NUM : cal_num; | |||||
| WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, | WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset, | ||||
| tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, | tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param, | ||||
| input_trans_func); | input_trans_func); | ||||
| // step 3 : gemm | // step 3 : gemm | ||||
| gemm_func(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL, | |||||
| input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, 0, 0); | |||||
| float *src_ptr = trans_input + task_id * trans_input_offset; | |||||
| float *dst_ptr = gemm_out + task_id * gemm_out_offset; | |||||
| float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; | |||||
| for (int i = 0; i < input_unit_square; ++i) { | |||||
| RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM); | |||||
| MatMulOpt(tmp_col_ptr, trans_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM, | |||||
| C12NUM, oc8 * C8NUM, input_unit_square, 2); | |||||
| } | |||||
| // step 4 : output transform | // step 4 : output transform | ||||
| WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, | WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data, | ||||
| @@ -442,18 +455,21 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat | |||||
| int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | ||||
| int output_channel = conv_param->output_channel_; | int output_channel = conv_param->output_channel_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int oc8 = UP_DIV(output_channel, C8NUM); | |||||
| int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT); | int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT); | ||||
| int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT); | int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT); | ||||
| int output_count = out_w_block * out_h_block; | int output_count = out_w_block * out_h_block; | ||||
| int output_tile_count = UP_DIV(output_count, TILE_NUM); | |||||
| int output_tile_count = UP_DIV(output_count, C12NUM); | |||||
| const int input_unit_square = 4 * 4; | const int input_unit_square = 4 * 4; | ||||
| float *tile_buffer = buffer_list[0]; | float *tile_buffer = buffer_list[0]; | ||||
| float *block_unit_buffer = buffer_list[1]; | float *block_unit_buffer = buffer_list[1]; | ||||
| float *tmp_dst_buffer = buffer_list[2]; | float *tmp_dst_buffer = buffer_list[2]; | ||||
| float *nc4hw4_out = buffer_list[3]; | float *nc4hw4_out = buffer_list[3]; | ||||
| int tile_buffer_offset = TILE_NUM * input_unit_square * ic4 * C4NUM; | |||||
| float *col_buffer = buffer_list[4]; | |||||
| int tile_buffer_offset = C12NUM * input_unit_square * ic4 * C4NUM; | |||||
| int block_unit_buffer_offset = input_unit_square * C4NUM; | int block_unit_buffer_offset = input_unit_square * C4NUM; | ||||
| int tmp_dst_buffer_offset = TILE_NUM * input_unit_square * oc4 * C4NUM; | |||||
| int tmp_dst_buffer_offset = C12NUM * input_unit_square * oc8 * C8NUM; | |||||
| int col_buffer_offset = C12NUM * ic4 * C4NUM; | |||||
| int input_batch = conv_param->input_batch_; | int input_batch = conv_param->input_batch_; | ||||
| for (int batch = 0; batch < input_batch; batch++) { | for (int batch = 0; batch < input_batch; batch++) { | ||||
| @@ -461,15 +477,20 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat | |||||
| int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_; | int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_; | ||||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | ||||
| int start_index = thread_id * TILE_NUM; | |||||
| int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM; | |||||
| int start_index = thread_id * C12NUM; | |||||
| int real_cal_num = (output_count - start_index) < C12NUM ? (output_count - start_index) : C12NUM; | |||||
| Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset, | Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset, | ||||
| block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num, | block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num, | ||||
| out_w_block, conv_param); | out_w_block, conv_param); | ||||
| gemm_func(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset, | |||||
| transed_weight, NULL, input_unit_square, ic4, oc4 * C4NUM, | |||||
| oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); | |||||
| float *src_ptr = tile_buffer + task_id * tile_buffer_offset; | |||||
| float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; | |||||
| float *dst_ptr = tmp_dst_buffer + task_id * tmp_dst_buffer_offset; | |||||
| for (int i = 0; i < input_unit_square; ++i) { | |||||
| RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM); | |||||
| MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, | |||||
| ic4 * C4NUM, C12NUM, oc8 * C8NUM, input_unit_square, 2); | |||||
| } | |||||
| Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset, | Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset, | ||||
| bias_data, start_index, real_cal_num, out_w_block, conv_param); | bias_data, start_index, real_cal_num, out_w_block, conv_param); | ||||
| @@ -38,13 +38,15 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat | |||||
| int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); | int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_); | ||||
| int h_start = h_step * task_id; | int h_start = h_step * task_id; | ||||
| int h_end = MSMIN(h_start + h_step, conv_param->output_h_); | int h_end = MSMIN(h_start + h_step, conv_param->output_h_); | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| for (int b = 0; b < conv_param->output_batch_; b++) { | for (int b = 0; b < conv_param->output_batch_; b++) { | ||||
| const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; | const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_; | ||||
| float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; | float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_; | ||||
| for (int oh = h_start; oh < h_end; oh++) { | for (int oh = h_start; oh < h_end; oh++) { | ||||
| float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; | float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_; | ||||
| int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_)); | ||||
| @@ -60,13 +62,13 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat | |||||
| int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; | int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_; | ||||
| for (int kw = 0; kw < conv_param->kernel_w_; kw++) { | for (int kw = 0; kw < conv_param->kernel_w_; kw++) { | ||||
| int out_w_start = MSMAX( | int out_w_start = MSMAX( | ||||
| 0, (conv_param->pad_w_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); | |||||
| int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_w_ - | |||||
| 0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_); | |||||
| int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ - | |||||
| conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / | conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / | ||||
| conv_param->stride_w_); | conv_param->stride_w_); | ||||
| float *dst_w = dst_data + out_w_start * conv_param->output_channel_; | float *dst_w = dst_data + out_w_start * conv_param->output_channel_; | ||||
| int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_w_ + conv_param->dilation_w_ * kw; | |||||
| int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw; | |||||
| const float *src_kw = src_kh + iw_origin * conv_param->input_channel_; | const float *src_kw = src_kh + iw_origin * conv_param->input_channel_; | ||||
| int num_pixels = out_w_end - out_w_start; | int num_pixels = out_w_end - out_w_start; | ||||
| @@ -75,10 +77,10 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat | |||||
| weight_kh += conv_param->output_channel_; | weight_kh += conv_param->output_channel_; | ||||
| } | } | ||||
| } | } | ||||
| if (conv_param->is_relu_) { | |||||
| if (relu) { | |||||
| ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); | ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); | ||||
| } | } | ||||
| if (conv_param->is_relu6_) { | |||||
| if (relu6) { | |||||
| Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); | Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_); | ||||
| } | } | ||||
| } | } | ||||
| @@ -91,16 +93,16 @@ void InitSlidingParam(SlidingWindowParam *sliding, const ConvParameter *conv_par | |||||
| int top = 0; | int top = 0; | ||||
| int bottom = conv_param->output_h_; | int bottom = conv_param->output_h_; | ||||
| for (; left * conv_param->stride_w_ < conv_param->pad_w_; left++) { | |||||
| for (; left * conv_param->stride_w_ < conv_param->pad_l_; left++) { | |||||
| } | } | ||||
| for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_w_ + conv_param->kernel_w_ * conv_param->dilation_w_ > | |||||
| for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_l_ + conv_param->kernel_w_ * conv_param->dilation_w_ > | |||||
| conv_param->input_w_ && | conv_param->input_w_ && | ||||
| right > left; | right > left; | ||||
| right--) { | right--) { | ||||
| } | } | ||||
| for (; top * conv_param->stride_h_ < conv_param->pad_h_; top++) { | |||||
| for (; top * conv_param->stride_h_ < conv_param->pad_u_; top++) { | |||||
| } | } | ||||
| for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_h_ + conv_param->kernel_h_ * conv_param->dilation_h_ > | |||||
| for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_u_ + conv_param->kernel_h_ * conv_param->dilation_h_ > | |||||
| conv_param->input_h_ && | conv_param->input_h_ && | ||||
| bottom > top; | bottom > top; | ||||
| bottom--) { | bottom--) { | ||||
| @@ -181,16 +183,18 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con | |||||
| void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, | void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, | ||||
| int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float *dst_h = dst + top * sliding->out_h_step_; | float *dst_h = dst + top * sliding->out_h_step_; | ||||
| for (int oh = top; oh < bottom; oh++) { | for (int oh = top; oh < bottom; oh++) { | ||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | ||||
| const float *src_h = src + ih * sliding->in_h_step_; | const float *src_h = src + ih * sliding->in_h_step_; | ||||
| float *dst_kernel = dst_h + left * sliding->block_channel_; | float *dst_kernel = dst_h + left * sliding->block_channel_; | ||||
| for (int ow = left; ow < right; ow++) { | for (int ow = left; ow < right; ow++) { | ||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | ||||
| const float *src_w = src_h + iw * sliding->block_channel_; | const float *src_w = src_h + iw * sliding->block_channel_; | ||||
| @@ -201,11 +205,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | ||||
| conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); | |||||
| conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); | |||||
| #else | #else | ||||
| DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, relu, relu6); | |||||
| #endif | #endif | ||||
| dst_kernel += sliding->block_channel_; | dst_kernel += sliding->block_channel_; | ||||
| } // width loop | } // width loop | ||||
| @@ -259,6 +262,8 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl | |||||
| // conv depthwise fp32: sliding window | // conv depthwise fp32: sliding window | ||||
| void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, | void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data, | ||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { | const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| const float *src = input_data; | const float *src = input_data; | ||||
| float *dst = output_data; | float *dst = output_data; | ||||
| for (int b = 0; b < conv_param->output_batch_; b++) { | for (int b = 0; b < conv_param->output_batch_; b++) { | ||||
| @@ -277,8 +282,8 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||||
| conv_param->output_w_, conv_param, sliding); | conv_param->output_w_, conv_param, sliding); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | ||||
| float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -286,12 +291,12 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), | conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), | ||||
| sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), | sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), | ||||
| sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float), | sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float), | ||||
| sliding->in_kw_step_ * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_kw_step_ * sizeof(float), relu, relu6); | |||||
| #else | #else | ||||
| DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | ||||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, | conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, | ||||
| sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, relu, | |||||
| relu6); | |||||
| #endif | #endif | ||||
| } | } | ||||
| } // output C4 loop | } // output C4 loop | ||||
| @@ -454,11 +459,11 @@ void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float | |||||
| memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float)); | memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float)); | ||||
| for (int oh = 0; oh < out_h_block; oh++) { | for (int oh = 0; oh < out_h_block; oh++) { | ||||
| int ih = oh * 2 - conv_param->pad_h_; | |||||
| int ih = oh * 2 - conv_param->pad_u_; | |||||
| int real_h_start = ih > 0 ? 0 : -ih; | int real_h_start = ih > 0 ? 0 : -ih; | ||||
| int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih); | int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih); | ||||
| for (int ow = 0; ow < out_w_block; ow++) { | for (int ow = 0; ow < out_w_block; ow++) { | ||||
| int iw = ow * 2 - conv_param->pad_w_; | |||||
| int iw = ow * 2 - conv_param->pad_l_; | |||||
| int real_w_start = iw > 0 ? 0 : -iw; | int real_w_start = iw > 0 ? 0 : -iw; | ||||
| int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw); | int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw); | ||||
| @@ -642,6 +647,8 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia | |||||
| void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block, | void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block, | ||||
| int out_w_block, const ConvParameter *conv_param) { | int out_w_block, const ConvParameter *conv_param) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); | int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); | ||||
| bool h_in_range = true; | bool h_in_range = true; | ||||
| for (int oh = 0; oh < out_h_block; oh++) { | for (int oh = 0; oh < out_h_block; oh++) { | ||||
| @@ -661,8 +668,8 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo | |||||
| float *buf_ow = buf_oh + ow * 16 * C4NUM; | float *buf_ow = buf_oh + ow * 16 * C4NUM; | ||||
| float *output_ow = output_oh + real_ow * oc4 * C4NUM; | float *output_ow = output_oh + real_ow * oc4 * C4NUM; | ||||
| ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, relu, | |||||
| relu6); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -727,14 +734,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in | |||||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| const float *src_h = src + top * sliding->out_h_step_; | const float *src_h = src + top * sliding->out_h_step_; | ||||
| for (int ih = top; ih < bottom; ih++) { | for (int ih = top; ih < bottom; ih++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| float *dst_h = dst + oh * sliding->in_h_step_; | float *dst_h = dst + oh * sliding->in_h_step_; | ||||
| const float *src_kernel = src_h + left * sliding->block_channel_; | const float *src_kernel = src_h + left * sliding->block_channel_; | ||||
| for (int iw = left; iw < right; iw++) { | for (int iw = left; iw < right; iw++) { | ||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | ||||
| float *dst_w = dst_h + ow * sliding->block_channel_; | float *dst_w = dst_h + ow * sliding->block_channel_; | ||||
| @@ -790,12 +797,14 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in | |||||
| #endif | #endif | ||||
| void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) { | void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) { | ||||
| bool relu = conv_param->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param->act_type_ == ActType_Relu6; | |||||
| float *dst_k = dst; | float *dst_k = dst; | ||||
| for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) { | ||||
| for (int c = 0; c < C4NUM; c++) { | for (int c = 0; c < C4NUM; c++) { | ||||
| dst_k[c] += bias[c]; | dst_k[c] += bias[c]; | ||||
| dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); | |||||
| dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); | |||||
| dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]); | |||||
| dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]); | |||||
| } | } | ||||
| dst_k += block_channel; | dst_k += block_channel; | ||||
| } | } | ||||
| @@ -821,8 +830,8 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we | |||||
| conv_param->input_w_, conv_param, sliding); | conv_param->input_w_, conv_param, sliding); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | ||||
| const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| @@ -57,8 +57,8 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * | |||||
| for (int ih = 0; ih < conv_param->input_h_; ih++) { | for (int ih = 0; ih < conv_param->input_h_; ih++) { | ||||
| for (int iw = 0; iw < conv_param->input_w_; iw++) { | for (int iw = 0; iw < conv_param->input_w_; iw++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| @@ -97,7 +97,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * | |||||
| } /*ih*/ | } /*ih*/ | ||||
| } /*oc8*/ | } /*oc8*/ | ||||
| PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_, | |||||
| conv_param->is_relu6_); | |||||
| PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, | |||||
| conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6); | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| @@ -356,7 +356,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||||
| dst[ci] = value; | dst[ci] = value; | ||||
| } | } | ||||
| } | } | ||||
| } else { | |||||
| } else if (out_type == OutType_C8) { | |||||
| /* col8-major * row8-major => col12x8-major */ | /* col8-major * row8-major => col12x8-major */ | ||||
| int col_8 = UP_ROUND(col, C8NUM); | int col_8 = UP_ROUND(col, C8NUM); | ||||
| int row_12 = UP_ROUND(row, C12NUM); | int row_12 = UP_ROUND(row, C12NUM); | ||||
| @@ -364,9 +364,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||||
| for (int c = 0; c < col_8; c++) { | for (int c = 0; c < col_8; c++) { | ||||
| int r12div = r / C12NUM, r12mod = r % C12NUM; | int r12div = r / C12NUM, r12mod = r % C12NUM; | ||||
| int c8div = c / C8NUM, c8mod = c % C8NUM; | int c8div = c / C8NUM, c8mod = c % C8NUM; | ||||
| int c4div = c / C4NUM, c4mod = c % C4NUM; | |||||
| size_t ci = (out_type == OutType_C4) ? (c4div * C4NUM * row_12 + r * C4NUM + c4mod) | |||||
| : (c8div * C8NUM * row_12 + r * C8NUM + c8mod); | |||||
| size_t ci = (c8div * C8NUM * row_12 + r * C8NUM + c8mod); | |||||
| float value = 0; | float value = 0; | ||||
| for (int d = 0; d < deep; d++) { | for (int d = 0; d < deep; d++) { | ||||
| size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod; | size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod; | ||||
| @@ -379,6 +377,25 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||||
| dst[ci] = value; | dst[ci] = value; | ||||
| } | } | ||||
| } | } | ||||
| } else { | |||||
| for (int i = 0; i < row; ++i) { | |||||
| int src_r_offset = i; | |||||
| int dst_r_offset = i * col * stride; | |||||
| for (int j = 0; j < col; ++j) { | |||||
| int c8div = j / 8, c8mod = j % 8; | |||||
| size_t ci = dst_r_offset + c8div * 8 * stride + c8mod; | |||||
| float value = 0; | |||||
| for (int d = 0; d < deep; ++d) { | |||||
| size_t ai = src_r_offset + d * row; | |||||
| size_t bi = c8div * deep * 8 + d * 8 + c8mod; | |||||
| value = value + a[ai] * b[bi]; | |||||
| } | |||||
| if (bias != NULL) value += bias[j]; | |||||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||||
| dst[ci] = value; | |||||
| } | |||||
| } | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -387,7 +404,7 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT | |||||
| int col, size_t stride, int out_type) { | int col, size_t stride, int out_type) { | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc), | MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc), | ||||
| (int)(out_type == OutType_C4)); | |||||
| (int)(out_type == OutType_TileC8)); | |||||
| #else | #else | ||||
| MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type); | MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type); | ||||
| #endif | #endif | ||||
| @@ -20,9 +20,9 @@ | |||||
| static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); } | static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); } | ||||
| void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) { | void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) { | ||||
| const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_; | |||||
| const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_; | |||||
| // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; | // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; | ||||
| const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_; | |||||
| const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_; | |||||
| // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; | // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; | ||||
| const int stride_h = conv_param->stride_h_; | const int stride_h = conv_param->stride_h_; | ||||
| @@ -72,9 +72,9 @@ void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param | |||||
| // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w) | // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w) | ||||
| void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) { | void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) { | ||||
| const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_; | |||||
| const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_; | |||||
| // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; | // const int pad_right = /*conv_param->pad_r_*/conv_param->pad_w_; | ||||
| const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_; | |||||
| const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_; | |||||
| // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; | // const int pad_down = /*conv_param->pad_d/*/conv_param->pad_h_; | ||||
| const int stride_h = conv_param->stride_h_; | const int stride_h = conv_param->stride_h_; | ||||
| @@ -68,14 +68,14 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight, | |||||
| bool per_channel) { | bool per_channel) { | ||||
| int8_t *dst_h = dst + top * sliding->out_h_step_; | int8_t *dst_h = dst + top * sliding->out_h_step_; | ||||
| for (int oh = top; oh < bottom; oh++) { | for (int oh = top; oh < bottom; oh++) { | ||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ih = oh * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_)); | ||||
| const int16_t *src_h = src + ih * sliding->in_h_step_; | const int16_t *src_h = src + ih * sliding->in_h_step_; | ||||
| int8_t *dst_kernel = dst_h + left * sliding->block_channel_; | int8_t *dst_kernel = dst_h + left * sliding->block_channel_; | ||||
| for (int ow = left; ow < right; ow++) { | for (int ow = left; ow < right; ow++) { | ||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int iw = ow * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_)); | ||||
| const int16_t *src_w = src_h + iw * sliding->block_channel_; | const int16_t *src_w = src_h + iw * sliding->block_channel_; | ||||
| @@ -186,8 +186,8 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w | |||||
| per_channel); | per_channel); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | ||||
| int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -241,14 +241,14 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t * | |||||
| int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| const int16_t *src_h = src + top * sliding->out_h_step_; | const int16_t *src_h = src + top * sliding->out_h_step_; | ||||
| for (int ih = top; ih < bottom; ih++) { | for (int ih = top; ih < bottom; ih++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| int32_t *dst_h = dst + oh * sliding->in_h_step_; | int32_t *dst_h = dst + oh * sliding->in_h_step_; | ||||
| const int16_t *src_kernel = src_h + left * sliding->block_channel_; | const int16_t *src_kernel = src_h + left * sliding->block_channel_; | ||||
| for (int iw = left; iw < right; iw++) { | for (int iw = left; iw < right; iw++) { | ||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_)); | ||||
| int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_)); | ||||
| int32_t *dst_w = dst_h + ow * C4NUM; | int32_t *dst_w = dst_h + ow * C4NUM; | ||||
| @@ -341,8 +341,8 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in | |||||
| conv_param->input_w_, conv_param, sliding); | conv_param->input_w_, conv_param, sliding); | ||||
| if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) { | ||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | ||||
| const int16_t *in_t = | const int16_t *in_t = | ||||
| src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| @@ -33,8 +33,8 @@ int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8 | |||||
| for (int ih = 0; ih < conv_param->input_h_; ih++) { | for (int ih = 0; ih < conv_param->input_h_; ih++) { | ||||
| for (int iw = 0; iw < conv_param->input_w_; iw++) { | for (int iw = 0; iw < conv_param->input_w_; iw++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| @@ -88,8 +88,8 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8 | |||||
| for (int ih = 0; ih < conv_param->input_h_; ih++) { | for (int ih = 0; ih < conv_param->input_h_; ih++) { | ||||
| for (int iw = 0; iw < conv_param->input_w_; iw++) { | for (int iw = 0; iw < conv_param->input_w_; iw++) { | ||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int oh = ih * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| int ow = iw * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_)); | ||||
| int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_)); | ||||
| @@ -29,9 +29,7 @@ typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, | |||||
| typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col); | typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col); | ||||
| typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; | |||||
| typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_C4 = 2 } OutType; | |||||
| typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType; | |||||
| typedef struct MatMulParameter { | typedef struct MatMulParameter { | ||||
| OpParameter op_parameter_; | OpParameter op_parameter_; | ||||
| @@ -25,7 +25,6 @@ | |||||
| #define C8NUM 8 | #define C8NUM 8 | ||||
| #define C12NUM 12 | #define C12NUM 12 | ||||
| #define C16NUM 16 | #define C16NUM 16 | ||||
| #define BLOCK 4 | |||||
| #define TILE_NUM 8 | #define TILE_NUM 8 | ||||
| #define MSMIN(x, y) ((x) < (y) ? (x) : (y)) | #define MSMIN(x, y) ((x) < (y) ? (x) : (y)) | ||||
| @@ -62,4 +61,6 @@ typedef struct OpParameter { | |||||
| int thread_num_; | int thread_num_; | ||||
| } OpParameter; | } OpParameter; | ||||
| typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; | |||||
| #endif // MINDSPORE_LITE_NNACL_OP_BASE_H_ | #endif // MINDSPORE_LITE_NNACL_OP_BASE_H_ | ||||
| @@ -158,14 +158,14 @@ void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_pa | |||||
| char *src = (char *)src_ptr; | char *src = (char *)src_ptr; | ||||
| char *dst = (char *)dst_ptr; | char *dst = (char *)dst_ptr; | ||||
| for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { | for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { | ||||
| int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; | |||||
| int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_; | |||||
| if (src_h < 0 || src_h >= conv_param->input_h_) { | if (src_h < 0 || src_h >= conv_param->input_h_) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size; | const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size; | ||||
| char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size; | char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size; | ||||
| for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { | for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { | ||||
| int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; | |||||
| int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_; | |||||
| if (src_w < 0 || src_w >= conv_param->input_w_) { | if (src_w < 0 || src_w >= conv_param->input_w_) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -296,8 +296,8 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa | |||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int stride_h = conv_param->stride_h_; | int stride_h = conv_param->stride_h_; | ||||
| int stride_w = conv_param->stride_w_; | int stride_w = conv_param->stride_w_; | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int dilation_h = conv_param->dilation_h_; | int dilation_h = conv_param->dilation_h_; | ||||
| int dilation_w = conv_param->dilation_w_; | int dilation_w = conv_param->dilation_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| @@ -348,8 +348,8 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real | |||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int stride_h = conv_param->stride_h_; | int stride_h = conv_param->stride_h_; | ||||
| int stride_w = conv_param->stride_w_; | int stride_w = conv_param->stride_w_; | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int dilation_h = conv_param->dilation_h_; | int dilation_h = conv_param->dilation_h_; | ||||
| int dilation_w = conv_param->dilation_w_; | int dilation_w = conv_param->dilation_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| @@ -419,8 +419,8 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r | |||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int stride_h = conv_param->stride_h_; | int stride_h = conv_param->stride_h_; | ||||
| int stride_w = conv_param->stride_w_; | int stride_w = conv_param->stride_w_; | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int dilation_h = conv_param->dilation_h_; | int dilation_h = conv_param->dilation_h_; | ||||
| int dilation_w = conv_param->dilation_w_; | int dilation_w = conv_param->dilation_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| @@ -24,8 +24,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * | |||||
| int output_unit = conv_param->output_unit_; | int output_unit = conv_param->output_unit_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| int ic4 = UP_DIV(in_channel, C4NUM); | int ic4 = UP_DIV(in_channel, C4NUM); | ||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int input_h = conv_param->input_h_; | int input_h = conv_param->input_h_; | ||||
| int input_w = conv_param->input_w_; | int input_w = conv_param->input_w_; | ||||
| if (out_w_block_num == 0) { | if (out_w_block_num == 0) { | ||||
| @@ -42,7 +42,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * | |||||
| int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s); | int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s); | ||||
| int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s); | int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s); | ||||
| int dst_plane_offset = c * C4NUM; | |||||
| int dst_plane_offset = c * C4NUM * ic4; | |||||
| for (int ic = 0; ic < ic4; ic++) { | for (int ic = 0; ic < ic4; ic++) { | ||||
| // clear tmp buffer | // clear tmp buffer | ||||
| memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); | memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); | ||||
| @@ -67,8 +67,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * | |||||
| } | } | ||||
| } | } | ||||
| // input transform | // input transform | ||||
| int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM; | |||||
| size_t dst_step = ic4 * C4NUM * TILE_NUM; | |||||
| int dst_ic4_offset = dst_plane_offset + ic * C4NUM; | |||||
| size_t dst_step = C12NUM * ic4 * C4NUM; | |||||
| float *trans_input_ptr = trans_input + dst_ic4_offset; | float *trans_input_ptr = trans_input + dst_ic4_offset; | ||||
| input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step); | input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step); | ||||
| } | } | ||||
| @@ -86,6 +86,7 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f | |||||
| int output_h_unit_block = UP_DIV(output_h, output_unit); | int output_h_unit_block = UP_DIV(output_h, output_unit); | ||||
| int output_channel = conv_param->output_channel_; | int output_channel = conv_param->output_channel_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int oc8 = UP_DIV(output_channel, C8NUM); | |||||
| int input_unit = conv_param->input_unit_; | int input_unit = conv_param->input_unit_; | ||||
| if (output_unit_num == 0) { | if (output_unit_num == 0) { | ||||
| return; | return; | ||||
| @@ -93,17 +94,19 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f | |||||
| for (int i = 0; i < cal_num; i++) { | for (int i = 0; i < cal_num; i++) { | ||||
| int dst_x_s = out_tile_index % output_unit_num; | int dst_x_s = out_tile_index % output_unit_num; | ||||
| int dst_y_s = out_tile_index / output_unit_num; | int dst_y_s = out_tile_index / output_unit_num; | ||||
| int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit; | |||||
| int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit; | |||||
| int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit); | int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit); | ||||
| for (int j = 0; j < oc4; j++) { | for (int j = 0; j < oc4; j++) { | ||||
| int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM; | |||||
| int c8_block = j / 2; | |||||
| int c8_res = j % 2; | |||||
| int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM; | |||||
| int dst_oc4_offset = | int dst_oc4_offset = | ||||
| dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit; | dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit; | ||||
| const float *src_ptr = gemm_out + src_oc4_offset; | const float *src_ptr = gemm_out + src_oc4_offset; | ||||
| const float *bias_ptr = bias_data + j * C4NUM; | const float *bias_ptr = bias_data + j * C4NUM; | ||||
| float *dst_ptr = tmp_out_data + dst_oc4_offset; | float *dst_ptr = tmp_out_data + dst_oc4_offset; | ||||
| output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit); | |||||
| output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit); | |||||
| } | } | ||||
| out_tile_index++; | out_tile_index++; | ||||
| } | } | ||||
| @@ -283,8 +286,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| int input_channel = conv_param->input_channel_; | int input_channel = conv_param->input_channel_; | ||||
| int input_width = conv_param->input_w_; | int input_width = conv_param->input_w_; | ||||
| int input_height = conv_param->input_h_; | int input_height = conv_param->input_h_; | ||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| int ic4 = UP_DIV(input_channel, C4NUM); | int ic4 = UP_DIV(input_channel, C4NUM); | ||||
| const int input_unit = 4; | const int input_unit = 4; | ||||
| if (out_w_block == 0) { | if (out_w_block == 0) { | ||||
| @@ -300,7 +303,7 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y); | int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y); | ||||
| int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x); | int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x); | ||||
| int dst_plane_offset = cal_id * C4NUM; | |||||
| int dst_plane_offset = cal_id * C4NUM * ic4; | |||||
| for (int ic = 0; ic < ic4; ic++) { | for (int ic = 0; ic < ic4; ic++) { | ||||
| // clear tmp buffer | // clear tmp buffer | ||||
| memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); | memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float)); | ||||
| @@ -326,8 +329,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| } | } | ||||
| // input transform | // input transform | ||||
| int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM; | |||||
| size_t dst_step = ic4 * C4NUM * TILE_NUM; | |||||
| int dst_ic4_offset = dst_plane_offset + ic * C4NUM; | |||||
| size_t dst_step = C12NUM * ic4 * C4NUM; | |||||
| float *trans_input_ptr = trans_input + dst_ic4_offset; | float *trans_input_ptr = trans_input + dst_ic4_offset; | ||||
| Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step); | Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step); | ||||
| } | } | ||||
| @@ -336,8 +339,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | ||||
| int oc_block) { | int oc_block) { | ||||
| const int input_unit = 4; | |||||
| int dst_step = iC4 * C4NUM * oc_block; | |||||
| int oc_plane_block = UP_DIV(output_channel, oc_block); | |||||
| int dst_step = iC4 * C4NUM * oc_block * oc_plane_block; | |||||
| if (oc_block == 0) { | if (oc_block == 0) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -345,7 +348,7 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4 | |||||
| int oc_block_num = o / oc_block; | int oc_block_num = o / oc_block; | ||||
| int oc_block_rem = o % oc_block; | int oc_block_rem = o % oc_block; | ||||
| int src_oc_offset = o * iC4 * C4NUM * kernel_plane; | int src_oc_offset = o * iC4 * C4NUM * kernel_plane; | ||||
| int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem; | |||||
| int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM + oc_block_rem; | |||||
| for (int i = 0; i < iC4; i++) { | for (int i = 0; i < iC4; i++) { | ||||
| float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; | float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; | ||||
| float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM; | float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM; | ||||
| @@ -559,24 +562,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float | |||||
| float32x4_t bias_ptr = vld1q_f32(bias_data); | float32x4_t bias_ptr = vld1q_f32(bias_data); | ||||
| float32x4_t s00 = vld1q_f32(gemm_out); | float32x4_t s00 = vld1q_f32(gemm_out); | ||||
| float32x4_t s01 = vld1q_f32(gemm_out + 4); | |||||
| float32x4_t s02 = vld1q_f32(gemm_out + 8); | |||||
| float32x4_t s03 = vld1q_f32(gemm_out + 12); | |||||
| float32x4_t s01 = vld1q_f32(gemm_out + 8); | |||||
| float32x4_t s02 = vld1q_f32(gemm_out + 16); | |||||
| float32x4_t s03 = vld1q_f32(gemm_out + 24); | |||||
| float32x4_t s10 = vld1q_f32(gemm_out + 16); | |||||
| float32x4_t s11 = vld1q_f32(gemm_out + 20); | |||||
| float32x4_t s12 = vld1q_f32(gemm_out + 24); | |||||
| float32x4_t s13 = vld1q_f32(gemm_out + 28); | |||||
| float32x4_t s10 = vld1q_f32(gemm_out + 32); | |||||
| float32x4_t s11 = vld1q_f32(gemm_out + 40); | |||||
| float32x4_t s12 = vld1q_f32(gemm_out + 48); | |||||
| float32x4_t s13 = vld1q_f32(gemm_out + 56); | |||||
| float32x4_t s20 = vld1q_f32(gemm_out + 32); | |||||
| float32x4_t s21 = vld1q_f32(gemm_out + 36); | |||||
| float32x4_t s22 = vld1q_f32(gemm_out + 40); | |||||
| float32x4_t s23 = vld1q_f32(gemm_out + 44); | |||||
| float32x4_t s20 = vld1q_f32(gemm_out + 64); | |||||
| float32x4_t s21 = vld1q_f32(gemm_out + 72); | |||||
| float32x4_t s22 = vld1q_f32(gemm_out + 80); | |||||
| float32x4_t s23 = vld1q_f32(gemm_out + 88); | |||||
| float32x4_t s30 = vld1q_f32(gemm_out + 48); | |||||
| float32x4_t s31 = vld1q_f32(gemm_out + 52); | |||||
| float32x4_t s32 = vld1q_f32(gemm_out + 56); | |||||
| float32x4_t s33 = vld1q_f32(gemm_out + 60); | |||||
| float32x4_t s30 = vld1q_f32(gemm_out + 96); | |||||
| float32x4_t s31 = vld1q_f32(gemm_out + 104); | |||||
| float32x4_t s32 = vld1q_f32(gemm_out + 112); | |||||
| float32x4_t s33 = vld1q_f32(gemm_out + 120); | |||||
| float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20); | float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20); | ||||
| float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21); | float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21); | ||||
| @@ -609,24 +612,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float | |||||
| const float *bias_ptr = bias_data + i; | const float *bias_ptr = bias_data + i; | ||||
| float s00 = local_ptr[0]; | float s00 = local_ptr[0]; | ||||
| float s01 = (local_ptr + 4)[0]; | |||||
| float s02 = (local_ptr + 8)[0]; | |||||
| float s03 = (local_ptr + 12)[0]; | |||||
| float s01 = (local_ptr + 8)[0]; | |||||
| float s02 = (local_ptr + 16)[0]; | |||||
| float s03 = (local_ptr + 24)[0]; | |||||
| float s10 = (local_ptr + 16)[0]; | |||||
| float s11 = (local_ptr + 20)[0]; | |||||
| float s12 = (local_ptr + 24)[0]; | |||||
| float s13 = (local_ptr + 28)[0]; | |||||
| float s10 = (local_ptr + 32)[0]; | |||||
| float s11 = (local_ptr + 40)[0]; | |||||
| float s12 = (local_ptr + 48)[0]; | |||||
| float s13 = (local_ptr + 56)[0]; | |||||
| float s20 = (local_ptr + 32)[0]; | |||||
| float s21 = (local_ptr + 36)[0]; | |||||
| float s22 = (local_ptr + 40)[0]; | |||||
| float s23 = (local_ptr + 44)[0]; | |||||
| float s20 = (local_ptr + 64)[0]; | |||||
| float s21 = (local_ptr + 72)[0]; | |||||
| float s22 = (local_ptr + 80)[0]; | |||||
| float s23 = (local_ptr + 88)[0]; | |||||
| float s30 = (local_ptr + 48)[0]; | |||||
| float s31 = (local_ptr + 52)[0]; | |||||
| float s32 = (local_ptr + 56)[0]; | |||||
| float s33 = (local_ptr + 60)[0]; | |||||
| float s30 = (local_ptr + 96)[0]; | |||||
| float s31 = (local_ptr + 104)[0]; | |||||
| float s32 = (local_ptr + 112)[0]; | |||||
| float s33 = (local_ptr + 120)[0]; | |||||
| float t00 = s00 + s10 + s20; | float t00 = s00 + s10 + s20; | ||||
| float t01 = s01 + s11 + s21; | float t01 = s01 + s11 + s21; | ||||
| @@ -663,6 +666,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl | |||||
| int output_w = conv_param->output_w_; | int output_w = conv_param->output_w_; | ||||
| int output_h = conv_param->output_h_; | int output_h = conv_param->output_h_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int oc8 = UP_DIV(output_channel, C8NUM); | |||||
| const int input_unit = 4; | const int input_unit = 4; | ||||
| if (out_w_block == 0) { | if (out_w_block == 0) { | ||||
| return; | return; | ||||
| @@ -670,11 +674,13 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl | |||||
| for (int i = 0; i < real_cal_num; i++) { | for (int i = 0; i < real_cal_num; i++) { | ||||
| int out_w_index = (start_index + i) % out_w_block; | int out_w_index = (start_index + i) % out_w_block; | ||||
| int out_h_index = (start_index + i) / out_w_block; | int out_h_index = (start_index + i) / out_w_block; | ||||
| int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit; | |||||
| int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit; | |||||
| int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w); | int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w); | ||||
| for (int j = 0; j < oc4; j++) { | for (int j = 0; j < oc4; j++) { | ||||
| int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM; | |||||
| int c8_block = j / 2; | |||||
| int c8_res = j % 2; | |||||
| int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM; | |||||
| int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w; | int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w; | ||||
| const float *src_ptr = gemm_out + src_oc4_offset; | const float *src_ptr = gemm_out + src_oc4_offset; | ||||
| const float *bias_ptr = bias_data + j * C4NUM; | const float *bias_ptr = bias_data + j * C4NUM; | ||||
| @@ -864,8 +870,8 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input, | |||||
| int input_channel = conv_param->input_channel_; | int input_channel = conv_param->input_channel_; | ||||
| int input_width = conv_param->input_w_; | int input_width = conv_param->input_w_; | ||||
| int input_height = conv_param->input_h_; | int input_height = conv_param->input_h_; | ||||
| int pad_w = conv_param->pad_w_; | |||||
| int pad_h = conv_param->pad_h_; | |||||
| int pad_w = conv_param->pad_l_; | |||||
| int pad_h = conv_param->pad_u_; | |||||
| ConvQuantArg quant_arg = conv_param->conv_quant_arg_; | ConvQuantArg quant_arg = conv_param->conv_quant_arg_; | ||||
| int input_zp = quant_arg.input_quant_args_[0].zp_; | int input_zp = quant_arg.input_quant_args_[0].zp_; | ||||
| const int ic8 = UP_DIV(input_channel, C8NUM); | const int ic8 = UP_DIV(input_channel, C8NUM); | ||||
| @@ -1221,9 +1227,9 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i | |||||
| int32x4_t ls; | int32x4_t ls; | ||||
| int32x4_t rs; | int32x4_t rs; | ||||
| if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | ||||
| out_multiplier = vld1q_s32(quant_multiplier); | |||||
| ls = vld1q_s32(left_shift); | |||||
| rs = vld1q_s32(right_shift); | |||||
| out_multiplier = vld1q_s32(quant_multiplier + oc_start); | |||||
| ls = vld1q_s32(left_shift + oc_start); | |||||
| rs = vld1q_s32(right_shift + oc_start); | |||||
| } else { | } else { | ||||
| out_multiplier = vdupq_n_s32(quant_multiplier[0]); | out_multiplier = vdupq_n_s32(quant_multiplier[0]); | ||||
| ls = vdupq_n_s32(left_shift[0]); | ls = vdupq_n_s32(left_shift[0]); | ||||
| @@ -4649,43 +4649,41 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float | |||||
| // Utilize cost model to compute performance gain. | // Utilize cost model to compute performance gain. | ||||
| // If the gain is greater than got from Im2col, winograd algorithm will be chosen. | // If the gain is greater than got from Im2col, winograd algorithm will be chosen. | ||||
| int SelectOutputUnit(ConvParameter *conv_param) { | int SelectOutputUnit(ConvParameter *conv_param) { | ||||
| int input_batch = conv_param->input_batch_; | |||||
| int kernel_h = conv_param->kernel_h_; | int kernel_h = conv_param->kernel_h_; | ||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int in_channel = conv_param->input_channel_; | |||||
| int out_h = conv_param->output_h_; | |||||
| int in_c = conv_param->input_channel_; | |||||
| int out_w = conv_param->output_w_; | int out_w = conv_param->output_w_; | ||||
| int out_channel = conv_param->output_channel_; | |||||
| int out_plane = out_h * out_w; | |||||
| int max_unit = sqrt((float)(out_plane)); | |||||
| max_unit = max_unit > MIN_UNIT ? max_unit : MIN_UNIT; | |||||
| max_unit = max_unit < MAX_UNIT ? max_unit : MAX_UNIT; | |||||
| int output_unit = 1; | |||||
| float ratio = 0.0f; | |||||
| // cost of conventional convolution multiplications | |||||
| float ori_cost = out_plane * out_channel * in_channel * kernel_h * kernel_w; | |||||
| for (int u = MIN_UNIT; u < max_unit; u++) { | |||||
| int input_unit = u + kernel_h - 1; | |||||
| if (input_unit != 4 && input_unit != 8) { | |||||
| int out_h = conv_param->output_h_; | |||||
| int out_c = conv_param->output_channel_; | |||||
| int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_); | |||||
| int max_out_unit = (int)(sqrtf((float)unit2)); | |||||
| max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit; | |||||
| max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT; | |||||
| int unit = 0; | |||||
| float max_rate = 0.0f; | |||||
| float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w; | |||||
| for (int i = MIN_UNIT; i <= max_out_unit; ++i) { | |||||
| int input_unit = i + kernel_w - 1; | |||||
| OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i); | |||||
| if (output_trans_func == NULL) { | |||||
| continue; | continue; | ||||
| } | } | ||||
| // don't count filter transform cost, because it can be processed once offline. | |||||
| const float input_trans_unit_cost = 2 * input_unit * input_unit * input_unit * in_channel; | |||||
| float gemm_unit_cost = input_unit * input_unit * in_channel * out_channel; | |||||
| float output_trans_unit_cost = input_unit * u * (u + input_unit) * out_channel; | |||||
| // equation (23) in papar | |||||
| float winograd_cost = (input_trans_unit_cost + gemm_unit_cost + output_trans_unit_cost) * | |||||
| (UP_DIV(out_w, u) * (UP_DIV(out_h, u))) * input_batch; | |||||
| float reduce_rate = ori_cost / winograd_cost; | |||||
| if (reduce_rate > ratio && reduce_rate > 1) { | |||||
| ratio = reduce_rate; | |||||
| output_unit = u; | |||||
| float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f; | |||||
| float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) * | |||||
| UP_DIV(out_w, i) * UP_DIV(out_h, i); | |||||
| float reduce_rate = common_cost / wino_cost - penalty; | |||||
| if (reduce_rate > max_rate) { | |||||
| max_rate = reduce_rate; | |||||
| unit = i; | |||||
| } | } | ||||
| } | } | ||||
| if (max_rate < 1.0f) { | |||||
| return 1; | |||||
| } | |||||
| // If output_unit is 1, then it is conventional convolution | // If output_unit is 1, then it is conventional convolution | ||||
| return output_unit; | |||||
| return unit; | |||||
| } | } | ||||
| InputTransformUnitFunc GetInputTransFunc(int input_unit) { | InputTransformUnitFunc GetInputTransFunc(int input_unit) { | ||||
| @@ -4719,17 +4717,6 @@ void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *con | |||||
| *output_unit = SelectOutputUnit(conv_param); | *output_unit = SelectOutputUnit(conv_param); | ||||
| if (*output_unit > 1) { | if (*output_unit > 1) { | ||||
| *use_winograd = true; | *use_winograd = true; | ||||
| int input_unit = conv_param->kernel_h_ + *output_unit - 1; | |||||
| input_trans_func = GetInputTransFunc(input_unit); | |||||
| if (input_trans_func == NULL) { | |||||
| *use_winograd = false; | |||||
| } | |||||
| output_trans_func = GetOutputTransFunc(input_unit, *output_unit); | |||||
| if (output_trans_func == NULL) { | |||||
| *use_winograd = false; | |||||
| } | |||||
| } else { | |||||
| *use_winograd = false; | |||||
| } | } | ||||
| } else { | } else { | ||||
| *use_winograd = false; | *use_winograd = false; | ||||
| @@ -376,10 +376,18 @@ void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output | |||||
| *output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(stride_h)); | *output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(stride_h)); | ||||
| auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h); | auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h); | ||||
| auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w); | auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w); | ||||
| pad_u_ = pad_h_all / 2; | |||||
| pad_d_ = pad_h_all - pad_u_; | |||||
| pad_l_ = pad_w_all / 2; | |||||
| pad_r_ = pad_w_all - pad_l_; | |||||
| if (pad_h_all < 0) { | |||||
| pad_u_ = pad_d_ = 0; | |||||
| } else { | |||||
| pad_u_ = pad_h_all / 2; | |||||
| pad_d_ = pad_h_all - pad_u_; | |||||
| } | |||||
| if (pad_w_all < 0) { | |||||
| pad_l_ = pad_r_ = 0; | |||||
| } else { | |||||
| pad_l_ = pad_w_all / 2; | |||||
| pad_r_ = pad_w_all - pad_l_; | |||||
| } | |||||
| } else { | } else { | ||||
| *output_w = std::ceil((static_cast<float>(input_w) + pad_l_ + pad_r_ - | *output_w = std::ceil((static_cast<float>(input_w) + pad_l_ + pad_r_ - | ||||
| (static_cast<float>(kernel_w) - 1) * static_cast<float>(dilate_w)) / | (static_cast<float>(kernel_w) - 1) * static_cast<float>(dilate_w)) / | ||||
| @@ -126,14 +126,12 @@ int DeConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vecto | |||||
| output->set_shape(out_shape); | output->set_shape(out_shape); | ||||
| if (pad_mode == schema::PadMode_SAME) { | if (pad_mode == schema::PadMode_SAME) { | ||||
| pad_h_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2; | |||||
| pad_w_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2; | |||||
| pad_u_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2; | |||||
| pad_l_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2; | |||||
| } else if (pad_mode == schema::PadMode_VALID) { | } else if (pad_mode == schema::PadMode_VALID) { | ||||
| pad_h_ = 0; | |||||
| pad_w_ = 0; | |||||
| pad_u_ = 0; | |||||
| pad_l_ = 0; | |||||
| } else if (pad_mode == schema::PadMode_CAFFE) { | } else if (pad_mode == schema::PadMode_CAFFE) { | ||||
| pad_h_ = pad_u_; | |||||
| pad_w_ = pad_l_; | |||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << "unsupported pad mode for deconv"; | MS_LOG(ERROR) << "unsupported pad mode for deconv"; | ||||
| } | } | ||||
| @@ -74,16 +74,12 @@ class DeConv2D : public PrimitiveC { | |||||
| int PadDown() const { return this->pad_d_; } | int PadDown() const { return this->pad_d_; } | ||||
| int PadLeft() const { return this->pad_l_; } | int PadLeft() const { return this->pad_l_; } | ||||
| int PadRight() const { return this->pad_r_; } | int PadRight() const { return this->pad_r_; } | ||||
| int PadH() const { return this->pad_h_; } | |||||
| int PadW() const { return this->pad_w_; } | |||||
| protected: | protected: | ||||
| int pad_u_ = 0; | int pad_u_ = 0; | ||||
| int pad_d_ = 0; | int pad_d_ = 0; | ||||
| int pad_l_ = 0; | int pad_l_ = 0; | ||||
| int pad_r_ = 0; | int pad_r_ = 0; | ||||
| int pad_h_ = 0; | |||||
| int pad_w_ = 0; | |||||
| }; | }; | ||||
| } // namespace lite | } // namespace lite | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -170,10 +170,18 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso | |||||
| output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH())); | output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH())); | ||||
| auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h); | auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h); | ||||
| auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w); | auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w); | ||||
| pad_u_ = pad_h_all / 2; | |||||
| pad_d_ = pad_h_all - pad_u_; | |||||
| pad_l_ = pad_w_all / 2; | |||||
| pad_r_ = pad_w_all - pad_l_; | |||||
| if (pad_h_all < 0) { | |||||
| pad_u_ = pad_d_ = 0; | |||||
| } else { | |||||
| pad_u_ = pad_h_all / 2; | |||||
| pad_d_ = pad_h_all - pad_u_; | |||||
| } | |||||
| if (pad_w_all < 0) { | |||||
| pad_l_ = pad_r_ = 0; | |||||
| } else { | |||||
| pad_l_ = pad_w_all / 2; | |||||
| pad_r_ = pad_w_all - pad_l_; | |||||
| } | |||||
| } else { | } else { | ||||
| auto round_mode = (schema::RoundMode)GetRoundMode(); | auto round_mode = (schema::RoundMode)GetRoundMode(); | ||||
| if (round_mode == schema::RoundMode_FLOOR) { | if (round_mode == schema::RoundMode_FLOOR) { | ||||
| @@ -376,8 +376,6 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive) | |||||
| conv_param->pad_d_ = conv2d_lite_primitive->PadDown(); | conv_param->pad_d_ = conv2d_lite_primitive->PadDown(); | ||||
| conv_param->pad_l_ = conv2d_lite_primitive->PadLeft(); | conv_param->pad_l_ = conv2d_lite_primitive->PadLeft(); | ||||
| conv_param->pad_r_ = conv2d_lite_primitive->PadRight(); | conv_param->pad_r_ = conv2d_lite_primitive->PadRight(); | ||||
| conv_param->pad_h_ = conv2d_lite_primitive->PadUp(); | |||||
| conv_param->pad_w_ = conv2d_lite_primitive->PadLeft(); | |||||
| conv_param->dilation_h_ = conv_primitive->GetDilateH(); | conv_param->dilation_h_ = conv_primitive->GetDilateH(); | ||||
| conv_param->dilation_w_ = conv_primitive->GetDilateW(); | conv_param->dilation_w_ = conv_primitive->GetDilateW(); | ||||
| conv_param->input_channel_ = conv_primitive->GetChannelIn(); | conv_param->input_channel_ = conv_primitive->GetChannelIn(); | ||||
| @@ -386,16 +384,13 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive) | |||||
| auto act_type = conv_primitive->GetActivationType(); | auto act_type = conv_primitive->GetActivationType(); | ||||
| switch (act_type) { | switch (act_type) { | ||||
| case schema::ActivationType_RELU: | case schema::ActivationType_RELU: | ||||
| conv_param->is_relu_ = true; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_Relu; | |||||
| break; | break; | ||||
| case schema::ActivationType_RELU6: | case schema::ActivationType_RELU6: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = true; | |||||
| conv_param->act_type_ = ActType_Relu6; | |||||
| break; | break; | ||||
| default: | default: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| break; | break; | ||||
| } | } | ||||
| return reinterpret_cast<OpParameter *>(conv_param); | return reinterpret_cast<OpParameter *>(conv_param); | ||||
| @@ -422,23 +417,18 @@ OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitiv | |||||
| conv_param->pad_d_ = convdw_lite_primitive->PadDown(); | conv_param->pad_d_ = convdw_lite_primitive->PadDown(); | ||||
| conv_param->pad_l_ = convdw_lite_primitive->PadLeft(); | conv_param->pad_l_ = convdw_lite_primitive->PadLeft(); | ||||
| conv_param->pad_r_ = convdw_lite_primitive->PadRight(); | conv_param->pad_r_ = convdw_lite_primitive->PadRight(); | ||||
| conv_param->pad_h_ = convdw_lite_primitive->PadUp(); | |||||
| conv_param->pad_w_ = convdw_lite_primitive->PadLeft(); | |||||
| conv_param->dilation_h_ = conv_primitive->GetDilateH(); | conv_param->dilation_h_ = conv_primitive->GetDilateH(); | ||||
| conv_param->dilation_w_ = conv_primitive->GetDilateW(); | conv_param->dilation_w_ = conv_primitive->GetDilateW(); | ||||
| auto act_type = conv_primitive->GetActivationType(); | auto act_type = conv_primitive->GetActivationType(); | ||||
| switch (act_type) { | switch (act_type) { | ||||
| case schema::ActivationType_RELU: | case schema::ActivationType_RELU: | ||||
| conv_param->is_relu_ = true; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_Relu; | |||||
| break; | break; | ||||
| case schema::ActivationType_RELU6: | case schema::ActivationType_RELU6: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = true; | |||||
| conv_param->act_type_ = ActType_Relu6; | |||||
| break; | break; | ||||
| default: | default: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| break; | break; | ||||
| } | } | ||||
| return reinterpret_cast<OpParameter *>(conv_param); | return reinterpret_cast<OpParameter *>(conv_param); | ||||
| @@ -464,23 +454,18 @@ OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primit | |||||
| conv_param->pad_d_ = deconvdw_lite_primitive->PadDown(); | conv_param->pad_d_ = deconvdw_lite_primitive->PadDown(); | ||||
| conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft(); | conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft(); | ||||
| conv_param->pad_r_ = deconvdw_lite_primitive->PadRight(); | conv_param->pad_r_ = deconvdw_lite_primitive->PadRight(); | ||||
| conv_param->pad_h_ = deconvdw_lite_primitive->PadUp(); | |||||
| conv_param->pad_w_ = deconvdw_lite_primitive->PadLeft(); | |||||
| conv_param->dilation_h_ = conv_primitive->GetDilateH(); | conv_param->dilation_h_ = conv_primitive->GetDilateH(); | ||||
| conv_param->dilation_w_ = conv_primitive->GetDilateW(); | conv_param->dilation_w_ = conv_primitive->GetDilateW(); | ||||
| auto act_type = conv_primitive->GetActivationType(); | auto act_type = conv_primitive->GetActivationType(); | ||||
| switch (act_type) { | switch (act_type) { | ||||
| case schema::ActivationType_RELU: | case schema::ActivationType_RELU: | ||||
| conv_param->is_relu_ = true; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_Relu; | |||||
| break; | break; | ||||
| case schema::ActivationType_RELU6: | case schema::ActivationType_RELU6: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = true; | |||||
| conv_param->act_type_ = ActType_Relu6; | |||||
| break; | break; | ||||
| default: | default: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| break; | break; | ||||
| } | } | ||||
| return reinterpret_cast<OpParameter *>(conv_param); | return reinterpret_cast<OpParameter *>(conv_param); | ||||
| @@ -506,23 +491,18 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv | |||||
| conv_param->pad_d_ = deconv_lite_primitive->PadDown(); | conv_param->pad_d_ = deconv_lite_primitive->PadDown(); | ||||
| conv_param->pad_l_ = deconv_lite_primitive->PadLeft(); | conv_param->pad_l_ = deconv_lite_primitive->PadLeft(); | ||||
| conv_param->pad_r_ = deconv_lite_primitive->PadRight(); | conv_param->pad_r_ = deconv_lite_primitive->PadRight(); | ||||
| conv_param->pad_h_ = deconv_lite_primitive->PadH(); | |||||
| conv_param->pad_w_ = deconv_lite_primitive->PadW(); | |||||
| conv_param->dilation_h_ = conv_primitive->GetDilateH(); | conv_param->dilation_h_ = conv_primitive->GetDilateH(); | ||||
| conv_param->dilation_w_ = conv_primitive->GetDilateW(); | conv_param->dilation_w_ = conv_primitive->GetDilateW(); | ||||
| auto act_type = conv_primitive->GetActivationType(); | auto act_type = conv_primitive->GetActivationType(); | ||||
| switch (act_type) { | switch (act_type) { | ||||
| case schema::ActivationType_RELU: | case schema::ActivationType_RELU: | ||||
| conv_param->is_relu_ = true; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_Relu; | |||||
| break; | break; | ||||
| case schema::ActivationType_RELU6: | case schema::ActivationType_RELU6: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = true; | |||||
| conv_param->act_type_ = ActType_Relu6; | |||||
| break; | break; | ||||
| default: | default: | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| break; | break; | ||||
| } | } | ||||
| return reinterpret_cast<OpParameter *>(conv_param); | return reinterpret_cast<OpParameter *>(conv_param); | ||||
| @@ -322,10 +322,12 @@ int ConvolutionBaseCPUKernel::SetQuantParam() { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| // now only consider per tensor for output | // now only consider per tensor for output | ||||
| CalculateActivationRangeQuantized( | |||||
| conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_, | |||||
| conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0], | |||||
| &conv_param_->conv_quant_arg_.out_act_max_[0]); | |||||
| bool relu = conv_param_->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param_->act_type_ == ActType_Relu6; | |||||
| CalculateActivationRangeQuantized(relu, relu6, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_, | |||||
| conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, | |||||
| &conv_param_->conv_quant_arg_.out_act_min_[0], | |||||
| &conv_param_->conv_quant_arg_.out_act_max_[0]); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -38,8 +38,7 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() { | |||||
| matmul_param_->deep_ = conv_param_->input_channel_; | matmul_param_->deep_ = conv_param_->input_channel_; | ||||
| matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM); | matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM); | ||||
| matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | ||||
| matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No; | |||||
| matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_; | |||||
| matmul_param_->act_type_ = conv_param_->act_type_; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -57,7 +56,7 @@ Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() { | |||||
| } | } | ||||
| int Convolution1x1FP16CPUKernel::InitConv1x1Param() { | int Convolution1x1FP16CPUKernel::InitConv1x1Param() { | ||||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| conv_param_->stride_w_ != 1); | conv_param_->stride_w_ != 1); | ||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | ||||
| @@ -237,8 +237,8 @@ int Convolution3x3FP16CPUKernel::Run() { | |||||
| } | } | ||||
| // get real output | // get real output | ||||
| bool relu = conv_param_->is_relu_; | |||||
| bool relu6 = conv_param_->is_relu6_; | |||||
| bool relu = conv_param_->act_type_ == ActType_Relu; | |||||
| bool relu6 = conv_param_->act_type_ == ActType_Relu6; | |||||
| if (relu) { | if (relu) { | ||||
| UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | ||||
| conv_param_->output_w_, conv_param_->output_channel_); | conv_param_->output_w_, conv_param_->output_channel_); | ||||
| @@ -391,10 +391,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() { | |||||
| } | } | ||||
| // get real output | // get real output | ||||
| if (conv_param_->is_relu_) { | |||||
| if (conv_param_->act_type_ == ActType_Relu) { | |||||
| UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | ||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | ||||
| } else if (conv_param_->is_relu6_) { | |||||
| } else if (conv_param_->act_type_ == ActType_Relu6) { | |||||
| UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, | ||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | ||||
| } else { | } else { | ||||
| @@ -232,34 +232,31 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten | |||||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | ||||
| int kernel_h = conv_param->kernel_h_; | int kernel_h = conv_param->kernel_h_; | ||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int stride_h = conv_param->stride_h_; | |||||
| int stride_w = conv_param->stride_w_; | |||||
| int dilation_h = conv_param->dilation_h_; | |||||
| int dilation_w = conv_param->dilation_w_; | |||||
| conv_param->input_h_ = inputs.front()->Height(); | conv_param->input_h_ = inputs.front()->Height(); | ||||
| conv_param->input_w_ = inputs.front()->Width(); | conv_param->input_w_ = inputs.front()->Width(); | ||||
| conv_param->input_channel_ = inputs.front()->Channel(); | |||||
| conv_param->output_h_ = outputs.front()->Height(); | conv_param->output_h_ = outputs.front()->Height(); | ||||
| conv_param->output_w_ = outputs.front()->Width(); | conv_param->output_w_ = outputs.front()->Width(); | ||||
| conv_param->output_channel_ = outputs.front()->Channel(); | |||||
| conv_param->op_parameter_.thread_num_ = ctx->thread_num_; | |||||
| bool use_winograd = false; | bool use_winograd = false; | ||||
| bool use_sw = false; | |||||
| int out_unit; | int out_unit; | ||||
| InputTransformUnitFunc input_trans_func = nullptr; | InputTransformUnitFunc input_trans_func = nullptr; | ||||
| OutputTransformUnitFunc output_trans_func = nullptr; | OutputTransformUnitFunc output_trans_func = nullptr; | ||||
| if (primitive != nullptr && primitive->GetInferFlag()) { | if (primitive != nullptr && primitive->GetInferFlag()) { | ||||
| CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func); | CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func); | ||||
| use_sw = CheckIfUseSlideWindow(conv_param); | |||||
| } | } | ||||
| kernel::LiteKernel *kernel; | kernel::LiteKernel *kernel; | ||||
| if (kernel_h == 1 && kernel_w == 1) { | if (kernel_h == 1 && kernel_w == 1) { | ||||
| kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | ||||
| } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | |||||
| kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||||
| } else if (use_winograd) { | } else if (use_winograd) { | ||||
| kernel = | |||||
| new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); | |||||
| } else if (use_sw) { | |||||
| kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||||
| if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) { | |||||
| kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||||
| } else { | |||||
| kernel = new (std::nothrow) | |||||
| kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); | |||||
| } | |||||
| } else { | } else { | ||||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | ||||
| } | } | ||||
| @@ -65,8 +65,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { | |||||
| matmul_param_->deep_ = conv_param_->input_channel_; | matmul_param_->deep_ = conv_param_->input_channel_; | ||||
| matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); | matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); | ||||
| matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | ||||
| matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No; | |||||
| matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_; | |||||
| matmul_param_->act_type_ = conv_param_->act_type_; | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -98,7 +97,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { | |||||
| } | } | ||||
| int Convolution1x1CPUKernel::InitConv1x1Param() { | int Convolution1x1CPUKernel::InitConv1x1Param() { | ||||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| conv_param_->stride_w_ != 1); | conv_param_->stride_w_ != 1); | ||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | ||||
| @@ -94,7 +94,9 @@ int Convolution3x3CPUKernel::InitWeightBias() { | |||||
| } | } | ||||
| int Convolution3x3CPUKernel::InitTmpBuffer() { | int Convolution3x3CPUKernel::InitTmpBuffer() { | ||||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||||
| int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | |||||
| const int k_plane = 16; | const int k_plane = 16; | ||||
| MS_ASSERT(ctx_->allocator != nullptr); | MS_ASSERT(ctx_->allocator != nullptr); | ||||
| @@ -105,13 +107,20 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float); | |||||
| size_t tmp_dst_buffer_size = thread_count_ * C12NUM * k_plane * oC8 * C8NUM * sizeof(float); | |||||
| tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size)); | tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size)); | ||||
| if (tmp_dst_buffer_ == nullptr) { | if (tmp_dst_buffer_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; | MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| size_t col_buffer_size = thread_count_ * C12NUM * C4NUM * ic4 * sizeof(float); | |||||
| col_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(col_buffer_size)); | |||||
| if (col_buffer_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc col_buffer_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| size_t nc4hw4_out_size = | size_t nc4hw4_out_size = | ||||
| oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float); | oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float); | ||||
| nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size)); | nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size)); | ||||
| @@ -124,6 +133,7 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||||
| tmp_buffer_address_list_[1] = block_unit_buffer_; | tmp_buffer_address_list_[1] = block_unit_buffer_; | ||||
| tmp_buffer_address_list_[2] = tmp_dst_buffer_; | tmp_buffer_address_list_[2] = tmp_dst_buffer_; | ||||
| tmp_buffer_address_list_[3] = nc4hw4_out_; | tmp_buffer_address_list_[3] = nc4hw4_out_; | ||||
| tmp_buffer_address_list_[4] = col_buffer_; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -182,7 +192,7 @@ int Convolution3x3CPUKernel::ReSize() { | |||||
| } | } | ||||
| memset(nhwc4_input_, 0, nhwc4_input_size); | memset(nhwc4_input_, 0, nhwc4_input_size); | ||||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float); | |||||
| size_t tile_buffer_size = thread_count_ * C12NUM * C16NUM * iC4 * C4NUM * sizeof(float); | |||||
| tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | ||||
| if (tile_buffer_ == nullptr) { | if (tile_buffer_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc tile buffer failed."; | MS_LOG(ERROR) << "malloc tile buffer failed."; | ||||
| @@ -237,8 +247,8 @@ int Convolution3x3CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto is_relu = conv_param_->is_relu_; | |||||
| auto is_relu6 = conv_param_->is_relu6_; | |||||
| auto is_relu = conv_param_->act_type_ == ActType_Relu; | |||||
| auto is_relu6 = conv_param_->act_type_ == ActType_Relu6; | |||||
| auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | ||||
| if (is_relu) { | if (is_relu) { | ||||
| PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, | ||||
| @@ -60,14 +60,19 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| ctx_->allocator->Free(nc4hw4_out_); | ctx_->allocator->Free(nc4hw4_out_); | ||||
| nc4hw4_out_ = nullptr; | nc4hw4_out_ = nullptr; | ||||
| } | } | ||||
| if (col_buffer_ != nullptr) { | |||||
| ctx_->allocator->Free(col_buffer_); | |||||
| col_buffer_ = nullptr; | |||||
| } | |||||
| } | } | ||||
| float *transformed_filter_addr_ = nullptr; | float *transformed_filter_addr_ = nullptr; | ||||
| float *tile_buffer_ = nullptr; | float *tile_buffer_ = nullptr; | ||||
| float *block_unit_buffer_ = nullptr; | float *block_unit_buffer_ = nullptr; | ||||
| float *tmp_dst_buffer_ = nullptr; | float *tmp_dst_buffer_ = nullptr; | ||||
| float *col_buffer_ = nullptr; | |||||
| float *nc4hw4_out_ = nullptr; | float *nc4hw4_out_ = nullptr; | ||||
| TmpBufferAddress tmp_buffer_address_list_[4]; | |||||
| TmpBufferAddress tmp_buffer_address_list_[5]; | |||||
| GEMM_FUNC_FP32 gemm_func_ = nullptr; | GEMM_FUNC_FP32 gemm_func_ = nullptr; | ||||
| }; | }; | ||||
| void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); | void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); | ||||
| @@ -76,7 +76,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||||
| int out_c_block = i / oc_block; | int out_c_block = i / oc_block; | ||||
| int out_c_res = i % oc_block; | int out_c_res = i % oc_block; | ||||
| int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; | int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; | ||||
| int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; | |||||
| int output_oz_offset = out_c_block * strides[1] + out_c_res; | |||||
| for (int j = 0; j < channel_in; j++) { | for (int j = 0; j < channel_in; j++) { | ||||
| int ic4_block = j / C4NUM; | int ic4_block = j / C4NUM; | ||||
| int ic4_res = j % C4NUM; | int ic4_res = j % C4NUM; | ||||
| @@ -93,7 +93,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||||
| MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row); | MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row); | ||||
| for (int z = 0; z < input_unit_square; z++) { | for (int z = 0; z < input_unit_square; z++) { | ||||
| int output_xy_offset = output_iz_offset + z * strides[1]; | |||||
| int output_xy_offset = output_iz_offset + z * strides[0]; | |||||
| *(trans_weight_data + output_xy_offset) = trans_out_data[z]; | *(trans_weight_data + output_xy_offset) = trans_out_data[z]; | ||||
| } | } | ||||
| } | } | ||||
| @@ -151,7 +151,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { | |||||
| int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { | int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { | ||||
| int channel_in = conv_param_->input_channel_; | int channel_in = conv_param_->input_channel_; | ||||
| int ic4 = UP_DIV(channel_in, BLOCK); | |||||
| int ic4 = UP_DIV(channel_in, C4NUM); | |||||
| // set data | // set data | ||||
| auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); | auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); | ||||
| @@ -196,10 +196,12 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { | |||||
| int output_h = conv_param_->output_h_; | int output_h = conv_param_->output_h_; | ||||
| int output_w = conv_param_->output_w_; | int output_w = conv_param_->output_w_; | ||||
| int oc4 = UP_DIV(channel_out, C4NUM); | int oc4 = UP_DIV(channel_out, C4NUM); | ||||
| int oc8 = UP_DIV(channel_out, C8NUM); | |||||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||||
| MS_ASSERT(ctx_->allocator != nullptr); | MS_ASSERT(ctx_->allocator != nullptr); | ||||
| gemm_out_ = reinterpret_cast<float *>( | gemm_out_ = reinterpret_cast<float *>( | ||||
| ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float))); | |||||
| ctx_->allocator->Malloc(thread_count_ * C12NUM * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float))); | |||||
| if (gemm_out_ == nullptr) { | if (gemm_out_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc gemm_out_ failed."; | MS_LOG(ERROR) << "malloc gemm_out_ failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -222,10 +224,18 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| col_buffer_ = | |||||
| reinterpret_cast<float *>(ctx_->allocator->Malloc(thread_count_ * C12NUM * ic4 * C4NUM * sizeof(float))); | |||||
| if (col_buffer_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc col_buffer_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| tmp_buffer_address_list_[0] = trans_input_; | tmp_buffer_address_list_[0] = trans_input_; | ||||
| tmp_buffer_address_list_[1] = gemm_out_; | tmp_buffer_address_list_[1] = gemm_out_; | ||||
| tmp_buffer_address_list_[2] = tmp_out_data_; | tmp_buffer_address_list_[2] = tmp_out_data_; | ||||
| tmp_buffer_address_list_[3] = tmp_data_; | tmp_buffer_address_list_[3] = tmp_data_; | ||||
| tmp_buffer_address_list_[4] = col_buffer_; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -306,7 +316,7 @@ int ConvolutionWinogradCPUKernel::ReSize() { | |||||
| } | } | ||||
| memset(nhwc4_input_, 0, nhwc4_input_size); | memset(nhwc4_input_, 0, nhwc4_input_size); | ||||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); | |||||
| size_t tile_buffer_size = thread_count_ * C12NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); | |||||
| trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | ||||
| if (trans_input_ == nullptr) { | if (trans_input_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc trans_input_ failed."; | MS_LOG(ERROR) << "malloc trans_input_ failed."; | ||||
| @@ -370,10 +380,10 @@ int ConvolutionWinogradCPUKernel::Run() { | |||||
| // get real output | // get real output | ||||
| auto out_tensor = out_tensors_.front(); | auto out_tensor = out_tensors_.front(); | ||||
| auto out_data = reinterpret_cast<float *>(out_tensor->Data()); | auto out_data = reinterpret_cast<float *>(out_tensor->Data()); | ||||
| if (conv_param_->is_relu_) { | |||||
| if (conv_param_->act_type_ == ActType_Relu) { | |||||
| UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | ||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | ||||
| } else if (conv_param_->is_relu6_) { | |||||
| } else if (conv_param_->act_type_ == ActType_Relu6) { | |||||
| UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, | ||||
| conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | conv_param_->output_w_, conv_param_->output_channel_, output_unit_); | ||||
| } else { | } else { | ||||
| @@ -66,6 +66,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| ctx_->allocator->Free(tmp_out_data_); | ctx_->allocator->Free(tmp_out_data_); | ||||
| tmp_out_data_ = nullptr; | tmp_out_data_ = nullptr; | ||||
| } | } | ||||
| if (col_buffer_ != nullptr) { | |||||
| ctx_->allocator->Free(col_buffer_); | |||||
| col_buffer_ = nullptr; | |||||
| } | |||||
| } | } | ||||
| int kernel_unit_; | int kernel_unit_; | ||||
| int input_unit_; | int input_unit_; | ||||
| @@ -74,6 +78,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| float *trans_input_ = nullptr; | float *trans_input_ = nullptr; | ||||
| float *gemm_out_ = nullptr; | float *gemm_out_ = nullptr; | ||||
| float *tmp_out_data_ = nullptr; | float *tmp_out_data_ = nullptr; | ||||
| float *col_buffer_ = nullptr; | |||||
| Matrix *trans_weight_ = nullptr; | Matrix *trans_weight_ = nullptr; | ||||
| InputTransformUnitFunc input_trans_func_; | InputTransformUnitFunc input_trans_func_; | ||||
| OutputTransformUnitFunc output_trans_func_; | OutputTransformUnitFunc output_trans_func_; | ||||
| @@ -146,7 +146,7 @@ int Convolution1x1Int8CPUKernel::Init() { | |||||
| } | } | ||||
| int Convolution1x1Int8CPUKernel::InitParam() { | int Convolution1x1Int8CPUKernel::InitParam() { | ||||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || | |||||
| conv_param_->stride_w_ != 1); | conv_param_->stride_w_ != 1); | ||||
| matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | ||||
| @@ -36,7 +36,7 @@ int Conv2dTransposeOpenCLKernel::Init() { | |||||
| MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2."; | MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (param->pad_h_ != 0 || param->pad_w_ != 0) { | |||||
| if (param->pad_u_ != 0 || param->pad_l_ != 0) { | |||||
| MS_LOG(ERROR) << "only support pad =0."; | MS_LOG(ERROR) << "only support pad =0."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| @@ -170,7 +170,7 @@ int Conv2dTransposeOpenCLKernel::Run() { | |||||
| int co = out_tensors_[0]->Channel(); | int co = out_tensors_[0]->Channel(); | ||||
| int kh = param->kernel_h_; | int kh = param->kernel_h_; | ||||
| int kw = param->kernel_w_; | int kw = param->kernel_w_; | ||||
| int pad = param->pad_h_; | |||||
| int pad = param->pad_u_; | |||||
| int oh = out_tensors_[0]->Height(); | int oh = out_tensors_[0]->Height(); | ||||
| int ow = out_tensors_[0]->Width(); | int ow = out_tensors_[0]->Width(); | ||||
| int h = in_tensors_[0]->Height(); | int h = in_tensors_[0]->Height(); | ||||
| @@ -382,9 +382,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { | |||||
| " }\n\n"; | " }\n\n"; | ||||
| code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; | code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; | ||||
| if (param->is_relu_) { | |||||
| if (param->act_type_ == ActType_Relu) { | |||||
| code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; | code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; | ||||
| } else if (param->is_relu6_) { | |||||
| } else if (param->act_type_ == ActType_Relu6) { | |||||
| code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | ||||
| } | } | ||||
| @@ -609,9 +609,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||||
| " acc += bias[slice];\n"; | " acc += bias[slice];\n"; | ||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | ||||
| if (param->is_relu_) { | |||||
| if (param->act_type_ == ActType_Relu) { | |||||
| code += " acc = max(acc, (float4)(0.0f));\n"; | code += " acc = max(acc, (float4)(0.0f));\n"; | ||||
| } else if (param->is_relu6_) { | |||||
| } else if (param->act_type_ == ActType_Relu6) { | |||||
| code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n"; | code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n"; | ||||
| } | } | ||||
| @@ -163,7 +163,7 @@ int DepthwiseConv2dOpenCLKernel::Run() { | |||||
| float relu_clip1 = 6.0; | float relu_clip1 = 6.0; | ||||
| cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_}; | cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_}; | ||||
| cl_int2 stride = {parameter->stride_h_, parameter->stride_w_}; | cl_int2 stride = {parameter->stride_h_, parameter->stride_w_}; | ||||
| cl_int2 padding = {-parameter->pad_h_, -parameter->pad_w_}; | |||||
| cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_}; | |||||
| cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_}; | cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_}; | ||||
| cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; | cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; | ||||
| cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, | cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, | ||||
| @@ -47,8 +47,8 @@ void InitConvParamPack(ConvParameter *conv_param) { | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| } | } | ||||
| TEST_F(TestPack, PackInputFp32) { | TEST_F(TestPack, PackInputFp32) { | ||||
| @@ -50,8 +50,8 @@ void InitConvParamGroup1Fp16(ConvParameter *conv_param) { | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| conv_param->thread_num_ = 1; | conv_param->thread_num_ = 1; | ||||
| } | } | ||||
| @@ -75,8 +75,8 @@ void InitConvParamGroup2Fp16(ConvParameter *conv_param) { | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| conv_param->thread_num_ = 1; | conv_param->thread_num_ = 1; | ||||
| } | } | ||||
| @@ -50,7 +50,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) { | |||||
| conv_param->output_h_ = 4; | conv_param->output_h_ = 4; | ||||
| conv_param->output_w_ = 5; | conv_param->output_w_ = 5; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 4; | conv_param->stride_h_ = conv_param->stride_w_ = 4; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 2; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 2; | |||||
| float out[20] = {0}; | float out[20] = {0}; | ||||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | Conv1x1InputPack(in, out, conv_param, sizeof(float)); | ||||
| @@ -91,7 +91,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) { | |||||
| conv_param->output_h_ = 7; | conv_param->output_h_ = 7; | ||||
| conv_param->output_w_ = 4; | conv_param->output_w_ = 4; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 3; | conv_param->stride_h_ = conv_param->stride_w_ = 3; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| float out[28] = {0}; | float out[28] = {0}; | ||||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | Conv1x1InputPack(in, out, conv_param, sizeof(float)); | ||||
| @@ -105,7 +105,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) { | |||||
| conv_param->input_h_ = conv_param->input_w_ = 3; | conv_param->input_h_ = conv_param->input_w_ = 3; | ||||
| conv_param->output_h_ = conv_param->output_w_ = 3; | conv_param->output_h_ = conv_param->output_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| float in[] = {1.6767339, 12.25904, 19.018835, 3.0790641, -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112, | float in[] = {1.6767339, 12.25904, 19.018835, 3.0790641, -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112, | ||||
| -5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931, -6.501229, -4.122992, 9.540845}; | -5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931, -6.501229, -4.122992, 9.540845}; | ||||
| @@ -124,7 +124,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) { | |||||
| conv_param->input_h_ = conv_param->input_w_ = 3; | conv_param->input_h_ = conv_param->input_w_ = 3; | ||||
| conv_param->output_h_ = conv_param->output_w_ = 3; | conv_param->output_h_ = conv_param->output_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542, | float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542, | ||||
| 18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298, | 18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298, | ||||
| 0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866, | 0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866, | ||||
| @@ -281,8 +281,8 @@ int Conv1x1TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -348,9 +348,8 @@ int Conv1x1TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -47,8 +47,8 @@ void InitConvDwParam(ConvParameter *conv_param) { | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| } | } | ||||
| void InitConvDwCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs, | void InitConvDwCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs, | ||||
| @@ -468,7 +468,7 @@ int DeConvTestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -537,7 +537,7 @@ int DeConvTestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -616,7 +616,7 @@ int DeConvTestInit3(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 2; | conv_param->kernel_h_ = conv_param->kernel_w_ = 2; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 3; | conv_param->stride_h_ = conv_param->stride_w_ = 3; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 2; | conv_param->dilation_h_ = conv_param->dilation_w_ = 2; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -685,8 +685,8 @@ int DeConvTestInit4(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -52,12 +52,11 @@ void InitConvParamGroup1FP32(ConvParameter *conv_param) { | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| conv_param->group_ = 1; | conv_param->group_ = 1; | ||||
| conv_param->is_relu_ = false; | |||||
| conv_param->is_relu6_ = false; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| conv_param->thread_num_ = 1; | conv_param->thread_num_ = 1; | ||||
| } | } | ||||
| @@ -34,7 +34,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack1) { | |||||
| conv_param->input_h_ = conv_param->input_w_ = 3; | conv_param->input_h_ = conv_param->input_w_ = 3; | ||||
| conv_param->output_h_ = conv_param->output_w_ = 3; | conv_param->output_h_ = conv_param->output_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4, | int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4, | ||||
| -0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9, | -0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9, | ||||
| 14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2}; | 14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2}; | ||||
| @@ -61,7 +61,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack2) { | |||||
| conv_param->output_h_ = 4; | conv_param->output_h_ = 4; | ||||
| conv_param->output_w_ = 5; | conv_param->output_w_ = 5; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 4; | conv_param->stride_h_ = conv_param->stride_w_ = 4; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 2; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 2; | |||||
| int8_t out[20] = {0}; | int8_t out[20] = {0}; | ||||
| Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); | Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); | ||||
| @@ -111,8 +111,8 @@ int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_ | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -178,8 +178,8 @@ int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vect | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -253,8 +253,8 @@ int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vect | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 0; | |||||
| conv_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| } | } | ||||
| @@ -343,7 +343,7 @@ int DeConvInt8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vecto | |||||
| PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | ||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = conv_param->pad_l_ = 1; | |||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | ||||
| return out_t->ElementsNum(); | return out_t->ElementsNum(); | ||||
| @@ -119,8 +119,8 @@ void RunTestCase(const std::vector<int> shape, const std::vector<std::string> fi | |||||
| opParameter->kernel_w_ = kw; | opParameter->kernel_w_ = kw; | ||||
| opParameter->stride_h_ = 2; | opParameter->stride_h_ = 2; | ||||
| opParameter->stride_w_ = 2; | opParameter->stride_w_ = 2; | ||||
| opParameter->pad_h_ = pad; | |||||
| opParameter->pad_w_ = pad; | |||||
| opParameter->pad_u_ = pad; | |||||
| opParameter->pad_l_ = pad; | |||||
| opParameter->input_channel_ = ci; | opParameter->input_channel_ = ci; | ||||
| opParameter->output_channel_ = co; | opParameter->output_channel_ = co; | ||||
| auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>( | auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>( | ||||
| @@ -169,8 +169,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 0; | |||||
| conv_param->pad_w_ = 0; | |||||
| conv_param->pad_u_ = 0; | |||||
| conv_param->pad_l_ = 0; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -214,8 +214,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -286,8 +286,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 0; | |||||
| conv_param->pad_w_ = 0; | |||||
| conv_param->pad_u_ = 0; | |||||
| conv_param->pad_l_ = 0; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -331,8 +331,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -405,8 +405,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 0; | |||||
| conv_param->pad_w_ = 0; | |||||
| conv_param->pad_u_ = 0; | |||||
| conv_param->pad_l_ = 0; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -529,8 +529,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { | |||||
| conv_param->stride_w_ = 1; | conv_param->stride_w_ = 1; | ||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| conv_param->pad_h_ = 1; | |||||
| conv_param->pad_w_ = 1; | |||||
| conv_param->pad_u_ = 1; | |||||
| conv_param->pad_l_ = 1; | |||||
| } | } | ||||
| // nhwc | // nhwc | ||||
| @@ -724,8 +724,8 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { | |||||
| conv_param->kernel_w_ = filter_shape[i][2]; | conv_param->kernel_w_ = filter_shape[i][2]; | ||||
| conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; | conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; | ||||
| conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; | conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; | ||||
| conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2; | |||||
| conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2; | |||||
| conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; | |||||
| conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| } | } | ||||
| @@ -774,8 +774,8 @@ TEST_F(TestConvolutionDwOpenCL, Buffer2Image) { | |||||
| conv_param->kernel_w_ = filter_shape[2]; | conv_param->kernel_w_ = filter_shape[2]; | ||||
| conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; | conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_; | ||||
| conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; | conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_; | ||||
| conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2; | |||||
| conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2; | |||||
| conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2; | |||||
| conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2; | |||||
| conv_param->dilation_h_ = 1; | conv_param->dilation_h_ = 1; | ||||
| conv_param->dilation_w_ = 1; | conv_param->dilation_w_ = 1; | ||||
| } | } | ||||