diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S index a6622a729a..20cfa58a8c 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S @@ -381,6 +381,8 @@ LoopRow: ldr lr, [sp, #20] cmp lr, #0 beq C8DstStep + cmp lr, #2 + beq WinoDstStep mov lr, #4 ldr r7, [sp, #12] // reload rhs col mul lr, lr, r7 @@ -391,6 +393,10 @@ LoopRow: ldr lr, [sp, #-40] add r2, lr, #128 str r2, [sp, #-40] + b NoDstStep + WinoDstStep: + add r2, r2, r10 + str r2, [sp, #-40] NoDstStep: cmp r6, #4 ble LoopRowEnd diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add.S b/mindspore/lite/nnacl/assembly/arm64/bias_add.S deleted file mode 100644 index 181de0de72..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/bias_add.S +++ /dev/null @@ -1,82 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global BiasAdd -#ifndef __APPLE__ - .type BiasAdd, %function -#endif - - - -//void BiasAdd(const float* bias, float* data, size_t oc4, size_t plan_size) - -//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size, - -BiasAdd: -cmp x2, #0 -beq BiasAddEnd - -cmp x3, #0 -beq BiasAddEnd - -LoopOc4: -ld1 {v0.4s}, [x0], #16 -mov x6, x3 -mov x5, x1 - -Loop16LineIn: -cmp x6, #4 -blt L4 -sub x6, x6, #4 - -ld1 {v1.4s, v2.4s}, [x5], #32 - -fadd v5.4s, v0.4s, v1.4s -fadd v6.4s, v0.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -cmp x6, #4 -blt Loop16LineOut - -Loop16: -st1 {v5.4s, v6.4s}, [x1], #32 -fadd v7.4s, v0.4s, v3.4s -fadd v8.4s, v0.4s, v4.4s -ld1 {v1.4s, v2.4s}, [x5], #32 - -st1 {v7.4s, v8.4s}, [x1], #32 -fadd v5.4s, v0.4s, v1.4s -fadd v6.4s, v0.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -sub x6, x6, #4 -cmp x6, #4 -bge Loop16 - -Loop16LineOut: -st1 {v5.4s, v6.4s}, [x1], #32 -fadd v7.4s, v0.4s, v3.4s -fadd v8.4s, v0.4s, v4.4s - -st1 {v7.4s, v8.4s}, [x1], #32 - -L4: -cmp x6, #0 -beq Loop16LineEnd -Loop4: -ld1 {v1.4s}, [x5], #16 -fadd v2.4s, v1.4s, v0.4s -subs x6, x6, #1 -st1 {v2.4s}, [x1], #16 -bne Loop4 - -Loop16LineEnd: -subs x2, x2, #1 -bne LoopOc4 - -BiasAddEnd: - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S deleted file mode 100644 index f9e4eccc69..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S +++ /dev/null @@ -1,94 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global BiasAddRelu -#ifndef __APPLE__ - .type BiasAddRelu, %function -#endif - - -//void BiasAddRelu(const float* bias, float* data, size_t oc4, size_t plan_size) - -//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size, - -BiasAddRelu: -cmp x2, #0 -beq BiasAddEnd - -cmp x3, #0 -beq BiasAddEnd - -dup v16.4s, wzr - -LoopOc4: -ld1 {v0.4s}, [x0], #16 -mov x6, x3 -mov x5, x1 - -Loop16LineIn: -cmp x6, #4 -blt L4 -sub x6, x6, #4 - -ld1 {v1.4s, v2.4s}, [x5], #32 - -fadd v21.4s, v0.4s, v1.4s -fadd v22.4s, v0.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -fmax v23.4s, v21.4s, v16.4s -fmax v24.4s, v22.4s, v16.4s - -cmp x6, #4 -blt Loop16LineOut - -Loop16: -st1 {v23.4s, v24.4s}, [x1], #32 -fadd v25.4s, v0.4s, v3.4s -fadd v26.4s, v0.4s, v4.4s -ld1 {v1.4s, v2.4s}, [x5], #32 - -fmax v27.4s, v25.4s, v16.4s -fmax v28.4s, v26.4s, v16.4s -fadd v21.4s, v0.4s, v1.4s -fadd v22.4s, v0.4s, v2.4s - -st1 {v27.4s, v28.4s}, [x1], #32 -ld1 {v3.4s, v4.4s}, [x5], #32 -fmax v23.4s, v21.4s, v16.4s -fmax v24.4s, v22.4s, v16.4s -sub x6, x6, #4 -cmp x6, #4 -bge Loop16 - -Loop16LineOut: -st1 {v23.4s, v24.4s}, [x1], #32 -fadd v25.4s, v0.4s, v3.4s -fadd v26.4s, v0.4s, v4.4s - -fmax v27.4s, v25.4s, v16.4s -fmax v28.4s, v26.4s, v16.4s -st1 {v27.4s, v28.4s}, [x1], #32 - -L4: -cmp x6, #0 -beq Loop16LineEnd -Loop4: -ld1 {v1.4s}, [x5], #16 -fadd v1.4s, v1.4s, v0.4s -fmax v1.4s, v1.4s, v16.4s - -subs x6, x6, #1 -st1 {v1.4s}, [x1], #16 -bne Loop4 - -Loop16LineEnd: -subs x2, x2, #1 -bne LoopOc4 - -BiasAddEnd: - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S deleted file mode 100644 index 77c563a812..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S +++ /dev/null @@ -1,113 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global BiasAddRelu6 -#ifndef __APPLE__ - .type BiasAddRelu6, %function -#endif - - - -//void BiasAddRelu6(const float* bias, float* data, size_t oc4, size_t plan_size) - -//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size, - -BiasAddRelu6: -cmp x2, #0 -beq BiasAddEnd - -cmp x3, #0 -beq BiasAddEnd - -dup v16.4s, wzr -movi v17.4s, #6 -scvtf v17.4s, v17.4s - -LoopOc4: -ld1 {v0.4s}, [x0], #16 -mov x6, x3 -mov x5, x1 - -Loop16LineIn: -cmp x6, #4 -blt L4 -sub x6, x6, #4 - -ld1 {v1.4s, v2.4s}, [x5], #32 - -fadd v21.4s, v0.4s, v1.4s -fadd v22.4s, v0.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -fmax v23.4s, v21.4s, v16.4s -fmax v24.4s, v22.4s, v16.4s - - - -cmp x6, #4 -blt Loop16LineOut - -Loop16: -fmin v23.4s, v23.4s, v17.4s -fmin v24.4s, v24.4s, v17.4s -fadd v25.4s, v0.4s, v3.4s -fadd v26.4s, v0.4s, v4.4s -ld1 {v1.4s, v2.4s}, [x5], #32 - -st1 {v23.4s, v24.4s}, [x1], #32 -fmax v27.4s, v25.4s, v16.4s -fmax v28.4s, v26.4s, v16.4s -fadd v21.4s, v0.4s, v1.4s -fadd v22.4s, v0.4s, v2.4s - -fmin v27.4s, v27.4s, v17.4s -fmin v28.4s, v28.4s, v17.4s -fmax v23.4s, v21.4s, v16.4s -fmax v24.4s, v22.4s, v16.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -st1 {v27.4s, v28.4s}, [x1], #32 - - -sub x6, x6, #4 -cmp x6, #4 -bge Loop16 - -Loop16LineOut: -fmin v23.4s, v23.4s, v17.4s -fmin v24.4s, v24.4s, v17.4s -fadd v25.4s, v0.4s, v3.4s -fadd v26.4s, v0.4s, v4.4s - -fmax v27.4s, v25.4s, v16.4s -fmax v28.4s, v26.4s, v16.4s -st1 {v23.4s, v24.4s}, [x1], #32 - -fmin v27.4s, v27.4s, v17.4s -fmin v28.4s, v28.4s, v17.4s - -st1 {v27.4s, v28.4s}, [x1], #32 - -L4: -cmp x6, #0 -beq Loop16LineEnd -Loop4: -ld1 {v1.4s}, [x5], #16 -fadd v1.4s, v1.4s, v0.4s -fmax v1.4s, v1.4s, v16.4s -fmin v1.4s, v1.4s, v17.4s - -subs x6, x6, #1 -st1 {v1.4s}, [x1], #16 -bne Loop4 - -Loop16LineEnd: -subs x2, x2, #1 -bne LoopOc4 - -BiasAddEnd: - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/relu.S b/mindspore/lite/nnacl/assembly/arm64/relu.S deleted file mode 100644 index 74c40a135b..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/relu.S +++ /dev/null @@ -1,73 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global Relu -#ifndef __APPLE__ - .type Relu, %function -#endif - - -//void Relu(float* data, size_t element4) - -//Auto: x0:data, x1: element4 - -Relu: -cmp x1, #0 -beq ReluEnd - -dup v16.4s, wzr - -mov x5, x0 - -Loop16LineIn: -cmp x1, #4 -blt L4 -sub x1, x1, #4 - -ld1 {v1.4s, v2.4s}, [x5], #32 - -fmax v5.4s, v16.4s, v1.4s -fmax v6.4s, v16.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -cmp x1, #4 -blt Loop16LineOut - -Loop16: -st1 {v5.4s, v6.4s}, [x0], #32 -fmax v7.4s, v16.4s, v3.4s -fmax v8.4s, v16.4s, v4.4s -ld1 {v1.4s, v2.4s}, [x5], #32 - -st1 {v7.4s, v8.4s}, [x0], #32 -fmax v5.4s, v16.4s, v1.4s -fmax v6.4s, v16.4s, v2.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -sub x1, x1, #4 -cmp x1, #4 -bge Loop16 - -Loop16LineOut: -st1 {v5.4s, v6.4s}, [x0], #32 -fmax v7.4s, v16.4s, v3.4s -fmax v8.4s, v16.4s, v4.4s - -st1 {v7.4s, v8.4s}, [x0], #32 - -L4: -cmp x1, #0 -beq ReluEnd -Loop4: -ld1 {v1.4s}, [x5], #16 -fmax v2.4s, v16.4s, v0.4s -subs x1, x1, #1 -st1 {v2.4s}, [x0], #16 -bne Loop4 - -ReluEnd: - -ret -#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/relu6.S b/mindspore/lite/nnacl/assembly/arm64/relu6.S deleted file mode 100644 index c1789845ee..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/relu6.S +++ /dev/null @@ -1,89 +0,0 @@ - -#ifdef __aarch64__ - .text - .align 5 - //.p2align 5,,15 - .global Relu6 -#ifndef __APPLE__ - .type Relu6, %function -#endif - - -//void Relu6(float* data, size_t element4) - -//Auto: x0:data, x1: element4 - -Relu6: -cmp x1, #0 -beq Relu6End - -dup v16.4s, wzr -movi v17.4s, #6 -scvtf v17.4s, v17.4s - -mov x5, x0 - -Loop16LineIn: -cmp x1, #4 -blt L4 -sub x1, x1, #4 - -ld1 {v1.4s, v2.4s}, [x5], #32 - -fmax v21.4s, v1.4s, v16.4s -fmax v22.4s, v2.4s, v16.4s -ld1 {v3.4s, v4.4s}, [x5], #32 - -fmin v23.4s, v21.4s, v17.4s -fmin v24.4s, v22.4s, v17.4s - - -cmp x1, #4 -blt Loop16LineOut - -Loop16: -st1 {v23.4s, v24.4s}, [x0], #32 -fmax v25.4s, v3.4s, v16.4s -fmax v26.4s, v4.4s, v16.4s -ld1 {v1.4s, v2.4s}, [x5], #32 - -fmin v27.4s, v25.4s, v17.4s -fmin v28.4s, v26.4s, v17.4s -fmax v21.4s, v1.4s, v16.4s -fmax v22.4s, v2.4s, v16.4s - -st1 {v27.4s, v28.4s}, [x0], #32 -ld1 {v3.4s, v4.4s}, [x5], #32 -fmin v23.4s, v21.4s, v17.4s -fmin v24.4s, v22.4s, v17.4s - -sub x1, x1, #4 -cmp x1, #4 -bge Loop16 - -Loop16LineOut: -st1 {v23.4s, v24.4s}, [x0], #32 -fmax v25.4s, v3.4s, v16.4s -fmax v26.4s, v4.4s, v16.4s - -fmin v27.4s, v25.4s, v17.4s -fmin v28.4s, v26.4s, v17.4s -st1 {v27.4s, v28.4s}, [x0], #32 - -L4: -cmp x1, #0 -beq Relu6End -Loop4: -ld1 {v1.4s}, [x5], #16 -fmax v1.4s, v1.4s, v16.4s - -fmin v1.4s, v1.4s, v17.4s - -subs x1, x1, #1 -st1 {v1.4s}, [x0], #16 -bne Loop4 - -Relu6End: - -ret -#endif diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c index 92643dddb8..3c89acc73d 100644 --- a/mindspore/lite/nnacl/fp32/conv.c +++ b/mindspore/lite/nnacl/fp32/conv.c @@ -81,11 +81,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ int out_w_block = UP_DIV(conv_param->output_w_, out_unit); int out_h_block = UP_DIV(conv_param->output_h_, out_unit); int output_count = out_w_block * out_h_block; -#ifdef ENABLE_ARM32 - const int tile_num = 4; -#else - const int tile_num = 12; -#endif + const int tile_num = C12NUM; int output_tile_count = UP_DIV(output_count, tile_num); int out_channel = conv_param->output_channel_; int oc8 = UP_DIV(out_channel, C8NUM); @@ -117,7 +113,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; for (int i = 0; i < input_unit_square; ++i) { #ifdef ENABLE_ARM32 - RowMajor2Col4Major(src_ptr + i * C4NUM * in_channel, tmp_col_ptr, C4NUM, in_channel); + RowMajor2Col4Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel); #else RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel); #endif diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index 42967768f9..ed1f8ebea4 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -85,11 +85,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * } // interval y loop } // input transform -#ifdef ENABLE_ARM32 - const int tile_num = 4; -#else - const int tile_num = 12; -#endif + const int tile_num = C12NUM; int dst_ic4_offset = dst_plane_offset + ic * C4NUM; size_t dst_step = tile_num * in_channel; float *trans_input_ptr = trans_input + dst_ic4_offset; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 925577c64d..47107834e5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -184,11 +184,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { int ConvolutionWinogradCPUKernel::InitTmpBuffer() { int channel_out = conv_param_->output_channel_; int oc8 = UP_DIV(channel_out, C8NUM); -#ifdef ENABLE_ARM32 - int tile_num = 4; -#else - int tile_num = 12; -#endif + int tile_num = C12NUM; MS_ASSERT(ctx_->allocator != nullptr); size_t tile_buffer_size =