diff --git a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S new file mode 100644 index 0000000000..c06301c941 --- /dev/null +++ b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S @@ -0,0 +1,185 @@ +#ifdef __arm__ +#ifndef __aarch64__ + +.text +.align 5 +.global MatVecMulFp32 +#ifndef __APPLE__ +.type MatVecMulFp32, %function +#endif + +// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) +// r0: a +// r1: b +// r2: c +// r3: bias +// r4: act_type +// r5: depth +// r6: col + +MatVecMulFp32: + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r10, r11, lr} + add sp, sp, #48 + + ldr r4, [sp] + ldr r5, [sp, #4] + ldr r6, [sp, #8] + + mov r10, #4 + mul r10, r10, r5 // stride = depth * sizeof(float) + mov r11, #4 + mul r11, r11, r10 // stride x 4 + + cmp r6, #4 + blt Col1Loop + +Col4Loop: + mov r7, r0 // reload a(vector) ptr + mov r9, r1 // reload b(matrix) ptr + mov r8, r5 // reload depth value + + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + veor q12, q12, q12 + veor q15, q15, q15 + + cmp r8, #4 + blt Col4Depth1 + + Col4Depth4: + vld1.f32 {q8}, [r7]! + add lr, r9, r10 + vld1.f32 {q0}, [r9]! + vld1.f32 {q1}, [lr], r10 + vld1.f32 {q2}, [lr], r10 + vld1.f32 {q3}, [lr] + + vmla.f32 q9, q8, q0 + vmla.f32 q10, q8, q1 + vmla.f32 q11, q8, q2 + vmla.f32 q12, q8, q3 + sub r8, r8, #4 + cmp r8, #4 + bge Col4Depth4 + + vpadd.f32 d26, d18, d20 + vpadd.f32 d27, d19, d21 + vpadd.f32 d28, d22, d24 + vpadd.f32 d29, d23, d25 + vadd.f32 d30, d26, d27 + vadd.f32 d31, d28, d29 + cmp r8, #0 + beq Col4End + + Col4Depth1: + vld1.f32 {d0[0]}, [r7]! + add lr, r9, r10 + vld1.f32 {d2[0]}, [r9]! + vld1.f32 {d2[1]}, [lr], r10 + vld1.f32 {d3[0]}, [lr], r10 + vld1.f32 {d3[1]}, [lr] + + vmla.f32 q15, q1, d0[0] + subs r8, r8, #1 + bne Col4Depth1 + + Col4End: + cmp r3, #0 + beq Col4Activation + vld1.f32 {q13}, [r3]! + vadd.f32 q15, q15, q13 + + Col4Activation: + cmp r4, #3 + beq Col4Relu6 + cmp r4, #1 + beq Col4Relu + b Col4Write + + Col4Relu6: + vmov.i32 q12, #6 + vcvt.f32.s32 q12, q12 + vmin.f32 q15, q15, q12 + + Col4Relu: + veor q13, q13, q13 + vmax.f32 q15, q15, q13 + + Col4Write: + vst1.f32 {q15}, [r2]! + subs r6, r6, #4 + beq End + add r1, r1, r11 + cmp r6, #4 + bge Col4Loop + +Col1Loop: + mov r7, r0 // reload a(vector) ptr + mov r9, r1 // reload b(matrix) ptr + mov r8, r5 // reload depth value + veor q10, q10, q10 + veor q13, q13, q13 + veor q15, q15, q15 + + cmp r8, #4 + blt Col1Depth1 + + Col1Depth4: + vld1.f32 {q0}, [r7]! + vld1.f32 {q1}, [r9]! + + vmla.f32 q10, q1, q0 + sub r8, r8, #4 + cmp r8, #4 + bge Col1Depth4 + + vpadd.f32 d24, d20, d22 + vpadd.f32 d25, d21, d23 + vadd.f32 d30, d24, d25 + cmp r8, #0 + beq Col1End + + Col1Depth1: + vld1.f32 {d0[0]}, [r7]! + vld1.f32 {d2[0]}, [r9]! + + vmla.f32 d30, d2, d0[0] + subs r8, r8, #1 + bne Col1Depth1 + + Col1End: + cmp r3, #0 + beq Col1Activation + vld1.f32 {d28[0]}, [r3]! + vadd.f32 d30, d30, d28 + + Col1Activation: + cmp r4, #3 + beq Col1Relu6 + cmp r4, #1 + beq Col1Relu + b Col1Write + + Col1Relu6: + vmov.i32 d26, #6 + vcvt.f32.s32 d26, d26 + vmin.f32 d30, d30, d26 + + Col1Relu: + veor d24, d24, d24 + vmax.f32 d30, d30, d24 + + Col1Write: + vst1.f32 {d30[0]}, [r2]! + subs r6, r6, #1 + beq End + add r1, r1, r10 + b Col1Loop + +End: + sub sp, sp, #48 + pop {r0-r8, r10, r11, pc} +#endif +#endif diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S index 09de97596b..228dc50245 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S @@ -1,12 +1,12 @@ #ifdef __aarch64__ .text .align 5 - .global MatVecMulFp32Neon64 + .global MatVecMulFp32 #ifndef __APPLE__ - .type MatVecMulFp32Neon64, %function + .type MatVecMulFp32, %function #endif -// void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) +// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) // x0: a // x1: b // x2: c @@ -15,7 +15,7 @@ // w5: depth // w6: col -MatVecMulFp32Neon64: +MatVecMulFp32: sub sp, sp, #128 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.c b/mindspore/lite/nnacl/fp32/matmul_fp32.c index 66846b9d82..760e71b113 100644 --- a/mindspore/lite/nnacl/fp32/matmul_fp32.c +++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c @@ -682,8 +682,8 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT } void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { -#ifdef ENABLE_ARM64 - MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col); +#ifdef ENABLE_ARM + MatVecMulFp32(a, b, c, bias, (int)act_type, depth, col); #endif } diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.h b/mindspore/lite/nnacl/fp32/matmul_fp32.h index ec78ed56b0..b9c5dba47c 100644 --- a/mindspore/lite/nnacl/fp32/matmul_fp32.h +++ b/mindspore/lite/nnacl/fp32/matmul_fp32.h @@ -36,12 +36,14 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); +#ifdef ENABLE_ARM +void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); +#endif #ifdef ENABLE_ARM64 void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino); void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t write_mode); -void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); #elif ENABLE_ARM32 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, int stride, size_t writeNhwc, size_t WriteWino); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc index 04d02b1f1f..abf5149080 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc @@ -58,7 +58,7 @@ int FullconnectionCPUKernel::ReSize() { thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); -#ifdef ENABLE_ARM64 +#ifdef ENABLE_ARM if (fc_param_->row_ == 1) { is_vector_input_ = true; } else { @@ -76,19 +76,15 @@ int FullconnectionCPUKernel::ReSize() { } #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) - a_pack_ptr_ = reinterpret_cast(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); - if (a_pack_ptr_ == nullptr) { - return RET_MEMORY_FAILED; - } - memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); + int row_tmp = is_vector_input_ ? 1 : fc_param_->row_4_; #else int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; +#endif a_pack_ptr_ = reinterpret_cast(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); if (a_pack_ptr_ == nullptr) { return RET_MEMORY_FAILED; } memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); -#endif int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; b_pack_ptr_ = reinterpret_cast(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc index 8934414480..f4637b402f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc @@ -66,9 +66,11 @@ int MatmulCPUKernel::MallocMatrixABuffer() { } params_->batch = batch; params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; -#ifdef ENABLE_ARM64 +#ifdef ENABLE_ARM if (params_->a_init_shape_ && params_->row_ == 1) { is_vector_a_ = true; + } else { + is_vector_a_ = false; } #endif params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; @@ -76,18 +78,10 @@ int MatmulCPUKernel::MallocMatrixABuffer() { params_->row_12_ = UP_ROUND(params_->row_, C12NUM); #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) - if (params_->a_const_) { - a_pack_ptr_ = reinterpret_cast(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); - } else { - a_pack_ptr_ = reinterpret_cast( - context_->allocator->Malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); - } - if (a_pack_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } + int row_tmp = is_vector_a_ ? 1 : params_->row_4_; #else int row_tmp = is_vector_a_ ? 1 : params_->row_12_; +#endif if (params_->a_const_) { a_pack_ptr_ = reinterpret_cast(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); } else { @@ -98,7 +92,7 @@ int MatmulCPUKernel::MallocMatrixABuffer() { FreeTmpBuffer(); return RET_MEMORY_FAILED; } -#endif + return RET_OK; }