From: @zhanyuan1 Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongtags/v1.1.0
| @@ -0,0 +1,185 @@ | |||||
| #ifdef __arm__ | |||||
| #ifndef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| .global MatVecMulFp32 | |||||
| #ifndef __APPLE__ | |||||
| .type MatVecMulFp32, %function | |||||
| #endif | |||||
| // void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) | |||||
| // r0: a | |||||
| // r1: b | |||||
| // r2: c | |||||
| // r3: bias | |||||
| // r4: act_type | |||||
| // r5: depth | |||||
| // r6: col | |||||
| MatVecMulFp32: | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||||
| push {r0-r8, r10, r11, lr} | |||||
| add sp, sp, #48 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] | |||||
| ldr r6, [sp, #8] | |||||
| mov r10, #4 | |||||
| mul r10, r10, r5 // stride = depth * sizeof(float) | |||||
| mov r11, #4 | |||||
| mul r11, r11, r10 // stride x 4 | |||||
| cmp r6, #4 | |||||
| blt Col1Loop | |||||
| Col4Loop: | |||||
| mov r7, r0 // reload a(vector) ptr | |||||
| mov r9, r1 // reload b(matrix) ptr | |||||
| mov r8, r5 // reload depth value | |||||
| veor q9, q9, q9 | |||||
| veor q10, q10, q10 | |||||
| veor q11, q11, q11 | |||||
| veor q12, q12, q12 | |||||
| veor q15, q15, q15 | |||||
| cmp r8, #4 | |||||
| blt Col4Depth1 | |||||
| Col4Depth4: | |||||
| vld1.f32 {q8}, [r7]! | |||||
| add lr, r9, r10 | |||||
| vld1.f32 {q0}, [r9]! | |||||
| vld1.f32 {q1}, [lr], r10 | |||||
| vld1.f32 {q2}, [lr], r10 | |||||
| vld1.f32 {q3}, [lr] | |||||
| vmla.f32 q9, q8, q0 | |||||
| vmla.f32 q10, q8, q1 | |||||
| vmla.f32 q11, q8, q2 | |||||
| vmla.f32 q12, q8, q3 | |||||
| sub r8, r8, #4 | |||||
| cmp r8, #4 | |||||
| bge Col4Depth4 | |||||
| vpadd.f32 d26, d18, d20 | |||||
| vpadd.f32 d27, d19, d21 | |||||
| vpadd.f32 d28, d22, d24 | |||||
| vpadd.f32 d29, d23, d25 | |||||
| vadd.f32 d30, d26, d27 | |||||
| vadd.f32 d31, d28, d29 | |||||
| cmp r8, #0 | |||||
| beq Col4End | |||||
| Col4Depth1: | |||||
| vld1.f32 {d0[0]}, [r7]! | |||||
| add lr, r9, r10 | |||||
| vld1.f32 {d2[0]}, [r9]! | |||||
| vld1.f32 {d2[1]}, [lr], r10 | |||||
| vld1.f32 {d3[0]}, [lr], r10 | |||||
| vld1.f32 {d3[1]}, [lr] | |||||
| vmla.f32 q15, q1, d0[0] | |||||
| subs r8, r8, #1 | |||||
| bne Col4Depth1 | |||||
| Col4End: | |||||
| cmp r3, #0 | |||||
| beq Col4Activation | |||||
| vld1.f32 {q13}, [r3]! | |||||
| vadd.f32 q15, q15, q13 | |||||
| Col4Activation: | |||||
| cmp r4, #3 | |||||
| beq Col4Relu6 | |||||
| cmp r4, #1 | |||||
| beq Col4Relu | |||||
| b Col4Write | |||||
| Col4Relu6: | |||||
| vmov.i32 q12, #6 | |||||
| vcvt.f32.s32 q12, q12 | |||||
| vmin.f32 q15, q15, q12 | |||||
| Col4Relu: | |||||
| veor q13, q13, q13 | |||||
| vmax.f32 q15, q15, q13 | |||||
| Col4Write: | |||||
| vst1.f32 {q15}, [r2]! | |||||
| subs r6, r6, #4 | |||||
| beq End | |||||
| add r1, r1, r11 | |||||
| cmp r6, #4 | |||||
| bge Col4Loop | |||||
| Col1Loop: | |||||
| mov r7, r0 // reload a(vector) ptr | |||||
| mov r9, r1 // reload b(matrix) ptr | |||||
| mov r8, r5 // reload depth value | |||||
| veor q10, q10, q10 | |||||
| veor q13, q13, q13 | |||||
| veor q15, q15, q15 | |||||
| cmp r8, #4 | |||||
| blt Col1Depth1 | |||||
| Col1Depth4: | |||||
| vld1.f32 {q0}, [r7]! | |||||
| vld1.f32 {q1}, [r9]! | |||||
| vmla.f32 q10, q1, q0 | |||||
| sub r8, r8, #4 | |||||
| cmp r8, #4 | |||||
| bge Col1Depth4 | |||||
| vpadd.f32 d24, d20, d22 | |||||
| vpadd.f32 d25, d21, d23 | |||||
| vadd.f32 d30, d24, d25 | |||||
| cmp r8, #0 | |||||
| beq Col1End | |||||
| Col1Depth1: | |||||
| vld1.f32 {d0[0]}, [r7]! | |||||
| vld1.f32 {d2[0]}, [r9]! | |||||
| vmla.f32 d30, d2, d0[0] | |||||
| subs r8, r8, #1 | |||||
| bne Col1Depth1 | |||||
| Col1End: | |||||
| cmp r3, #0 | |||||
| beq Col1Activation | |||||
| vld1.f32 {d28[0]}, [r3]! | |||||
| vadd.f32 d30, d30, d28 | |||||
| Col1Activation: | |||||
| cmp r4, #3 | |||||
| beq Col1Relu6 | |||||
| cmp r4, #1 | |||||
| beq Col1Relu | |||||
| b Col1Write | |||||
| Col1Relu6: | |||||
| vmov.i32 d26, #6 | |||||
| vcvt.f32.s32 d26, d26 | |||||
| vmin.f32 d30, d30, d26 | |||||
| Col1Relu: | |||||
| veor d24, d24, d24 | |||||
| vmax.f32 d30, d30, d24 | |||||
| Col1Write: | |||||
| vst1.f32 {d30[0]}, [r2]! | |||||
| subs r6, r6, #1 | |||||
| beq End | |||||
| add r1, r1, r10 | |||||
| b Col1Loop | |||||
| End: | |||||
| sub sp, sp, #48 | |||||
| pop {r0-r8, r10, r11, pc} | |||||
| #endif | |||||
| #endif | |||||
| @@ -1,12 +1,12 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatVecMulFp32Neon64 | |||||
| .global MatVecMulFp32 | |||||
| #ifndef __APPLE__ | #ifndef __APPLE__ | ||||
| .type MatVecMulFp32Neon64, %function | |||||
| .type MatVecMulFp32, %function | |||||
| #endif | #endif | ||||
| // void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) | |||||
| // void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) | |||||
| // x0: a | // x0: a | ||||
| // x1: b | // x1: b | ||||
| // x2: c | // x2: c | ||||
| @@ -15,7 +15,7 @@ | |||||
| // w5: depth | // w5: depth | ||||
| // w6: col | // w6: col | ||||
| MatVecMulFp32Neon64: | |||||
| MatVecMulFp32: | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| @@ -682,8 +682,8 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT | |||||
| } | } | ||||
| void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { | void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { | ||||
| #ifdef ENABLE_ARM64 | |||||
| MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col); | |||||
| #ifdef ENABLE_ARM | |||||
| MatVecMulFp32(a, b, c, bias, (int)act_type, depth, col); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -36,12 +36,14 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||||
| void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | ||||
| void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | ||||
| void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | ||||
| #ifdef ENABLE_ARM | |||||
| void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); | |||||
| #endif | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | ||||
| int col, size_t stride, size_t writeNhwc, size_t WriteWino); | int col, size_t stride, size_t writeNhwc, size_t WriteWino); | ||||
| void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | ||||
| int col, size_t stride, size_t write_mode); | int col, size_t stride, size_t write_mode); | ||||
| void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); | |||||
| #elif ENABLE_ARM32 | #elif ENABLE_ARM32 | ||||
| void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | ||||
| int col, int stride, size_t writeNhwc, size_t WriteWino); | int col, int stride, size_t writeNhwc, size_t WriteWino); | ||||
| @@ -58,7 +58,7 @@ int FullconnectionCPUKernel::ReSize() { | |||||
| thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); | thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); | thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); | ||||
| #ifdef ENABLE_ARM64 | |||||
| #ifdef ENABLE_ARM | |||||
| if (fc_param_->row_ == 1) { | if (fc_param_->row_ == 1) { | ||||
| is_vector_input_ = true; | is_vector_input_ = true; | ||||
| } else { | } else { | ||||
| @@ -76,19 +76,15 @@ int FullconnectionCPUKernel::ReSize() { | |||||
| } | } | ||||
| #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) | #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) | ||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); | |||||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_4_; | |||||
| #else | #else | ||||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; | int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; | ||||
| #endif | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); | a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); | ||||
| if (a_pack_ptr_ == nullptr) { | if (a_pack_ptr_ == nullptr) { | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); | memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); | ||||
| #endif | |||||
| int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; | int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; | ||||
| b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); | b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); | ||||
| @@ -66,9 +66,11 @@ int MatmulCPUKernel::MallocMatrixABuffer() { | |||||
| } | } | ||||
| params_->batch = batch; | params_->batch = batch; | ||||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | ||||
| #ifdef ENABLE_ARM64 | |||||
| #ifdef ENABLE_ARM | |||||
| if (params_->a_init_shape_ && params_->row_ == 1) { | if (params_->a_init_shape_ && params_->row_ == 1) { | ||||
| is_vector_a_ = true; | is_vector_a_ = true; | ||||
| } else { | |||||
| is_vector_a_ = false; | |||||
| } | } | ||||
| #endif | #endif | ||||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | ||||
| @@ -76,18 +78,10 @@ int MatmulCPUKernel::MallocMatrixABuffer() { | |||||
| params_->row_12_ = UP_ROUND(params_->row_, C12NUM); | params_->row_12_ = UP_ROUND(params_->row_, C12NUM); | ||||
| #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) | #if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE) | ||||
| if (params_->a_const_) { | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); | |||||
| } else { | |||||
| a_pack_ptr_ = reinterpret_cast<float *>( | |||||
| context_->allocator->Malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); | |||||
| } | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| int row_tmp = is_vector_a_ ? 1 : params_->row_4_; | |||||
| #else | #else | ||||
| int row_tmp = is_vector_a_ ? 1 : params_->row_12_; | int row_tmp = is_vector_a_ ? 1 : params_->row_12_; | ||||
| #endif | |||||
| if (params_->a_const_) { | if (params_->a_const_) { | ||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); | a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); | ||||
| } else { | } else { | ||||
| @@ -98,7 +92,7 @@ int MatmulCPUKernel::MallocMatrixABuffer() { | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| #endif | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||