| @@ -265,7 +265,7 @@ void ArithmeticFP32Coder::ComputeInOutStrides() { | |||||
| } | } | ||||
| } | } | ||||
| void ArithmeticFP32Coder::CollectFilesForFnc(CoderContext *const context) { | |||||
| void ArithmeticFP32Coder::CollectFilesForFunc(CoderContext *const context) { | |||||
| /** | /** | ||||
| * for nnacl's operator combine all arithmetic to nnalc/arithmetic.c | * for nnacl's operator combine all arithmetic to nnalc/arithmetic.c | ||||
| * this solution is not suitable for micro, for the size of package. | * this solution is not suitable for micro, for the size of package. | ||||
| @@ -332,6 +332,7 @@ int ArithmeticFP32Coder::DoCode(CoderContext *const context) { | |||||
| int count = MSMIN(stride, element_num - stride * kDefaultTaskId); | int count = MSMIN(stride, element_num - stride * kDefaultTaskId); | ||||
| MS_CHECK_TRUE(!arithmetic_run_.empty(), "arithmetic_run function is nullptr!"); | MS_CHECK_TRUE(!arithmetic_run_.empty(), "arithmetic_run function is nullptr!"); | ||||
| NNaclFp32Serializer code; | NNaclFp32Serializer code; | ||||
| CollectFilesForFunc(context); | |||||
| if (arithmetic_parameter_->broadcasting_) { | if (arithmetic_parameter_->broadcasting_) { | ||||
| stride = UP_DIV(outside_, thread_num_); | stride = UP_DIV(outside_, thread_num_); | ||||
| out_count_ = MSMIN(stride, outside_ - stride * kDefaultTaskId); | out_count_ = MSMIN(stride, outside_ - stride * kDefaultTaskId); | ||||
| @@ -85,7 +85,7 @@ class ArithmeticFP32Coder final : public OperatorCoder { | |||||
| int BroadcastRun(const std::string &input0, const std::string &input1, const std::string &output, int dim, | int BroadcastRun(const std::string &input0, const std::string &input1, const std::string &output, int dim, | ||||
| int out_count, int out_thread_stride, NNaclFp32Serializer *const code); | int out_count, int out_thread_stride, NNaclFp32Serializer *const code); | ||||
| void CollectFilesForFnc(CoderContext *const context); | |||||
| void CollectFilesForFunc(CoderContext *const context); | |||||
| int break_pos_{0}; | int break_pos_{0}; | ||||
| @@ -170,7 +170,12 @@ int LstmFP32Coder::DoCode(CoderContext *context) { | |||||
| "lstm_fp32.c", | "lstm_fp32.c", | ||||
| "mul_fp32.c", | "mul_fp32.c", | ||||
| }); | }); | ||||
| if (target_ == kARM32A || target_ == kARM64) { | |||||
| Collect(context, {}, {}, | |||||
| { | |||||
| "MatVecMulFp32.S", | |||||
| }); | |||||
| } | |||||
| Tensor *hidden_state = input_tensors_.at(kFifthIndex); | Tensor *hidden_state = input_tensors_.at(kFifthIndex); | ||||
| MS_CHECK_PTR(hidden_state); | MS_CHECK_PTR(hidden_state); | ||||
| Tensor *cell_state = input_tensors_.at(kSixthIndex); | Tensor *cell_state = input_tensors_.at(kSixthIndex); | ||||
| @@ -1,7 +1,7 @@ | |||||
| project(nnacl) | project(nnacl) | ||||
| set(NNACL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) | set(NNACL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) | ||||
| include_directories(NNACL_DIR) | |||||
| include_directories(${NNACL_DIR}/..) | |||||
| if(PLATFORM_ARM32 OR PLATFORM_ARM64) | if(PLATFORM_ARM32 OR PLATFORM_ARM64) | ||||
| if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND DEFINED ARCHS) | if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND DEFINED ARCHS) | ||||
| @@ -152,20 +152,20 @@ asm_function IndirectGemmInt8_2x4 | |||||
| cmp lr, #0 | cmp lr, #0 | ||||
| beq SymSum | beq SymSum | ||||
| ldr lr, [sp, #52] | ldr lr, [sp, #52] | ||||
| vld1.32 q0, [r10] | |||||
| vld1.32 {d0, d1}, [r10] | |||||
| add r10, r10, lr | add r10, r10, lr | ||||
| vld1.32 q1, [r10] | |||||
| vld1.32 {d2, d3}, [r10] | |||||
| b AddSum | b AddSum | ||||
| SymSum: | SymSum: | ||||
| vld1.32 q0[], [r10]! | |||||
| vld1.32 q1[], [r10]! | |||||
| vld1.32 {d0[], d1[]}, [r10]! | |||||
| vld1.32 {d2[], d3[]}, [r10]! | |||||
| AddSum: | AddSum: | ||||
| vsub.i32 q8, q8, q0 | vsub.i32 q8, q8, q0 | ||||
| vsub.i32 q12, q12, q1 | vsub.i32 q12, q12, q1 | ||||
| NoSum: | NoSum: | ||||
| cmp r3, #0 | cmp r3, #0 | ||||
| beq NoBias | beq NoBias | ||||
| vld1.32 q2, [r3] | |||||
| vld1.32 {d4, d5}, [r3] | |||||
| vadd.i32 q8, q8, q2 | vadd.i32 q8, q8, q2 | ||||
| vadd.i32 q12, q12, q2 | vadd.i32 q12, q12, q2 | ||||
| @@ -174,19 +174,19 @@ asm_function IndirectGemmInt8_2x4 | |||||
| cmp lr, #0 | cmp lr, #0 | ||||
| bne PerChannel | bne PerChannel | ||||
| ldr lr, [sp, #36] | ldr lr, [sp, #36] | ||||
| vld1.32 q3[], [lr] | |||||
| vld1.32 {d6[], d7[]}, [lr] | |||||
| ldr lr, [sp, #32] | ldr lr, [sp, #32] | ||||
| vld1.32 q4[], [lr] | |||||
| vld1.32 {d8[], d9[]}, [lr] | |||||
| ldr lr, [sp, #40] | ldr lr, [sp, #40] | ||||
| vld1.32 q5[], [lr] | |||||
| vld1.32 {d10[], d11[]}, [lr] | |||||
| b QuantizeStart | b QuantizeStart | ||||
| PerChannel: | PerChannel: | ||||
| ldr lr, [sp, #36] | ldr lr, [sp, #36] | ||||
| vld1.32 q3, [lr] | |||||
| vld1.32 {d6, d7}, [lr] | |||||
| ldr lr, [sp, #32] | ldr lr, [sp, #32] | ||||
| vld1.32 q4, [lr] | |||||
| vld1.32 {d8, d9}, [lr] | |||||
| ldr lr, [sp, #40] | ldr lr, [sp, #40] | ||||
| vld1.32 q5, [lr] | |||||
| vld1.32 {d10, d11}, [lr] | |||||
| QuantizeStart: | QuantizeStart: | ||||
| vshl.s32 q8, q8, q3 | vshl.s32 q8, q8, q3 | ||||
| vshl.s32 q12, q12, q3 | vshl.s32 q12, q12, q3 | ||||
| @@ -201,73 +201,73 @@ LoopCol: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write4: | Write4: | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write5: | Write5: | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18[0], [r4] | vst1.32 d18[0], [r4] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22[0], [r4] | vst1.32 d22[0], [r4] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26[0], [r4] | vst1.32 d26[0], [r4] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30[0], [r4] | vst1.32 d30[0], [r4] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write6: | Write6: | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18, [r4] | vst1.32 d18, [r4] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22, [r4] | vst1.32 d22, [r4] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26, [r4] | vst1.32 d26, [r4] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30, [r4] | vst1.32 d30, [r4] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write7: | Write7: | ||||
| add lr, r2, #24 | add lr, r2, #24 | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18, [r4] | vst1.32 d18, [r4] | ||||
| vst1.32 d19[0], [lr] | vst1.32 d19[0], [lr] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| @@ -275,7 +275,7 @@ LoopCol: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22, [r4] | vst1.32 d22, [r4] | ||||
| vst1.32 d23[0], [lr] | vst1.32 d23[0], [lr] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| @@ -283,7 +283,7 @@ LoopCol: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26, [r4] | vst1.32 d26, [r4] | ||||
| vst1.32 d27[0], [lr] | vst1.32 d27[0], [lr] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| @@ -291,7 +291,7 @@ LoopCol: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30, [r4] | vst1.32 d30, [r4] | ||||
| vst1.32 d31[0], [lr] | vst1.32 d31[0], [lr] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| @@ -226,19 +226,19 @@ LoopRow: | |||||
| Write4: | Write4: | ||||
| add lr, r2, #16 | add lr, r2, #16 | ||||
| str lr, [sp, #-40] | str lr, [sp, #-40] | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r2, r2, #16 | add r2, r2, #16 | ||||
| b WriteEnd | b WriteEnd | ||||
| @@ -246,25 +246,25 @@ LoopRow: | |||||
| add lr, r2, #20 | add lr, r2, #20 | ||||
| str lr, [sp, #-40] | str lr, [sp, #-40] | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18[0], [r4] | vst1.32 d18[0], [r4] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22[0], [r4] | vst1.32 d22[0], [r4] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26[0], [r4] | vst1.32 d26[0], [r4] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30[0], [r4] | vst1.32 d30[0], [r4] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r2, r2, #20 | add r2, r2, #20 | ||||
| @@ -273,25 +273,25 @@ LoopRow: | |||||
| add lr, r2, #24 | add lr, r2, #24 | ||||
| str lr, [sp, #-40] | str lr, [sp, #-40] | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18, [r4] | vst1.32 d18, [r4] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22, [r4] | vst1.32 d22, [r4] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26, [r4] | vst1.32 d26, [r4] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30, [r4] | vst1.32 d30, [r4] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r2, r2, #24 | add r2, r2, #24 | ||||
| @@ -301,7 +301,7 @@ LoopRow: | |||||
| str lr, [sp, #-40] | str lr, [sp, #-40] | ||||
| add lr, r2, #24 | add lr, r2, #24 | ||||
| add r4, r2, #16 | add r4, r2, #16 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| vst1.32 d18, [r4] | vst1.32 d18, [r4] | ||||
| vst1.32 d19[0], [lr] | vst1.32 d19[0], [lr] | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| @@ -309,7 +309,7 @@ LoopRow: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| vst1.32 d22, [r4] | vst1.32 d22, [r4] | ||||
| vst1.32 d23[0], [lr] | vst1.32 d23[0], [lr] | ||||
| cmp r6, #2 | cmp r6, #2 | ||||
| @@ -317,7 +317,7 @@ LoopRow: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| vst1.32 d26, [r4] | vst1.32 d26, [r4] | ||||
| vst1.32 d27[0], [lr] | vst1.32 d27[0], [lr] | ||||
| cmp r6, #3 | cmp r6, #3 | ||||
| @@ -325,7 +325,7 @@ LoopRow: | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r4, r4, r8 | add r4, r4, r8 | ||||
| add lr, lr, r8 | add lr, lr, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| vst1.32 d30, [r4] | vst1.32 d30, [r4] | ||||
| vst1.32 d31[0], [lr] | vst1.32 d31[0], [lr] | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| @@ -491,51 +491,51 @@ LoopRow4: | |||||
| Write4: | Write4: | ||||
| add lr, r2, #16 | add lr, r2, #16 | ||||
| str lr, [sp, #-40] | str lr, [sp, #-40] | ||||
| vst1.32 q4, [r2] | |||||
| vst1.32 {d8, d9}, [r2] | |||||
| cmp r6, #1 | cmp r6, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q5, [r2] | |||||
| vst1.32 {d10, d11}, [r2] | |||||
| cmp r6, #2 | cmp r6, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q6, [r2] | |||||
| vst1.32 {d12, d13}, [r2] | |||||
| cmp r6, #3 | cmp r6, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q7, [r2] | |||||
| vst1.32 {d14, d15}, [r2] | |||||
| cmp r6, #4 | cmp r6, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q8, [r2] | |||||
| vst1.32 {d16, d17}, [r2] | |||||
| cmp r6, #5 | cmp r6, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q9, [r2] | |||||
| vst1.32 {d18, d19}, [r2] | |||||
| cmp r6, #6 | cmp r6, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q10, [r2] | |||||
| vst1.32 {d20, d21}, [r2] | |||||
| cmp r6, #7 | cmp r6, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q11, [r2] | |||||
| vst1.32 {d22, d23}, [r2] | |||||
| cmp r6, #8 | cmp r6, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q12, [r2] | |||||
| vst1.32 {d24, d25}, [r2] | |||||
| cmp r6, #9 | cmp r6, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q13, [r2] | |||||
| vst1.32 {d26, d27}, [r2] | |||||
| cmp r6, #10 | cmp r6, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q14, [r2] | |||||
| vst1.32 {d28, d29}, [r2] | |||||
| cmp r6, #11 | cmp r6, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| vst1.32 q15, [r2] | |||||
| vst1.32 {d30, d31}, [r2] | |||||
| add r2, r2, r8 | add r2, r2, r8 | ||||
| add r2, r2, #16 | add r2, r2, #16 | ||||
| b WriteEnd | b WriteEnd | ||||
| @@ -135,17 +135,17 @@ LoopRow: | |||||
| vsub.s32 d31, d31, d23 | vsub.s32 d31, d31, d23 | ||||
| vmov.32 lr, d4[1] | vmov.32 lr, d4[1] | ||||
| vld1.32 {q9[]}, [lr] | |||||
| vld1.32 {d18[], d19[]}, [lr] | |||||
| vshl.s32 q14, q14, q9 | vshl.s32 q14, q14, q9 | ||||
| vshl.s32 q15, q15, q9 | vshl.s32 q15, q15, q9 | ||||
| vmov.32 lr, d5[0] | vmov.32 lr, d5[0] | ||||
| vld1.32 {q8[]}, [lr] | |||||
| vld1.32 {d16[], d17[]}, [lr] | |||||
| vqrdmulh.s32 q14, q14, q8 | vqrdmulh.s32 q14, q14, q8 | ||||
| vqrdmulh.s32 q15, q15, q8 | vqrdmulh.s32 q15, q15, q8 | ||||
| vmov.32 lr, d5[1] | vmov.32 lr, d5[1] | ||||
| vld1.32 {q7[]}, [lr] | |||||
| vld1.32 {d14[], d15[]}, [lr] | |||||
| vand q6, q7, q14 | vand q6, q7, q14 | ||||
| vshr.s32 q6, q6, #31 | vshr.s32 q6, q6, #31 | ||||
| vqadd.s32 q14, q14, q6 | vqadd.s32 q14, q14, q6 | ||||
| @@ -143,7 +143,7 @@ asm_function MatrixMultiplyWinograd | |||||
| mov r0, r8 // mat_b1 | mov r0, r8 // mat_b1 | ||||
| ldr r12, [sp] // k | ldr r12, [sp] // k | ||||
| LoopK: | LoopK: | ||||
| vld1.32 {s0}, [r9], r5 | |||||
| vld1.32 d0[0], [r9], r5 | |||||
| vld1.32 d2[0], [r0], r4 | vld1.32 d2[0], [r0], r4 | ||||
| vmla.f32 s8, s0, s4 | vmla.f32 s8, s0, s4 | ||||
| subs r12, r12, #1 | subs r12, r12, #1 | ||||
| @@ -73,7 +73,7 @@ CalLoop: | |||||
| Write: | Write: | ||||
| vmul.i32 q13, q13, q10 | vmul.i32 q13, q13, q10 | ||||
| vst1.32 q13, [r1], r7 | |||||
| vst1.32 {d26, d27}, [r1], r7 | |||||
| beq RowLoop | beq RowLoop | ||||
| End: | End: | ||||