|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898 |
- /***************************************************************************
- Copyright (c) 2021, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- #include "common.h"
- #include <arm_neon.h>
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define FMLA_RI "fmla "
- #define FMLA_IR "fmla "
- #define FMLA_II "fmls "
- #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
- #define FMLA_RI "fmls "
- #define FMLA_IR "fmla "
- #define FMLA_II "fmla "
- #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
- #define FMLA_RI "fmla "
- #define FMLA_IR "fmls "
- #define FMLA_II "fmla "
- #else
- #define FMLA_RI "fmls "
- #define FMLA_IR "fmls "
- #define FMLA_II "fmls "
- #endif
- #define FMLA_RR "fmla "
-
- static inline void store_m8n1_contracted(float *C,
- float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i,
- float alphar, float alphai) {
-
- float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8);
- ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar);
- ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar);
- ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai);
- ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai);
- ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai);
- ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai);
- ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar);
- ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar);
- vst2q_f32(C, ld1);
- vst2q_f32(C + 8, ld2);
- }
-
- static inline void kernel_8x4(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- const float *c_pref = C;
- float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i;
- float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i;
-
- /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */
- /** v0-v1 and v10-v11 for B, v2-v9 for A */
- __asm__ __volatile__(
- "cmp %[K],#0; mov %[c_pref],%[C]\n\t"
- "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
- "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
- "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
- "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
- "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "movi %[c6i].16b,#0\n\t"
- "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t"
- "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
- "beq 4f\n\t"
- "cmp %[K],#2\n\t"
- "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t"
- "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t"
- "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t"
- "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t"
- "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t"
-
- "blt 3f; beq 2f\n\t"
- "1:\n\t"
- "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
- FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
- FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t"
- FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
- "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
- FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
- FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
- FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
- "fmov v5.d[1],x0; fmov d1,x2\n\t"
- FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t"
- FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t"
- FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
- "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
- FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
- FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t"
- FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
- "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
- FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
- FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
- FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
- "fmov v7.d[1],x0; fmov d10,x5\n\t"
- FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
- FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t"
- FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
- "fmov v10.d[1],x6; fmov d11,x2\n\t"
- FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t"
- FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t"
- FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t"
- "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
- FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
- FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t"
- FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t"
- "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
- FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
- FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t"
- FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
- "fmov v9.d[1],x0; fmov d0,x5\n\t"
- FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t"
- FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t"
- FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
- "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t"
- FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t"
- FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
- FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t"
- "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t"
- FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
- FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t"
- FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t"
- FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t"
- FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t"
- FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t"
- FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t"
- FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t"
- FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t"
- FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t"
- FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
- FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
- FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
- FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
- FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
- FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
- FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
- FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
- FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
- FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
- FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
- "bgt 1b; blt 3f\n\t"
- "2:\n\t"
- "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
- FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
- FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t"
- FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
- "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
- FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
- FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
- FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
- "fmov v5.d[1],x0; fmov d1,x2\n\t"
- FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t"
- FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t"
- FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
- "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
- FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
- FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t"
- FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t"
- "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
- FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
- FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
- FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
- "fmov v7.d[1],x0; fmov d10,x5\n\t"
- FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
- FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t"
- FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t"
- "fmov v10.d[1],x6; fmov d11,x2\n\t"
- FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t"
- FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
- FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t"
- "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
- FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
- FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t"
- FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
- "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
- FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
- FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t"
- FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
- "fmov v9.d[1],x0\n\t"
- FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
- FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
- FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
- FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
- FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t"
- FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t"
- FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t"
- FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t"
- FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
- FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
- FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
- FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
- FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
- FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
- FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
- FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
- FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
- FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
- FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
- "b 4f\n\t"
- "3:\n\t"
- "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
- FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
- FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t"
- FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
- "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
- FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
- FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t"
- FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
- "fmov v5.d[1],x0; fmov d1,x2\n\t"
- FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t"
- FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t"
- FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
- "fmov v1.d[1],x4\n\t"
- FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t"
- FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t"
- FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t"
- FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t"
- FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t"
- FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
- FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t"
- FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
- FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t"
- FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
- FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
- FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
- "4:\n\t"
- "mov %[c_pref],%[C]\n\t"
- "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
- "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t"
- "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
- "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
- "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t"
- "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t"
- "zip2 %[c2i].2d,v4.2d,v5.2d\n\t"
- "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t"
- "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t"
- "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t"
- "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t"
- "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t"
- "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t"
- "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t"
- "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t"
- "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t"
- "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t"
- :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
- [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
- [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
- [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
- [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref)
- :[C]"r"(C), [LDC]"r"(LDC)
- :"cc","memory","x0","x1","x2","x3","x4","x5","x6",
- "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11");
-
- store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2;
- store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2;
- store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2;
- store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai);
- }
-
- static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc,
- float32x4_t a, float32x4_t b) {
-
- acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0);
- acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1);
- acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2);
- acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3);
- return acc;
- }
-
- static inline float32x4x4_t expand_alpha(float alphar, float alphai) {
- float32x4x4_t ret;
- const float maskp[] = { -1, 1, -1, 1 };
- const float maskn[] = { 1, -1, 1, -1 };
- const float32x4_t vrevp = vld1q_f32(maskp);
- const float32x4_t vrevn = vld1q_f32(maskn);
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- ret.val[0] = vdupq_n_f32(alphar);
- ret.val[1] = vdupq_n_f32(-alphai);
- ret.val[2] = vmulq_f32(ret.val[1], vrevn);
- ret.val[3] = vmulq_f32(ret.val[0], vrevp);
- #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
- ret.val[0] = vdupq_n_f32(alphar);
- ret.val[1] = vdupq_n_f32(alphai);
- ret.val[2] = vmulq_f32(ret.val[1], vrevp);
- ret.val[3] = vmulq_f32(ret.val[0], vrevn);
- #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
- ret.val[2] = vdupq_n_f32(alphai);
- ret.val[3] = vdupq_n_f32(alphar);
- ret.val[0] = vmulq_f32(ret.val[3], vrevn);
- ret.val[1] = vmulq_f32(ret.val[2], vrevp);
- #else
- ret.val[2] = vdupq_n_f32(alphai);
- ret.val[3] = vdupq_n_f32(-alphar);
- ret.val[0] = vmulq_f32(ret.val[3], vrevp);
- ret.val[1] = vmulq_f32(ret.val[2], vrevn);
- #endif
- return ret;
- }
-
- static inline void store_expanded_m2n2(float *C, BLASLONG LDC,
- float32x4x4_t acc, float32x4x4_t expanded_alpha) {
-
- float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2);
- ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
- ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]);
- acc.val[0] = vrev64q_f32(acc.val[0]);
- acc.val[2] = vrev64q_f32(acc.val[2]);
- ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
- ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]);
- acc.val[1] = vrev64q_f32(acc.val[1]);
- acc.val[3] = vrev64q_f32(acc.val[3]);
- ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
- ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]);
- ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
- ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]);
- vst1q_f32(C, ld1);
- vst1q_f32(C + LDC * 2, ld2);
- }
-
- static inline float32x4x4_t init_expanded_m2n2() {
- float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0),
- vdupq_n_f32(0), vdupq_n_f32(0) }};
- return ret;
- }
-
- static inline void kernel_4x4(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x4x4_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m2n2();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
- a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4),
- b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- c3 = acc_expanded_m2n2(c3, a1, b2);
- c4 = acc_expanded_m2n2(c4, a2, b2);
- c1 = acc_expanded_m2n2(c1, a3, b3);
- c2 = acc_expanded_m2n2(c2, a4, b3);
- c3 = acc_expanded_m2n2(c3, a3, b4);
- c4 = acc_expanded_m2n2(c4, a4, b4);
- }
- if (K) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- c3 = acc_expanded_m2n2(c3, a1, b2);
- c4 = acc_expanded_m2n2(c4, a2, b2);
- }
-
- float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n2(C, LDC, c1, e_alpha);
- store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
- C += LDC * 4;
- store_expanded_m2n2(C, LDC, c3, e_alpha);
- store_expanded_m2n2(C + 4, LDC, c4, e_alpha);
- }
-
- static inline void kernel_8x2(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x4x4_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m2n2();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
- float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20);
- float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- c3 = acc_expanded_m2n2(c3, a3, b1);
- c4 = acc_expanded_m2n2(c4, a4, b1);
- c1 = acc_expanded_m2n2(c1, a5, b2);
- c2 = acc_expanded_m2n2(c2, a6, b2);
- c3 = acc_expanded_m2n2(c3, a7, b2);
- c4 = acc_expanded_m2n2(c4, a8, b2);
- }
- if (K) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
- float32x4_t b1 = vld1q_f32(sb);
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- c3 = acc_expanded_m2n2(c3, a3, b1);
- c4 = acc_expanded_m2n2(c4, a4, b1);
- }
-
- float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n2(C, LDC, c1, e_alpha);
- store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
- store_expanded_m2n2(C + 8, LDC, c3, e_alpha);
- store_expanded_m2n2(C + 12, LDC, c4, e_alpha);
- }
-
- static inline void kernel_4x2(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x4x4_t c1, c2;
- c1 = c2 = init_expanded_m2n2();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- c1 = acc_expanded_m2n2(c1, a3, b2);
- c2 = acc_expanded_m2n2(c2, a4, b2);
- }
- if (K) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x4_t b1 = vld1q_f32(sb);
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b1);
- }
-
- float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n2(C, LDC, c1, e_alpha);
- store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
- }
-
- static inline void kernel_2x4(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x4x4_t c1, c2;
- c1 = c2 = init_expanded_m2n2();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
- float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a1, b2);
- c1 = acc_expanded_m2n2(c1, a2, b3);
- c2 = acc_expanded_m2n2(c2, a2, b4);
- }
- if (K) {
- float32x4_t a1 = vld1q_f32(sa);
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a1, b2);
- }
-
- float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n2(C, LDC, c1, e_alpha);
- store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha);
- }
-
- static inline void kernel_2x2(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x4x4_t c1, c2;
- c1 = c2 = init_expanded_m2n2();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
- float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
- c1 = acc_expanded_m2n2(c1, a1, b1);
- c2 = acc_expanded_m2n2(c2, a2, b2);
- }
- c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
- c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
- c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]);
- c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]);
- if (K) {
- float32x4_t a1 = vld1q_f32(sa);
- float32x4_t b1 = vld1q_f32(sb);
- c1 = acc_expanded_m2n2(c1, a1, b1);
- }
-
- store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai));
- }
-
- static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc,
- float32x4_t a, float32x2_t b) {
-
- acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0);
- acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1);
- return acc;
- }
-
- static inline void store_expanded_m2n1(float *C,
- float32x4x2_t acc, float32x4x4_t expanded_alpha) {
-
- float32x4_t ld1 = vld1q_f32(C);
- ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
- acc.val[0] = vrev64q_f32(acc.val[0]);
- ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
- acc.val[1] = vrev64q_f32(acc.val[1]);
- ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
- ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
- vst1q_f32(C, ld1);
- }
-
- static inline float32x4x2_t init_expanded_m2n1() {
- float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }};
- return ret;
- }
-
- static inline void kernel_8x1(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K) {
-
- float32x4x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m2n1();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
- a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12),
- a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20),
- a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
- float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
- c1 = acc_expanded_m2n1(c1, a1, b1);
- c2 = acc_expanded_m2n1(c2, a2, b1);
- c3 = acc_expanded_m2n1(c3, a3, b1);
- c4 = acc_expanded_m2n1(c4, a4, b1);
- c1 = acc_expanded_m2n1(c1, a5, b2);
- c2 = acc_expanded_m2n1(c2, a6, b2);
- c3 = acc_expanded_m2n1(c3, a7, b2);
- c4 = acc_expanded_m2n1(c4, a8, b2);
- }
- if (K) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
- a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
- float32x2_t b1 = vld1_f32(sb);
- c1 = acc_expanded_m2n1(c1, a1, b1);
- c2 = acc_expanded_m2n1(c2, a2, b1);
- c3 = acc_expanded_m2n1(c3, a3, b1);
- c4 = acc_expanded_m2n1(c4, a4, b1);
- }
-
- float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n1(C, c1, expanded_alpha);
- store_expanded_m2n1(C + 4, c2, expanded_alpha);
- store_expanded_m2n1(C + 8, c3, expanded_alpha);
- store_expanded_m2n1(C + 12, c4, expanded_alpha);
- }
-
- static inline void kernel_4x1(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K) {
-
- float32x4x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m2n1();
-
- for (; K > 1; K -= 2) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
- a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
- float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
- c1 = acc_expanded_m2n1(c1, a1, b1);
- c2 = acc_expanded_m2n1(c2, a2, b1);
- c3 = acc_expanded_m2n1(c3, a3, b2);
- c4 = acc_expanded_m2n1(c4, a4, b2);
- }
- c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
- c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
- c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
- c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
- if (K) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
- float32x2_t b1 = vld1_f32(sb);
- c1 = acc_expanded_m2n1(c1, a1, b1);
- c2 = acc_expanded_m2n1(c2, a2, b1);
- }
-
- float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n1(C, c1, expanded_alpha);
- store_expanded_m2n1(C + 4, c2, expanded_alpha);
- }
-
- static inline void kernel_2x1(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K) {
-
- float32x4x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m2n1();
-
- for (; K > 3; K -= 4) {
- float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
- a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
- float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2),
- b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8;
- c1 = acc_expanded_m2n1(c1, a1, b1);
- c2 = acc_expanded_m2n1(c2, a2, b2);
- c3 = acc_expanded_m2n1(c3, a3, b3);
- c4 = acc_expanded_m2n1(c4, a4, b4);
- }
- c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
- c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
- c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
- c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
- c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
- c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
- for (; K; K--) {
- float32x4_t a1 = vld1q_f32(sa); sa += 4;
- float32x2_t b1 = vld1_f32(sb); sb += 2;
- c1 = acc_expanded_m2n1(c1, a1, b1);
- }
-
- float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
- store_expanded_m2n1(C, c1, expanded_alpha);
- }
-
- static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) {
- float32x2x4_t ret;
- const float maskp[] = { -1, 1 };
- const float maskn[] = { 1, -1 };
- const float32x2_t vrevp = vld1_f32(maskp);
- const float32x2_t vrevn = vld1_f32(maskn);
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- ret.val[0] = vdup_n_f32(alphar);
- ret.val[1] = vdup_n_f32(-alphai);
- ret.val[2] = vmul_f32(ret.val[1], vrevn);
- ret.val[3] = vmul_f32(ret.val[0], vrevp);
- #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
- ret.val[0] = vdup_n_f32(alphar);
- ret.val[1] = vdup_n_f32(alphai);
- ret.val[2] = vmul_f32(ret.val[1], vrevp);
- ret.val[3] = vmul_f32(ret.val[0], vrevn);
- #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
- ret.val[2] = vdup_n_f32(alphai);
- ret.val[3] = vdup_n_f32(alphar);
- ret.val[0] = vmul_f32(ret.val[3], vrevn);
- ret.val[1] = vmul_f32(ret.val[2], vrevp);
- #else
- ret.val[2] = vdup_n_f32(alphai);
- ret.val[3] = vdup_n_f32(-alphar);
- ret.val[0] = vmul_f32(ret.val[3], vrevp);
- ret.val[1] = vmul_f32(ret.val[2], vrevn);
- #endif
- return ret;
- }
-
- static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc,
- float32x2_t a, float32x2_t b) {
-
- acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0);
- acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1);
- return acc;
- }
-
- static inline void store_expanded_m1n1(float *C,
- float32x2x2_t acc, float32x2x4_t expanded_alpha) {
-
- float32x2_t ld1 = vld1_f32(C);
- ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]);
- acc.val[0] = vrev64_f32(acc.val[0]);
- ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]);
- acc.val[1] = vrev64_f32(acc.val[1]);
- ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]);
- ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]);
- vst1_f32(C, ld1);
- }
-
- static inline float32x2x2_t init_expanded_m1n1() {
- float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }};
- return ret;
- }
-
- static inline void kernel_1x4(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x2x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m1n1();
-
- for (; K; K--) {
- float32x2_t a1 = vld1_f32(sa); sa += 2;
- c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
- c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
- c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4));
- c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6));
- sb += 8;
- }
-
- float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
- store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
- store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2;
- store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2;
- store_expanded_m1n1(C, c4, expanded_alpha);
- }
-
- static inline void kernel_1x2(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
-
- float32x2x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m1n1();
-
- for (; K > 1; K -= 2) {
- float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4;
- c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
- c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
- c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4));
- c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6));
- sb += 8;
- }
- c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
- c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
- c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
- c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
- if (K) {
- float32x2_t a1 = vld1_f32(sa);
- c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
- c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
- }
-
- float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
- store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
- store_expanded_m1n1(C, c2, expanded_alpha);
- }
-
- static inline void kernel_1x1(const float *sa, const float *sb, float *C,
- float alphar, float alphai, BLASLONG K) {
-
- float32x2x2_t c1, c2, c3, c4;
- c1 = c2 = c3 = c4 = init_expanded_m1n1();
-
- for (; K > 3; K -= 4) {
- c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
- c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2));
- c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4));
- c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6));
- sa += 8; sb += 8;
- }
- c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
- c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
- c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
- c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
- c1.val[0] = vadd_f32(c1.val[0], c2.val[0]);
- c1.val[1] = vadd_f32(c1.val[1], c2.val[1]);
- for (; K; K--) {
- c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
- sa += 2; sb += 2;
- }
-
- store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai));
- }
-
- int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
- FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
-
- BLASLONG n_left = N;
- for (; n_left >= 8; n_left -= 8) {
- const FLOAT *a_ = sa;
- FLOAT *c1_ = C;
- FLOAT *c2_ = C + LDC * 8;
- const FLOAT *b1_ = sb;
- const FLOAT *b2_ = sb + K * 8;
- BLASLONG m_left = M;
- for (; m_left >= 8; m_left -= 8) {
- kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC);
- kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC);
- a_ += 16 * K;
- c1_ += 16;
- c2_ += 16;
- }
- if (m_left >= 4) {
- m_left -= 4;
- kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC);
- kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC);
- a_ += 8 * K;
- c1_ += 8;
- c2_ += 8;
- }
- if (m_left >= 2) {
- m_left -= 2;
- kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC);
- kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC);
- a_ += 4 * K;
- c1_ += 4;
- c2_ += 4;
- }
- if (m_left) {
- kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC);
- kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC);
- }
- C += 16 * LDC;
- sb += 16 * K;
- }
-
- if (n_left >= 4) {
- n_left -= 4;
- const FLOAT *a_ = sa;
- FLOAT *c_ = C;
- BLASLONG m_left = M;
- for (; m_left >= 8; m_left -= 8) {
- kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 16 * K;
- c_ += 16;
- }
- if (m_left >= 4) {
- m_left -= 4;
- kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 8 * K;
- c_ += 8;
- }
- if (m_left >= 2) {
- m_left -= 2;
- kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 4 * K;
- c_ += 4;
- }
- if (m_left) {
- kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC);
- }
- C += 8 * LDC;
- sb += 8 * K;
- }
-
- if (n_left >= 2) {
- n_left -= 2;
- const FLOAT *a_ = sa;
- FLOAT *c_ = C;
- BLASLONG m_left = M;
- for (; m_left >= 8; m_left -= 8) {
- kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 16 * K;
- c_ += 16;
- }
- if (m_left >= 4) {
- m_left -= 4;
- kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 8 * K;
- c_ += 8;
- }
- if (m_left >= 2) {
- m_left -= 2;
- kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC);
- a_ += 4 * K;
- c_ += 4;
- }
- if (m_left) {
- kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC);
- }
- C += 4 * LDC;
- sb += 4 * K;
- }
-
- if (n_left) {
- BLASLONG m_left = M;
- for (; m_left >= 8; m_left -= 8) {
- kernel_8x1(sa, sb, C, alphar, alphai, K);
- sa += 16 * K;
- C += 16;
- }
- if (m_left >= 4) {
- m_left -= 4;
- kernel_4x1(sa, sb, C, alphar, alphai, K);
- sa += 8 * K;
- C += 8;
- }
- if (m_left >= 2) {
- m_left -= 2;
- kernel_2x1(sa, sb, C, alphar, alphai, K);
- sa += 4 * K;
- C += 4;
- }
- if (m_left) {
- kernel_1x1(sa, sb, C, alphar, alphai, K);
- }
- }
- return 0;
- }
-
|