You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_LN_8x8_msa.c 45 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. BLASLONG k;
  32. FLOAT *aa = a, *bb = b;
  33. v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
  34. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  35. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  36. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  37. v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
  38. v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
  39. v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36;
  40. v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
  41. v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
  42. v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
  43. FLOAT *c_nxt1line = c + ldc;
  44. FLOAT *c_nxt2line = c + 2 * ldc;
  45. FLOAT *c_nxt3line = c + 3 * ldc;
  46. FLOAT *c_nxt4line = c + 4 * ldc;
  47. FLOAT *c_nxt5line = c + 5 * ldc;
  48. FLOAT *c_nxt6line = c + 6 * ldc;
  49. FLOAT *c_nxt7line = c + 7 * ldc;
  50. LD_SP2(c, 4, src_c0, src_c1);
  51. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  52. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  53. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  54. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  55. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  56. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  57. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  58. for (k = 0; k < bk; k++)
  59. {
  60. LD_SP2(aa, 4, src_a0, src_a1);
  61. src_b = LD_SP(bb + 0);
  62. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  63. src_c0 -= src_a0 * src_b0;
  64. src_c1 -= src_a1 * src_b0;
  65. src_c2 -= src_a0 * src_b1;
  66. src_c3 -= src_a1 * src_b1;
  67. src_c4 -= src_a0 * src_b2;
  68. src_c5 -= src_a1 * src_b2;
  69. src_c6 -= src_a0 * src_b3;
  70. src_c7 -= src_a1 * src_b3;
  71. src_b = LD_SP(bb + 4);
  72. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  73. src_c8 -= src_a0 * src_b0;
  74. src_c9 -= src_a1 * src_b0;
  75. src_c10 -= src_a0 * src_b1;
  76. src_c11 -= src_a1 * src_b1;
  77. src_c12 -= src_a0 * src_b2;
  78. src_c13 -= src_a1 * src_b2;
  79. src_c14 -= src_a0 * src_b3;
  80. src_c15 -= src_a1 * src_b3;
  81. aa += 8;
  82. bb += 8;
  83. }
  84. a -= 64;
  85. b -= 64;
  86. TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
  87. res_c4, res_c5, res_c6, res_c7);
  88. TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
  89. res_c12, res_c13, res_c14, res_c15);
  90. TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
  91. res_c0, res_c1, res_c2, res_c3);
  92. TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
  93. res_c8, res_c9, res_c10, res_c11);
  94. src_a = LD_SP(a + 60);
  95. SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
  96. src_a = LD_SP(a + 56);
  97. SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
  98. res_c7 *= src_a63;
  99. res_c15 *= src_a63;
  100. res_c6 -= res_c7 * src_a62;
  101. res_c14 -= res_c15 * src_a62;
  102. res_c5 -= res_c7 * src_a61;
  103. res_c13 -= res_c15 * src_a61;
  104. res_c4 -= res_c7 * src_a60;
  105. res_c12 -= res_c15 * src_a60;
  106. res_c3 -= res_c7 * src_a59;
  107. res_c11 -= res_c15 * src_a59;
  108. res_c2 -= res_c7 * src_a58;
  109. res_c10 -= res_c15 * src_a58;
  110. res_c1 -= res_c7 * src_a57;
  111. res_c9 -= res_c15 * src_a57;
  112. res_c0 -= res_c7 * src_a56;
  113. res_c8 -= res_c15 * src_a56;
  114. src_a = LD_SP(a + 48);
  115. SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
  116. src_a52 = LD_SP(a + 52);
  117. src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
  118. src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
  119. src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
  120. res_c6 *= src_a54;
  121. res_c14 *= src_a54;
  122. res_c5 -= res_c6 * src_a53;
  123. res_c13 -= res_c14 * src_a53;
  124. res_c4 -= res_c6 * src_a52;
  125. res_c12 -= res_c14 * src_a52;
  126. res_c3 -= res_c6 * src_a51;
  127. res_c11 -= res_c14 * src_a51;
  128. res_c2 -= res_c6 * src_a50;
  129. res_c10 -= res_c14 * src_a50;
  130. res_c1 -= res_c6 * src_a49;
  131. res_c9 -= res_c14 * src_a49;
  132. res_c0 -= res_c6 * src_a48;
  133. res_c8 -= res_c14 * src_a48;
  134. src_a = LD_SP(a + 40);
  135. SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
  136. src_a44 = LD_SP(a + 44);
  137. src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
  138. src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
  139. res_c5 *= src_a45;
  140. res_c13 *= src_a45;
  141. res_c4 -= res_c5 * src_a44;
  142. res_c12 -= res_c13 * src_a44;
  143. res_c3 -= res_c5 * src_a43;
  144. res_c11 -= res_c13 * src_a43;
  145. res_c2 -= res_c5 * src_a42;
  146. res_c10 -= res_c13 * src_a42;
  147. res_c1 -= res_c5 * src_a41;
  148. res_c9 -= res_c13 * src_a41;
  149. res_c0 -= res_c5 * src_a40;
  150. res_c8 -= res_c13 * src_a40;
  151. src_a = LD_SP(a + 32);
  152. SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
  153. COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
  154. res_c4 *= src_a36;
  155. res_c12 *= src_a36;
  156. res_c3 -= res_c4 * src_a35;
  157. res_c11 -= res_c12 * src_a35;
  158. res_c2 -= res_c4 * src_a34;
  159. res_c10 -= res_c12 * src_a34;
  160. res_c1 -= res_c4 * src_a33;
  161. res_c9 -= res_c12 * src_a33;
  162. res_c0 -= res_c4 * src_a32;
  163. res_c8 -= res_c12 * src_a32;
  164. ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
  165. ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
  166. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  167. src_c1, src_c3, src_c5, src_c7);
  168. TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
  169. src_c9, src_c11, src_c13, src_c15);
  170. ST_SP(src_c1, c + 4);
  171. ST_SP(src_c3, c_nxt1line + 4);
  172. ST_SP(src_c5, c_nxt2line + 4);
  173. ST_SP(src_c7, c_nxt3line + 4);
  174. ST_SP(src_c9, c_nxt4line + 4);
  175. ST_SP(src_c11, c_nxt5line + 4);
  176. ST_SP(src_c13, c_nxt6line + 4);
  177. ST_SP(src_c15, c_nxt7line + 4);
  178. src_a = LD_SP(a + 24);
  179. SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
  180. res_c3 *= src_a27;
  181. res_c11 *= src_a27;
  182. res_c2 -= res_c3 * src_a26;
  183. res_c10 -= res_c11 * src_a26;
  184. res_c1 -= res_c3 * src_a25;
  185. res_c9 -= res_c11 * src_a25;
  186. res_c0 -= res_c3 * src_a24;
  187. res_c8 -= res_c11 * src_a24;
  188. src_a16 = LD_SP(a + 16);
  189. src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
  190. src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
  191. src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
  192. res_c2 *= src_a18;
  193. res_c10 *= src_a18;
  194. res_c1 -= res_c2 * src_a17;
  195. res_c9 -= res_c10 * src_a17;
  196. res_c0 -= res_c2 * src_a16;
  197. res_c8 -= res_c10 * src_a16;
  198. COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
  199. COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
  200. COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
  201. res_c1 *= src_a9;
  202. res_c9 *= src_a9;
  203. res_c0 -= res_c1 * src_a8;
  204. res_c8 -= res_c9 * src_a8;
  205. res_c0 *= src_a0;
  206. res_c8 *= src_a0;
  207. ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
  208. ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
  209. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  210. src_c0, src_c2, src_c4, src_c6);
  211. TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
  212. src_c8, src_c10, src_c12, src_c14);
  213. ST_SP(src_c0, c);
  214. ST_SP(src_c2, c_nxt1line);
  215. ST_SP(src_c4, c_nxt2line);
  216. ST_SP(src_c6, c_nxt3line);
  217. ST_SP(src_c8, c_nxt4line);
  218. ST_SP(src_c10, c_nxt5line);
  219. ST_SP(src_c12, c_nxt6line);
  220. ST_SP(src_c14, c_nxt7line);
  221. }
  222. static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  223. {
  224. BLASLONG k;
  225. FLOAT *aa = a, *bb = b;
  226. v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
  227. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  228. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  229. v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
  230. v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35;
  231. v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
  232. v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
  233. v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
  234. FLOAT *c_nxt1line = c + ldc;
  235. FLOAT *c_nxt2line = c + 2 * ldc;
  236. FLOAT *c_nxt3line = c + 3 * ldc;
  237. LD_SP2(c, 4, src_c0, src_c1);
  238. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  239. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  240. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  241. for (k = 0; k < (bk >> 1); k++)
  242. {
  243. LD_SP2(aa, 4, src_a0, src_a1);
  244. src_b = LD_SP(bb + 0);
  245. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  246. src_c0 -= src_a0 * src_b0;
  247. src_c1 -= src_a1 * src_b0;
  248. src_c2 -= src_a0 * src_b1;
  249. src_c3 -= src_a1 * src_b1;
  250. src_c4 -= src_a0 * src_b2;
  251. src_c5 -= src_a1 * src_b2;
  252. src_c6 -= src_a0 * src_b3;
  253. src_c7 -= src_a1 * src_b3;
  254. aa += 8;
  255. bb += 4;
  256. LD_SP2(aa, 4, src_a0, src_a1);
  257. src_b = LD_SP(bb + 0);
  258. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  259. src_c0 -= src_a0 * src_b0;
  260. src_c1 -= src_a1 * src_b0;
  261. src_c2 -= src_a0 * src_b1;
  262. src_c3 -= src_a1 * src_b1;
  263. src_c4 -= src_a0 * src_b2;
  264. src_c5 -= src_a1 * src_b2;
  265. src_c6 -= src_a0 * src_b3;
  266. src_c7 -= src_a1 * src_b3;
  267. aa += 8;
  268. bb += 4;
  269. }
  270. if (bk & 1)
  271. {
  272. LD_SP2(aa, 4, src_a0, src_a1);
  273. src_b = LD_SP(bb + 0);
  274. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  275. src_c0 -= src_a0 * src_b0;
  276. src_c1 -= src_a1 * src_b0;
  277. src_c2 -= src_a0 * src_b1;
  278. src_c3 -= src_a1 * src_b1;
  279. src_c4 -= src_a0 * src_b2;
  280. src_c5 -= src_a1 * src_b2;
  281. src_c6 -= src_a0 * src_b3;
  282. src_c7 -= src_a1 * src_b3;
  283. }
  284. a -= 64;
  285. b -= 32;
  286. TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
  287. res_c0, res_c1, res_c2, res_c3);
  288. TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
  289. res_c4, res_c5, res_c6, res_c7);
  290. src_a = LD_SP(a + 60);
  291. SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
  292. src_a = LD_SP(a + 56);
  293. SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
  294. src_a = LD_SP(a + 48);
  295. SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
  296. src_a52 = LD_SP(a + 52);
  297. src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
  298. src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
  299. src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
  300. res_c7 *= src_a63;
  301. res_c6 -= res_c7 * src_a62;
  302. res_c5 -= res_c7 * src_a61;
  303. res_c4 -= res_c7 * src_a60;
  304. res_c3 -= res_c7 * src_a59;
  305. res_c2 -= res_c7 * src_a58;
  306. res_c1 -= res_c7 * src_a57;
  307. res_c0 -= res_c7 * src_a56;
  308. res_c6 *= src_a54;
  309. res_c5 -= res_c6 * src_a53;
  310. res_c4 -= res_c6 * src_a52;
  311. res_c3 -= res_c6 * src_a51;
  312. res_c2 -= res_c6 * src_a50;
  313. res_c1 -= res_c6 * src_a49;
  314. res_c0 -= res_c6 * src_a48;
  315. src_a = LD_SP(a + 40);
  316. SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
  317. src_a44 = LD_SP(a + 44);
  318. src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
  319. src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
  320. res_c5 *= src_a45;
  321. res_c4 -= res_c5 * src_a44;
  322. res_c3 -= res_c5 * src_a43;
  323. res_c2 -= res_c5 * src_a42;
  324. res_c1 -= res_c5 * src_a41;
  325. res_c0 -= res_c5 * src_a40;
  326. src_a = LD_SP(a + 32);
  327. SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
  328. COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
  329. res_c4 *= src_a36;
  330. res_c3 -= res_c4 * src_a35;
  331. res_c2 -= res_c4 * src_a34;
  332. res_c1 -= res_c4 * src_a33;
  333. res_c0 -= res_c4 * src_a32;
  334. src_a = LD_SP(a + 24);
  335. SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
  336. res_c3 *= src_a27;
  337. res_c2 -= res_c3 * src_a26;
  338. res_c1 -= res_c3 * src_a25;
  339. res_c0 -= res_c3 * src_a24;
  340. src_a16 = LD_SP(a + 16);
  341. src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
  342. src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
  343. src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
  344. res_c2 *= src_a18;
  345. res_c1 -= res_c2 * src_a17;
  346. res_c0 -= res_c2 * src_a16;
  347. COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
  348. COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
  349. COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
  350. res_c1 *= src_a9;
  351. res_c0 -= res_c1 * src_a8;
  352. res_c0 *= src_a0;
  353. ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
  354. ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4);
  355. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  356. src_c0, src_c2, src_c4, src_c6);
  357. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  358. src_c1, src_c3, src_c5, src_c7);
  359. ST_SP2(src_c0, src_c1, c, 4);
  360. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  361. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  362. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  363. }
  364. static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  365. {
  366. BLASLONG k;
  367. FLOAT *aa = a, *bb = b;
  368. FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
  369. FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
  370. FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
  371. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  372. FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
  373. c0 = *(c + 0);
  374. c1 = *(c + 1);
  375. c2 = *(c + 2);
  376. c3 = *(c + 3);
  377. c4 = *(c + 4);
  378. c5 = *(c + 5);
  379. c6 = *(c + 6);
  380. c7 = *(c + 7);
  381. c0_nxt = *(c + 0 + ldc);
  382. c1_nxt = *(c + 1 + ldc);
  383. c2_nxt = *(c + 2 + ldc);
  384. c3_nxt = *(c + 3 + ldc);
  385. c4_nxt = *(c + 4 + ldc);
  386. c5_nxt = *(c + 5 + ldc);
  387. c6_nxt = *(c + 6 + ldc);
  388. c7_nxt = *(c + 7 + ldc);
  389. for (k = 0; k < bk; k++)
  390. {
  391. c0 -= aa[0] * bb[0];
  392. c1 -= aa[1] * bb[0];
  393. c2 -= aa[2] * bb[0];
  394. c3 -= aa[3] * bb[0];
  395. c4 -= aa[4] * bb[0];
  396. c5 -= aa[5] * bb[0];
  397. c6 -= aa[6] * bb[0];
  398. c7 -= aa[7] * bb[0];
  399. c0_nxt -= aa[0] * bb[1];
  400. c1_nxt -= aa[1] * bb[1];
  401. c2_nxt -= aa[2] * bb[1];
  402. c3_nxt -= aa[3] * bb[1];
  403. c4_nxt -= aa[4] * bb[1];
  404. c5_nxt -= aa[5] * bb[1];
  405. c6_nxt -= aa[6] * bb[1];
  406. c7_nxt -= aa[7] * bb[1];
  407. aa += 8;
  408. bb += 2;
  409. }
  410. a -= 64;
  411. b -= 16;
  412. a0 = *(a + 0);
  413. a8 = *(a + 8);
  414. a9 = *(a + 9);
  415. a16 = *(a + 16);
  416. a17 = *(a + 17);
  417. a18 = *(a + 18);
  418. a24 = *(a + 24);
  419. a25 = *(a + 25);
  420. a26 = *(a + 26);
  421. a27 = *(a + 27);
  422. a32 = *(a + 32);
  423. a33 = *(a + 33);
  424. a34 = *(a + 34);
  425. a35 = *(a + 35);
  426. a36 = *(a + 36);
  427. a40 = *(a + 40);
  428. a41 = *(a + 41);
  429. a42 = *(a + 42);
  430. a43 = *(a + 43);
  431. a44 = *(a + 44);
  432. a45 = *(a + 45);
  433. a48 = *(a + 48);
  434. a49 = *(a + 49);
  435. a50 = *(a + 50);
  436. a51 = *(a + 51);
  437. a52 = *(a + 52);
  438. a53 = *(a + 53);
  439. a54 = *(a + 54);
  440. a56 = *(a + 56);
  441. a57 = *(a + 57);
  442. a58 = *(a + 58);
  443. a59 = *(a + 59);
  444. a60 = *(a + 60);
  445. a61 = *(a + 61);
  446. a62 = *(a + 62);
  447. a63 = *(a + 63);
  448. c7 *= a63;
  449. c7_nxt *= a63;
  450. c6 -= c7 * a62;
  451. c6_nxt -= c7_nxt * a62;
  452. c5 -= c7 * a61;
  453. c5_nxt -= c7_nxt * a61;
  454. c4 -= c7 * a60;
  455. c4_nxt -= c7_nxt * a60;
  456. c3 -= c7 * a59;
  457. c3_nxt -= c7_nxt * a59;
  458. c2 -= c7 * a58;
  459. c2_nxt -= c7_nxt * a58;
  460. c1 -= c7 * a57;
  461. c1_nxt -= c7_nxt * a57;
  462. c0 -= c7 * a56;
  463. c0_nxt -= c7_nxt * a56;
  464. c6 *= a54;
  465. c6_nxt *= a54;
  466. c5 -= c6 * a53;
  467. c5_nxt -= c6_nxt * a53;
  468. c4 -= c6 * a52;
  469. c4_nxt -= c6_nxt * a52;
  470. c3 -= c6 * a51;
  471. c3_nxt -= c6_nxt * a51;
  472. c2 -= c6 * a50;
  473. c2_nxt -= c6_nxt * a50;
  474. c1 -= c6 * a49;
  475. c1_nxt -= c6_nxt * a49;
  476. c0 -= c6 * a48;
  477. c0_nxt -= c6_nxt * a48;
  478. c5 *= a45;
  479. c5_nxt *= a45;
  480. c4 -= c5 * a44;
  481. c4_nxt -= c5_nxt * a44;
  482. c3 -= c5 * a43;
  483. c3_nxt -= c5_nxt * a43;
  484. c2 -= c5 * a42;
  485. c2_nxt -= c5_nxt * a42;
  486. c1 -= c5 * a41;
  487. c1_nxt -= c5_nxt * a41;
  488. c0 -= c5 * a40;
  489. c0_nxt -= c5_nxt * a40;
  490. c4 *= a36;
  491. c4_nxt *= a36;
  492. c3 -= c4 * a35;
  493. c3_nxt -= c4_nxt * a35;
  494. c2 -= c4 * a34;
  495. c2_nxt -= c4_nxt * a34;
  496. c1 -= c4 * a33;
  497. c1_nxt -= c4_nxt * a33;
  498. c0 -= c4 * a32;
  499. c0_nxt -= c4_nxt * a32;
  500. c3 *= a27;
  501. c3_nxt *= a27;
  502. c2 -= c3 * a26;
  503. c2_nxt -= c3_nxt * a26;
  504. c1 -= c3 * a25;
  505. c1_nxt -= c3_nxt * a25;
  506. c0 -= c3 * a24;
  507. c0_nxt -= c3_nxt * a24;
  508. c2 *= a18;
  509. c2_nxt *= a18;
  510. c1 -= c2 * a17;
  511. c1_nxt -= c2_nxt * a17;
  512. c0 -= c2 * a16;
  513. c0_nxt -= c2_nxt * a16;
  514. c1 *= a9;
  515. c1_nxt *= a9;
  516. c0 -= c1 * a8;
  517. c0_nxt -= c1_nxt * a8;
  518. c0 *= a0;
  519. c0_nxt *= a0;
  520. *(b + 0) = c0;
  521. *(b + 1) = c0_nxt;
  522. *(b + 2) = c1;
  523. *(b + 3) = c1_nxt;
  524. *(b + 4) = c2;
  525. *(b + 5) = c2_nxt;
  526. *(b + 6) = c3;
  527. *(b + 7) = c3_nxt;
  528. *(b + 8) = c4;
  529. *(b + 9) = c4_nxt;
  530. *(b + 10) = c5;
  531. *(b + 11) = c5_nxt;
  532. *(b + 12) = c6;
  533. *(b + 13) = c6_nxt;
  534. *(b + 14) = c7;
  535. *(b + 15) = c7_nxt;
  536. *(c + 0) = c0;
  537. *(c + 1) = c1;
  538. *(c + 2) = c2;
  539. *(c + 3) = c3;
  540. *(c + 4) = c4;
  541. *(c + 5) = c5;
  542. *(c + 6) = c6;
  543. *(c + 7) = c7;
  544. *(c + 0 + ldc) = c0_nxt;
  545. *(c + 1 + ldc) = c1_nxt;
  546. *(c + 2 + ldc) = c2_nxt;
  547. *(c + 3 + ldc) = c3_nxt;
  548. *(c + 4 + ldc) = c4_nxt;
  549. *(c + 5 + ldc) = c5_nxt;
  550. *(c + 6 + ldc) = c6_nxt;
  551. *(c + 7 + ldc) = c7_nxt;
  552. }
  553. static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  554. {
  555. BLASLONG k;
  556. FLOAT *aa = a, *bb = b;
  557. FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
  558. FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
  559. FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
  560. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  561. c0 = *(c + 0);
  562. c1 = *(c + 1);
  563. c2 = *(c + 2);
  564. c3 = *(c + 3);
  565. c4 = *(c + 4);
  566. c5 = *(c + 5);
  567. c6 = *(c + 6);
  568. c7 = *(c + 7);
  569. for (k = 0; k < bk; k++)
  570. {
  571. c0 -= aa[0] * bb[0];
  572. c1 -= aa[1] * bb[0];
  573. c2 -= aa[2] * bb[0];
  574. c3 -= aa[3] * bb[0];
  575. c4 -= aa[4] * bb[0];
  576. c5 -= aa[5] * bb[0];
  577. c6 -= aa[6] * bb[0];
  578. c7 -= aa[7] * bb[0];
  579. aa += 8;
  580. bb += 1;
  581. }
  582. a -= 64;
  583. b -= 8;
  584. a0 = *(a + 0);
  585. a8 = *(a + 8);
  586. a9 = *(a + 9);
  587. a16 = *(a + 16);
  588. a17 = *(a + 17);
  589. a18 = *(a + 18);
  590. a24 = *(a + 24);
  591. a25 = *(a + 25);
  592. a26 = *(a + 26);
  593. a27 = *(a + 27);
  594. a32 = *(a + 32);
  595. a33 = *(a + 33);
  596. a34 = *(a + 34);
  597. a35 = *(a + 35);
  598. a36 = *(a + 36);
  599. a40 = *(a + 40);
  600. a41 = *(a + 41);
  601. a42 = *(a + 42);
  602. a43 = *(a + 43);
  603. a44 = *(a + 44);
  604. a45 = *(a + 45);
  605. a48 = *(a + 48);
  606. a49 = *(a + 49);
  607. a50 = *(a + 50);
  608. a51 = *(a + 51);
  609. a52 = *(a + 52);
  610. a53 = *(a + 53);
  611. a54 = *(a + 54);
  612. a56 = *(a + 56);
  613. a57 = *(a + 57);
  614. a58 = *(a + 58);
  615. a59 = *(a + 59);
  616. a60 = *(a + 60);
  617. a61 = *(a + 61);
  618. a62 = *(a + 62);
  619. a63 = *(a + 63);
  620. c7 *= a63;
  621. c6 -= c7 * a62;
  622. c6 *= a54;
  623. c5 -= c7 * a61;
  624. c5 -= c6 * a53;
  625. c5 *= a45;
  626. c4 -= c7 * a60;
  627. c4 -= c6 * a52;
  628. c4 -= c5 * a44;
  629. c4 *= a36;
  630. c3 -= c7 * a59;
  631. c3 -= c6 * a51;
  632. c3 -= c5 * a43;
  633. c3 -= c4 * a35;
  634. c3 *= a27;
  635. c2 -= c7 * a58;
  636. c2 -= c6 * a50;
  637. c2 -= c5 * a42;
  638. c2 -= c4 * a34;
  639. c2 -= c3 * a26;
  640. c2 *= a18;
  641. c1 -= c7 * a57;
  642. c1 -= c6 * a49;
  643. c1 -= c5 * a41;
  644. c1 -= c4 * a33;
  645. c1 -= c3 * a25;
  646. c1 -= c2 * a17;
  647. c1 *= a9;
  648. c0 -= c7 * a56;
  649. c0 -= c6 * a48;
  650. c0 -= c5 * a40;
  651. c0 -= c4 * a32;
  652. c0 -= c3 * a24;
  653. c0 -= c2 * a16;
  654. c0 -= c1 * a8;
  655. c0 *= a0;
  656. *(b + 0) = c0;
  657. *(b + 1) = c1;
  658. *(b + 2) = c2;
  659. *(b + 3) = c3;
  660. *(b + 4) = c4;
  661. *(b + 5) = c5;
  662. *(b + 6) = c6;
  663. *(b + 7) = c7;
  664. *(c + 0) = c0;
  665. *(c + 1) = c1;
  666. *(c + 2) = c2;
  667. *(c + 3) = c3;
  668. *(c + 4) = c4;
  669. *(c + 5) = c5;
  670. *(c + 6) = c6;
  671. *(c + 7) = c7;
  672. }
  673. static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  674. {
  675. BLASLONG k;
  676. FLOAT *aa = a, *bb = b;
  677. v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
  678. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  679. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  680. v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
  681. v4f32 src_a13, src_a14, src_a15;
  682. FLOAT *c_nxt1line = c + ldc;
  683. FLOAT *c_nxt2line = c + 2 * ldc;
  684. FLOAT *c_nxt3line = c + 3 * ldc;
  685. FLOAT *c_nxt4line = c + 4 * ldc;
  686. FLOAT *c_nxt5line = c + 5 * ldc;
  687. FLOAT *c_nxt6line = c + 6 * ldc;
  688. FLOAT *c_nxt7line = c + 7 * ldc;
  689. src_c0 = LD_SP(c);
  690. src_c1 = LD_SP(c_nxt1line);
  691. src_c2 = LD_SP(c_nxt2line);
  692. src_c3 = LD_SP(c_nxt3line);
  693. src_c4 = LD_SP(c_nxt4line);
  694. src_c5 = LD_SP(c_nxt5line);
  695. src_c6 = LD_SP(c_nxt6line);
  696. src_c7 = LD_SP(c_nxt7line);
  697. for (k = 0; k < bk; k++)
  698. {
  699. src_a0 = LD_SP(aa);
  700. src_b = LD_SP(bb);
  701. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  702. src_c0 -= src_a0 * src_b0;
  703. src_c1 -= src_a0 * src_b1;
  704. src_c2 -= src_a0 * src_b2;
  705. src_c3 -= src_a0 * src_b3;
  706. src_b = LD_SP(bb + 4);
  707. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  708. src_c4 -= src_a0 * src_b0;
  709. src_c5 -= src_a0 * src_b1;
  710. src_c6 -= src_a0 * src_b2;
  711. src_c7 -= src_a0 * src_b3;
  712. aa += 4;
  713. bb += 8;
  714. }
  715. a -= 16;
  716. b -= 32;
  717. TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
  718. res_c0, res_c1, res_c2, res_c3);
  719. TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
  720. res_c4, res_c5, res_c6, res_c7);
  721. src_a = LD_SP(a + 12);
  722. SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
  723. src_a8 = LD_SP(a + 8);
  724. src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
  725. src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
  726. src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
  727. COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
  728. COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
  729. COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
  730. res_c3 *= src_a15;
  731. res_c7 *= src_a15;
  732. res_c2 -= res_c3 * src_a14;
  733. res_c6 -= res_c7 * src_a14;
  734. res_c1 -= res_c3 * src_a13;
  735. res_c5 -= res_c7 * src_a13;
  736. res_c0 -= res_c3 * src_a12;
  737. res_c4 -= res_c7 * src_a12;
  738. res_c2 *= src_a10;
  739. res_c6 *= src_a10;
  740. res_c1 -= res_c2 * src_a9;
  741. res_c5 -= res_c6 * src_a9;
  742. res_c0 -= res_c2 * src_a8;
  743. res_c4 -= res_c6 * src_a8;
  744. res_c1 *= src_a5;
  745. res_c5 *= src_a5;
  746. res_c0 -= res_c1 * src_a4;
  747. res_c4 -= res_c5 * src_a4;
  748. res_c0 *= src_a0;
  749. res_c4 *= src_a0;
  750. ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
  751. ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
  752. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  753. src_c0, src_c1, src_c2, src_c3);
  754. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  755. src_c4, src_c5, src_c6, src_c7);
  756. ST_SP(src_c0, c);
  757. ST_SP(src_c1, c_nxt1line);
  758. ST_SP(src_c2, c_nxt2line);
  759. ST_SP(src_c3, c_nxt3line);
  760. ST_SP(src_c4, c_nxt4line);
  761. ST_SP(src_c5, c_nxt5line);
  762. ST_SP(src_c6, c_nxt6line);
  763. ST_SP(src_c7, c_nxt7line);
  764. }
  765. static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  766. {
  767. BLASLONG k;
  768. FLOAT *aa = a, *bb = b;
  769. v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
  770. v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
  771. v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
  772. v4f32 src_a13, src_a14, src_a15;
  773. FLOAT *c_nxt1line = c + ldc;
  774. FLOAT *c_nxt2line = c + 2 * ldc;
  775. FLOAT *c_nxt3line = c + 3 * ldc;
  776. src_c0 = LD_SP(c);
  777. src_c1 = LD_SP(c_nxt1line);
  778. src_c2 = LD_SP(c_nxt2line);
  779. src_c3 = LD_SP(c_nxt3line);
  780. for (k = 0; k < (bk >> 1); k++)
  781. {
  782. src_a0 = LD_SP(aa);
  783. src_b = LD_SP(bb);
  784. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  785. src_c0 -= src_a0 * src_b0;
  786. src_c1 -= src_a0 * src_b1;
  787. src_c2 -= src_a0 * src_b2;
  788. src_c3 -= src_a0 * src_b3;
  789. aa += 4;
  790. bb += 4;
  791. src_a0 = LD_SP(aa);
  792. src_b = LD_SP(bb);
  793. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  794. src_c0 -= src_a0 * src_b0;
  795. src_c1 -= src_a0 * src_b1;
  796. src_c2 -= src_a0 * src_b2;
  797. src_c3 -= src_a0 * src_b3;
  798. aa += 4;
  799. bb += 4;
  800. }
  801. if (bk & 1)
  802. {
  803. src_a0 = LD_SP(aa);
  804. src_b = LD_SP(bb);
  805. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  806. src_c0 -= src_a0 * src_b0;
  807. src_c1 -= src_a0 * src_b1;
  808. src_c2 -= src_a0 * src_b2;
  809. src_c3 -= src_a0 * src_b3;
  810. }
  811. a -= 16;
  812. b -= 16;
  813. TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
  814. res_c0, res_c1, res_c2, res_c3);
  815. src_a = LD_SP(a + 12);
  816. SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
  817. src_a8 = LD_SP(a + 8);
  818. src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
  819. src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
  820. src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
  821. COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
  822. COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
  823. COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
  824. res_c3 *= src_a15;
  825. res_c2 -= res_c3 * src_a14;
  826. res_c1 -= res_c3 * src_a13;
  827. res_c0 -= res_c3 * src_a12;
  828. res_c2 *= src_a10;
  829. res_c1 -= res_c2 * src_a9;
  830. res_c0 -= res_c2 * src_a8;
  831. res_c1 *= src_a5;
  832. res_c0 -= res_c1 * src_a4;
  833. res_c0 *= src_a0;
  834. ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
  835. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  836. src_c0, src_c1, src_c2, src_c3);
  837. ST_SP(src_c0, c);
  838. ST_SP(src_c1, c_nxt1line);
  839. ST_SP(src_c2, c_nxt2line);
  840. ST_SP(src_c3, c_nxt3line);
  841. }
  842. static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  843. {
  844. BLASLONG k;
  845. FLOAT *aa = a, *bb = b;
  846. FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
  847. FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
  848. c0 = *(c + 0);
  849. c1 = *(c + 1);
  850. c2 = *(c + 2);
  851. c3 = *(c + 3);
  852. c0_nxt = *(c + 0 + ldc);
  853. c1_nxt = *(c + 1 + ldc);
  854. c2_nxt = *(c + 2 + ldc);
  855. c3_nxt = *(c + 3 + ldc);
  856. for (k = 0; k < bk; k++)
  857. {
  858. c0 -= aa[0] * bb[0];
  859. c1 -= aa[1] * bb[0];
  860. c2 -= aa[2] * bb[0];
  861. c3 -= aa[3] * bb[0];
  862. c0_nxt -= aa[0] * bb[1];
  863. c1_nxt -= aa[1] * bb[1];
  864. c2_nxt -= aa[2] * bb[1];
  865. c3_nxt -= aa[3] * bb[1];
  866. aa += 4;
  867. bb += 2;
  868. }
  869. a -= 16;
  870. b -= 8;
  871. a0 = *(a + 0);
  872. a4 = *(a + 4);
  873. a5 = *(a + 5);
  874. a8 = *(a + 8);
  875. a9 = *(a + 9);
  876. a10 = *(a + 10);
  877. a12 = *(a + 12);
  878. a13 = *(a + 13);
  879. a14 = *(a + 14);
  880. a15 = *(a + 15);
  881. c3 *= a15;
  882. c3_nxt *= a15;
  883. c2 -= c3 * a14;
  884. c2_nxt -= c3_nxt * a14;
  885. c2 *= a10;
  886. c2_nxt *= a10;
  887. c1 -= c3 * a13;
  888. c1_nxt -= c3_nxt * a13;
  889. c1 -= c2 * a9;
  890. c1_nxt -= c2_nxt * a9;
  891. c1 *= a5;
  892. c1_nxt *= a5;
  893. c0 -= c3 * a12;
  894. c0_nxt -= c3_nxt * a12;
  895. c0 -= c2 * a8;
  896. c0_nxt -= c2_nxt * a8;
  897. c0 -= c1 * a4;
  898. c0_nxt -= c1_nxt * a4;
  899. c0 *= a0;
  900. c0_nxt *= a0;
  901. *(b + 0) = c0;
  902. *(b + 1) = c0_nxt;
  903. *(b + 2) = c1;
  904. *(b + 3) = c1_nxt;
  905. *(b + 4) = c2;
  906. *(b + 5) = c2_nxt;
  907. *(b + 6) = c3;
  908. *(b + 7) = c3_nxt;
  909. *(c + 0) = c0;
  910. *(c + 1) = c1;
  911. *(c + 2) = c2;
  912. *(c + 3) = c3;
  913. *(c + 0 + ldc) = c0_nxt;
  914. *(c + 1 + ldc) = c1_nxt;
  915. *(c + 2 + ldc) = c2_nxt;
  916. *(c + 3 + ldc) = c3_nxt;
  917. }
  918. static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  919. {
  920. BLASLONG k;
  921. FLOAT *aa = a, *bb = b;
  922. FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
  923. c0 = *(c + 0);
  924. c1 = *(c + 1);
  925. c2 = *(c + 2);
  926. c3 = *(c + 3);
  927. for (k = 0; k < bk; k++)
  928. {
  929. c0 -= aa[0] * bb[0];
  930. c1 -= aa[1] * bb[0];
  931. c2 -= aa[2] * bb[0];
  932. c3 -= aa[3] * bb[0];
  933. aa += 4;
  934. bb += 1;
  935. }
  936. a -= 16;
  937. b -= 4;
  938. a0 = *(a + 0);
  939. a4 = *(a + 4);
  940. a5 = *(a + 5);
  941. a8 = *(a + 8);
  942. a9 = *(a + 9);
  943. a10 = *(a + 10);
  944. a12 = *(a + 12);
  945. a13 = *(a + 13);
  946. a14 = *(a + 14);
  947. a15 = *(a + 15);
  948. c3 *= a15;
  949. c2 -= c3 * a14;
  950. c2 *= a10;
  951. c1 -= c3 * a13;
  952. c1 -= c2 * a9;
  953. c1 *= a5;
  954. c0 -= c3 * a12;
  955. c0 -= c2 * a8;
  956. c0 -= c1 * a4;
  957. c0 *= a0;
  958. *(b + 0) = c0;
  959. *(b + 1) = c1;
  960. *(b + 2) = c2;
  961. *(b + 3) = c3;
  962. *(c + 0) = c0;
  963. *(c + 1) = c1;
  964. *(c + 2) = c2;
  965. *(c + 3) = c3;
  966. }
  967. static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  968. {
  969. BLASLONG k;
  970. FLOAT *aa = a, *bb = b;
  971. FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3;
  972. FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  973. FLOAT c0_nxt7, c1_nxt7;
  974. c0 = *(c + 0);
  975. c1 = *(c + 1);
  976. c0_nxt1 = *(c + 0 + 1 * ldc);
  977. c1_nxt1 = *(c + 1 + 1 * ldc);
  978. c0_nxt2 = *(c + 0 + 2 * ldc);
  979. c1_nxt2 = *(c + 1 + 2 * ldc);
  980. c0_nxt3 = *(c + 0 + 3 * ldc);
  981. c1_nxt3 = *(c + 1 + 3 * ldc);
  982. c0_nxt4 = *(c + 0 + 4 * ldc);
  983. c1_nxt4 = *(c + 1 + 4 * ldc);
  984. c0_nxt5 = *(c + 0 + 5 * ldc);
  985. c1_nxt5 = *(c + 1 + 5 * ldc);
  986. c0_nxt6 = *(c + 0 + 6 * ldc);
  987. c1_nxt6 = *(c + 1 + 6 * ldc);
  988. c0_nxt7 = *(c + 0 + 7 * ldc);
  989. c1_nxt7 = *(c + 1 + 7 * ldc);
  990. for (k = 0; k < bk; k++)
  991. {
  992. c0 -= aa[0] * bb[0];
  993. c1 -= aa[1] * bb[0];
  994. c0_nxt1 -= aa[0] * bb[1];
  995. c1_nxt1 -= aa[1] * bb[1];
  996. c0_nxt2 -= aa[0] * bb[2];
  997. c1_nxt2 -= aa[1] * bb[2];
  998. c0_nxt3 -= aa[0] * bb[3];
  999. c1_nxt3 -= aa[1] * bb[3];
  1000. c0_nxt4 -= aa[0] * bb[4];
  1001. c1_nxt4 -= aa[1] * bb[4];
  1002. c0_nxt5 -= aa[0] * bb[5];
  1003. c1_nxt5 -= aa[1] * bb[5];
  1004. c0_nxt6 -= aa[0] * bb[6];
  1005. c1_nxt6 -= aa[1] * bb[6];
  1006. c0_nxt7 -= aa[0] * bb[7];
  1007. c1_nxt7 -= aa[1] * bb[7];
  1008. aa += 2;
  1009. bb += 8;
  1010. }
  1011. a -= 4;
  1012. b -= 16;
  1013. a0 = *(a + 0);
  1014. a2 = *(a + 2);
  1015. a3 = *(a + 3);
  1016. c1 *= a3;
  1017. c1_nxt1 *= a3;
  1018. c1_nxt2 *= a3;
  1019. c1_nxt3 *= a3;
  1020. c1_nxt4 *= a3;
  1021. c1_nxt5 *= a3;
  1022. c1_nxt6 *= a3;
  1023. c1_nxt7 *= a3;
  1024. c0 -= c1 * a2;
  1025. c0_nxt1 -= c1_nxt1 * a2;
  1026. c0_nxt2 -= c1_nxt2 * a2;
  1027. c0_nxt3 -= c1_nxt3 * a2;
  1028. c0_nxt4 -= c1_nxt4 * a2;
  1029. c0_nxt5 -= c1_nxt5 * a2;
  1030. c0_nxt6 -= c1_nxt6 * a2;
  1031. c0_nxt7 -= c1_nxt7 * a2;
  1032. c0 *= a0;
  1033. c0_nxt1 *= a0;
  1034. c0_nxt2 *= a0;
  1035. c0_nxt3 *= a0;
  1036. c0_nxt4 *= a0;
  1037. c0_nxt5 *= a0;
  1038. c0_nxt6 *= a0;
  1039. c0_nxt7 *= a0;
  1040. *(b + 0) = c0;
  1041. *(b + 1) = c0_nxt1;
  1042. *(b + 2) = c0_nxt2;
  1043. *(b + 3) = c0_nxt3;
  1044. *(b + 4) = c0_nxt4;
  1045. *(b + 5) = c0_nxt5;
  1046. *(b + 6) = c0_nxt6;
  1047. *(b + 7) = c0_nxt7;
  1048. *(b + 8) = c1;
  1049. *(b + 9) = c1_nxt1;
  1050. *(b + 10) = c1_nxt2;
  1051. *(b + 11) = c1_nxt3;
  1052. *(b + 12) = c1_nxt4;
  1053. *(b + 13) = c1_nxt5;
  1054. *(b + 14) = c1_nxt6;
  1055. *(b + 15) = c1_nxt7;
  1056. *(c + 0) = c0;
  1057. *(c + 1) = c1;
  1058. *(c + 0 + 1 * ldc) = c0_nxt1;
  1059. *(c + 1 + 1 * ldc) = c1_nxt1;
  1060. *(c + 0 + 2 * ldc) = c0_nxt2;
  1061. *(c + 1 + 2 * ldc) = c1_nxt2;
  1062. *(c + 0 + 3 * ldc) = c0_nxt3;
  1063. *(c + 1 + 3 * ldc) = c1_nxt3;
  1064. *(c + 0 + 4 * ldc) = c0_nxt4;
  1065. *(c + 1 + 4 * ldc) = c1_nxt4;
  1066. *(c + 0 + 5 * ldc) = c0_nxt5;
  1067. *(c + 1 + 5 * ldc) = c1_nxt5;
  1068. *(c + 0 + 6 * ldc) = c0_nxt6;
  1069. *(c + 1 + 6 * ldc) = c1_nxt6;
  1070. *(c + 0 + 7 * ldc) = c0_nxt7;
  1071. *(c + 1 + 7 * ldc) = c1_nxt7;
  1072. }
  1073. static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1074. {
  1075. BLASLONG k;
  1076. FLOAT *aa = a, *bb = b;
  1077. FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
  1078. FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  1079. c0 = *(c + 0);
  1080. c1 = *(c + 1);
  1081. c0_nxt1 = *(c + 0 + ldc);
  1082. c1_nxt1 = *(c + 1 + ldc);
  1083. c0_nxt2 = *(c + 0 + 2 * ldc);
  1084. c1_nxt2 = *(c + 1 + 2 * ldc);
  1085. c0_nxt3 = *(c + 0 + 3 * ldc);
  1086. c1_nxt3 = *(c + 1 + 3 * ldc);
  1087. for (k = 0; k < bk; k++)
  1088. {
  1089. c0 -= aa[0] * bb[0];
  1090. c1 -= aa[1] * bb[0];
  1091. c0_nxt1 -= aa[0] * bb[1];
  1092. c1_nxt1 -= aa[1] * bb[1];
  1093. c0_nxt2 -= aa[0] * bb[2];
  1094. c1_nxt2 -= aa[1] * bb[2];
  1095. c0_nxt3 -= aa[0] * bb[3];
  1096. c1_nxt3 -= aa[1] * bb[3];
  1097. aa += 2;
  1098. bb += 4;
  1099. }
  1100. a -= 4;
  1101. b -= 8;
  1102. a0 = *(a + 0);
  1103. a2 = *(a + 2);
  1104. a3 = *(a + 3);
  1105. c1 *= a3;
  1106. c1_nxt1 *= a3;
  1107. c1_nxt2 *= a3;
  1108. c1_nxt3 *= a3;
  1109. c0 -= c1 * a2;
  1110. c0_nxt1 -= c1_nxt1 * a2;
  1111. c0_nxt2 -= c1_nxt2 * a2;
  1112. c0_nxt3 -= c1_nxt3 * a2;
  1113. c0 *= a0;
  1114. c0_nxt1 *= a0;
  1115. c0_nxt2 *= a0;
  1116. c0_nxt3 *= a0;
  1117. *(b + 0) = c0;
  1118. *(b + 1) = c0_nxt1;
  1119. *(b + 2) = c0_nxt2;
  1120. *(b + 3) = c0_nxt3;
  1121. *(b + 4) = c1;
  1122. *(b + 5) = c1_nxt1;
  1123. *(b + 6) = c1_nxt2;
  1124. *(b + 7) = c1_nxt3;
  1125. *(c + 0) = c0;
  1126. *(c + 1) = c1;
  1127. *(c + 0 + ldc) = c0_nxt1;
  1128. *(c + 1 + ldc) = c1_nxt1;
  1129. *(c + 0 + 2 * ldc) = c0_nxt2;
  1130. *(c + 1 + 2 * ldc) = c1_nxt2;
  1131. *(c + 0 + 3 * ldc) = c0_nxt3;
  1132. *(c + 1 + 3 * ldc) = c1_nxt3;
  1133. }
  1134. static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1135. {
  1136. BLASLONG k;
  1137. FLOAT *aa = a, *bb = b;
  1138. FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
  1139. c0 = *(c + 0);
  1140. c1 = *(c + 1);
  1141. c0_nxt = *(c + 0 + ldc);
  1142. c1_nxt = *(c + 1 + ldc);
  1143. for (k = 0; k < bk; k++)
  1144. {
  1145. c0 -= aa[0] * bb[0];
  1146. c1 -= aa[1] * bb[0];
  1147. c0_nxt -= aa[0] * bb[1];
  1148. c1_nxt -= aa[1] * bb[1];
  1149. aa += 2;
  1150. bb += 2;
  1151. }
  1152. a -= 4;
  1153. b -= 4;
  1154. a0 = *(a + 0);
  1155. a2 = *(a + 2);
  1156. a3 = *(a + 3);
  1157. c1 *= a3;
  1158. c1_nxt *= a3;
  1159. c0 -= c1 * a2;
  1160. c0_nxt -= c1_nxt * a2;
  1161. c0 *= a0;
  1162. c0_nxt *= a0;
  1163. *(b + 0) = c0;
  1164. *(b + 1) = c0_nxt;
  1165. *(b + 2) = c1;
  1166. *(b + 3) = c1_nxt;
  1167. *(c + 0) = c0;
  1168. *(c + 1) = c1;
  1169. *(c + 0 + ldc) = c0_nxt;
  1170. *(c + 1 + ldc) = c1_nxt;
  1171. }
  1172. static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1173. {
  1174. BLASLONG k;
  1175. FLOAT *aa = a, *bb = b;
  1176. FLOAT a0, a2, a3, c0, c1;
  1177. c0 = *(c + 0);
  1178. c1 = *(c + 1);
  1179. for (k = 0; k < bk; k++)
  1180. {
  1181. c0 -= aa[0] * bb[0];
  1182. c1 -= aa[1] * bb[0];
  1183. aa += 2;
  1184. bb += 1;
  1185. }
  1186. a -= 4;
  1187. b -= 2;
  1188. a0 = *(a + 0);
  1189. a2 = *(a + 2);
  1190. a3 = *(a + 3);
  1191. c1 *= a3;
  1192. c0 -= c1 * a2;
  1193. c0 *= a0;
  1194. *(b + 0) = c0;
  1195. *(b + 1) = c1;
  1196. *(c + 0) = c0;
  1197. *(c + 1) = c1;
  1198. }
  1199. static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1200. {
  1201. BLASLONG k;
  1202. FLOAT *aa = a, *bb = b;
  1203. FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7;
  1204. c0 = *(c + 0);
  1205. c1 = *(c + 1 * ldc);
  1206. c2 = *(c + 2 * ldc);
  1207. c3 = *(c + 3 * ldc);
  1208. c4 = *(c + 4 * ldc);
  1209. c5 = *(c + 5 * ldc);
  1210. c6 = *(c + 6 * ldc);
  1211. c7 = *(c + 7 * ldc);
  1212. for (k = 0; k < bk; k++)
  1213. {
  1214. c0 -= aa[0] * bb[0];
  1215. c1 -= aa[0] * bb[1];
  1216. c2 -= aa[0] * bb[2];
  1217. c3 -= aa[0] * bb[3];
  1218. c4 -= aa[0] * bb[4];
  1219. c5 -= aa[0] * bb[5];
  1220. c6 -= aa[0] * bb[6];
  1221. c7 -= aa[0] * bb[7];
  1222. aa += 1;
  1223. bb += 8;
  1224. }
  1225. a0 = *(a - 1);
  1226. c0 *= a0;
  1227. c1 *= a0;
  1228. c2 *= a0;
  1229. c3 *= a0;
  1230. c4 *= a0;
  1231. c5 *= a0;
  1232. c6 *= a0;
  1233. c7 *= a0;
  1234. *(b - 8) = c0;
  1235. *(b - 7) = c1;
  1236. *(b - 6) = c2;
  1237. *(b - 5) = c3;
  1238. *(b - 4) = c4;
  1239. *(b - 3) = c5;
  1240. *(b - 2) = c6;
  1241. *(b - 1) = c7;
  1242. *(c + 0 * ldc) = c0;
  1243. *(c + 1 * ldc) = c1;
  1244. *(c + 2 * ldc) = c2;
  1245. *(c + 3 * ldc) = c3;
  1246. *(c + 4 * ldc) = c4;
  1247. *(c + 5 * ldc) = c5;
  1248. *(c + 6 * ldc) = c6;
  1249. *(c + 7 * ldc) = c7;
  1250. }
  1251. static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1252. {
  1253. BLASLONG k;
  1254. FLOAT *aa = a, *bb = b;
  1255. FLOAT a0, c0, c1, c2, c3;
  1256. c0 = *(c + 0 * ldc);
  1257. c1 = *(c + 1 * ldc);
  1258. c2 = *(c + 2 * ldc);
  1259. c3 = *(c + 3 * ldc);
  1260. for (k = 0; k < bk; k++)
  1261. {
  1262. c0 -= aa[0] * bb[0];
  1263. c1 -= aa[0] * bb[1];
  1264. c2 -= aa[0] * bb[2];
  1265. c3 -= aa[0] * bb[3];
  1266. aa += 1;
  1267. bb += 4;
  1268. }
  1269. a0 = *(a - 1);
  1270. c0 *= a0;
  1271. c1 *= a0;
  1272. c2 *= a0;
  1273. c3 *= a0;
  1274. *(b - 4) = c0;
  1275. *(b - 3) = c1;
  1276. *(b - 2) = c2;
  1277. *(b - 1) = c3;
  1278. *(c + 0 * ldc) = c0;
  1279. *(c + 1 * ldc) = c1;
  1280. *(c + 2 * ldc) = c2;
  1281. *(c + 3 * ldc) = c3;
  1282. }
  1283. static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1284. {
  1285. BLASLONG k;
  1286. FLOAT *aa = a, *bb = b;
  1287. FLOAT a0, c0, c1;
  1288. c0 = *c;
  1289. c1 = *(c + ldc);
  1290. for (k = 0; k < bk; k++)
  1291. {
  1292. c0 -= aa[0] * bb[0];
  1293. c1 -= aa[0] * bb[1];
  1294. aa += 1;
  1295. bb += 2;
  1296. }
  1297. a0 = *(a - 1);
  1298. c0 *= a0;
  1299. c1 *= a0;
  1300. *(b - 2) = c0;
  1301. *(b - 1) = c1;
  1302. *(c + 0 * ldc) = c0;
  1303. *(c + 1 * ldc) = c1;
  1304. }
  1305. static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1306. {
  1307. BLASLONG k;
  1308. for (k = 0; k < bk; k++)
  1309. {
  1310. *c -= a[k] * b[k];
  1311. }
  1312. *c *= *(a - 1);
  1313. *(b - 1) = *c;
  1314. }
  1315. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1316. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1317. {
  1318. FLOAT *aa, *cc;
  1319. BLASLONG i, j, kk;
  1320. for (j = (n >> 3); j--;)
  1321. {
  1322. kk = m + offset;
  1323. if (m & 7)
  1324. {
  1325. if (m & 1)
  1326. {
  1327. aa = a + (m - 1) * k + kk;
  1328. cc = c + (m - 1);
  1329. ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
  1330. kk -= 1;
  1331. }
  1332. if (m & 2)
  1333. {
  1334. aa = a + ((m & ~1) - 2) * k + 2 * kk;
  1335. cc = c + ((m & ~1) - 2);
  1336. ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
  1337. kk -= 2;
  1338. }
  1339. if (m & 4)
  1340. {
  1341. aa = a + ((m & ~3) - 4) * k + 4 * kk;
  1342. cc = c + ((m & ~3) - 4);
  1343. ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
  1344. kk -= 4;
  1345. }
  1346. }
  1347. i = (m >> 3);
  1348. if (i > 0)
  1349. {
  1350. aa = a + ((m & ~7) - 8) * k;
  1351. cc = c + ((m & ~7) - 8);
  1352. do
  1353. {
  1354. ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1355. aa -= 8 * k;
  1356. cc -= 8;
  1357. kk -= 8;
  1358. i --;
  1359. } while (i > 0);
  1360. }
  1361. b += 8 * k;
  1362. c += 8 * ldc;
  1363. }
  1364. if (n & 7)
  1365. {
  1366. if (n & 4)
  1367. {
  1368. kk = m + offset;
  1369. if (m & 7)
  1370. {
  1371. if (m & 1)
  1372. {
  1373. aa = a + (m - 1) * k + kk;
  1374. cc = c + (m - 1);
  1375. ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
  1376. kk -= 1;
  1377. }
  1378. if (m & 2)
  1379. {
  1380. aa = a + ((m & ~1) - 2) * k + 2 * kk;
  1381. cc = c + ((m & ~1) - 2);
  1382. ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
  1383. kk -= 2;
  1384. }
  1385. if (m & 4)
  1386. {
  1387. aa = a + ((m & ~3) - 4) * k + 4 * kk;
  1388. cc = c + ((m & ~3) - 4);
  1389. ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
  1390. kk -= 4;
  1391. }
  1392. }
  1393. i = (m >> 3);
  1394. if (i > 0)
  1395. {
  1396. aa = a + ((m & ~7) - 8) * k;
  1397. cc = c + ((m & ~7) - 8);
  1398. do
  1399. {
  1400. ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1401. aa -= 8 * k;
  1402. cc -= 8;
  1403. kk -= 8;
  1404. i --;
  1405. } while (i > 0);
  1406. }
  1407. b += 4 * k;
  1408. c += 4 * ldc;
  1409. }
  1410. if (n & 2)
  1411. {
  1412. kk = m + offset;
  1413. if (m & 7)
  1414. {
  1415. if (m & 1)
  1416. {
  1417. aa = a + (m - 1) * k + kk;
  1418. cc = c + (m - 1);
  1419. ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
  1420. kk -= 1;
  1421. }
  1422. if (m & 2)
  1423. {
  1424. aa = a + ((m & ~1) - 2) * k + 2 * kk;
  1425. cc = c + ((m & ~1) - 2);
  1426. ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
  1427. kk -= 2;
  1428. }
  1429. if (m & 4)
  1430. {
  1431. aa = a + ((m & ~3) - 4) * k + 4 * kk;
  1432. cc = c + ((m & ~3) - 4);
  1433. ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
  1434. kk -= 4;
  1435. }
  1436. }
  1437. i = (m >> 3);
  1438. if (i > 0)
  1439. {
  1440. aa = a + ((m & ~7) - 8) * k;
  1441. cc = c + ((m & ~7) - 8);
  1442. do
  1443. {
  1444. ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1445. aa -= 8 * k;
  1446. cc -= 8;
  1447. kk -= 8;
  1448. i --;
  1449. } while (i > 0);
  1450. }
  1451. b += 2 * k;
  1452. c += 2 * ldc;
  1453. }
  1454. if (n & 1)
  1455. {
  1456. kk = m + offset;
  1457. if (m & 7)
  1458. {
  1459. if (m & 1)
  1460. {
  1461. aa = a + (m - 1) * k + kk;
  1462. cc = c + (m - 1);
  1463. ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk));
  1464. kk -= 1;
  1465. }
  1466. if (m & 2)
  1467. {
  1468. aa = a + ((m & ~1) - 2) * k + 2 * kk;
  1469. cc = c + ((m & ~1) - 2);
  1470. ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
  1471. kk -= 2;
  1472. }
  1473. if (m & 4)
  1474. {
  1475. aa = a + ((m & ~3) - 4) * k + 4 * kk;
  1476. cc = c + ((m & ~3) - 4);
  1477. ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk));
  1478. kk -= 4;
  1479. }
  1480. }
  1481. i = (m >> 3);
  1482. if (i > 0)
  1483. {
  1484. aa = a + ((m & ~7) - 8) * k;
  1485. cc = c + ((m & ~7) - 8);
  1486. do
  1487. {
  1488. ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));
  1489. aa -= 8 * k;
  1490. cc -= 8;
  1491. kk -= 8;
  1492. i --;
  1493. } while (i > 0);
  1494. }
  1495. b += k;
  1496. c += ldc;
  1497. }
  1498. }
  1499. return 0;
  1500. }