You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4_msa.c 44 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
  30. FLOAT *C, BLASLONG ldc
  31. #ifdef TRMMKERNEL
  32. , BLASLONG offset
  33. #endif
  34. )
  35. {
  36. BLASLONG i, j, l, temp;
  37. #if defined(TRMMKERNEL)
  38. BLASLONG off;
  39. #endif
  40. FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
  41. FLOAT tmp0, tmp1, tmp2, tmp3;
  42. FLOAT a0, b0, b1, b2, b3;
  43. v2f64 v_alpha = {alpha, alpha};
  44. v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1;
  45. v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  46. v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
  47. v2f64 res8, res9, res10, res11, res12, res13, res14, res15;
  48. #if defined(TRMMKERNEL) && !defined(LEFT)
  49. off = -offset;
  50. #endif
  51. for (j = (n >> 2); j--;)
  52. {
  53. pc0 = C;
  54. pc1 = pc0 + ldc;
  55. pc2 = pc1 + ldc;
  56. pc3 = pc2 + ldc;
  57. pa0 = A;
  58. #if defined(TRMMKERNEL) && defined(LEFT)
  59. off = offset;
  60. #endif
  61. for (i = (m >> 3); i--;)
  62. {
  63. #if defined(TRMMKERNEL)
  64. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  65. pb0 = B;
  66. #else
  67. pa0 += off * 8;
  68. pb0 = B + off * 4;
  69. #endif
  70. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  71. temp = k - off;
  72. #elif defined(LEFT)
  73. temp = off + 8; // number of values in A
  74. #else
  75. temp = off + 4; // number of values in B
  76. #endif
  77. #else
  78. pb0 = B;
  79. temp = k;
  80. #endif
  81. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  82. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  83. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  84. res0 = src_a0 * src_b;
  85. res1 = src_a1 * src_b;
  86. res2 = src_a2 * src_b;
  87. res3 = src_a3 * src_b;
  88. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  89. res4 = src_a0 * src_b;
  90. res5 = src_a1 * src_b;
  91. res6 = src_a2 * src_b;
  92. res7 = src_a3 * src_b;
  93. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  94. res8 = src_a0 * src_b;
  95. res9 = src_a1 * src_b;
  96. res10 = src_a2 * src_b;
  97. res11 = src_a3 * src_b;
  98. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  99. res12 = src_a0 * src_b;
  100. res13 = src_a1 * src_b;
  101. res14 = src_a2 * src_b;
  102. res15 = src_a3 * src_b;
  103. for (l = ((temp - 1) >> 1); l--;)
  104. {
  105. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  106. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  107. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  108. res0 += src_a0 * src_b;
  109. res1 += src_a1 * src_b;
  110. res2 += src_a2 * src_b;
  111. res3 += src_a3 * src_b;
  112. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  113. res4 += src_a0 * src_b;
  114. res5 += src_a1 * src_b;
  115. res6 += src_a2 * src_b;
  116. res7 += src_a3 * src_b;
  117. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  118. res8 += src_a0 * src_b;
  119. res9 += src_a1 * src_b;
  120. res10 += src_a2 * src_b;
  121. res11 += src_a3 * src_b;
  122. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  123. res12 += src_a0 * src_b;
  124. res13 += src_a1 * src_b;
  125. res14 += src_a2 * src_b;
  126. res15 += src_a3 * src_b;
  127. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  128. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  129. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  130. res0 += src_a0 * src_b;
  131. res1 += src_a1 * src_b;
  132. res2 += src_a2 * src_b;
  133. res3 += src_a3 * src_b;
  134. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  135. res4 += src_a0 * src_b;
  136. res5 += src_a1 * src_b;
  137. res6 += src_a2 * src_b;
  138. res7 += src_a3 * src_b;
  139. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  140. res8 += src_a0 * src_b;
  141. res9 += src_a1 * src_b;
  142. res10 += src_a2 * src_b;
  143. res11 += src_a3 * src_b;
  144. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  145. res12 += src_a0 * src_b;
  146. res13 += src_a1 * src_b;
  147. res14 += src_a2 * src_b;
  148. res15 += src_a3 * src_b;
  149. }
  150. if ((temp - 1) & 1)
  151. {
  152. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  153. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  154. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  155. res0 += src_a0 * src_b;
  156. res1 += src_a1 * src_b;
  157. res2 += src_a2 * src_b;
  158. res3 += src_a3 * src_b;
  159. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  160. res4 += src_a0 * src_b;
  161. res5 += src_a1 * src_b;
  162. res6 += src_a2 * src_b;
  163. res7 += src_a3 * src_b;
  164. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  165. res8 += src_a0 * src_b;
  166. res9 += src_a1 * src_b;
  167. res10 += src_a2 * src_b;
  168. res11 += src_a3 * src_b;
  169. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  170. res12 += src_a0 * src_b;
  171. res13 += src_a1 * src_b;
  172. res14 += src_a2 * src_b;
  173. res15 += src_a3 * src_b;
  174. }
  175. #if defined(TRMMKERNEL)
  176. dst0 = res0 * v_alpha;
  177. dst1 = res1 * v_alpha;
  178. dst2 = res2 * v_alpha;
  179. dst3 = res3 * v_alpha;
  180. dst4 = res4 * v_alpha;
  181. dst5 = res5 * v_alpha;
  182. dst6 = res6 * v_alpha;
  183. dst7 = res7 * v_alpha;
  184. #else
  185. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  186. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
  187. dst0 += res0 * v_alpha;
  188. dst1 += res1 * v_alpha;
  189. dst2 += res2 * v_alpha;
  190. dst3 += res3 * v_alpha;
  191. dst4 += res4 * v_alpha;
  192. dst5 += res5 * v_alpha;
  193. dst6 += res6 * v_alpha;
  194. dst7 += res7 * v_alpha;
  195. #endif
  196. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  197. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
  198. #if defined(TRMMKERNEL)
  199. dst0 = res8 * v_alpha;
  200. dst1 = res9 * v_alpha;
  201. dst2 = res10 * v_alpha;
  202. dst3 = res11 * v_alpha;
  203. dst4 = res12 * v_alpha;
  204. dst5 = res13 * v_alpha;
  205. dst6 = res14 * v_alpha;
  206. dst7 = res15 * v_alpha;
  207. #else
  208. LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);
  209. LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);
  210. dst0 += res8 * v_alpha;
  211. dst1 += res9 * v_alpha;
  212. dst2 += res10 * v_alpha;
  213. dst3 += res11 * v_alpha;
  214. dst4 += res12 * v_alpha;
  215. dst5 += res13 * v_alpha;
  216. dst6 += res14 * v_alpha;
  217. dst7 += res15 * v_alpha;
  218. #endif
  219. ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);
  220. ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);
  221. #if defined(TRMMKERNEL)
  222. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  223. temp = k - off;
  224. #ifdef LEFT
  225. temp -= 8; // number of values in A
  226. #else
  227. temp -= 4; // number of values in B
  228. #endif
  229. pa0 += temp * 8;
  230. pb0 += temp * 4;
  231. #endif
  232. #ifdef LEFT
  233. off += 8; // number of values in A
  234. #endif
  235. #endif
  236. }
  237. if (m & 4)
  238. {
  239. #if defined(TRMMKERNEL)
  240. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  241. pb0 = B;
  242. #else
  243. pa0 += off * 4;
  244. pb0 = B + off * 4;
  245. #endif
  246. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  247. temp = k - off;
  248. #elif defined(LEFT)
  249. temp = off + 4; // number of values in A
  250. #else
  251. temp = off + 4; // number of values in B
  252. #endif
  253. #else
  254. pb0 = B;
  255. temp = k;
  256. #endif
  257. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  258. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  259. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  260. res0 = src_a0 * src_b;
  261. res1 = src_a1 * src_b;
  262. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  263. res2 = src_a0 * src_b;
  264. res3 = src_a1 * src_b;
  265. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  266. res4 = src_a0 * src_b;
  267. res5 = src_a1 * src_b;
  268. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  269. res6 = src_a0 * src_b;
  270. res7 = src_a1 * src_b;
  271. for (l = ((temp - 1) >> 1); l--;)
  272. {
  273. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  274. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  275. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  276. res0 += src_a0 * src_b;
  277. res1 += src_a1 * src_b;
  278. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  279. res2 += src_a0 * src_b;
  280. res3 += src_a1 * src_b;
  281. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  282. res4 += src_a0 * src_b;
  283. res5 += src_a1 * src_b;
  284. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  285. res6 += src_a0 * src_b;
  286. res7 += src_a1 * src_b;
  287. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  288. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  289. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  290. res0 += src_a0 * src_b;
  291. res1 += src_a1 * src_b;
  292. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  293. res2 += src_a0 * src_b;
  294. res3 += src_a1 * src_b;
  295. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  296. res4 += src_a0 * src_b;
  297. res5 += src_a1 * src_b;
  298. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  299. res6 += src_a0 * src_b;
  300. res7 += src_a1 * src_b;
  301. }
  302. if ((temp - 1) & 1)
  303. {
  304. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  305. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  306. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  307. res0 += src_a0 * src_b;
  308. res1 += src_a1 * src_b;
  309. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  310. res2 += src_a0 * src_b;
  311. res3 += src_a1 * src_b;
  312. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  313. res4 += src_a0 * src_b;
  314. res5 += src_a1 * src_b;
  315. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  316. res6 += src_a0 * src_b;
  317. res7 += src_a1 * src_b;
  318. }
  319. #if defined(TRMMKERNEL)
  320. dst0 = res0 * v_alpha;
  321. dst1 = res1 * v_alpha;
  322. dst2 = res2 * v_alpha;
  323. dst3 = res3 * v_alpha;
  324. dst4 = res4 * v_alpha;
  325. dst5 = res5 * v_alpha;
  326. dst6 = res6 * v_alpha;
  327. dst7 = res7 * v_alpha;
  328. #else
  329. LD_DP2(pc0, 2, dst0, dst1);
  330. LD_DP2(pc1, 2, dst2, dst3);
  331. LD_DP2(pc2, 2, dst4, dst5);
  332. LD_DP2(pc3, 2, dst6, dst7);
  333. dst0 += res0 * v_alpha;
  334. dst1 += res1 * v_alpha;
  335. dst2 += res2 * v_alpha;
  336. dst3 += res3 * v_alpha;
  337. dst4 += res4 * v_alpha;
  338. dst5 += res5 * v_alpha;
  339. dst6 += res6 * v_alpha;
  340. dst7 += res7 * v_alpha;
  341. #endif
  342. ST_DP2_INC(dst0, dst1, pc0, 2);
  343. ST_DP2_INC(dst2, dst3, pc1, 2);
  344. ST_DP2_INC(dst4, dst5, pc2, 2);
  345. ST_DP2_INC(dst6, dst7, pc3, 2);
  346. #if defined(TRMMKERNEL)
  347. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  348. temp = k - off;
  349. #ifdef LEFT
  350. temp -= 4; // number of values in A
  351. #else
  352. temp -= 4; // number of values in B
  353. #endif
  354. pa0 += temp * 4;
  355. pb0 += temp * 4;
  356. #endif
  357. #ifdef LEFT
  358. off += 4; // number of values in A
  359. #endif
  360. #endif
  361. }
  362. if (m & 2)
  363. {
  364. #if defined(TRMMKERNEL)
  365. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  366. pb0 = B;
  367. #else
  368. pa0 += off * 2;
  369. pb0 = B + off * 4;
  370. #endif
  371. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  372. temp = k - off;
  373. #elif defined(LEFT)
  374. temp = off + 2; // number of values in A
  375. #else
  376. temp = off + 4; // number of values in B
  377. #endif
  378. #else
  379. pb0 = B;
  380. temp = k;
  381. #endif
  382. src_a0 = LD_DP(pa0);
  383. pa0 += 2;
  384. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  385. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  386. res0 = src_a0 * src_b;
  387. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  388. res1 = src_a0 * src_b;
  389. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  390. res2 = src_a0 * src_b;
  391. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  392. res3 = src_a0 * src_b;
  393. for (l = ((temp - 1) >> 1); l--;)
  394. {
  395. src_a0 = LD_DP(pa0);
  396. pa0 += 2;
  397. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  398. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  399. res0 += src_a0 * src_b;
  400. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  401. res1 += src_a0 * src_b;
  402. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  403. res2 += src_a0 * src_b;
  404. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  405. res3 += src_a0 * src_b;
  406. src_a0 = LD_DP(pa0);
  407. pa0 += 2;
  408. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  409. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  410. res0 += src_a0 * src_b;
  411. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  412. res1 += src_a0 * src_b;
  413. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  414. res2 += src_a0 * src_b;
  415. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  416. res3 += src_a0 * src_b;
  417. }
  418. if ((temp - 1) & 1)
  419. {
  420. src_a0 = LD_DP(pa0);
  421. pa0 += 2;
  422. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  423. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  424. res0 += src_a0 * src_b;
  425. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  426. res1 += src_a0 * src_b;
  427. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  428. res2 += src_a0 * src_b;
  429. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  430. res3 += src_a0 * src_b;
  431. }
  432. #if defined(TRMMKERNEL)
  433. dst0 = res0 * v_alpha;
  434. dst1 = res1 * v_alpha;
  435. dst2 = res2 * v_alpha;
  436. dst3 = res3 * v_alpha;
  437. #else
  438. dst0 = LD_DP(pc0);
  439. dst1 = LD_DP(pc1);
  440. dst2 = LD_DP(pc2);
  441. dst3 = LD_DP(pc3);
  442. dst0 += res0 * v_alpha;
  443. dst1 += res1 * v_alpha;
  444. dst2 += res2 * v_alpha;
  445. dst3 += res3 * v_alpha;
  446. #endif
  447. ST_DP(dst0, pc0);
  448. ST_DP(dst1, pc1);
  449. ST_DP(dst2, pc2);
  450. ST_DP(dst3, pc3);
  451. #if defined(TRMMKERNEL)
  452. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  453. temp = k - off;
  454. #ifdef LEFT
  455. temp -= 2; // number of values in A
  456. #else
  457. temp -= 4; // number of values in B
  458. #endif
  459. pa0 += temp * 2;
  460. pb0 += temp * 4;
  461. #endif
  462. #ifdef LEFT
  463. off += 2; // number of values in A
  464. #endif
  465. #endif
  466. pc0 += 2;
  467. pc1 += 2;
  468. pc2 += 2;
  469. pc3 += 2;
  470. }
  471. if (m & 1)
  472. {
  473. #if defined(TRMMKERNEL)
  474. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  475. pb0 = B;
  476. #else
  477. pa0 += off * 1;
  478. pb0 = B + off * 4;
  479. #endif
  480. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  481. temp = k - off;
  482. #elif defined(LEFT)
  483. temp = off + 1; // number of values in A
  484. #else
  485. temp = off + 4; // number of values in B
  486. #endif
  487. #else
  488. pb0 = B;
  489. temp = k;
  490. #endif
  491. a0 = pa0[0];
  492. b0 = pb0[0];
  493. tmp0 = a0 * b0;
  494. b1 = pb0[1];
  495. tmp1 = a0 * b1;
  496. b2 = pb0[2];
  497. tmp2 = a0 * b2;
  498. b3 = pb0[3];
  499. tmp3 = a0 * b3;
  500. pa0 += 1;
  501. pb0 += 4;
  502. for (l = ((temp - 1) >> 1); l--;)
  503. {
  504. a0 = pa0[0];
  505. b0 = pb0[0];
  506. tmp0 += a0 * b0;
  507. b1 = pb0[1];
  508. tmp1 += a0 * b1;
  509. b2 = pb0[2];
  510. tmp2 += a0 * b2;
  511. b3 = pb0[3];
  512. tmp3 += a0 * b3;
  513. pa0 += 1;
  514. pb0 += 4;
  515. a0 = pa0[0];
  516. b0 = pb0[0];
  517. tmp0 += a0 * b0;
  518. b1 = pb0[1];
  519. tmp1 += a0 * b1;
  520. b2 = pb0[2];
  521. tmp2 += a0 * b2;
  522. b3 = pb0[3];
  523. tmp3 += a0 * b3;
  524. pa0 += 1;
  525. pb0 += 4;
  526. }
  527. if ((temp - 1) & 1)
  528. {
  529. a0 = pa0[0];
  530. b0 = pb0[0];
  531. tmp0 += a0 * b0;
  532. b1 = pb0[1];
  533. tmp1 += a0 * b1;
  534. b2 = pb0[2];
  535. tmp2 += a0 * b2;
  536. b3 = pb0[3];
  537. tmp3 += a0 * b3;
  538. pa0 += 1;
  539. pb0 += 4;
  540. }
  541. tmp0 = alpha * tmp0;
  542. tmp1 = alpha * tmp1;
  543. tmp2 = alpha * tmp2;
  544. tmp3 = alpha * tmp3;
  545. #if defined(TRMMKERNEL)
  546. pc0[0] = tmp0;
  547. pc1[0] = tmp1;
  548. pc2[0] = tmp2;
  549. pc3[0] = tmp3;
  550. #else
  551. pc0[0] += tmp0;
  552. pc1[0] += tmp1;
  553. pc2[0] += tmp2;
  554. pc3[0] += tmp3;
  555. #endif
  556. #if defined(TRMMKERNEL)
  557. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  558. temp = k - off;
  559. #ifdef LEFT
  560. temp -= 1; // number of values in A
  561. #else
  562. temp -= 4; // number of values in B
  563. #endif
  564. pa0 += temp * 1;
  565. pb0 += temp * 4;
  566. #endif
  567. #ifdef LEFT
  568. off += 1; // number of values in A
  569. #endif
  570. #endif
  571. pc0 += 1;
  572. pc1 += 1;
  573. pc2 += 1;
  574. pc3 += 1;
  575. }
  576. #if defined(TRMMKERNEL) && !defined(LEFT)
  577. off += 4; // number of values in A
  578. #endif
  579. l = (k << 2);
  580. B = B + l;
  581. i = (ldc << 2);
  582. C = C + i;
  583. }
  584. if (n & 2)
  585. {
  586. pc0 = C;
  587. pc1 = pc0 + ldc;
  588. pa0 = A;
  589. #if defined(TRMMKERNEL) && defined(LEFT)
  590. off = offset;
  591. #endif
  592. for (i = (m >> 3); i--;)
  593. {
  594. #if defined(TRMMKERNEL)
  595. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  596. pb0 = B;
  597. #else
  598. pa0 += off * 8;
  599. pb0 = B + off * 2;
  600. #endif
  601. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  602. temp = k - off;
  603. #elif defined(LEFT)
  604. temp = off + 8; // number of values in A
  605. #else
  606. temp = off + 2; // number of values in B
  607. #endif
  608. #else
  609. pb0 = B;
  610. temp = k;
  611. #endif
  612. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  613. src_b0 = LD_DP(pb0);
  614. pb0 += 2;
  615. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  616. res0 = src_a0 * src_b;
  617. res1 = src_a1 * src_b;
  618. res2 = src_a2 * src_b;
  619. res3 = src_a3 * src_b;
  620. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  621. res4 = src_a0 * src_b;
  622. res5 = src_a1 * src_b;
  623. res6 = src_a2 * src_b;
  624. res7 = src_a3 * src_b;
  625. for (l = ((temp - 1) >> 1); l--;)
  626. {
  627. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  628. src_b0 = LD_DP(pb0);
  629. pb0 += 2;
  630. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  631. res0 += src_a0 * src_b;
  632. res1 += src_a1 * src_b;
  633. res2 += src_a2 * src_b;
  634. res3 += src_a3 * src_b;
  635. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  636. res4 += src_a0 * src_b;
  637. res5 += src_a1 * src_b;
  638. res6 += src_a2 * src_b;
  639. res7 += src_a3 * src_b;
  640. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  641. src_b0 = LD_DP(pb0);
  642. pb0 += 2;
  643. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  644. res0 += src_a0 * src_b;
  645. res1 += src_a1 * src_b;
  646. res2 += src_a2 * src_b;
  647. res3 += src_a3 * src_b;
  648. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  649. res4 += src_a0 * src_b;
  650. res5 += src_a1 * src_b;
  651. res6 += src_a2 * src_b;
  652. res7 += src_a3 * src_b;
  653. }
  654. if ((temp - 1) & 1)
  655. {
  656. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  657. src_b0 = LD_DP(pb0);
  658. pb0 += 2;
  659. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  660. res0 += src_a0 * src_b;
  661. res1 += src_a1 * src_b;
  662. res2 += src_a2 * src_b;
  663. res3 += src_a3 * src_b;
  664. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  665. res4 += src_a0 * src_b;
  666. res5 += src_a1 * src_b;
  667. res6 += src_a2 * src_b;
  668. res7 += src_a3 * src_b;
  669. }
  670. #if defined(TRMMKERNEL)
  671. dst0 = res0 * v_alpha;
  672. dst1 = res1 * v_alpha;
  673. dst2 = res2 * v_alpha;
  674. dst3 = res3 * v_alpha;
  675. dst4 = res4 * v_alpha;
  676. dst5 = res5 * v_alpha;
  677. dst6 = res6 * v_alpha;
  678. dst7 = res7 * v_alpha;
  679. #else
  680. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  681. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
  682. dst0 += res0 * v_alpha;
  683. dst1 += res1 * v_alpha;
  684. dst2 += res2 * v_alpha;
  685. dst3 += res3 * v_alpha;
  686. dst4 += res4 * v_alpha;
  687. dst5 += res5 * v_alpha;
  688. dst6 += res6 * v_alpha;
  689. dst7 += res7 * v_alpha;
  690. #endif
  691. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  692. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
  693. #if defined(TRMMKERNEL)
  694. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  695. temp = k - off;
  696. #ifdef LEFT
  697. temp -= 8; // number of values in A
  698. #else
  699. temp -= 2; // number of values in B
  700. #endif
  701. pa0 += temp * 8;
  702. pb0 += temp * 2;
  703. #endif
  704. #ifdef LEFT
  705. off += 8; // number of values in A
  706. #endif
  707. #endif
  708. }
  709. if (m & 4)
  710. {
  711. #if defined(TRMMKERNEL)
  712. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  713. pb0 = B;
  714. #else
  715. pa0 += off * 4;
  716. pb0 = B + off * 2;
  717. #endif
  718. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  719. temp = k - off;
  720. #elif defined(LEFT)
  721. temp = off + 4; // number of values in A
  722. #else
  723. temp = off + 2; // number of values in B
  724. #endif
  725. #else
  726. pb0 = B;
  727. temp = k;
  728. #endif
  729. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  730. src_b0 = LD_DP(pb0);
  731. pb0 += 2;
  732. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  733. res0 = src_a0 * src_b;
  734. res1 = src_a1 * src_b;
  735. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  736. res2 = src_a0 * src_b;
  737. res3 = src_a1 * src_b;
  738. for (l = ((temp - 1) >> 1); l--;)
  739. {
  740. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  741. src_b0 = LD_DP(pb0);
  742. pb0 += 2;
  743. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  744. res0 += src_a0 * src_b;
  745. res1 += src_a1 * src_b;
  746. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  747. res2 += src_a0 * src_b;
  748. res3 += src_a1 * src_b;
  749. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  750. src_b0 = LD_DP(pb0);
  751. pb0 += 2;
  752. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  753. res0 += src_a0 * src_b;
  754. res1 += src_a1 * src_b;
  755. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  756. res2 += src_a0 * src_b;
  757. res3 += src_a1 * src_b;
  758. }
  759. if ((temp - 1) & 1)
  760. {
  761. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  762. src_b0 = LD_DP(pb0);
  763. pb0 += 2;
  764. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  765. res0 += src_a0 * src_b;
  766. res1 += src_a1 * src_b;
  767. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  768. res2 += src_a0 * src_b;
  769. res3 += src_a1 * src_b;
  770. }
  771. #if defined(TRMMKERNEL)
  772. dst0 = res0 * v_alpha;
  773. dst1 = res1 * v_alpha;
  774. dst2 = res2 * v_alpha;
  775. dst3 = res3 * v_alpha;
  776. #else
  777. LD_DP2(pc0, 2, dst0, dst1);
  778. LD_DP2(pc1, 2, dst2, dst3);
  779. dst0 += res0 * v_alpha;
  780. dst1 += res1 * v_alpha;
  781. dst2 += res2 * v_alpha;
  782. dst3 += res3 * v_alpha;
  783. #endif
  784. ST_DP2_INC(dst0, dst1, pc0, 2);
  785. ST_DP2_INC(dst2, dst3, pc1, 2);
  786. #if defined(TRMMKERNEL)
  787. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  788. temp = k - off;
  789. #ifdef LEFT
  790. temp -= 4; // number of values in A
  791. #else
  792. temp -= 2; // number of values in B
  793. #endif
  794. pa0 += temp * 4;
  795. pb0 += temp * 2;
  796. #endif
  797. #ifdef LEFT
  798. off += 4; // number of values in A
  799. #endif
  800. #endif
  801. }
  802. if (m & 2)
  803. {
  804. #if defined(TRMMKERNEL)
  805. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  806. pb0 = B;
  807. #else
  808. pa0 += off * 2;
  809. pb0 = B + off * 2;
  810. #endif
  811. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  812. temp = k - off;
  813. #elif defined(LEFT)
  814. temp = off + 2; // number of values in A
  815. #else
  816. temp = off + 2; // number of values in B
  817. #endif
  818. #else
  819. pb0 = B;
  820. temp = k;
  821. #endif
  822. src_a0 = LD_DP(pa0);
  823. pa0 += 2;
  824. src_b0 = LD_DP(pb0);
  825. pb0 += 2;
  826. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  827. res0 = src_a0 * src_b;
  828. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  829. res1 = src_a0 * src_b;
  830. for (l = ((temp - 1) >> 1); l--;)
  831. {
  832. src_a0 = LD_DP(pa0);
  833. pa0 += 2;
  834. src_b0 = LD_DP(pb0);
  835. pb0 += 2;
  836. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  837. res0 += src_a0 * src_b;
  838. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  839. res1 += src_a0 * src_b;
  840. src_a0 = LD_DP(pa0);
  841. pa0 += 2;
  842. src_b0 = LD_DP(pb0);
  843. pb0 += 2;
  844. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  845. res0 += src_a0 * src_b;
  846. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  847. res1 += src_a0 * src_b;
  848. }
  849. if ((temp - 1) & 1)
  850. {
  851. src_a0 = LD_DP(pa0);
  852. pa0 += 2;
  853. src_b0 = LD_DP(pb0);
  854. pb0 += 2;
  855. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  856. res0 += src_a0 * src_b;
  857. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  858. res1 += src_a0 * src_b;
  859. }
  860. #if defined(TRMMKERNEL)
  861. dst0 = res0 * v_alpha;
  862. dst1 = res1 * v_alpha;
  863. #else
  864. dst0 = LD_DP(pc0);
  865. dst1 = LD_DP(pc1);
  866. dst0 += res0 * v_alpha;
  867. dst1 += res1 * v_alpha;
  868. #endif
  869. ST_DP(dst0, pc0);
  870. ST_DP(dst1, pc1);
  871. #if defined(TRMMKERNEL)
  872. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  873. temp = k - off;
  874. #ifdef LEFT
  875. temp -= 2; // number of values in A
  876. #else
  877. temp -= 2; // number of values in B
  878. #endif
  879. pa0 += temp * 2;
  880. pb0 += temp * 2;
  881. #endif
  882. #ifdef LEFT
  883. off += 2; // number of values in A
  884. #endif
  885. #endif
  886. pc0 += 2;
  887. pc1 += 2;
  888. }
  889. if (m & 1)
  890. {
  891. #if defined(TRMMKERNEL)
  892. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  893. pb0 = B;
  894. #else
  895. pa0 += off * 1;
  896. pb0 = B + off * 2;
  897. #endif
  898. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  899. temp = k - off;
  900. #elif defined(LEFT)
  901. temp = off + 1; // number of values in A
  902. #else
  903. temp = off + 2; // number of values in B
  904. #endif
  905. #else
  906. pb0 = B;
  907. temp = k;
  908. #endif
  909. a0 = pa0[0];
  910. b0 = pb0[0];
  911. tmp0 = a0 * b0;
  912. b1 = pb0[1];
  913. tmp1 = a0 * b1;
  914. pa0 += 1;
  915. pb0 += 2;
  916. for (l = ((temp - 1) >> 1); l--;)
  917. {
  918. a0 = pa0[0];
  919. b0 = pb0[0];
  920. tmp0 += a0 * b0;
  921. b1 = pb0[1];
  922. tmp1 += a0 * b1;
  923. pa0 += 1;
  924. pb0 += 2;
  925. a0 = pa0[0];
  926. b0 = pb0[0];
  927. tmp0 += a0 * b0;
  928. b1 = pb0[1];
  929. tmp1 += a0 * b1;
  930. pa0 += 1;
  931. pb0 += 2;
  932. }
  933. if ((temp - 1) & 1)
  934. {
  935. a0 = pa0[0];
  936. b0 = pb0[0];
  937. tmp0 += a0 * b0;
  938. b1 = pb0[1];
  939. tmp1 += a0 * b1;
  940. pa0 += 1;
  941. pb0 += 2;
  942. }
  943. tmp0 = alpha * tmp0;
  944. tmp1 = alpha * tmp1;
  945. #if defined(TRMMKERNEL)
  946. pc0[0] = tmp0;
  947. pc1[0] = tmp1;
  948. #else
  949. pc0[0] += tmp0;
  950. pc1[0] += tmp1;
  951. #endif
  952. #if defined(TRMMKERNEL)
  953. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  954. temp = k - off;
  955. #ifdef LEFT
  956. temp -= 1; // number of values in A
  957. #else
  958. temp -= 2; // number of values in B
  959. #endif
  960. pa0 += temp * 1;
  961. pb0 += temp * 2;
  962. #endif
  963. #ifdef LEFT
  964. off += 1; // number of values in A
  965. #endif
  966. #endif
  967. pc0 += 1;
  968. pc1 += 1;
  969. }
  970. #if defined(TRMMKERNEL) && !defined(LEFT)
  971. off += 2; // number of values in A
  972. #endif
  973. l = (k << 1);
  974. B = B + l;
  975. i = (ldc << 1);
  976. C = C + i;
  977. }
  978. if (n & 1)
  979. {
  980. pc0 = C;
  981. pa0 = A;
  982. #if defined(TRMMKERNEL) && defined(LEFT)
  983. off = offset;
  984. #endif
  985. for (i = (m >> 3); i--;)
  986. {
  987. #if defined(TRMMKERNEL)
  988. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  989. pb0 = B;
  990. #else
  991. pa0 += off * 8;
  992. pb0 = B + off * 1;
  993. #endif
  994. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  995. temp = k - off;
  996. #elif defined(LEFT)
  997. temp = off + 8; // number of values in A
  998. #else
  999. temp = off + 1; // number of values in B
  1000. #endif
  1001. #else
  1002. pb0 = B;
  1003. temp = k;
  1004. #endif
  1005. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1006. src_b[0] = pb0[0];
  1007. src_b[1] = pb0[0];
  1008. res0 = src_a0 * src_b;
  1009. res1 = src_a1 * src_b;
  1010. res2 = src_a2 * src_b;
  1011. res3 = src_a3 * src_b;
  1012. pb0 += 1;
  1013. for (l = ((temp - 1) >> 1); l--;)
  1014. {
  1015. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1016. src_b[0] = pb0[0];
  1017. src_b[1] = pb0[0];
  1018. res0 += src_a0 * src_b;
  1019. res1 += src_a1 * src_b;
  1020. res2 += src_a2 * src_b;
  1021. res3 += src_a3 * src_b;
  1022. pb0 += 1;
  1023. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1024. src_b[0] = pb0[0];
  1025. src_b[1] = pb0[0];
  1026. res0 += src_a0 * src_b;
  1027. res1 += src_a1 * src_b;
  1028. res2 += src_a2 * src_b;
  1029. res3 += src_a3 * src_b;
  1030. pb0 += 1;
  1031. }
  1032. if ((temp - 1) & 1)
  1033. {
  1034. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1035. src_b[0] = pb0[0];
  1036. src_b[1] = pb0[0];
  1037. res0 += src_a0 * src_b;
  1038. res1 += src_a1 * src_b;
  1039. res2 += src_a2 * src_b;
  1040. res3 += src_a3 * src_b;
  1041. pb0 += 1;
  1042. }
  1043. #if defined(TRMMKERNEL)
  1044. dst0 = res0 * v_alpha;
  1045. dst1 = res1 * v_alpha;
  1046. dst2 = res2 * v_alpha;
  1047. dst3 = res3 * v_alpha;
  1048. #else
  1049. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  1050. dst0 += res0 * v_alpha;
  1051. dst1 += res1 * v_alpha;
  1052. dst2 += res2 * v_alpha;
  1053. dst3 += res3 * v_alpha;
  1054. #endif
  1055. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  1056. #if defined(TRMMKERNEL)
  1057. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1058. temp = k - off;
  1059. #ifdef LEFT
  1060. temp -= 8; // number of values in A
  1061. #else
  1062. temp -= 1; // number of values in B
  1063. #endif
  1064. pa0 += temp * 8;
  1065. pb0 += temp * 1;
  1066. #endif
  1067. #ifdef LEFT
  1068. off += 8; // number of values in A
  1069. #endif
  1070. #endif
  1071. }
  1072. if (m & 4)
  1073. {
  1074. #if defined(TRMMKERNEL)
  1075. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1076. pb0 = B;
  1077. #else
  1078. pa0 += off * 4;
  1079. pb0 = B + off * 1;
  1080. #endif
  1081. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1082. temp = k - off;
  1083. #elif defined(LEFT)
  1084. temp = off + 4; // number of values in A
  1085. #else
  1086. temp = off + 1; // number of values in B
  1087. #endif
  1088. #else
  1089. pb0 = B;
  1090. temp = k;
  1091. #endif
  1092. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1093. src_b[0] = pb0[0];
  1094. src_b[1] = pb0[0];
  1095. res0 = src_a0 * src_b;
  1096. res1 = src_a1 * src_b;
  1097. pb0 += 1;
  1098. for (l = ((temp - 1) >> 1); l--;)
  1099. {
  1100. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1101. src_b[0] = pb0[0];
  1102. src_b[1] = pb0[0];
  1103. res0 += src_a0 * src_b;
  1104. res1 += src_a1 * src_b;
  1105. pb0 += 1;
  1106. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1107. src_b[0] = pb0[0];
  1108. src_b[1] = pb0[0];
  1109. res0 += src_a0 * src_b;
  1110. res1 += src_a1 * src_b;
  1111. pb0 += 1;
  1112. }
  1113. if ((temp - 1) & 1)
  1114. {
  1115. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1116. src_b[0] = pb0[0];
  1117. src_b[1] = pb0[0];
  1118. res0 += src_a0 * src_b;
  1119. res1 += src_a1 * src_b;
  1120. pb0 += 1;
  1121. }
  1122. #if defined(TRMMKERNEL)
  1123. dst0 = res0 * v_alpha;
  1124. dst1 = res1 * v_alpha;
  1125. #else
  1126. LD_DP2(pc0, 2, dst0, dst1);
  1127. dst0 += res0 * v_alpha;
  1128. dst1 += res1 * v_alpha;
  1129. #endif
  1130. ST_DP2_INC(dst0, dst1, pc0, 2);
  1131. #if defined(TRMMKERNEL)
  1132. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1133. temp = k - off;
  1134. #ifdef LEFT
  1135. temp -= 4; // number of values in A
  1136. #else
  1137. temp -= 1; // number of values in B
  1138. #endif
  1139. pa0 += temp * 4;
  1140. pb0 += temp * 1;
  1141. #endif
  1142. #ifdef LEFT
  1143. off += 4; // number of values in A
  1144. #endif
  1145. #endif
  1146. }
  1147. if (m & 2)
  1148. {
  1149. #if defined(TRMMKERNEL)
  1150. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1151. pb0 = B;
  1152. #else
  1153. pa0 += off * 2;
  1154. pb0 = B + off * 1;
  1155. #endif
  1156. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1157. temp = k - off;
  1158. #elif defined(LEFT)
  1159. temp = off + 2; // number of values in A
  1160. #else
  1161. temp = off + 1; // number of values in B
  1162. #endif
  1163. #else
  1164. pb0 = B;
  1165. temp = k;
  1166. #endif
  1167. src_a0 = LD_DP(pa0);
  1168. src_b[0] = pb0[0];
  1169. src_b[1] = pb0[0];
  1170. res0 = src_a0 * src_b;
  1171. pa0 += 2;
  1172. pb0 += 1;
  1173. for (l = ((temp - 1) >> 1); l--;)
  1174. {
  1175. src_a0 = LD_DP(pa0);
  1176. src_b[0] = pb0[0];
  1177. src_b[1] = pb0[0];
  1178. res0 += src_a0 * src_b;
  1179. pa0 += 2;
  1180. pb0 += 1;
  1181. src_a0 = LD_DP(pa0);
  1182. src_b[0] = pb0[0];
  1183. src_b[1] = pb0[0];
  1184. res0 += src_a0 * src_b;
  1185. pa0 += 2;
  1186. pb0 += 1;
  1187. }
  1188. if ((temp - 1) & 1)
  1189. {
  1190. src_a0 = LD_DP(pa0);
  1191. src_b[0] = pb0[0];
  1192. src_b[1] = pb0[0];
  1193. res0 += src_a0 * src_b;
  1194. pa0 += 2;
  1195. pb0 += 1;
  1196. }
  1197. #if defined(TRMMKERNEL)
  1198. dst0 = res0 * v_alpha;
  1199. #else
  1200. dst0 = LD_DP(pc0);
  1201. dst0 += res0 * v_alpha;
  1202. #endif
  1203. ST_DP(dst0, pc0);
  1204. #if defined(TRMMKERNEL)
  1205. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1206. temp = k - off;
  1207. #ifdef LEFT
  1208. temp -= 2; // number of values in A
  1209. #else
  1210. temp -= 1; // number of values in B
  1211. #endif
  1212. pa0 += temp * 2;
  1213. pb0 += temp * 1;
  1214. #endif
  1215. #ifdef LEFT
  1216. off += 2; // number of values in A
  1217. #endif
  1218. #endif
  1219. pc0 += 2;
  1220. }
  1221. if (m & 1)
  1222. {
  1223. #if defined(TRMMKERNEL)
  1224. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1225. pb0 = B;
  1226. #else
  1227. pa0 += off * 1;
  1228. pb0 = B + off * 1;
  1229. #endif
  1230. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1231. temp = k - off;
  1232. #elif defined(LEFT)
  1233. temp = off + 1; // number of values in A
  1234. #else
  1235. temp = off + 1; // number of values in B
  1236. #endif
  1237. #else
  1238. pb0 = B;
  1239. temp = k;
  1240. #endif
  1241. a0 = pa0[0];
  1242. b0 = pb0[0];
  1243. tmp0 = a0 * b0;
  1244. pa0 += 1;
  1245. pb0 += 1;
  1246. for (l = ((temp - 1) >> 1); l--;)
  1247. {
  1248. a0 = pa0[0];
  1249. b0 = pb0[0];
  1250. tmp0 += a0 * b0;
  1251. pa0 += 1;
  1252. pb0 += 1;
  1253. a0 = pa0[0];
  1254. b0 = pb0[0];
  1255. tmp0 += a0 * b0;
  1256. pa0 += 1;
  1257. pb0 += 1;
  1258. }
  1259. if ((temp - 1) & 1)
  1260. {
  1261. a0 = pa0[0];
  1262. b0 = pb0[0];
  1263. tmp0 += a0 * b0;
  1264. pa0 += 1;
  1265. pb0 += 1;
  1266. }
  1267. #if defined(TRMMKERNEL)
  1268. pc0[0] = alpha * tmp0;
  1269. #else
  1270. pc0[0] += alpha * tmp0;
  1271. #endif
  1272. #if defined(TRMMKERNEL)
  1273. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1274. temp = k - off;
  1275. #ifdef LEFT
  1276. temp -= 1; // number of values in A
  1277. #else
  1278. temp -= 1; // number of values in B
  1279. #endif
  1280. pa0 += temp * 1;
  1281. pb0 += temp * 1;
  1282. #endif
  1283. #ifdef LEFT
  1284. off += 1; // number of values in A
  1285. #endif
  1286. #endif
  1287. pc0 += 1;
  1288. }
  1289. #if defined(TRMMKERNEL) && !defined(LEFT)
  1290. off += 1; // number of values in A
  1291. #endif
  1292. l = (k << 0);
  1293. B = B + l;
  1294. i = (ldc << 0);
  1295. C = C + i;
  1296. }
  1297. return 0;
  1298. }