You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_power10.c 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386
  1. /*********************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
  31. typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
  32. #if defined(TRMMKERNEL)
  33. #define SAVE_ACC(ACC, J) \
  34. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  35. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  36. rowC[0] = result[0] * alpha; \
  37. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  38. rowC[0] = result[1] * alpha; \
  39. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  40. rowC[0] = result[2] * alpha; \
  41. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  42. rowC[0] = result[3] * alpha;
  43. #define SAVE_ACC1(ACC, J) \
  44. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  45. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  46. rowC[0] = result[0] * alpha; \
  47. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  48. rowC[0] = result[1] * alpha; \
  49. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  50. rowC[0] = result[2] * alpha; \
  51. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  52. rowC[0] = result[3] * alpha;
  53. #define SAVE4x2_ACC(ACC, J) \
  54. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  55. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  56. rowC[0] = result[0] * alpha; \
  57. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  58. rowC[0] = result[2] * alpha; \
  59. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  60. rowC[0] = result[4] * alpha; \
  61. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  62. rowC[0] = result[6] * alpha;
  63. #define SAVE4x2_ACC1(ACC, J) \
  64. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  65. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  66. rowC[0] = result[0] * alpha; \
  67. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  68. rowC[0] = result[2] * alpha; \
  69. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  70. rowC[0] = result[4] * alpha; \
  71. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  72. rowC[0] = result[6] * alpha;
  73. #define SAVE2x4_ACC(ACC, J) \
  74. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  75. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  76. rowC[0] = result[0] * alpha; \
  77. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  78. rowC[0] = result[1] * alpha;
  79. #else
  80. #define SAVE_ACC(ACC, J) \
  81. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  82. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  83. rowC[0] += result[0] * alpha; \
  84. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  85. rowC[0] += result[1] * alpha; \
  86. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  87. rowC[0] += result[2] * alpha; \
  88. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  89. rowC[0] += result[3] * alpha;
  90. #define SAVE_ACC1(ACC, J) \
  91. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  92. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  93. rowC[0] += result[0] * alpha; \
  94. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  95. rowC[0] += result[1] * alpha; \
  96. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  97. rowC[0] += result[2] * alpha; \
  98. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  99. rowC[0] += result[3] * alpha;
  100. #define SAVE4x2_ACC(ACC, J) \
  101. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  102. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  103. rowC[0] += result[0] * alpha; \
  104. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  105. rowC[0] += result[2] * alpha; \
  106. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  107. rowC[0] += result[4] * alpha; \
  108. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  109. rowC[0] += result[6] * alpha;
  110. #define SAVE4x2_ACC1(ACC, J) \
  111. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  112. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  113. rowC[0] += result[0] * alpha; \
  114. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  115. rowC[0] += result[2] * alpha; \
  116. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  117. rowC[0] += result[4] * alpha; \
  118. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  119. rowC[0] += result[6] * alpha;
  120. #define SAVE2x4_ACC(ACC, J) \
  121. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  122. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  123. rowC[0] += result[0] * alpha; \
  124. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  125. rowC[0] += result[1] * alpha;
  126. #endif
  127. #define KERNEL(i, j) \
  128. __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
  129. __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
  130. __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
  131. __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
  132. __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
  133. __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
  134. __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
  135. __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
  136. #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  137. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  138. #define REFRESH_TEMP_BK(x, y) \
  139. temp = k - off;
  140. #elif defined(LEFT)
  141. #define REFRESH_TEMP_BK(x, y) \
  142. temp = off + x;
  143. #else
  144. #define REFRESH_TEMP_BK(x, y) \
  145. temp = off + y;
  146. #endif
  147. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  148. #define REFRESH_POINTERS(x, y) \
  149. BO = B; \
  150. REFRESH_TEMP_BK(x, y)
  151. #else
  152. #define REFRESH_POINTERS(x, y) \
  153. AO += off * x; \
  154. BO = B + off * y; \
  155. REFRESH_TEMP_BK(x, y)
  156. #endif
  157. #ifdef LEFT
  158. #define REFRESH_OFF(x) \
  159. off += x;
  160. #else
  161. #define REFRESH_OFF(x)
  162. #endif
  163. #ifdef LEFT
  164. #define UPDATE_TEMP(x, y) \
  165. temp -= x;
  166. #else
  167. #define UPDATE_TEMP(x, y) \
  168. temp -= y;
  169. #endif
  170. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  171. #define REFRESH_TMP_AFTER_SAVE(x, y) \
  172. temp = k - off; \
  173. UPDATE_TEMP(x, y) \
  174. AO += temp * x; \
  175. BO += temp * y;
  176. #else
  177. #define REFRESH_TMP_AFTER_SAVE(x, y)
  178. #endif
  179. #define REFRESH_AFTER_SAVE(x,y) \
  180. REFRESH_TMP_AFTER_SAVE(x, y) \
  181. REFRESH_OFF(x)
  182. /*************************************************************************************
  183. * GEMM Kernel
  184. *************************************************************************************/
  185. int
  186. CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
  187. FLOAT * C, BLASLONG ldc
  188. #ifdef TRMMKERNEL
  189. , BLASLONG offset
  190. #endif
  191. )
  192. {
  193. BLASLONG N = n;
  194. BLASLONG i1;
  195. #if defined(TRMMKERNEL)
  196. BLASLONG off;
  197. #endif
  198. #if defined(TRMMKERNEL) && !defined(LEFT)
  199. off = -offset;
  200. #endif
  201. v4sf_t valpha = { alpha, alpha, alpha, alpha };
  202. N = n >> 3;
  203. for (i1 = 0; i1 < N; i1++)
  204. {
  205. BLASLONG i, j, temp;
  206. FLOAT *CO;
  207. FLOAT *AO;
  208. #if defined(TRMMKERNEL) && defined(LEFT)
  209. off = offset;
  210. #endif
  211. CO = C;
  212. C += ldc << 3;
  213. AO = A;
  214. PREFETCH1 (A, 128);
  215. PREFETCH1 (A, 256);
  216. i = m >> 4;
  217. for (j = 0; j < i; j++)
  218. {
  219. FLOAT *BO;
  220. #if defined(TRMMKERNEL)
  221. REFRESH_POINTERS (16, 8);
  222. #else
  223. BO = B;
  224. temp = k;
  225. #endif
  226. v4sf_t *rowC;
  227. v4sf_t result[4];
  228. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  229. BLASLONG l = 0;
  230. vec_t *rowA1 = (vec_t *) & AO[0];
  231. vec_t *rowB1 = (vec_t *) & BO[0];
  232. __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]);
  233. __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]);
  234. __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]);
  235. __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]);
  236. __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]);
  237. __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]);
  238. __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]);
  239. __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]);
  240. AO += 16;
  241. BO += 8;
  242. temp--;
  243. BLASLONG K = temp / 64;
  244. for (l = 0; l < K; l++)
  245. {
  246. vec_t *rowA = (vec_t *) & AO[0];
  247. vec_t *rowB = (vec_t *) & BO[0];
  248. KERNEL (0, 0);
  249. KERNEL (2, 4);
  250. KERNEL (4, 8);
  251. KERNEL (6, 12);
  252. KERNEL (8, 16);
  253. KERNEL (10, 20);
  254. KERNEL (12, 24);
  255. KERNEL (14, 28);
  256. KERNEL (16, 32);
  257. KERNEL (18, 36);
  258. KERNEL (20, 40);
  259. KERNEL (22, 44);
  260. KERNEL (24, 48);
  261. KERNEL (26, 52);
  262. KERNEL (28, 56);
  263. KERNEL (30, 60);
  264. KERNEL (32, 64);
  265. KERNEL (34, 68);
  266. KERNEL (36, 72);
  267. KERNEL (38, 76);
  268. KERNEL (40, 80);
  269. KERNEL (42, 84);
  270. KERNEL (44, 88);
  271. KERNEL (46, 92);
  272. KERNEL (48, 96);
  273. KERNEL (50, 100);
  274. KERNEL (52, 104);
  275. KERNEL (54, 108);
  276. KERNEL (56, 112);
  277. KERNEL (58, 116);
  278. KERNEL (60, 120);
  279. KERNEL (62, 124);
  280. KERNEL (64, 128);
  281. KERNEL (66, 132);
  282. KERNEL (68, 136);
  283. KERNEL (70, 140);
  284. KERNEL (72, 144);
  285. KERNEL (74, 148);
  286. KERNEL (76, 152);
  287. KERNEL (78, 156);
  288. KERNEL (80, 160);
  289. KERNEL (82, 164);
  290. KERNEL (84, 168);
  291. KERNEL (86, 172);
  292. KERNEL (88, 176);
  293. KERNEL (90, 180);
  294. KERNEL (92, 184);
  295. KERNEL (94, 188);
  296. KERNEL (96, 192);
  297. KERNEL (98, 196);
  298. KERNEL (100, 200);
  299. KERNEL (102, 204);
  300. KERNEL (104, 208);
  301. KERNEL (106, 212);
  302. KERNEL (108, 216);
  303. KERNEL (110, 220);
  304. KERNEL (112, 224);
  305. KERNEL (114, 228);
  306. KERNEL (116, 232);
  307. KERNEL (118, 236);
  308. KERNEL (120, 240);
  309. KERNEL (122, 244);
  310. KERNEL (124, 248);
  311. KERNEL (126, 252);
  312. AO += 1024;
  313. BO += 512;
  314. }
  315. if ((temp & 63) >> 5)
  316. {
  317. vec_t *rowA = (vec_t *) & AO[0];
  318. vec_t *rowB = (vec_t *) & BO[0];
  319. KERNEL (0, 0);
  320. KERNEL (2, 4);
  321. KERNEL (4, 8);
  322. KERNEL (6, 12);
  323. KERNEL (8, 16);
  324. KERNEL (10, 20);
  325. KERNEL (12, 24);
  326. KERNEL (14, 28);
  327. KERNEL (16, 32);
  328. KERNEL (18, 36);
  329. KERNEL (20, 40);
  330. KERNEL (22, 44);
  331. KERNEL (24, 48);
  332. KERNEL (26, 52);
  333. KERNEL (28, 56);
  334. KERNEL (30, 60);
  335. KERNEL (32, 64);
  336. KERNEL (34, 68);
  337. KERNEL (36, 72);
  338. KERNEL (38, 76);
  339. KERNEL (40, 80);
  340. KERNEL (42, 84);
  341. KERNEL (44, 88);
  342. KERNEL (46, 92);
  343. KERNEL (48, 96);
  344. KERNEL (50, 100);
  345. KERNEL (52, 104);
  346. KERNEL (54, 108);
  347. KERNEL (56, 112);
  348. KERNEL (58, 116);
  349. KERNEL (60, 120);
  350. KERNEL (62, 124);
  351. AO += 512;
  352. BO += 256;
  353. }
  354. if ((temp & 31) >> 4)
  355. {
  356. vec_t *rowA = (vec_t *) & AO[0];
  357. vec_t *rowB = (vec_t *) & BO[0];
  358. KERNEL (0, 0);
  359. KERNEL (2, 4);
  360. KERNEL (4, 8);
  361. KERNEL (6, 12);
  362. KERNEL (8, 16);
  363. KERNEL (10, 20);
  364. KERNEL (12, 24);
  365. KERNEL (14, 28);
  366. KERNEL (16, 32);
  367. KERNEL (18, 36);
  368. KERNEL (20, 40);
  369. KERNEL (22, 44);
  370. KERNEL (24, 48);
  371. KERNEL (26, 52);
  372. KERNEL (28, 56);
  373. KERNEL (30, 60);
  374. AO += 256;
  375. BO += 128;
  376. }
  377. if ((temp & 15) >> 3)
  378. {
  379. vec_t *rowA = (vec_t *) & AO[0];
  380. vec_t *rowB = (vec_t *) & BO[0];
  381. KERNEL (0, 0);
  382. KERNEL (2, 4);
  383. KERNEL (4, 8);
  384. KERNEL (6, 12);
  385. KERNEL (8, 16);
  386. KERNEL (10, 20);
  387. KERNEL (12, 24);
  388. KERNEL (14, 28);
  389. AO += 128;
  390. BO += 64;
  391. }
  392. if ((temp & 7) >> 2)
  393. {
  394. vec_t *rowA = (vec_t *) & AO[0];
  395. vec_t *rowB = (vec_t *) & BO[0];
  396. KERNEL (0, 0);
  397. KERNEL (2, 4);
  398. KERNEL (4, 8);
  399. KERNEL (6, 12);
  400. AO += 64;
  401. BO += 32;
  402. }
  403. if ((temp & 3) >> 1)
  404. {
  405. vec_t *rowA = (vec_t *) & AO[0];
  406. vec_t *rowB = (vec_t *) & BO[0];
  407. KERNEL (0, 0);
  408. KERNEL (2, 4);
  409. AO += 32;
  410. BO += 16;
  411. }
  412. if ((temp & 1) >> 0)
  413. {
  414. vec_t *rowA = (vec_t *) & AO[0];
  415. vec_t *rowB = (vec_t *) & BO[0];
  416. KERNEL (0, 0);
  417. AO += 16;
  418. BO += 8;
  419. }
  420. SAVE_ACC (&acc0, 0);
  421. SAVE_ACC (&acc2, 4);
  422. SAVE_ACC1 (&acc1, 0);
  423. SAVE_ACC1 (&acc3, 4);
  424. SAVE_ACC (&acc4, 8);
  425. SAVE_ACC (&acc6, 12);
  426. SAVE_ACC1 (&acc5, 8);
  427. SAVE_ACC1 (&acc7, 12);
  428. #if defined(TRMMKERNEL)
  429. REFRESH_AFTER_SAVE (16, 8)
  430. #endif
  431. CO += 16;
  432. }
  433. i = (m & 15) >> 3;
  434. for (j = 0; j < i; j++)
  435. {
  436. FLOAT *BO;
  437. #if defined(TRMMKERNEL)
  438. REFRESH_POINTERS (8, 8);
  439. #else
  440. BO = B;
  441. temp = k;
  442. #endif
  443. v4sf_t *rowC;
  444. v4sf_t result[4];
  445. __vector_quad acc0, acc1, acc2, acc3;
  446. BLASLONG l = 0;
  447. vec_t *rowA = (vec_t *) & AO[0];
  448. vec_t *rowB = (vec_t *) & BO[0];
  449. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  450. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  451. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]);
  452. __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]);
  453. for (l = 1; l < temp; l++)
  454. {
  455. rowA = (vec_t *) & AO[l << 3];
  456. rowB = (vec_t *) & BO[l << 3];
  457. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  458. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  459. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
  460. __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
  461. }
  462. SAVE_ACC (&acc0, 0);
  463. SAVE_ACC (&acc2, 4);
  464. SAVE_ACC1 (&acc1, 0);
  465. SAVE_ACC1 (&acc3, 4);
  466. AO += (temp << 3);
  467. BO += (temp << 3);
  468. CO += 8;
  469. #if defined(TRMMKERNEL)
  470. REFRESH_AFTER_SAVE (8, 8)
  471. #endif
  472. }
  473. i = (m & 7) >> 2;
  474. for (j = 0; j < i; j++)
  475. {
  476. FLOAT *BO;
  477. #if defined(TRMMKERNEL)
  478. REFRESH_POINTERS (4, 8);
  479. #else
  480. BO = B;
  481. temp = k;
  482. #endif
  483. v4sf_t *rowC;
  484. v4sf_t result[4];
  485. __vector_quad acc0, acc1;
  486. BLASLONG l = 0;
  487. vec_t *rowA = (vec_t *) & AO[0];
  488. vec_t *rowB = (vec_t *) & BO[0];
  489. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  490. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  491. for (l = 1; l < temp; l++)
  492. {
  493. rowA = (vec_t *) & AO[l << 2];
  494. rowB = (vec_t *) & BO[l << 3];
  495. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  496. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  497. }
  498. SAVE_ACC (&acc0, 0);
  499. SAVE_ACC1 (&acc1, 0);
  500. CO += 4;
  501. AO += (temp << 2);
  502. BO += (temp << 3);
  503. #if defined(TRMMKERNEL)
  504. REFRESH_AFTER_SAVE (4, 8)
  505. #endif
  506. }
  507. i = (m & 3) >> 1;
  508. for (j = 0; j < i; j++)
  509. {
  510. FLOAT *BO;
  511. #if defined(TRMMKERNEL)
  512. REFRESH_POINTERS (2, 8);
  513. #else
  514. BO = B;
  515. temp = k;
  516. #endif
  517. v2sf_t *rowC;
  518. v2sf_t result[8];
  519. __vector_quad acc0, acc1;
  520. BLASLONG l = 0;
  521. FLOAT t[4] = { 0 };
  522. t[0] = AO[0], t[1] = AO[1];
  523. vec_t *rowA = (vec_t *) & t[0];
  524. vec_t *rowB = (vec_t *) & BO[0];
  525. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  526. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  527. for (l = 1; l < temp; l++)
  528. {
  529. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  530. rowA = (vec_t *) & t[0];
  531. rowB = (vec_t *) & BO[l << 3];
  532. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  533. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  534. }
  535. SAVE4x2_ACC (&acc0, 0);
  536. SAVE4x2_ACC1 (&acc1, 0);
  537. CO += 2;
  538. AO += (temp << 1);
  539. BO += (temp << 3);
  540. #if defined(TRMMKERNEL)
  541. REFRESH_AFTER_SAVE (2, 8)
  542. #endif
  543. }
  544. i = (m & 1) >> 0;
  545. for (j = 0; j < i; j++)
  546. {
  547. FLOAT *BO;
  548. #if defined(TRMMKERNEL)
  549. REFRESH_POINTERS (1, 8);
  550. #else
  551. BO = B;
  552. temp = k;
  553. #endif
  554. BLASLONG l = 0;
  555. v4sf_t t = { 0, 0, 0, 0 };
  556. v4sf_t t1 = { 0, 0, 0, 0 };
  557. for (l = 0; l < temp; l++)
  558. {
  559. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  560. v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
  561. BO[(l << 3) + 3]
  562. };
  563. v4sf_t rowB1 =
  564. { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
  565. BO[(l << 3) + 7]
  566. };
  567. t += rowA * rowB;
  568. t1 += rowA * rowB1;
  569. }
  570. t = t * valpha;
  571. t1 = t1 * valpha;
  572. #if defined(TRMMKERNEL)
  573. CO[0 * ldc] = t[0];
  574. CO[1 * ldc] = t[1];
  575. CO[2 * ldc] = t[2];
  576. CO[3 * ldc] = t[3];
  577. CO[4 * ldc] = t1[0];
  578. CO[5 * ldc] = t1[1];
  579. CO[6 * ldc] = t1[2];
  580. CO[7 * ldc] = t1[3];
  581. #else
  582. CO[0 * ldc] += t[0];
  583. CO[1 * ldc] += t[1];
  584. CO[2 * ldc] += t[2];
  585. CO[3 * ldc] += t[3];
  586. CO[4 * ldc] += t1[0];
  587. CO[5 * ldc] += t1[1];
  588. CO[6 * ldc] += t1[2];
  589. CO[7 * ldc] += t1[3];
  590. #endif
  591. CO += 1;
  592. AO += temp;
  593. BO += (temp << 3);
  594. #if defined(TRMMKERNEL)
  595. REFRESH_AFTER_SAVE (1, 8)
  596. #endif
  597. }
  598. #if defined(TRMMKERNEL) && !defined(LEFT)
  599. off += 8; // number of values in A
  600. #endif
  601. B += k << 3;
  602. }
  603. N = (n & 7) >> 2;
  604. for (i1 = 0; i1 < N; i1++)
  605. {
  606. BLASLONG i, j, temp;
  607. #if defined(TRMMKERNEL) && defined(LEFT)
  608. off = offset;
  609. #endif
  610. FLOAT *CO;
  611. FLOAT *AO;
  612. CO = C;
  613. C += ldc << 2;
  614. AO = A;
  615. #if !defined(TRMMKERNEL)
  616. i = m >> 5;
  617. for (j = 0; j < i; j++)
  618. {
  619. FLOAT *BO = B;
  620. v4sf_t *rowC;
  621. v4sf_t result[4];
  622. FLOAT *A1;
  623. A1 = AO + (16 * k);
  624. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  625. BLASLONG l = 0;
  626. vec_t *rowA = (vec_t *) & AO[0];
  627. vec_t *rowA1 = (vec_t *) & A1[0];
  628. vec_t *rowB = (vec_t *) & BO[0];
  629. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  630. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  631. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  632. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  633. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  634. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  635. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  636. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  637. for (l = 1; l < k; l++)
  638. {
  639. rowA = (vec_t *) & AO[l << 4];
  640. rowA1 = (vec_t *) & A1[l << 4];
  641. rowB = (vec_t *) & BO[l << 2];
  642. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  643. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  644. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  645. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  646. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  647. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  648. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  649. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  650. }
  651. SAVE_ACC (&acc0, 0);
  652. SAVE_ACC (&acc1, 4);
  653. CO += 8;
  654. SAVE_ACC (&acc2, 0);
  655. SAVE_ACC (&acc3, 4);
  656. CO += 8;
  657. SAVE_ACC (&acc4, 0);
  658. SAVE_ACC (&acc5, 4);
  659. CO += 8;
  660. SAVE_ACC (&acc6, 0);
  661. SAVE_ACC (&acc7, 4);
  662. CO += 8;
  663. AO += k << 5;
  664. BO += k << 2;
  665. }
  666. i = (m & 31) >> 4;
  667. #else
  668. i = m >> 4;
  669. #endif
  670. for (j = 0; j < i; j++)
  671. {
  672. FLOAT *BO;
  673. #if defined(TRMMKERNEL)
  674. REFRESH_POINTERS (16, 4);
  675. #else
  676. BO = B;
  677. temp = k;
  678. #endif
  679. v4sf_t *rowC;
  680. v4sf_t result[4];
  681. __vector_quad acc0, acc1, acc2, acc3;
  682. BLASLONG l = 0;
  683. vec_t *rowA = (vec_t *) & AO[0];
  684. vec_t *rowB = (vec_t *) & BO[0];
  685. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  686. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  687. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  688. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  689. for (l = 1; l < temp; l++)
  690. {
  691. rowA = (vec_t *) & AO[l << 4];
  692. rowB = (vec_t *) & BO[l << 2];
  693. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  694. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  695. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  696. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  697. }
  698. SAVE_ACC (&acc0, 0);
  699. SAVE_ACC (&acc1, 4);
  700. CO += 8;
  701. SAVE_ACC (&acc2, 0);
  702. SAVE_ACC (&acc3, 4);
  703. CO += 8;
  704. AO += temp << 4;
  705. BO += temp << 2;
  706. #if defined(TRMMKERNEL)
  707. REFRESH_AFTER_SAVE (16, 4)
  708. #endif
  709. }
  710. i = (m & 15) >> 3;
  711. for (j = 0; j < i; j++)
  712. {
  713. FLOAT *BO;
  714. #if defined(TRMMKERNEL)
  715. REFRESH_POINTERS (8, 4);
  716. #else
  717. BO = B;
  718. temp = k;
  719. #endif
  720. v4sf_t *rowC;
  721. v4sf_t result[4];
  722. __vector_quad acc0, acc1;
  723. BLASLONG l = 0;
  724. vec_t *rowA = (vec_t *) & AO[0];
  725. vec_t *rowB = (vec_t *) & BO[0];
  726. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  727. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  728. for (l = 1; l < temp; l++)
  729. {
  730. rowA = (vec_t *) & AO[l << 3];
  731. rowB = (vec_t *) & BO[l << 2];
  732. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  733. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  734. }
  735. SAVE_ACC (&acc0, 0);
  736. SAVE_ACC (&acc1, 4);
  737. CO += 8;
  738. AO += temp << 3;
  739. BO += temp << 2;
  740. #if defined(TRMMKERNEL)
  741. REFRESH_AFTER_SAVE (8, 4)
  742. #endif
  743. }
  744. i = (m & 7) >> 2;
  745. for (j = 0; j < i; j++)
  746. {
  747. FLOAT *BO;
  748. #if defined(TRMMKERNEL)
  749. REFRESH_POINTERS (4, 4);
  750. #else
  751. BO = B;
  752. temp = k;
  753. #endif
  754. v4sf_t *rowC;
  755. __vector_quad acc0;
  756. v4sf_t result[4];
  757. BLASLONG l = 0;
  758. vec_t *rowA = (vec_t *) & AO[0];
  759. vec_t *rowB = (vec_t *) & BO[0];
  760. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  761. for (l = 1; l < temp; l++)
  762. {
  763. rowA = (vec_t *) & AO[l << 2];
  764. rowB = (vec_t *) & BO[l << 2];
  765. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  766. }
  767. SAVE_ACC (&acc0, 0);
  768. CO += 4;
  769. AO += temp << 2;
  770. BO += temp << 2;
  771. #if defined(TRMMKERNEL)
  772. REFRESH_AFTER_SAVE (4, 4)
  773. #endif
  774. }
  775. i = (m & 3) >> 1;
  776. for (j = 0; j < i; j++)
  777. {
  778. FLOAT *BO;
  779. #if defined(TRMMKERNEL)
  780. REFRESH_POINTERS (2, 4);
  781. #else
  782. BO = B;
  783. temp = k;
  784. #endif
  785. v2sf_t *rowC;
  786. v2sf_t result[8];
  787. __vector_quad acc0;
  788. BLASLONG l = 0;
  789. FLOAT t[4] = { 0 };
  790. t[0] = AO[0], t[1] = AO[1];
  791. vec_t *rowA = (vec_t *) & t[0];
  792. vec_t *rowB = (vec_t *) & BO[0];
  793. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  794. for (l = 1; l < temp; l++)
  795. {
  796. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  797. rowA = (vec_t *) & t[0];
  798. rowB = (vec_t *) & BO[l << 2];
  799. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  800. }
  801. SAVE4x2_ACC (&acc0, 0);
  802. CO += 2;
  803. AO += temp << 1;
  804. BO += temp << 2;
  805. #if defined(TRMMKERNEL)
  806. REFRESH_AFTER_SAVE (2, 4)
  807. #endif
  808. }
  809. i = (m & 1) >> 0;
  810. for (j = 0; j < i; j++)
  811. {
  812. FLOAT *BO;
  813. #if defined(TRMMKERNEL)
  814. REFRESH_POINTERS (1, 4)
  815. #else
  816. BO = B;
  817. temp = k;
  818. #endif
  819. BLASLONG l = 0;
  820. v4sf_t t = { 0, 0, 0, 0 };
  821. for (l = 0; l < temp; l++)
  822. {
  823. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  824. v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
  825. BO[(l << 2) + 3]
  826. };
  827. t += rowA * rowB;
  828. }
  829. t = t * valpha;
  830. #if defined(TRMMKERNEL)
  831. CO[0 * ldc] = t[0];
  832. CO[1 * ldc] = t[1];
  833. CO[2 * ldc] = t[2];
  834. CO[3 * ldc] = t[3];
  835. #else
  836. CO[0 * ldc] += t[0];
  837. CO[1 * ldc] += t[1];
  838. CO[2 * ldc] += t[2];
  839. CO[3 * ldc] += t[3];
  840. #endif
  841. CO += 1;
  842. AO += temp;
  843. BO += temp << 2;
  844. #if defined(TRMMKERNEL)
  845. REFRESH_AFTER_SAVE (1, 4)
  846. #endif
  847. }
  848. #if defined(TRMMKERNEL) && !defined(LEFT)
  849. off += 4; // number of values in A
  850. #endif
  851. B += k << 2;
  852. }
  853. N = (n & 3) >> 1;
  854. for (i1 = 0; i1 < N; i1++)
  855. {
  856. BLASLONG i, j, temp;
  857. #if defined(TRMMKERNEL) && defined(LEFT)
  858. off = offset;
  859. #endif
  860. FLOAT *CO;
  861. FLOAT *AO;
  862. CO = C;
  863. C += ldc << 1;
  864. AO = A;
  865. #if !defined(TRMMKERNEL)
  866. i = m >> 5;
  867. for (j = 0; j < i; j++)
  868. {
  869. FLOAT *BO = B;
  870. v4sf_t *rowC;
  871. v4sf_t result[4];
  872. FLOAT *A1;
  873. A1 = AO + (16 * k);
  874. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  875. BLASLONG l = 0;
  876. FLOAT t[4] = { 0 };
  877. t[0] = BO[0], t[1] = BO[1];
  878. vec_t *rowB = (vec_t *) & t[0];
  879. vec_t *rowA = (vec_t *) & AO[0];
  880. vec_t *rowA1 = (vec_t *) & A1[0];
  881. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  882. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  883. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  884. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  885. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  886. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  887. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  888. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  889. for (l = 1; l < k; l++)
  890. {
  891. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  892. rowB = (vec_t *) & t[0];
  893. rowA = (vec_t *) & AO[l << 4];
  894. rowA1 = (vec_t *) & A1[l << 4];
  895. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  896. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  897. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  898. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  899. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  900. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  901. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  902. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  903. }
  904. SAVE2x4_ACC (&acc0, 0);
  905. SAVE2x4_ACC (&acc1, 4);
  906. SAVE2x4_ACC (&acc2, 8);
  907. SAVE2x4_ACC (&acc3, 12);
  908. CO += 16;
  909. SAVE2x4_ACC (&acc4, 0);
  910. SAVE2x4_ACC (&acc5, 4);
  911. SAVE2x4_ACC (&acc6, 8);
  912. SAVE2x4_ACC (&acc7, 12);
  913. CO += 16;
  914. AO += k << 5;
  915. BO += k << 1;
  916. }
  917. i = (m & 31) >> 4;
  918. #else
  919. i = m >> 4;
  920. #endif
  921. for (j = 0; j < i; j++)
  922. {
  923. FLOAT *BO;
  924. v4sf_t *rowC;
  925. v4sf_t result[4];
  926. __vector_quad acc0, acc1, acc2, acc3;
  927. BLASLONG l = 0;
  928. #if defined(TRMMKERNEL)
  929. REFRESH_POINTERS (16, 2)
  930. #else
  931. BO = B;
  932. temp = k;
  933. #endif
  934. FLOAT t[4] = { 0 };
  935. t[0] = BO[0], t[1] = BO[1];
  936. vec_t *rowB = (vec_t *) & t[0];
  937. vec_t *rowA = (vec_t *) & AO[0];
  938. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  939. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  940. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  941. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  942. for (l = 1; l < temp; l++)
  943. {
  944. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  945. rowB = (vec_t *) & t[0];
  946. rowA = (vec_t *) & AO[l << 4];
  947. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  948. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  949. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  950. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  951. }
  952. SAVE2x4_ACC (&acc0, 0);
  953. SAVE2x4_ACC (&acc1, 4);
  954. SAVE2x4_ACC (&acc2, 8);
  955. SAVE2x4_ACC (&acc3, 12);
  956. CO += 16;
  957. AO += temp << 4;
  958. BO += temp << 1;
  959. #if defined(TRMMKERNEL)
  960. REFRESH_AFTER_SAVE (16, 2)
  961. #endif
  962. }
  963. i = (m & 15) >> 3;
  964. for (j = 0; j < i; j++)
  965. {
  966. FLOAT *BO;
  967. v4sf_t *rowC;
  968. v4sf_t result[4];
  969. __vector_quad acc0, acc1;
  970. #if defined(TRMMKERNEL)
  971. REFRESH_POINTERS (8, 2)
  972. #else
  973. BO = B;
  974. temp = k;
  975. #endif
  976. BLASLONG l = 0;
  977. FLOAT t[4] = { 0 };
  978. t[0] = BO[0], t[1] = BO[1];
  979. vec_t *rowB = (vec_t *) & t[0];
  980. vec_t *rowA = (vec_t *) & AO[0];
  981. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  982. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  983. for (l = 1; l < temp; l++)
  984. {
  985. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  986. rowB = (vec_t *) & t[0];
  987. rowA = (vec_t *) & AO[l << 3];
  988. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  989. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  990. }
  991. SAVE2x4_ACC (&acc0, 0);
  992. SAVE2x4_ACC (&acc1, 4);
  993. CO += 8;
  994. AO += temp << 3;
  995. BO += temp << 1;
  996. #if defined(TRMMKERNEL)
  997. REFRESH_AFTER_SAVE (8, 2)
  998. #endif
  999. }
  1000. i = (m & 7) >> 2;
  1001. for (j = 0; j < i; j++)
  1002. {
  1003. FLOAT *BO;
  1004. v4sf_t *rowC;
  1005. v4sf_t result[4];
  1006. __vector_quad acc0;
  1007. #if defined(TRMMKERNEL)
  1008. REFRESH_POINTERS (4, 2)
  1009. #else
  1010. BO = B;
  1011. temp = k;
  1012. #endif
  1013. BLASLONG l = 0;
  1014. FLOAT t[4] = { 0 };
  1015. t[0] = BO[0], t[1] = BO[1];
  1016. vec_t *rowB = (vec_t *) & t[0];
  1017. vec_t *rowA = (vec_t *) & AO[0];
  1018. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  1019. for (l = 1; l < temp; l++)
  1020. {
  1021. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  1022. rowB = (vec_t *) & t[0];
  1023. rowA = (vec_t *) & AO[l << 2];
  1024. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  1025. }
  1026. SAVE2x4_ACC (&acc0, 0);
  1027. CO += 4;
  1028. AO += temp << 2;
  1029. BO += temp << 1;
  1030. #if defined(TRMMKERNEL)
  1031. REFRESH_AFTER_SAVE (4, 2)
  1032. #endif
  1033. }
  1034. i = (m & 3) >> 1;
  1035. for (j = 0; j < i; j++)
  1036. {
  1037. FLOAT *BO;
  1038. BLASLONG l = 0;
  1039. #if defined(TRMMKERNEL)
  1040. REFRESH_POINTERS (2, 2)
  1041. #else
  1042. BO = B;
  1043. temp = k;
  1044. #endif
  1045. v4sf_t t = { 0, 0, 0, 0 };
  1046. for (l = 0; l < (temp << 1); l += 2)
  1047. {
  1048. v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
  1049. v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
  1050. t += rowA * rowB;
  1051. }
  1052. t = t * valpha;
  1053. #if defined(TRMMKERNEL)
  1054. CO[0 * ldc] = t[0];
  1055. CO[1 * ldc] = t[1];
  1056. CO[0 * ldc + 1] = t[2];
  1057. CO[1 * ldc + 1] = t[3];
  1058. #else
  1059. CO[0 * ldc] += t[0];
  1060. CO[1 * ldc] += t[1];
  1061. CO[0 * ldc + 1] += t[2];
  1062. CO[1 * ldc + 1] += t[3];
  1063. #endif
  1064. CO += 2;
  1065. AO += temp << 1;
  1066. BO += temp << 1;
  1067. #if defined(TRMMKERNEL)
  1068. REFRESH_AFTER_SAVE (2, 2)
  1069. #endif
  1070. }
  1071. i = (m & 1) >> 0;
  1072. for (j = 0; j < i; j++)
  1073. {
  1074. FLOAT *BO;
  1075. BLASLONG l = 0;
  1076. #if defined(TRMMKERNEL)
  1077. REFRESH_POINTERS (1, 2)
  1078. #else
  1079. BO = B;
  1080. temp = k;
  1081. #endif
  1082. v4sf_t t = { 0, 0, 0, 0 };
  1083. for (l = 0; l < temp; l++)
  1084. {
  1085. v4sf_t rowA = { AO[l], AO[l], 0, 0 };
  1086. v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
  1087. t += rowA * rowB;
  1088. }
  1089. t = t * valpha;
  1090. #if defined(TRMMKERNEL)
  1091. CO[0 * ldc] = t[0];
  1092. CO[1 * ldc] = t[1];
  1093. #else
  1094. CO[0 * ldc] += t[0];
  1095. CO[1 * ldc] += t[1];
  1096. #endif
  1097. CO += 1;
  1098. AO += temp;
  1099. BO += temp << 1;
  1100. #if defined(TRMMKERNEL)
  1101. REFRESH_AFTER_SAVE (1, 2)
  1102. #endif
  1103. }
  1104. #if defined(TRMMKERNEL) && !defined(LEFT)
  1105. off += 2; // number of values in A
  1106. #endif
  1107. B += k << 1;
  1108. }
  1109. N = (n & 1) >> 0;
  1110. for (i1 = 0; i1 < N; i1++)
  1111. {
  1112. BLASLONG i, temp;
  1113. #if defined(TRMMKERNEL) && defined(LEFT)
  1114. off = offset;
  1115. #endif
  1116. FLOAT *CO;
  1117. FLOAT *AO;
  1118. CO = C;
  1119. C += ldc;
  1120. AO = A;
  1121. i = m;
  1122. while (i >= 16)
  1123. {
  1124. FLOAT *BO;
  1125. BLASLONG l = 0;
  1126. #if defined(TRMMKERNEL)
  1127. REFRESH_POINTERS (16, 1)
  1128. #else
  1129. BO = B;
  1130. temp = k;
  1131. #endif
  1132. v4sf_t t = { 0, 0, 0, 0 };
  1133. v4sf_t t1 = { 0, 0, 0, 0 };
  1134. v4sf_t t2 = { 0, 0, 0, 0 };
  1135. v4sf_t t3 = { 0, 0, 0, 0 };
  1136. for (l = 0; l < temp; l++)
  1137. {
  1138. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1139. v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
  1140. AO[(l << 4) + 3]
  1141. };
  1142. v4sf_t rowA1 =
  1143. { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
  1144. AO[(l << 4) + 7]
  1145. };
  1146. v4sf_t rowA2 =
  1147. { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
  1148. AO[(l << 4) + 11]
  1149. };
  1150. v4sf_t rowA3 =
  1151. { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
  1152. AO[(l << 4) + 15]
  1153. };
  1154. t += rowA * rowB;
  1155. t1 += rowA1 * rowB;
  1156. t2 += rowA2 * rowB;
  1157. t3 += rowA3 * rowB;
  1158. }
  1159. t = t * valpha;
  1160. t1 = t1 * valpha;
  1161. t2 = t2 * valpha;
  1162. t3 = t3 * valpha;
  1163. #if defined(TRMMKERNEL)
  1164. CO[0] = t[0];
  1165. CO[1] = t[1];
  1166. CO[2] = t[2];
  1167. CO[3] = t[3];
  1168. CO[4] = t1[0];
  1169. CO[5] = t1[1];
  1170. CO[6] = t1[2];
  1171. CO[7] = t1[3];
  1172. CO[8] = t2[0];
  1173. CO[9] = t2[1];
  1174. CO[10] = t2[2];
  1175. CO[11] = t2[3];
  1176. CO[12] = t3[0];
  1177. CO[13] = t3[1];
  1178. CO[14] = t3[2];
  1179. CO[15] = t3[3];
  1180. #else
  1181. CO[0] += t[0];
  1182. CO[1] += t[1];
  1183. CO[2] += t[2];
  1184. CO[3] += t[3];
  1185. CO[4] += t1[0];
  1186. CO[5] += t1[1];
  1187. CO[6] += t1[2];
  1188. CO[7] += t1[3];
  1189. CO[8] += t2[0];
  1190. CO[9] += t2[1];
  1191. CO[10] += t2[2];
  1192. CO[11] += t2[3];
  1193. CO[12] += t3[0];
  1194. CO[13] += t3[1];
  1195. CO[14] += t3[2];
  1196. CO[15] += t3[3];
  1197. #endif
  1198. AO += temp << 4;
  1199. BO += temp;
  1200. CO += 16;
  1201. i -= 16;
  1202. #if defined(TRMMKERNEL)
  1203. REFRESH_AFTER_SAVE (16, 1)
  1204. #endif
  1205. }
  1206. while (i >= 8)
  1207. {
  1208. FLOAT *BO;
  1209. BLASLONG l = 0;
  1210. v4sf_t t = { 0, 0, 0, 0 };
  1211. v4sf_t t1 = { 0, 0, 0, 0 };
  1212. #if defined(TRMMKERNEL)
  1213. REFRESH_POINTERS (8, 1)
  1214. #else
  1215. BO = B;
  1216. temp = k;
  1217. #endif
  1218. for (l = 0; l < temp; l++)
  1219. {
  1220. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1221. v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
  1222. AO[(l << 3) + 3]
  1223. };
  1224. v4sf_t rowA1 =
  1225. { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
  1226. AO[(l << 3) + 7]
  1227. };
  1228. t += rowA * rowB;
  1229. t1 += rowA1 * rowB;
  1230. }
  1231. t = t * valpha;
  1232. t1 = t1 * valpha;
  1233. #if defined(TRMMKERNEL)
  1234. CO[0] = t[0];
  1235. CO[1] = t[1];
  1236. CO[2] = t[2];
  1237. CO[3] = t[3];
  1238. CO[4] = t1[0];
  1239. CO[5] = t1[1];
  1240. CO[6] = t1[2];
  1241. CO[7] = t1[3];
  1242. #else
  1243. CO[0] += t[0];
  1244. CO[1] += t[1];
  1245. CO[2] += t[2];
  1246. CO[3] += t[3];
  1247. CO[4] += t1[0];
  1248. CO[5] += t1[1];
  1249. CO[6] += t1[2];
  1250. CO[7] += t1[3];
  1251. #endif
  1252. AO += temp << 3;
  1253. BO += temp;
  1254. CO += 8;
  1255. i -= 8;
  1256. #if defined(TRMMKERNEL)
  1257. REFRESH_AFTER_SAVE (8, 1)
  1258. #endif
  1259. }
  1260. while (i >= 4)
  1261. {
  1262. FLOAT *BO;
  1263. BLASLONG l = 0;
  1264. v4sf_t t = { 0, 0, 0, 0 };
  1265. #if defined(TRMMKERNEL)
  1266. REFRESH_POINTERS (4, 1)
  1267. #else
  1268. BO = B;
  1269. temp = k;
  1270. #endif
  1271. for (l = 0; l < temp; l++)
  1272. {
  1273. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1274. v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
  1275. AO[(l << 2) + 3]
  1276. };
  1277. t += rowA * rowB;
  1278. }
  1279. t = t * valpha;
  1280. #if defined(TRMMKERNEL)
  1281. CO[0] = t[0];
  1282. CO[1] = t[1];
  1283. CO[2] = t[2];
  1284. CO[3] = t[3];
  1285. #else
  1286. CO[0] += t[0];
  1287. CO[1] += t[1];
  1288. CO[2] += t[2];
  1289. CO[3] += t[3];
  1290. #endif
  1291. AO += temp << 2;
  1292. BO += temp;
  1293. CO += 4;
  1294. i -= 4;
  1295. #if defined(TRMMKERNEL)
  1296. REFRESH_AFTER_SAVE (4, 1)
  1297. #endif
  1298. }
  1299. while (i >= 2)
  1300. {
  1301. FLOAT *BO;
  1302. BLASLONG l = 0;
  1303. #if defined(TRMMKERNEL)
  1304. REFRESH_POINTERS (2, 1)
  1305. #else
  1306. BO = B;
  1307. temp = k;
  1308. #endif
  1309. v4sf_t t = { 0, 0, 0, 0 };
  1310. for (l = 0; l < temp; l++)
  1311. {
  1312. v4sf_t rowB = { BO[l], BO[l], 0, 0 };
  1313. v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
  1314. t += rowA * rowB;
  1315. }
  1316. t = t * valpha;
  1317. #if defined(TRMMKERNEL)
  1318. CO[0] = t[0];
  1319. CO[1] = t[1];
  1320. #else
  1321. CO[0] += t[0];
  1322. CO[1] += t[1];
  1323. #endif
  1324. AO += temp << 1;
  1325. BO += temp;
  1326. CO += 2;
  1327. i -= 2;
  1328. #if defined(TRMMKERNEL)
  1329. REFRESH_AFTER_SAVE (2, 1)
  1330. #endif
  1331. }
  1332. while (i >= 1)
  1333. {
  1334. FLOAT *BO;
  1335. #if defined(TRMMKERNEL)
  1336. REFRESH_POINTERS (1, 1)
  1337. #else
  1338. BO = B;
  1339. temp = k;
  1340. #endif
  1341. BLASLONG l = 0;
  1342. FLOAT t = 0;
  1343. for (l = 0; l < temp; l++)
  1344. {
  1345. t += AO[l] * BO[l];
  1346. }
  1347. AO += temp;
  1348. BO += temp;
  1349. #if defined(TRMMKERNEL)
  1350. CO[0] = t * alpha;
  1351. #else
  1352. CO[0] += t * alpha;
  1353. #endif
  1354. CO += 1;
  1355. i -= 1;
  1356. #if defined(TRMMKERNEL)
  1357. REFRESH_AFTER_SAVE (1, 1)
  1358. #endif
  1359. }
  1360. #if defined(TRMMKERNEL) && !defined(LEFT)
  1361. off += 1; // number of values in A
  1362. #endif
  1363. B += k;
  1364. }
  1365. return 0;
  1366. }