You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_power10.c 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878
  1. /*********************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
  31. #if !__has_builtin(__builtin_vsx_assemble_pair)
  32. #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
  33. #endif
  34. #if !__has_builtin(__builtin_vsx_disassemble_pair)
  35. #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
  36. #endif
  37. #ifdef TRMMKERNEL
  38. #define SAVE_ACC(ACC, J) \
  39. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  40. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  41. rowC[0] = result[0] * alpha; \
  42. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  43. rowC[0] = result[1] * alpha; \
  44. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  45. rowC[0] = result[2] * alpha; \
  46. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  47. rowC[0] = result[3] * alpha;
  48. #define SAVE_ACC1(ACC, J) \
  49. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  50. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  51. rowC[0] = result[0] * alpha; \
  52. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  53. rowC[0] = result[1] * alpha; \
  54. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  55. rowC[0] = result[2] * alpha; \
  56. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  57. rowC[0] = result[3] * alpha;
  58. #define SAVE2x4_ACC(ACC, J) \
  59. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  60. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  61. rowC[0] = result[0] * alpha; \
  62. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  63. rowC[0] = result[1] * alpha;
  64. #else
  65. #define SAVE_ACC(ACC, J) \
  66. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  67. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  68. rowC[0] += result[0] * alpha; \
  69. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  70. rowC[0] += result[1] * alpha; \
  71. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  72. rowC[0] += result[2] * alpha; \
  73. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  74. rowC[0] += result[3] * alpha;
  75. #define SAVE_ACC1(ACC, J) \
  76. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  77. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  78. rowC[0] += result[0] * alpha; \
  79. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  80. rowC[0] += result[1] * alpha; \
  81. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  82. rowC[0] += result[2] * alpha; \
  83. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  84. rowC[0] += result[3] * alpha;
  85. #define SAVE2x4_ACC(ACC, J) \
  86. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  87. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  88. rowC[0] += result[0] * alpha; \
  89. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  90. rowC[0] += result[1] * alpha;
  91. #endif
  92. #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  93. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  94. #define REFRESH_TEMP_BK(x, y) \
  95. temp = k - off;
  96. #elif defined(LEFT)
  97. #define REFRESH_TEMP_BK(x, y) \
  98. temp = off + x;
  99. #else
  100. #define REFRESH_TEMP_BK(x, y) \
  101. temp = off + y;
  102. #endif
  103. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  104. #define REFRESH_POINTERS(x, y) \
  105. BO = B; \
  106. REFRESH_TEMP_BK(x, y)
  107. #else
  108. #define REFRESH_POINTERS(x, y) \
  109. AO += off * x; \
  110. BO = B + off * y; \
  111. REFRESH_TEMP_BK(x, y)
  112. #endif
  113. #ifdef LEFT
  114. #define REFRESH_OFF(x) \
  115. off += x;
  116. #else
  117. #define REFRESH_OFF(x)
  118. #endif
  119. #ifdef LEFT
  120. #define UPDATE_TEMP(x, y) \
  121. temp -= x;
  122. #else
  123. #define UPDATE_TEMP(x, y) \
  124. temp -= y;
  125. #endif
  126. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  127. #define REFRESH_TMP_AFTER_SAVE(x, y) \
  128. temp = k - off; \
  129. UPDATE_TEMP(x, y) \
  130. AO += temp * x; \
  131. BO += temp * y;
  132. #else
  133. #define REFRESH_TMP_AFTER_SAVE(x, y)
  134. #endif
  135. #define REFRESH_AFTER_SAVE(x,y) \
  136. REFRESH_TMP_AFTER_SAVE(x, y) \
  137. REFRESH_OFF(x)
  138. /*************************************************************************************
  139. * GEMM Kernel
  140. *************************************************************************************/
  141. int
  142. CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
  143. FLOAT * C, BLASLONG ldc
  144. #ifdef TRMMKERNEL
  145. , BLASLONG offset
  146. #endif
  147. )
  148. {
  149. BLASLONG i1;
  150. #if defined(TRMMKERNEL)
  151. BLASLONG off;
  152. #endif
  153. #if defined(TRMMKERNEL) && !defined(LEFT)
  154. off = -offset;
  155. #endif
  156. v4sf_t valpha = { alpha, alpha };
  157. for (i1 = 0; i1 < (n >> 3); i1++)
  158. {
  159. BLASLONG j, temp;
  160. FLOAT *CO;
  161. FLOAT *AO;
  162. #if defined(TRMMKERNEL) && defined(LEFT)
  163. off = offset;
  164. #endif
  165. CO = C;
  166. C += ldc << 3;
  167. AO = A;
  168. PREFETCH1 (A, 128);
  169. PREFETCH1 (A, 256);
  170. for (j = 0; j < (m >> 3); j++)
  171. {
  172. FLOAT *BO;
  173. #if defined(TRMMKERNEL)
  174. REFRESH_POINTERS (8, 8);
  175. #else
  176. BO = B;
  177. temp = k;
  178. #endif
  179. v4sf_t *rowC;
  180. v4sf_t result[4];
  181. __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
  182. BLASLONG l = 0;
  183. vec_t *rowA = (vec_t *) & AO[0];
  184. vec_t *rb = (vec_t *) & BO[0];
  185. __vector_pair rowB, rowB1;
  186. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  187. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  188. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  189. __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
  190. __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
  191. __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
  192. __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
  193. __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
  194. __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
  195. __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
  196. for (l = 1; l < temp; l++)
  197. {
  198. rowA = (vec_t *) & AO[l << 3];
  199. rb = (vec_t *) & BO[l << 3];
  200. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  201. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  202. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  203. __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
  204. __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
  205. __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
  206. __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
  207. __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
  208. __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
  209. __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
  210. }
  211. SAVE_ACC (&acc0, 0);
  212. SAVE_ACC1 (&acc1, 0);
  213. SAVE_ACC (&acc2, 2);
  214. SAVE_ACC1 (&acc3, 2);
  215. SAVE_ACC (&acc4, 4);
  216. SAVE_ACC1 (&acc5, 4);
  217. SAVE_ACC (&acc6, 6);
  218. SAVE_ACC1 (&acc7, 6);
  219. CO += 8;
  220. AO += temp << 3;
  221. BO += temp << 3;
  222. #if defined(TRMMKERNEL)
  223. REFRESH_AFTER_SAVE (8, 8)
  224. #endif
  225. }
  226. if (m & 4)
  227. {
  228. FLOAT *BO;
  229. #if defined(TRMMKERNEL)
  230. REFRESH_POINTERS (4, 8);
  231. #else
  232. BO = B;
  233. temp = k;
  234. #endif
  235. v4sf_t *rowC;
  236. v4sf_t result[4];
  237. __vector_quad acc0, acc1, acc2, acc3;
  238. BLASLONG l = 0;
  239. vec_t *rowA = (vec_t *) & AO[0];
  240. __vector_pair rowB, rowB1;
  241. vec_t *rb = (vec_t *) & BO[0];
  242. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  243. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  244. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  245. __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
  246. __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
  247. __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
  248. for (l = 1; l < temp; l++)
  249. {
  250. rowA = (vec_t *) & AO[l << 2];
  251. rb = (vec_t *) & BO[l << 3];
  252. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  253. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  254. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  255. __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
  256. __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
  257. __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
  258. }
  259. SAVE_ACC (&acc0, 0);
  260. SAVE_ACC1 (&acc1, 0);
  261. SAVE_ACC (&acc2, 2);
  262. SAVE_ACC1 (&acc3, 2);
  263. CO += 4;
  264. AO += temp << 2;
  265. BO += temp << 3;
  266. #if defined(TRMMKERNEL)
  267. REFRESH_AFTER_SAVE (4, 8)
  268. #endif
  269. }
  270. if (m & 2)
  271. {
  272. FLOAT *BO;
  273. #if defined(TRMMKERNEL)
  274. REFRESH_POINTERS (2, 8);
  275. #else
  276. BO = B;
  277. temp = k;
  278. #endif
  279. v4sf_t *rowC;
  280. v4sf_t result[4];
  281. __vector_quad acc0, acc1;
  282. BLASLONG l = 0;
  283. vec_t *rowA = (vec_t *) & AO[0];
  284. __vector_pair rowB, rowB1;
  285. vec_t *rb = (vec_t *) & BO[0];
  286. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  287. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  288. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  289. __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
  290. for (l = 1; l < temp; l++)
  291. {
  292. rowA = (vec_t *) & AO[l << 1];
  293. rb = (vec_t *) & BO[l << 3];
  294. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  295. __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
  296. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  297. __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
  298. }
  299. SAVE_ACC (&acc0, 0);
  300. SAVE_ACC1 (&acc1, 0);
  301. CO += 2;
  302. AO += temp << 1;
  303. BO += temp << 3;
  304. #if defined(TRMMKERNEL)
  305. REFRESH_AFTER_SAVE (2, 8)
  306. #endif
  307. }
  308. if (m & 1)
  309. {
  310. FLOAT *BO;
  311. #if defined(TRMMKERNEL)
  312. REFRESH_POINTERS (1, 8);
  313. #else
  314. BO = B;
  315. temp = k;
  316. #endif
  317. BLASLONG l = 0;
  318. v4sf_t t = { 0, 0 };
  319. v4sf_t t1 = { 0, 0 };
  320. v4sf_t t2 = { 0, 0 };
  321. v4sf_t t3 = { 0, 0 };
  322. for (l = 0; l < temp; l++)
  323. {
  324. v4sf_t rowA = { AO[l], AO[l] };
  325. v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
  326. v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
  327. v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
  328. v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
  329. t += rowA * rowB;
  330. t1 += rowA * rowB1;
  331. t2 += rowA * rowB2;
  332. t3 += rowA * rowB3;
  333. }
  334. t = t * valpha;
  335. t1 = t1 * valpha;
  336. t2 = t2 * valpha;
  337. t3 = t3 * valpha;
  338. #if defined(TRMMKERNEL)
  339. CO[0 * ldc] = t[0];
  340. CO[1 * ldc] = t[1];
  341. CO[2 * ldc] = t1[0];
  342. CO[3 * ldc] = t1[1];
  343. CO[4 * ldc] = t2[0];
  344. CO[5 * ldc] = t2[1];
  345. CO[6 * ldc] = t3[0];
  346. CO[7 * ldc] = t3[1];
  347. #else
  348. CO[0 * ldc] += t[0];
  349. CO[1 * ldc] += t[1];
  350. CO[2 * ldc] += t1[0];
  351. CO[3 * ldc] += t1[1];
  352. CO[4 * ldc] += t2[0];
  353. CO[5 * ldc] += t2[1];
  354. CO[6 * ldc] += t3[0];
  355. CO[7 * ldc] += t3[1];
  356. #endif
  357. CO += 1;
  358. AO += temp;
  359. BO += temp << 3;
  360. #if defined(TRMMKERNEL)
  361. REFRESH_AFTER_SAVE (1, 8)
  362. #endif
  363. }
  364. #if defined(TRMMKERNEL) && !defined(LEFT)
  365. off += 8; // number of values in A
  366. #endif
  367. B += k << 3;
  368. }
  369. if (n & 4)
  370. {
  371. BLASLONG j, temp;
  372. FLOAT *CO;
  373. FLOAT *AO;
  374. #if defined(TRMMKERNEL) && defined(LEFT)
  375. off = offset;
  376. #endif
  377. CO = C;
  378. C += ldc << 2;
  379. AO = A;
  380. PREFETCH1 (A, 128);
  381. PREFETCH1 (A, 256);
  382. for (j = 0; j < (m >> 3); j++)
  383. {
  384. FLOAT *BO;
  385. #if defined(TRMMKERNEL)
  386. REFRESH_POINTERS (8, 4);
  387. #else
  388. BO = B;
  389. temp = k;
  390. #endif
  391. v4sf_t *rowC;
  392. v4sf_t result[4];
  393. __vector_quad acc0, acc1, acc2, acc3;
  394. BLASLONG l = 0;
  395. vec_t *rowA = (vec_t *) & AO[0];
  396. __vector_pair rowB;
  397. vec_t *rb = (vec_t *) & BO[0];
  398. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  399. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  400. __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
  401. __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
  402. __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
  403. for (l = 1; l < temp; l++)
  404. {
  405. rowA = (vec_t *) & AO[l << 3];
  406. rb = (vec_t *) & BO[l << 2];
  407. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  408. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  409. __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
  410. __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
  411. __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
  412. }
  413. SAVE_ACC (&acc0, 0);
  414. SAVE_ACC (&acc2, 4);
  415. SAVE_ACC (&acc1, 2);
  416. SAVE_ACC (&acc3, 6);
  417. CO += 8;
  418. AO += temp << 3;
  419. BO += temp << 2;
  420. #if defined(TRMMKERNEL)
  421. REFRESH_AFTER_SAVE (8, 4)
  422. #endif
  423. }
  424. if (m & 4)
  425. {
  426. FLOAT *BO;
  427. #if defined(TRMMKERNEL)
  428. REFRESH_POINTERS (4, 4);
  429. #else
  430. BO = B;
  431. temp = k;
  432. #endif
  433. v4sf_t *rowC;
  434. v4sf_t result[4];
  435. __vector_quad acc0, acc1;
  436. BLASLONG l = 0;
  437. vec_t *rowA = (vec_t *) & AO[0];
  438. __vector_pair rowB;
  439. vec_t *rb = (vec_t *) & BO[0];
  440. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  441. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  442. __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
  443. for (l = 1; l < temp; l++)
  444. {
  445. rowA = (vec_t *) & AO[l << 2];
  446. rb = (vec_t *) & BO[l << 2];
  447. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  448. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  449. __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
  450. }
  451. SAVE_ACC (&acc0, 0);
  452. SAVE_ACC (&acc1, 2);
  453. CO += 4;
  454. AO += temp << 2;
  455. BO += temp << 2;
  456. #if defined(TRMMKERNEL)
  457. REFRESH_AFTER_SAVE (4, 4)
  458. #endif
  459. }
  460. if (m & 2)
  461. {
  462. FLOAT *BO;
  463. #if defined(TRMMKERNEL)
  464. REFRESH_POINTERS (2, 4);
  465. #else
  466. BO = B;
  467. temp = k;
  468. #endif
  469. v4sf_t *rowC;
  470. v4sf_t result[4];
  471. __vector_quad acc0;
  472. BLASLONG l = 0;
  473. vec_t *rowA = (vec_t *) & AO[0];
  474. __vector_pair rowB;
  475. vec_t *rb = (vec_t *) & BO[0];
  476. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  477. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  478. for (l = 1; l < temp; l++)
  479. {
  480. rowA = (vec_t *) & AO[l << 1];
  481. rb = (vec_t *) & BO[l << 2];
  482. __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
  483. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  484. }
  485. SAVE_ACC (&acc0, 0);
  486. CO += 2;
  487. AO += temp << 1;
  488. BO += temp << 2;
  489. #if defined(TRMMKERNEL)
  490. REFRESH_AFTER_SAVE (2, 4)
  491. #endif
  492. }
  493. if (m & 1)
  494. {
  495. FLOAT *BO;
  496. #if defined(TRMMKERNEL)
  497. REFRESH_POINTERS (1, 4);
  498. #else
  499. BO = B;
  500. temp = k;
  501. #endif
  502. BLASLONG l = 0;
  503. v4sf_t t = { 0, 0 };
  504. v4sf_t t1 = { 0, 0 };
  505. for (l = 0; l < temp; l++)
  506. {
  507. v4sf_t rowA = { AO[l], AO[l] };
  508. v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
  509. v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
  510. t += rowA * rowB;
  511. t1 += rowA * rowB1;
  512. }
  513. t = t * valpha;
  514. t1 = t1 * valpha;
  515. #if defined(TRMMKERNEL)
  516. CO[0 * ldc] = t[0];
  517. CO[1 * ldc] = t[1];
  518. CO[2 * ldc] = t1[0];
  519. CO[3 * ldc] = t1[1];
  520. #else
  521. CO[0 * ldc] += t[0];
  522. CO[1 * ldc] += t[1];
  523. CO[2 * ldc] += t1[0];
  524. CO[3 * ldc] += t1[1];
  525. #endif
  526. CO += 1;
  527. AO += temp;
  528. BO += temp << 2;
  529. #if defined(TRMMKERNEL)
  530. REFRESH_AFTER_SAVE (1, 4)
  531. #endif
  532. }
  533. #if defined(TRMMKERNEL) && !defined(LEFT)
  534. off += 4; // number of values in A
  535. #endif
  536. B += k << 2;
  537. }
  538. if (n & 2)
  539. {
  540. BLASLONG j, temp;
  541. #if defined(TRMMKERNEL) && defined(LEFT)
  542. off = offset;
  543. #endif
  544. FLOAT *CO;
  545. FLOAT *AO;
  546. CO = C;
  547. C += ldc << 1;
  548. AO = A;
  549. for (j = 0; j < (m >> 3); j++)
  550. {
  551. FLOAT *BO;
  552. #if defined(TRMMKERNEL)
  553. REFRESH_POINTERS (8, 2);
  554. #else
  555. BO = B;
  556. temp = k;
  557. #endif
  558. v4sf_t *rowC;
  559. v4sf_t result[4];
  560. __vector_quad acc0, acc1, acc2, acc3;
  561. BLASLONG l = 0;
  562. __vector_pair rowB;
  563. vec_t *rb = (vec_t *) & BO[0];
  564. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  565. vec_t *rowA = (vec_t *) & AO[0];
  566. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  567. __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
  568. __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
  569. __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
  570. for (l = 1; l < temp; l++)
  571. {
  572. rb = (vec_t *) & BO[l << 1];
  573. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  574. rowA = (vec_t *) & AO[l << 3];
  575. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  576. __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
  577. __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
  578. __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
  579. }
  580. SAVE2x4_ACC (&acc0, 0);
  581. SAVE2x4_ACC (&acc1, 2);
  582. SAVE2x4_ACC (&acc2, 4);
  583. SAVE2x4_ACC (&acc3, 6);
  584. CO += 8;
  585. AO += temp << 3;
  586. BO += temp << 1;
  587. #if defined(TRMMKERNEL)
  588. REFRESH_AFTER_SAVE (8, 2)
  589. #endif
  590. }
  591. if (m & 4)
  592. {
  593. FLOAT *BO;
  594. #if defined(TRMMKERNEL)
  595. REFRESH_POINTERS (4, 2);
  596. #else
  597. BO = B;
  598. temp = k;
  599. #endif
  600. v4sf_t *rowC;
  601. v4sf_t result[4];
  602. __vector_quad acc0, acc1;
  603. BLASLONG l = 0;
  604. __vector_pair rowB;
  605. vec_t *rb = (vec_t *) & BO[0];
  606. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  607. vec_t *rowA = (vec_t *) & AO[0];
  608. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  609. __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
  610. for (l = 1; l < temp; l++)
  611. {
  612. rb = (vec_t *) & BO[l << 1];
  613. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  614. rowA = (vec_t *) & AO[l << 2];
  615. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  616. __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
  617. }
  618. SAVE2x4_ACC (&acc0, 0);
  619. SAVE2x4_ACC (&acc1, 2);
  620. CO += 4;
  621. AO += temp << 2;
  622. BO += temp << 1;
  623. #if defined(TRMMKERNEL)
  624. REFRESH_AFTER_SAVE (4, 2)
  625. #endif
  626. }
  627. if (m & 2)
  628. {
  629. FLOAT *BO;
  630. #if defined(TRMMKERNEL)
  631. REFRESH_POINTERS (2, 2);
  632. #else
  633. BO = B;
  634. temp = k;
  635. #endif
  636. v4sf_t *rowC;
  637. v4sf_t result[4];
  638. __vector_quad acc0;
  639. BLASLONG l = 0;
  640. __vector_pair rowB;
  641. vec_t *rb = (vec_t *) & BO[0];
  642. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  643. vec_t *rowA = (vec_t *) & AO[0];
  644. __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
  645. for (l = 1; l < temp; l++)
  646. {
  647. rb = (vec_t *) & BO[l << 1];
  648. __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
  649. rowA = (vec_t *) & AO[l << 1];
  650. __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
  651. }
  652. SAVE2x4_ACC (&acc0, 0);
  653. CO += 2;
  654. AO += temp << 1;
  655. BO += temp << 1;
  656. #if defined(TRMMKERNEL)
  657. REFRESH_AFTER_SAVE (2, 2)
  658. #endif
  659. }
  660. if (m & 1)
  661. {
  662. FLOAT *BO;
  663. #if defined(TRMMKERNEL)
  664. REFRESH_POINTERS (1, 2);
  665. #else
  666. BO = B;
  667. temp = k;
  668. #endif
  669. BLASLONG l = 0;
  670. v4sf_t t = { 0, 0 };
  671. for (l = 0; l < temp; l++)
  672. {
  673. v4sf_t rowA = { AO[l], AO[l] };
  674. v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
  675. t += rowA * rowB;
  676. }
  677. t = t * valpha;
  678. #if defined(TRMMKERNEL)
  679. CO[0 * ldc] = t[0];
  680. CO[1 * ldc] = t[1];
  681. #else
  682. CO[0 * ldc] += t[0];
  683. CO[1 * ldc] += t[1];
  684. #endif
  685. CO += 1;
  686. AO += temp;
  687. BO += temp << 1;
  688. #if defined(TRMMKERNEL)
  689. REFRESH_AFTER_SAVE (1, 2)
  690. #endif
  691. }
  692. #if defined(TRMMKERNEL) && !defined(LEFT)
  693. off += 2; // number of values in A
  694. #endif
  695. B += k << 1;
  696. }
  697. if (n & 1)
  698. {
  699. BLASLONG i, temp;
  700. #if defined(TRMMKERNEL) && defined(LEFT)
  701. off = offset;
  702. #endif
  703. FLOAT *CO;
  704. FLOAT *AO;
  705. CO = C;
  706. C += ldc;
  707. AO = A;
  708. for (i = 0; i < (m >> 3); i++)
  709. {
  710. FLOAT *BO;
  711. #if defined(TRMMKERNEL)
  712. REFRESH_POINTERS (8, 1)
  713. #else
  714. BO = B;
  715. temp = k;
  716. #endif
  717. BLASLONG l = 0;
  718. v4sf_t t = { 0, 0 };
  719. v4sf_t t1 = { 0, 0 };
  720. v4sf_t t2 = { 0, 0 };
  721. v4sf_t t3 = { 0, 0 };
  722. for (l = 0; l < temp; l++)
  723. {
  724. v4sf_t rowB = { BO[l], BO[l] };
  725. v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
  726. v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
  727. v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
  728. v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
  729. t += rowA * rowB;
  730. t1 += rowA1 * rowB;
  731. t2 += rowA2 * rowB;
  732. t3 += rowA3 * rowB;
  733. }
  734. t = t * valpha;
  735. t1 = t1 * valpha;
  736. t2 = t2 * valpha;
  737. t3 = t3 * valpha;
  738. #if defined(TRMMKERNEL)
  739. CO[0] = t[0];
  740. CO[1] = t[1];
  741. CO[2] = t1[0];
  742. CO[3] = t1[1];
  743. CO[4] = t2[0];
  744. CO[5] = t2[1];
  745. CO[6] = t3[0];
  746. CO[7] = t3[1];
  747. #else
  748. CO[0] += t[0];
  749. CO[1] += t[1];
  750. CO[2] += t1[0];
  751. CO[3] += t1[1];
  752. CO[4] += t2[0];
  753. CO[5] += t2[1];
  754. CO[6] += t3[0];
  755. CO[7] += t3[1];
  756. #endif
  757. AO += temp << 3;
  758. BO += temp;
  759. CO += 8;
  760. #if defined(TRMMKERNEL)
  761. REFRESH_AFTER_SAVE (8, 1)
  762. #endif
  763. }
  764. if (m & 4)
  765. {
  766. FLOAT *BO;
  767. #if defined(TRMMKERNEL)
  768. REFRESH_POINTERS (4, 1)
  769. #else
  770. BO = B;
  771. temp = k;
  772. #endif
  773. BLASLONG l = 0;
  774. v4sf_t t = { 0, 0 };
  775. v4sf_t t1 = { 0, 0 };
  776. for (l = 0; l < temp; l++)
  777. {
  778. v4sf_t rowB = { BO[l], BO[l] };
  779. v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
  780. v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
  781. t += rowA * rowB;
  782. t1 += rowA1 * rowB;
  783. }
  784. t = t * valpha;
  785. t1 = t1 * valpha;
  786. #if defined(TRMMKERNEL)
  787. CO[0] = t[0];
  788. CO[1] = t[1];
  789. CO[2] = t1[0];
  790. CO[3] = t1[1];
  791. #else
  792. CO[0] += t[0];
  793. CO[1] += t[1];
  794. CO[2] += t1[0];
  795. CO[3] += t1[1];
  796. #endif
  797. AO += temp << 2;
  798. BO += temp;
  799. CO += 4;
  800. #if defined(TRMMKERNEL)
  801. REFRESH_AFTER_SAVE (4, 1)
  802. #endif
  803. }
  804. if (m & 2)
  805. {
  806. FLOAT *BO;
  807. #if defined(TRMMKERNEL)
  808. REFRESH_POINTERS (2, 1)
  809. #else
  810. BO = B;
  811. temp = k;
  812. #endif
  813. BLASLONG l = 0;
  814. v4sf_t t = { 0, 0 };
  815. for (l = 0; l < temp; l++)
  816. {
  817. v4sf_t rowB = { BO[l], BO[l] };
  818. v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
  819. t += rowA * rowB;
  820. }
  821. t = t * valpha;
  822. #if defined(TRMMKERNEL)
  823. CO[0] = t[0];
  824. CO[1] = t[1];
  825. #else
  826. CO[0] += t[0];
  827. CO[1] += t[1];
  828. #endif
  829. AO += temp << 1;
  830. BO += temp;
  831. CO += 2;
  832. #if defined(TRMMKERNEL)
  833. REFRESH_AFTER_SAVE (2, 1)
  834. #endif
  835. }
  836. if (m & 1)
  837. {
  838. FLOAT *BO;
  839. #if defined(TRMMKERNEL)
  840. REFRESH_POINTERS (1, 1)
  841. #else
  842. BO = B;
  843. temp = k;
  844. #endif
  845. BLASLONG l = 0;
  846. FLOAT t = 0;
  847. for (l = 0; l < temp; l++)
  848. {
  849. t += AO[l] * BO[l];
  850. }
  851. AO += temp;
  852. BO += temp;
  853. #if defined(TRMMKERNEL)
  854. CO[0] = t * alpha;
  855. #else
  856. CO[0] += t * alpha;
  857. #endif
  858. CO += 1;
  859. #if defined(TRMMKERNEL)
  860. REFRESH_AFTER_SAVE (1, 1)
  861. #endif
  862. }
  863. #if defined(TRMMKERNEL) && !defined(LEFT)
  864. off += 1; // number of values in A
  865. #endif
  866. B += k;
  867. }
  868. return 0;
  869. }