You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x8_haswell.c 27 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546
  1. #include "common.h"
  2. #include <stdbool.h>
  3. static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline));
  4. static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7)
  5. {
  6. BLASLONG i = 0;
  7. BLASLONG temp1 = n * 8;
  8. __asm__ __volatile__
  9. (
  10. " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t"
  11. " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t"
  12. " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t"
  13. " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t"
  14. " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t"
  15. " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t"
  16. " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
  17. " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
  18. " cmp $0, %1 \n\t"
  19. " jz 2f \n\t"
  20. " .p2align 4 \n\t"
  21. "1: \n\t"
  22. " vmovups (%2,%0,4) , %%ymm0 \n\t"
  23. " vmovups (%3,%0,8) , %%ymm1 \n\t"
  24. " vmovups 32(%3,%0,8) , %%ymm2 \n\t"
  25. " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t"
  26. " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t"
  27. " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
  28. " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t"
  29. " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t"
  30. " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t"
  31. " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t"
  32. " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t"
  33. " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
  34. " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t"
  35. " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t"
  36. " addq $8 , %0 \n\t"
  37. " cmp %0 , %1 \n\t"
  38. " jne 1b \n\t"
  39. "2: \n\t"
  40. " vbroadcastsd (%4), %%ymm0 \n\t"
  41. " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t"
  42. " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t"
  43. " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t"
  44. " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t"
  45. " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t"
  46. " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t"
  47. " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t"
  48. " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t"
  49. " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t"
  50. " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t"
  51. " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t"
  52. " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t"
  53. " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t"
  54. " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t"
  55. " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
  56. " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
  57. " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t"
  58. " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t"
  59. " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t"
  60. " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t"
  61. " vmovups %%ymm4 , (%5) \n\t"
  62. " vmovups %%ymm5 , (%6) \n\t"
  63. " vmovups %%ymm6 , (%7) \n\t"
  64. " vmovups %%ymm7 , (%8) \n\t"
  65. " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t"
  66. " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t"
  67. " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t"
  68. " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t"
  69. " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t"
  70. " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t"
  71. " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
  72. " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
  73. " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t"
  74. " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t"
  75. " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t"
  76. " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t"
  77. " vmovups %%ymm4 , (%9) \n\t"
  78. " vmovups %%ymm5 , (%10) \n\t"
  79. " vmovups %%ymm6 , (%11) \n\t"
  80. " vmovups %%ymm7 , (%12) \n\t"
  81. :
  82. :
  83. "a" (i), // 0
  84. "r" (temp1), // 1
  85. "S" (a), // 2
  86. "D" (b), // 3
  87. "r" (alpha), // 4
  88. "r" (C0), // 5
  89. "r" (C1), // 6
  90. "r" (C2), // 7
  91. "r" (C3), // 8
  92. "r" (C4), // 9
  93. "r" (C5), // 10
  94. "r" (C6), // 11
  95. "r" (C7) // 12
  96. : "cc",
  97. "%xmm0", "%xmm1", "%xmm2", "%xmm3",
  98. "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  99. "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  100. "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  101. "memory"
  102. );
  103. }
  104. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  105. {
  106. BLASLONG i,j,k;
  107. FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
  108. FLOAT res0_0;
  109. FLOAT res0_1;
  110. FLOAT res0_2;
  111. FLOAT res0_3;
  112. FLOAT res1_0;
  113. FLOAT res1_1;
  114. FLOAT res1_2;
  115. FLOAT res1_3;
  116. FLOAT res2_0;
  117. FLOAT res2_1;
  118. FLOAT res2_2;
  119. FLOAT res2_3;
  120. FLOAT res3_0;
  121. FLOAT res3_1;
  122. FLOAT res3_2;
  123. FLOAT res3_3;
  124. FLOAT res4_0;
  125. FLOAT res4_1;
  126. /*
  127. FLOAT res4_2;
  128. FLOAT res4_3;
  129. */
  130. FLOAT res5_0;
  131. FLOAT res5_1;
  132. /*
  133. FLOAT res5_2;
  134. FLOAT res5_3;
  135. */
  136. FLOAT res6_0;
  137. FLOAT res6_1;
  138. /*
  139. FLOAT res6_2;
  140. FLOAT res6_3;
  141. */
  142. FLOAT res7_0;
  143. FLOAT res7_1;
  144. /*
  145. FLOAT res7_2;
  146. FLOAT res7_3;
  147. */
  148. FLOAT a0;
  149. FLOAT a1;
  150. FLOAT b0;
  151. FLOAT b1;
  152. FLOAT b2;
  153. FLOAT b3;
  154. FLOAT b4;
  155. FLOAT b5;
  156. FLOAT b6;
  157. FLOAT b7;
  158. BLASLONG off, temp ;
  159. bool left;
  160. bool transposed;
  161. bool backwards;
  162. #ifdef LEFT
  163. left = true;
  164. #else
  165. left = false;
  166. #endif
  167. #ifdef TRANSA
  168. transposed = true;
  169. #else
  170. transposed = false;
  171. #endif
  172. backwards = left != transposed;
  173. if (!left) {
  174. off = -offset;
  175. }
  176. for (j=0; j<bn/8; j+=1) // do blocks of the Mx8 loops
  177. {
  178. C0 = C;
  179. C1 = C0+ldc;
  180. C2 = C1+ldc;
  181. C3 = C2+ldc;
  182. C4 = C3+ldc;
  183. C5 = C4+ldc;
  184. C6 = C5+ldc;
  185. C7 = C6+ldc;
  186. if (left) {
  187. off = offset;
  188. }
  189. ptrba = ba;
  190. for (i=0; i<bm/4; i+=1) // do blocks of 4x4
  191. {
  192. ptrbb = bb;
  193. if (backwards)
  194. {
  195. ptrba += off*4; // number of values in A
  196. ptrbb += off*8; // number of values in B
  197. }
  198. /*
  199. res0_0 = 0;
  200. res0_1 = 0;
  201. res0_2 = 0;
  202. res0_3 = 0;
  203. res1_0 = 0;
  204. res1_1 = 0;
  205. res1_2 = 0;
  206. res1_3 = 0;
  207. res2_0 = 0;
  208. res2_1 = 0;
  209. res2_2 = 0;
  210. res2_3 = 0;
  211. res3_0 = 0;
  212. res3_1 = 0;
  213. res3_2 = 0;
  214. res3_3 = 0;
  215. res4_0 = 0;
  216. res4_1 = 0;
  217. res4_2 = 0;
  218. res4_3 = 0;
  219. res5_0 = 0;
  220. res5_1 = 0;
  221. res5_2 = 0;
  222. res5_3 = 0;
  223. res6_0 = 0;
  224. res6_1 = 0;
  225. res6_2 = 0;
  226. res6_3 = 0;
  227. res7_0 = 0;
  228. res7_1 = 0;
  229. res7_2 = 0;
  230. res7_3 = 0;
  231. */
  232. temp = backwards ? bk-off :
  233. left ? off + 4 : // number of values in A
  234. off + 8; // number of values in B
  235. dtrmm_kernel_4x8( temp, &alpha , ptrba, ptrbb, C0, C1, C2, C3, C4, C5, C6, C7);
  236. ptrba = ptrba + temp * 4;
  237. ptrbb = ptrbb + temp * 8;
  238. /*
  239. for (k=0; k<temp; k++)
  240. {
  241. b0 = ptrbb[0];
  242. b1 = ptrbb[1];
  243. b2 = ptrbb[2];
  244. b3 = ptrbb[3];
  245. b4 = ptrbb[4];
  246. b5 = ptrbb[5];
  247. b6 = ptrbb[6];
  248. b7 = ptrbb[7];
  249. a0 = ptrba[0];
  250. res0_0 += a0*b0;
  251. res1_0 += a0*b1;
  252. res2_0 += a0*b2;
  253. res3_0 += a0*b3;
  254. res4_0 += a0*b4;
  255. res5_0 += a0*b5;
  256. res6_0 += a0*b6;
  257. res7_0 += a0*b7;
  258. a1 = ptrba[1];
  259. res0_1 += a1*b0;
  260. res1_1 += a1*b1;
  261. res2_1 += a1*b2;
  262. res3_1 += a1*b3;
  263. res4_1 += a1*b4;
  264. res5_1 += a1*b5;
  265. res6_1 += a1*b6;
  266. res7_1 += a1*b7;
  267. a0 = ptrba[2];
  268. res0_2 += a0*b0;
  269. res1_2 += a0*b1;
  270. res2_2 += a0*b2;
  271. res3_2 += a0*b3;
  272. res4_2 += a0*b4;
  273. res5_2 += a0*b5;
  274. res6_2 += a0*b6;
  275. res7_2 += a0*b7;
  276. a1 = ptrba[3];
  277. res0_3 += a1*b0;
  278. res1_3 += a1*b1;
  279. res2_3 += a1*b2;
  280. res3_3 += a1*b3;
  281. res4_3 += a1*b4;
  282. res5_3 += a1*b5;
  283. res6_3 += a1*b6;
  284. res7_3 += a1*b7;
  285. ptrba = ptrba+4;
  286. ptrbb = ptrbb+8;
  287. }
  288. res0_0 *= alpha;
  289. res0_1 *= alpha;
  290. res0_2 *= alpha;
  291. res0_3 *= alpha;
  292. res1_0 *= alpha;
  293. res1_1 *= alpha;
  294. res1_2 *= alpha;
  295. res1_3 *= alpha;
  296. res2_0 *= alpha;
  297. res2_1 *= alpha;
  298. res2_2 *= alpha;
  299. res2_3 *= alpha;
  300. res3_0 *= alpha;
  301. res3_1 *= alpha;
  302. res3_2 *= alpha;
  303. res3_3 *= alpha;
  304. res4_0 *= alpha;
  305. res4_1 *= alpha;
  306. res4_2 *= alpha;
  307. res4_3 *= alpha;
  308. res5_0 *= alpha;
  309. res5_1 *= alpha;
  310. res5_2 *= alpha;
  311. res5_3 *= alpha;
  312. res6_0 *= alpha;
  313. res6_1 *= alpha;
  314. res6_2 *= alpha;
  315. res6_3 *= alpha;
  316. res7_0 *= alpha;
  317. res7_1 *= alpha;
  318. res7_2 *= alpha;
  319. res7_3 *= alpha;
  320. C0[0] = res0_0;
  321. C0[1] = res0_1;
  322. C0[2] = res0_2;
  323. C0[3] = res0_3;
  324. C1[0] = res1_0;
  325. C1[1] = res1_1;
  326. C1[2] = res1_2;
  327. C1[3] = res1_3;
  328. C2[0] = res2_0;
  329. C2[1] = res2_1;
  330. C2[2] = res2_2;
  331. C2[3] = res2_3;
  332. C3[0] = res3_0;
  333. C3[1] = res3_1;
  334. C3[2] = res3_2;
  335. C3[3] = res3_3;
  336. C4[0] = res4_0;
  337. C4[1] = res4_1;
  338. C4[2] = res4_2;
  339. C4[3] = res4_3;
  340. C5[0] = res5_0;
  341. C5[1] = res5_1;
  342. C5[2] = res5_2;
  343. C5[3] = res5_3;
  344. C6[0] = res6_0;
  345. C6[1] = res6_1;
  346. C6[2] = res6_2;
  347. C6[3] = res6_3;
  348. C7[0] = res7_0;
  349. C7[1] = res7_1;
  350. C7[2] = res7_2;
  351. C7[3] = res7_3;
  352. */
  353. if (!backwards) {
  354. temp = bk-off;
  355. temp = left ? temp - 4 : // number of values in A
  356. temp - 8; // number of values in B
  357. ptrba += temp*4; // number of values in A
  358. ptrbb += temp*8; // number of values in B
  359. }
  360. #ifdef LEFT
  361. off += 4; // number of values in A
  362. #endif
  363. C0 = C0+4;
  364. C1 = C1+4;
  365. C2 = C2+4;
  366. C3 = C3+4;
  367. C4 = C4+4;
  368. C5 = C5+4;
  369. C6 = C6+4;
  370. C7 = C7+4;
  371. }
  372. if ( bm & 2 ) // do any 2x4 loop
  373. {
  374. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  375. ptrbb = bb;
  376. #else
  377. ptrba += off*2;
  378. ptrbb = bb + off*8;
  379. #endif
  380. res0_0 = 0;
  381. res0_1 = 0;
  382. res1_0 = 0;
  383. res1_1 = 0;
  384. res2_0 = 0;
  385. res2_1 = 0;
  386. res3_0 = 0;
  387. res3_1 = 0;
  388. res4_0 = 0;
  389. res4_1 = 0;
  390. res5_0 = 0;
  391. res5_1 = 0;
  392. res6_0 = 0;
  393. res6_1 = 0;
  394. res7_0 = 0;
  395. res7_1 = 0;
  396. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  397. temp = bk-off;
  398. #elif defined(LEFT)
  399. temp = off+2; // number of values in A
  400. #else
  401. temp = off+8; // number of values in B
  402. #endif
  403. for (k=0; k<temp; k++)
  404. {
  405. b0 = ptrbb[0];
  406. b1 = ptrbb[1];
  407. b2 = ptrbb[2];
  408. b3 = ptrbb[3];
  409. b4 = ptrbb[4];
  410. b5 = ptrbb[5];
  411. b6 = ptrbb[6];
  412. b7 = ptrbb[7];
  413. a0 = ptrba[0];
  414. res0_0 += a0*b0;
  415. res1_0 += a0*b1;
  416. res2_0 += a0*b2;
  417. res3_0 += a0*b3;
  418. res4_0 += a0*b4;
  419. res5_0 += a0*b5;
  420. res6_0 += a0*b6;
  421. res7_0 += a0*b7;
  422. a1 = ptrba[1];
  423. res0_1 += a1*b0;
  424. res1_1 += a1*b1;
  425. res2_1 += a1*b2;
  426. res3_1 += a1*b3;
  427. res4_1 += a1*b4;
  428. res5_1 += a1*b5;
  429. res6_1 += a1*b6;
  430. res7_1 += a1*b7;
  431. ptrba = ptrba+2;
  432. ptrbb = ptrbb+8;
  433. }
  434. res0_0 *= alpha;
  435. res0_1 *= alpha;
  436. res1_0 *= alpha;
  437. res1_1 *= alpha;
  438. res2_0 *= alpha;
  439. res2_1 *= alpha;
  440. res3_0 *= alpha;
  441. res3_1 *= alpha;
  442. res4_0 *= alpha;
  443. res4_1 *= alpha;
  444. res5_0 *= alpha;
  445. res5_1 *= alpha;
  446. res6_0 *= alpha;
  447. res6_1 *= alpha;
  448. res7_0 *= alpha;
  449. res7_1 *= alpha;
  450. C0[0] = res0_0;
  451. C0[1] = res0_1;
  452. C1[0] = res1_0;
  453. C1[1] = res1_1;
  454. C2[0] = res2_0;
  455. C2[1] = res2_1;
  456. C3[0] = res3_0;
  457. C3[1] = res3_1;
  458. C4[0] = res4_0;
  459. C4[1] = res4_1;
  460. C5[0] = res5_0;
  461. C5[1] = res5_1;
  462. C6[0] = res6_0;
  463. C6[1] = res6_1;
  464. C7[0] = res7_0;
  465. C7[1] = res7_1;
  466. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  467. temp = bk - off;
  468. #ifdef LEFT
  469. temp -= 2; // number of values in A
  470. #else
  471. temp -= 8; // number of values in B
  472. #endif
  473. ptrba += temp*2;
  474. ptrbb += temp*8;
  475. #endif
  476. #ifdef LEFT
  477. off += 2; // number of values in A
  478. #endif
  479. C0 = C0+2;
  480. C1 = C1+2;
  481. C2 = C2+2;
  482. C3 = C3+2;
  483. C4 = C4+2;
  484. C5 = C5+2;
  485. C6 = C6+2;
  486. C7 = C7+2;
  487. }
  488. if ( bm & 1 ) // do any 1x4 loop
  489. {
  490. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  491. ptrbb = bb;
  492. #else
  493. ptrba += off*1;
  494. ptrbb = bb + off*8;
  495. #endif
  496. res0_0 = 0;
  497. res1_0 = 0;
  498. res2_0 = 0;
  499. res3_0 = 0;
  500. res4_0 = 0;
  501. res5_0 = 0;
  502. res6_0 = 0;
  503. res7_0 = 0;
  504. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  505. temp = bk-off;
  506. #elif defined(LEFT)
  507. temp = off+1; // number of values in A
  508. #else
  509. temp = off+8; // number of values in B
  510. #endif
  511. for (k=0; k<temp; k++)
  512. {
  513. b0 = ptrbb[0];
  514. b1 = ptrbb[1];
  515. b2 = ptrbb[2];
  516. b3 = ptrbb[3];
  517. b4 = ptrbb[4];
  518. b5 = ptrbb[5];
  519. b6 = ptrbb[6];
  520. b7 = ptrbb[7];
  521. a0 = ptrba[0];
  522. res0_0 += a0*b0;
  523. res1_0 += a0*b1;
  524. res2_0 += a0*b2;
  525. res3_0 += a0*b3;
  526. res4_0 += a0*b4;
  527. res5_0 += a0*b5;
  528. res6_0 += a0*b6;
  529. res7_0 += a0*b7;
  530. ptrba = ptrba+1;
  531. ptrbb = ptrbb+8;
  532. }
  533. res0_0 *= alpha;
  534. res1_0 *= alpha;
  535. res2_0 *= alpha;
  536. res3_0 *= alpha;
  537. res4_0 *= alpha;
  538. res5_0 *= alpha;
  539. res6_0 *= alpha;
  540. res7_0 *= alpha;
  541. C0[0] = res0_0;
  542. C1[0] = res1_0;
  543. C2[0] = res2_0;
  544. C3[0] = res3_0;
  545. C4[0] = res4_0;
  546. C5[0] = res5_0;
  547. C6[0] = res6_0;
  548. C7[0] = res7_0;
  549. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  550. temp = bk - off;
  551. #ifdef LEFT
  552. temp -= 1; // number of values in A
  553. #else
  554. temp -= 8; // number of values in B
  555. #endif
  556. ptrba += temp*1;
  557. ptrbb += temp*8;
  558. #endif
  559. #ifdef LEFT
  560. off += 1; // number of values in A
  561. #endif
  562. C0 = C0+1;
  563. C1 = C1+1;
  564. C2 = C2+1;
  565. C3 = C3+1;
  566. C4 = C4+1;
  567. C5 = C5+1;
  568. C6 = C6+1;
  569. C7 = C7+1;
  570. }
  571. #if defined(TRMMKERNEL) && !defined(LEFT)
  572. off += 8;
  573. #endif
  574. k = (bk<<3);
  575. bb = bb+k;
  576. i = (ldc<<3);
  577. C = C+i;
  578. }
  579. for (j=0; j<(bn&4); j+=4) // do blocks of the Mx4 loops
  580. {
  581. C0 = C;
  582. C1 = C0+ldc;
  583. C2 = C1+ldc;
  584. C3 = C2+ldc;
  585. if (left) {
  586. off = offset;
  587. }
  588. ptrba = ba;
  589. for (i=0; i<bm/4; i+=1) // do blocks of 4x4
  590. {
  591. ptrbb = bb;
  592. if (backwards)
  593. {
  594. ptrba += off*4; // number of values in A
  595. ptrbb += off*4; // number of values in B
  596. }
  597. res0_0 = 0;
  598. res0_1 = 0;
  599. res0_2 = 0;
  600. res0_3 = 0;
  601. res1_0 = 0;
  602. res1_1 = 0;
  603. res1_2 = 0;
  604. res1_3 = 0;
  605. res2_0 = 0;
  606. res2_1 = 0;
  607. res2_2 = 0;
  608. res2_3 = 0;
  609. res3_0 = 0;
  610. res3_1 = 0;
  611. res3_2 = 0;
  612. res3_3 = 0;
  613. temp = backwards ? bk-off : off + 4;
  614. /* left ? off + 4 : // number of values in A
  615. off + 4; // number of values in B */
  616. for (k=0; k<temp; k++)
  617. {
  618. b0 = ptrbb[0];
  619. b1 = ptrbb[1];
  620. b2 = ptrbb[2];
  621. b3 = ptrbb[3];
  622. a0 = ptrba[0];
  623. res0_0 += a0*b0;
  624. res1_0 += a0*b1;
  625. res2_0 += a0*b2;
  626. res3_0 += a0*b3;
  627. a1 = ptrba[1];
  628. res0_1 += a1*b0;
  629. res1_1 += a1*b1;
  630. res2_1 += a1*b2;
  631. res3_1 += a1*b3;
  632. a0 = ptrba[2];
  633. res0_2 += a0*b0;
  634. res1_2 += a0*b1;
  635. res2_2 += a0*b2;
  636. res3_2 += a0*b3;
  637. a1 = ptrba[3];
  638. res0_3 += a1*b0;
  639. res1_3 += a1*b1;
  640. res2_3 += a1*b2;
  641. res3_3 += a1*b3;
  642. ptrba = ptrba+4;
  643. ptrbb = ptrbb+4;
  644. }
  645. res0_0 *= alpha;
  646. res0_1 *= alpha;
  647. res0_2 *= alpha;
  648. res0_3 *= alpha;
  649. res1_0 *= alpha;
  650. res1_1 *= alpha;
  651. res1_2 *= alpha;
  652. res1_3 *= alpha;
  653. res2_0 *= alpha;
  654. res2_1 *= alpha;
  655. res2_2 *= alpha;
  656. res2_3 *= alpha;
  657. res3_0 *= alpha;
  658. res3_1 *= alpha;
  659. res3_2 *= alpha;
  660. res3_3 *= alpha;
  661. C0[0] = res0_0;
  662. C0[1] = res0_1;
  663. C0[2] = res0_2;
  664. C0[3] = res0_3;
  665. C1[0] = res1_0;
  666. C1[1] = res1_1;
  667. C1[2] = res1_2;
  668. C1[3] = res1_3;
  669. C2[0] = res2_0;
  670. C2[1] = res2_1;
  671. C2[2] = res2_2;
  672. C2[3] = res2_3;
  673. C3[0] = res3_0;
  674. C3[1] = res3_1;
  675. C3[2] = res3_2;
  676. C3[3] = res3_3;
  677. if (!backwards) {
  678. temp = bk-off - 4;
  679. /* temp = left ? temp - 4 : // number of values in A
  680. temp - 4; // number of values in B */
  681. ptrba += temp*4; // number of values in A
  682. ptrbb += temp*4; // number of values in B
  683. }
  684. #ifdef LEFT
  685. off += 4; // number of values in A
  686. #endif
  687. C0 = C0+4;
  688. C1 = C1+4;
  689. C2 = C2+4;
  690. C3 = C3+4;
  691. }
  692. if ( bm & 2 ) // do any 2x4 loop
  693. {
  694. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  695. ptrbb = bb;
  696. #else
  697. ptrba += off*2;
  698. ptrbb = bb + off*4;
  699. #endif
  700. res0_0 = 0;
  701. res0_1 = 0;
  702. res1_0 = 0;
  703. res1_1 = 0;
  704. res2_0 = 0;
  705. res2_1 = 0;
  706. res3_0 = 0;
  707. res3_1 = 0;
  708. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  709. temp = bk-off;
  710. #elif defined(LEFT)
  711. temp = off+2; // number of values in A
  712. #else
  713. temp = off+4; // number of values in B
  714. #endif
  715. for (k=0; k<temp; k++)
  716. {
  717. b0 = ptrbb[0];
  718. b1 = ptrbb[1];
  719. b2 = ptrbb[2];
  720. b3 = ptrbb[3];
  721. a0 = ptrba[0];
  722. res0_0 += a0*b0;
  723. res1_0 += a0*b1;
  724. res2_0 += a0*b2;
  725. res3_0 += a0*b3;
  726. a1 = ptrba[1];
  727. res0_1 += a1*b0;
  728. res1_1 += a1*b1;
  729. res2_1 += a1*b2;
  730. res3_1 += a1*b3;
  731. ptrba = ptrba+2;
  732. ptrbb = ptrbb+4;
  733. }
  734. res0_0 *= alpha;
  735. res0_1 *= alpha;
  736. res1_0 *= alpha;
  737. res1_1 *= alpha;
  738. res2_0 *= alpha;
  739. res2_1 *= alpha;
  740. res3_0 *= alpha;
  741. res3_1 *= alpha;
  742. C0[0] = res0_0;
  743. C0[1] = res0_1;
  744. C1[0] = res1_0;
  745. C1[1] = res1_1;
  746. C2[0] = res2_0;
  747. C2[1] = res2_1;
  748. C3[0] = res3_0;
  749. C3[1] = res3_1;
  750. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  751. temp = bk - off;
  752. #ifdef LEFT
  753. temp -= 2; // number of values in A
  754. #else
  755. temp -= 4; // number of values in B
  756. #endif
  757. ptrba += temp*2;
  758. ptrbb += temp*4;
  759. #endif
  760. #ifdef LEFT
  761. off += 2; // number of values in A
  762. #endif
  763. C0 = C0+2;
  764. C1 = C1+2;
  765. C2 = C2+2;
  766. C3 = C3+2;
  767. }
  768. if ( bm & 1 ) // do any 1x4 loop
  769. {
  770. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  771. ptrbb = bb;
  772. #else
  773. ptrba += off*1;
  774. ptrbb = bb + off*4;
  775. #endif
  776. res0_0 = 0;
  777. res1_0 = 0;
  778. res2_0 = 0;
  779. res3_0 = 0;
  780. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  781. temp = bk-off;
  782. #elif defined(LEFT)
  783. temp = off+1; // number of values in A
  784. #else
  785. temp = off+4; // number of values in B
  786. #endif
  787. for (k=0; k<temp; k++)
  788. {
  789. b0 = ptrbb[0];
  790. b1 = ptrbb[1];
  791. b2 = ptrbb[2];
  792. b3 = ptrbb[3];
  793. a0 = ptrba[0];
  794. res0_0 += a0*b0;
  795. res1_0 += a0*b1;
  796. res2_0 += a0*b2;
  797. res3_0 += a0*b3;
  798. ptrba = ptrba+1;
  799. ptrbb = ptrbb+4;
  800. }
  801. res0_0 *= alpha;
  802. res1_0 *= alpha;
  803. res2_0 *= alpha;
  804. res3_0 *= alpha;
  805. C0[0] = res0_0;
  806. C1[0] = res1_0;
  807. C2[0] = res2_0;
  808. C3[0] = res3_0;
  809. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  810. temp = bk - off;
  811. #ifdef LEFT
  812. temp -= 1; // number of values in A
  813. #else
  814. temp -= 4; // number of values in B
  815. #endif
  816. ptrba += temp*1;
  817. ptrbb += temp*4;
  818. #endif
  819. #ifdef LEFT
  820. off += 1; // number of values in A
  821. #endif
  822. C0 = C0+1;
  823. C1 = C1+1;
  824. C2 = C2+1;
  825. C3 = C3+1;
  826. }
  827. #if defined(TRMMKERNEL) && !defined(LEFT)
  828. off += 4;
  829. #endif
  830. k = (bk<<2);
  831. bb = bb+k;
  832. i = (ldc<<2);
  833. C = C+i;
  834. }
  835. for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
  836. {
  837. C0 = C;
  838. C1 = C0+ldc;
  839. #if defined(TRMMKERNEL) && defined(LEFT)
  840. off = offset;
  841. #endif
  842. ptrba = ba;
  843. for (i=0; i<bm/4; i+=1) // do blocks of 4x2
  844. {
  845. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  846. ptrbb = bb;
  847. #else
  848. ptrba += off*4;
  849. ptrbb = bb + off*2;
  850. #endif
  851. res0_0 = 0;
  852. res0_1 = 0;
  853. res0_2 = 0;
  854. res0_3 = 0;
  855. res1_0 = 0;
  856. res1_1 = 0;
  857. res1_2 = 0;
  858. res1_3 = 0;
  859. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  860. temp = bk-off;
  861. #elif defined(LEFT)
  862. temp = off+4; // number of values in A
  863. #else
  864. temp = off+2; // number of values in B
  865. #endif
  866. for (k=0; k<temp; k++)
  867. {
  868. b0 = ptrbb[0];
  869. b1 = ptrbb[1];
  870. a0 = ptrba[0];
  871. res0_0 += a0*b0;
  872. res1_0 += a0*b1;
  873. a1 = ptrba[1];
  874. res0_1 += a1*b0;
  875. res1_1 += a1*b1;
  876. a0 = ptrba[2];
  877. res0_2 += a0*b0;
  878. res1_2 += a0*b1;
  879. a1 = ptrba[3];
  880. res0_3 += a1*b0;
  881. res1_3 += a1*b1;
  882. ptrba = ptrba+4;
  883. ptrbb = ptrbb+2;
  884. }
  885. res0_0 *= alpha;
  886. res0_1 *= alpha;
  887. res0_2 *= alpha;
  888. res0_3 *= alpha;
  889. res1_0 *= alpha;
  890. res1_1 *= alpha;
  891. res1_2 *= alpha;
  892. res1_3 *= alpha;
  893. C0[0] = res0_0;
  894. C0[1] = res0_1;
  895. C0[2] = res0_2;
  896. C0[3] = res0_3;
  897. C1[0] = res1_0;
  898. C1[1] = res1_1;
  899. C1[2] = res1_2;
  900. C1[3] = res1_3;
  901. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  902. temp = bk - off;
  903. #ifdef LEFT
  904. temp -= 4; // number of values in A
  905. #else
  906. temp -= 2; // number of values in B
  907. #endif
  908. ptrba += temp*4;
  909. ptrbb += temp*2;
  910. #endif
  911. #ifdef LEFT
  912. off += 4; // number of values in A
  913. #endif
  914. C0 = C0+4;
  915. C1 = C1+4;
  916. }
  917. if ( bm & 2 ) // do any 2x2 loop
  918. {
  919. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  920. ptrbb = bb;
  921. #else
  922. ptrba += off*2;
  923. ptrbb = bb + off*2;
  924. #endif
  925. res0_0 = 0;
  926. res0_1 = 0;
  927. res1_0 = 0;
  928. res1_1 = 0;
  929. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  930. temp = bk-off;
  931. #elif defined(LEFT)
  932. temp = off+2; // number of values in A
  933. #else
  934. temp = off+2; // number of values in B
  935. #endif
  936. for (k=0; k<temp; k++)
  937. {
  938. b0 = ptrbb[0];
  939. b1 = ptrbb[1];
  940. a0 = ptrba[0];
  941. res0_0 += a0*b0;
  942. res1_0 += a0*b1;
  943. a1 = ptrba[1];
  944. res0_1 += a1*b0;
  945. res1_1 += a1*b1;
  946. ptrba = ptrba+2;
  947. ptrbb = ptrbb+2;
  948. }
  949. res0_0 *= alpha;
  950. res0_1 *= alpha;
  951. res1_0 *= alpha;
  952. res1_1 *= alpha;
  953. C0[0] = res0_0;
  954. C0[1] = res0_1;
  955. C1[0] = res1_0;
  956. C1[1] = res1_1;
  957. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  958. temp = bk - off;
  959. #ifdef LEFT
  960. temp -= 2; // number of values in A
  961. #else
  962. temp -= 2; // number of values in B
  963. #endif
  964. ptrba += temp*2;
  965. ptrbb += temp*2;
  966. #endif
  967. #ifdef LEFT
  968. off += 2; // number of values in A
  969. #endif
  970. C0 = C0+2;
  971. C1 = C1+2;
  972. }
  973. if ( bm & 1 ) // do any 1x2 loop
  974. {
  975. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  976. ptrbb = bb;
  977. #else
  978. ptrba += off*1;
  979. ptrbb = bb + off*2;
  980. #endif
  981. res0_0 = 0;
  982. res1_0 = 0;
  983. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  984. temp = bk-off;
  985. #elif defined(LEFT)
  986. temp = off+1; // number of values in A
  987. #else
  988. temp = off+2; // number of values in B
  989. #endif
  990. for (k=0; k<temp; k++)
  991. {
  992. b0 = ptrbb[0];
  993. b1 = ptrbb[1];
  994. a0 = ptrba[0];
  995. res0_0 += a0*b0;
  996. res1_0 += a0*b1;
  997. ptrba = ptrba+1;
  998. ptrbb = ptrbb+2;
  999. }
  1000. res0_0 *= alpha;
  1001. res1_0 *= alpha;
  1002. C0[0] = res0_0;
  1003. C1[0] = res1_0;
  1004. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1005. temp = bk - off;
  1006. #ifdef LEFT
  1007. temp -= 1; // number of values in A
  1008. #else
  1009. temp -= 2; // number of values in B
  1010. #endif
  1011. ptrba += temp*1;
  1012. ptrbb += temp*2;
  1013. #endif
  1014. #ifdef LEFT
  1015. off += 1; // number of values in A
  1016. #endif
  1017. C0 = C0+1;
  1018. C1 = C1+1;
  1019. }
  1020. #if defined(TRMMKERNEL) && !defined(LEFT)
  1021. off += 2;
  1022. #endif
  1023. k = (bk<<1);
  1024. bb = bb+k;
  1025. i = (ldc<<1);
  1026. C = C+i;
  1027. }
  1028. for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
  1029. {
  1030. C0 = C;
  1031. #if defined(TRMMKERNEL) && defined(LEFT)
  1032. off = offset;
  1033. #endif
  1034. ptrba = ba;
  1035. for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
  1036. {
  1037. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1038. ptrbb = bb;
  1039. #else
  1040. ptrba += off*4;
  1041. ptrbb = bb + off*1;
  1042. #endif
  1043. res0_0 = 0;
  1044. res0_1 = 0;
  1045. res0_2 = 0;
  1046. res0_3 = 0;
  1047. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1048. temp = bk-off;
  1049. #elif defined(LEFT)
  1050. temp = off+4; // number of values in A
  1051. #else
  1052. temp = off+1; // number of values in B
  1053. #endif
  1054. for (k=0; k<temp; k++)
  1055. {
  1056. b0 = ptrbb[0];
  1057. a0 = ptrba[0];
  1058. res0_0 += a0*b0;
  1059. a1 = ptrba[1];
  1060. res0_1 += a1*b0;
  1061. a0 = ptrba[2];
  1062. res0_2 += a0*b0;
  1063. a1 = ptrba[3];
  1064. res0_3 += a1*b0;
  1065. ptrba = ptrba+4;
  1066. ptrbb = ptrbb+1;
  1067. }
  1068. res0_0 *= alpha;
  1069. res0_1 *= alpha;
  1070. res0_2 *= alpha;
  1071. res0_3 *= alpha;
  1072. C0[0] = res0_0;
  1073. C0[1] = res0_1;
  1074. C0[2] = res0_2;
  1075. C0[3] = res0_3;
  1076. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1077. temp = bk - off;
  1078. #ifdef LEFT
  1079. temp -= 4; // number of values in A
  1080. #else
  1081. temp -= 1; // number of values in B
  1082. #endif
  1083. ptrba += temp*4;
  1084. ptrbb += temp*1;
  1085. #endif
  1086. #ifdef LEFT
  1087. off += 4; // number of values in A
  1088. #endif
  1089. C0 = C0+4;
  1090. }
  1091. if ( bm & 2 ) // do any 2x1 loop
  1092. {
  1093. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1094. ptrbb = bb;
  1095. #else
  1096. ptrba += off*2;
  1097. ptrbb = bb + off*1;
  1098. #endif
  1099. res0_0 = 0;
  1100. res0_1 = 0;
  1101. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1102. temp = bk-off;
  1103. #elif defined(LEFT)
  1104. temp = off+2; // number of values in A
  1105. #else
  1106. temp = off+1; // number of values in B
  1107. #endif
  1108. for (k=0; k<temp; k++)
  1109. {
  1110. b0 = ptrbb[0];
  1111. a0 = ptrba[0];
  1112. res0_0 += a0*b0;
  1113. a1 = ptrba[1];
  1114. res0_1 += a1*b0;
  1115. ptrba = ptrba+2;
  1116. ptrbb = ptrbb+1;
  1117. }
  1118. res0_0 *= alpha;
  1119. res0_1 *= alpha;
  1120. C0[0] = res0_0;
  1121. C0[1] = res0_1;
  1122. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1123. temp = bk - off;
  1124. #ifdef LEFT
  1125. temp -= 2; // number of values in A
  1126. #else
  1127. temp -= 1; // number of values in B
  1128. #endif
  1129. ptrba += temp*2;
  1130. ptrbb += temp*1;
  1131. #endif
  1132. #ifdef LEFT
  1133. off += 2; // number of values in A
  1134. #endif
  1135. C0 = C0+2;
  1136. }
  1137. if ( bm & 1 ) // do any 1x1 loop
  1138. {
  1139. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1140. ptrbb = bb;
  1141. #else
  1142. ptrba += off*1;
  1143. ptrbb = bb + off*1;
  1144. #endif
  1145. res0_0 = 0;
  1146. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1147. temp = bk-off;
  1148. #elif defined(LEFT)
  1149. temp = off+1; // number of values in A
  1150. #else
  1151. temp = off+1; // number of values in B
  1152. #endif
  1153. for (k=0; k<temp; k++)
  1154. {
  1155. b0 = ptrbb[0];
  1156. a0 = ptrba[0];
  1157. res0_0 += a0*b0;
  1158. ptrba = ptrba+1;
  1159. ptrbb = ptrbb+1;
  1160. }
  1161. res0_0 *= alpha;
  1162. C0[0] = res0_0;
  1163. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1164. temp = bk - off;
  1165. #ifdef LEFT
  1166. temp -= 1; // number of values in A
  1167. #else
  1168. temp -= 1; // number of values in B
  1169. #endif
  1170. ptrba += temp*1;
  1171. ptrbb += temp*1;
  1172. #endif
  1173. #ifdef LEFT
  1174. off += 1; // number of values in A
  1175. #endif
  1176. C0 = C0+1;
  1177. }
  1178. #if defined(TRMMKERNEL) && !defined(LEFT)
  1179. off += 1;
  1180. #endif
  1181. k = (bk<<0);
  1182. bb = bb+k;
  1183. C = C+ldc;
  1184. }
  1185. return 0;
  1186. }