You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 31 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define P 2048
  41. #ifndef __64BIT__
  42. #define STACKSIZE 224
  43. #else
  44. #define STACKSIZE 304
  45. #endif
  46. #ifdef linux
  47. #ifndef __64BIT__
  48. #define M r3
  49. #define N r4
  50. #define A r6
  51. #define LDA r7
  52. #define X r8
  53. #define INCX r9
  54. #define Y r10
  55. #define INCY r5
  56. #else
  57. #define M r3
  58. #define N r4
  59. #define A r8
  60. #define LDA r9
  61. #define X r10
  62. #define INCX r5
  63. #define Y r6
  64. #define INCY r7
  65. #endif
  66. #endif
  67. #if defined(_AIX) || defined(__APPLE__)
  68. #if !defined(__64BIT__) && defined(DOUBLE)
  69. #define M r3
  70. #define N r4
  71. #define A r10
  72. #define LDA r5
  73. #define X r6
  74. #define INCX r7
  75. #define Y r8
  76. #define INCY r9
  77. #else
  78. #define M r3
  79. #define N r4
  80. #define A r8
  81. #define LDA r9
  82. #define X r10
  83. #define INCX r5
  84. #define Y r6
  85. #define INCY r7
  86. #endif
  87. #endif
  88. #define BUFFER r11
  89. #define XP r12
  90. #define MIN_N r14
  91. #define J r15
  92. #define CO r16
  93. #define BO r17
  94. #define PLDA_M r18
  95. #define AO1 r19
  96. #define AO2 r20
  97. #define AO3 r21
  98. #define AO4 r22
  99. #define IS r23
  100. #define PREA r24
  101. #define PREC r25
  102. #define Y1 r23 /* dummy; should be same as gemv_n.S */
  103. #define Y2 r24 /* dummy; should be same as gemv_n.S */
  104. #if defined(PPCG4)
  105. #define PREFETCHSIZE_A 34
  106. #define PREFETCHSIZE_C 16
  107. #endif
  108. #if defined(PPC440) || defined(PPC440FP2)
  109. #define PREFETCHSIZE_A 34
  110. #define PREFETCHSIZE_C 16
  111. #endif
  112. #ifdef PPC970
  113. #define PREFETCHSIZE_A 56
  114. #define PREFETCHSIZE_C 16
  115. #endif
  116. #ifdef CELL
  117. #define PREFETCHSIZE_A 56
  118. #define PREFETCHSIZE_C 16
  119. #endif
  120. #ifdef POWER4
  121. #define PREFETCHSIZE_A 34
  122. #define PREFETCHSIZE_C 16
  123. #endif
  124. #ifdef POWER5
  125. #define PREFETCHSIZE_A 40
  126. #define PREFETCHSIZE_C 8
  127. #endif
  128. #ifdef POWER6
  129. #define PREFETCHSIZE_A 24
  130. #define PREFETCHSIZE_C 8
  131. #endif
  132. #ifdef POWER8
  133. #define PREFETCHSIZE_A 24
  134. #define PREFETCHSIZE_C 8
  135. #endif
  136. #if !(defined(CONJ) && defined(XCONJ))
  137. #define FMADDR FMADD
  138. #define FMSUBR FNMSUB
  139. #else
  140. #define FMADDR FNMSUB
  141. #define FMSUBR FMADD
  142. #endif
  143. #ifndef NEEDPARAM
  144. #ifndef __64BIT__
  145. #define FZERO 200(SP)
  146. #define ALPHA_R 208(SP)
  147. #define ALPHA_I 216(SP)
  148. #else
  149. #define FZERO 256(SP)
  150. #define ALPHA_R 264(SP)
  151. #define ALPHA_I 272(SP)
  152. #endif
  153. PROLOGUE
  154. PROFCODE
  155. addi SP, SP, -STACKSIZE
  156. li r0, 0
  157. stfd f14, 0(SP)
  158. stfd f15, 8(SP)
  159. stfd f16, 16(SP)
  160. stfd f17, 24(SP)
  161. stfd f18, 32(SP)
  162. stfd f19, 40(SP)
  163. stfd f20, 48(SP)
  164. stfd f21, 56(SP)
  165. stfd f22, 64(SP)
  166. stfd f23, 72(SP)
  167. stfd f24, 80(SP)
  168. stfd f25, 88(SP)
  169. stfd f26, 96(SP)
  170. stfd f27, 104(SP)
  171. stfd f28, 112(SP)
  172. stfd f29, 120(SP)
  173. stfd f30, 128(SP)
  174. stfd f31, 136(SP)
  175. #ifdef __64BIT__
  176. std r14, 144(SP)
  177. std r15, 152(SP)
  178. std r16, 160(SP)
  179. std r17, 168(SP)
  180. std r18, 176(SP)
  181. std r19, 184(SP)
  182. std r20, 192(SP)
  183. std r21, 200(SP)
  184. std r22, 208(SP)
  185. std r23, 216(SP)
  186. std r24, 224(SP)
  187. std r25, 232(SP)
  188. std r0, FZERO
  189. #else
  190. stw r14, 144(SP)
  191. stw r15, 148(SP)
  192. stw r16, 152(SP)
  193. stw r17, 156(SP)
  194. stw r18, 160(SP)
  195. stw r19, 164(SP)
  196. stw r20, 168(SP)
  197. stw r21, 172(SP)
  198. stw r22, 176(SP)
  199. stw r23, 180(SP)
  200. stw r24, 184(SP)
  201. stw r25, 188(SP)
  202. stw r0, FZERO
  203. stw r0, 4 + FZERO
  204. #endif
  205. #ifdef linux
  206. #ifndef __64BIT__
  207. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  208. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  209. #else
  210. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  211. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  212. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  213. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  214. #endif
  215. #endif
  216. #if defined(_AIX) || defined(__APPLE__)
  217. #ifndef __64BIT__
  218. #ifdef DOUBLE
  219. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  220. lwz X, FRAMESLOT(1) + STACKSIZE(SP)
  221. lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
  222. lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
  223. lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
  224. lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
  225. #else
  226. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  227. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  228. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  229. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  230. #endif
  231. #else
  232. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  233. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  234. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  235. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  236. #endif
  237. #endif
  238. stfd f1, ALPHA_R
  239. stfd f2, ALPHA_I
  240. mullw PLDA_M, LDA, N
  241. li XP, P
  242. subf PLDA_M, XP, PLDA_M
  243. slwi PLDA_M, PLDA_M, ZBASE_SHIFT
  244. slwi LDA, LDA, ZBASE_SHIFT
  245. slwi INCX, INCX, ZBASE_SHIFT
  246. slwi INCY, INCY, ZBASE_SHIFT
  247. li IS, 0
  248. li PREA, PREFETCHSIZE_A * SIZE
  249. li PREC, PREFETCHSIZE_C * SIZE
  250. cmpwi cr0, M, 0
  251. ble LL(End)
  252. cmpwi cr0, N, 0
  253. ble LL(End)
  254. .align 4
  255. LL(ISLoop):
  256. subf MIN_N, IS, M
  257. slwi r0, IS, ZBASE_SHIFT
  258. cmpi cr0, 0, MIN_N, P
  259. ble+ LL(min_nP)
  260. li MIN_N, P
  261. LL(min_nP):
  262. add XP, X, r0
  263. cmpwi cr0, INCX, 2 * SIZE
  264. beq LL(Main)
  265. mr XP, BUFFER
  266. addi CO, BUFFER, -SIZE
  267. srawi. r0, MIN_N, 2
  268. mtspr CTR, r0
  269. ble LL(CopyRemain)
  270. .align 4
  271. LL(CopyKernel):
  272. LFD f0, 0 * SIZE(X)
  273. LFD f1, 1 * SIZE(X)
  274. add X, X, INCX
  275. LFD f2, 0 * SIZE(X)
  276. LFD f3, 1 * SIZE(X)
  277. add X, X, INCX
  278. LFD f4, 0 * SIZE(X)
  279. LFD f5, 1 * SIZE(X)
  280. add X, X, INCX
  281. LFD f6, 0 * SIZE(X)
  282. LFD f7, 1 * SIZE(X)
  283. add X, X, INCX
  284. STFD f0, 1 * SIZE(CO)
  285. STFD f1, 2 * SIZE(CO)
  286. STFD f2, 3 * SIZE(CO)
  287. STFD f3, 4 * SIZE(CO)
  288. STFD f4, 5 * SIZE(CO)
  289. STFD f5, 6 * SIZE(CO)
  290. STFD f6, 7 * SIZE(CO)
  291. STFDU f7, 8 * SIZE(CO)
  292. bdnz LL(CopyKernel)
  293. .align 4
  294. LL(CopyRemain):
  295. andi. r0, MIN_N, 3
  296. mtspr CTR, r0
  297. ble LL(Main)
  298. .align 4
  299. LL(CopySub):
  300. LFD f0, 0 * SIZE(X)
  301. LFD f1, 1 * SIZE(X)
  302. add X, X, INCX
  303. STFD f0, 1 * SIZE(CO)
  304. STFDU f1, 2 * SIZE(CO)
  305. bdnz LL(CopySub)
  306. .align 4
  307. LL(Main):
  308. mr CO, Y
  309. addi XP, XP, -SIZE
  310. srawi. J, N, 2
  311. ble LL(Remain)
  312. .align 4
  313. LL(MainHead):
  314. mr AO1, A
  315. add AO2, A, LDA
  316. add AO3, AO2, LDA
  317. add AO4, AO3, LDA
  318. add A, AO4, LDA
  319. mr BO, XP
  320. lfd f0, FZERO
  321. fmr f1, f0
  322. fmr f2, f0
  323. fmr f3, f0
  324. fmr f4, f0
  325. fmr f5, f0
  326. fmr f6, f0
  327. fmr f7, f0
  328. fmr f8, f0
  329. fmr f9, f0
  330. fmr f10, f0
  331. fmr f11, f0
  332. fmr f12, f0
  333. fmr f13, f0
  334. fmr f14, f0
  335. fmr f15, f0
  336. dcbtst PREC, CO
  337. srawi. r0, MIN_N, 3
  338. mtspr CTR, r0
  339. ble LL(MainN3)
  340. LFD f16, 0 * SIZE(AO1)
  341. LFD f17, 1 * SIZE(AO1)
  342. LFD f18, 0 * SIZE(AO2)
  343. LFD f19, 1 * SIZE(AO2)
  344. LFD f20, 0 * SIZE(AO3)
  345. LFD f21, 1 * SIZE(AO3)
  346. LFD f22, 0 * SIZE(AO4)
  347. LFD f23, 1 * SIZE(AO4)
  348. LFD f24, 1 * SIZE(BO)
  349. LFD f25, 2 * SIZE(BO)
  350. LFD f26, 3 * SIZE(BO)
  351. LFD f27, 4 * SIZE(BO)
  352. LFD f28, 5 * SIZE(BO)
  353. LFD f29, 6 * SIZE(BO)
  354. LFD f30, 7 * SIZE(BO)
  355. LFD f31, 8 * SIZE(BO)
  356. bdz LL(MainKernelSkip)
  357. .align 5
  358. LL(MainKernel):
  359. FMADD f0, f16, f24, f0
  360. FMADD f1, f16, f25, f1
  361. FMADD f2, f17, f24, f2
  362. FMADD f3, f17, f25, f3
  363. FMADD f4, f18, f24, f4
  364. FMADD f5, f18, f25, f5
  365. FMADD f6, f19, f24, f6
  366. FMADD f7, f19, f25, f7
  367. LFD f16, 2 * SIZE(AO1)
  368. LFD f17, 3 * SIZE(AO1)
  369. LFD f18, 2 * SIZE(AO2)
  370. LFD f19, 3 * SIZE(AO2)
  371. FMADD f8, f20, f24, f8
  372. FMADD f9, f20, f25, f9
  373. FMADD f10, f21, f24, f10
  374. FMADD f11, f21, f25, f11
  375. FMADD f12, f22, f24, f12
  376. FMADD f13, f22, f25, f13
  377. FMADD f14, f23, f24, f14
  378. FMADD f15, f23, f25, f15
  379. LFD f20, 2 * SIZE(AO3)
  380. LFD f21, 3 * SIZE(AO3)
  381. LFD f22, 2 * SIZE(AO4)
  382. LFD f23, 3 * SIZE(AO4)
  383. FMADD f0, f16, f26, f0
  384. FMADD f1, f16, f27, f1
  385. FMADD f2, f17, f26, f2
  386. FMADD f3, f17, f27, f3
  387. FMADD f4, f18, f26, f4
  388. FMADD f5, f18, f27, f5
  389. FMADD f6, f19, f26, f6
  390. FMADD f7, f19, f27, f7
  391. LFD f16, 4 * SIZE(AO1)
  392. LFD f17, 5 * SIZE(AO1)
  393. LFD f18, 4 * SIZE(AO2)
  394. LFD f19, 5 * SIZE(AO2)
  395. FMADD f8, f20, f26, f8
  396. FMADD f9, f20, f27, f9
  397. FMADD f10, f21, f26, f10
  398. FMADD f11, f21, f27, f11
  399. FMADD f12, f22, f26, f12
  400. FMADD f13, f22, f27, f13
  401. FMADD f14, f23, f26, f14
  402. FMADD f15, f23, f27, f15
  403. LFD f20, 4 * SIZE(AO3)
  404. LFD f21, 5 * SIZE(AO3)
  405. LFD f22, 4 * SIZE(AO4)
  406. LFD f23, 5 * SIZE(AO4)
  407. LFD f24, 9 * SIZE(BO)
  408. LFD f25, 10 * SIZE(BO)
  409. LFD f26, 11 * SIZE(BO)
  410. LFD f27, 12 * SIZE(BO)
  411. FMADD f0, f16, f28, f0
  412. FMADD f1, f16, f29, f1
  413. FMADD f2, f17, f28, f2
  414. FMADD f3, f17, f29, f3
  415. FMADD f4, f18, f28, f4
  416. FMADD f5, f18, f29, f5
  417. FMADD f6, f19, f28, f6
  418. FMADD f7, f19, f29, f7
  419. LFD f16, 6 * SIZE(AO1)
  420. LFD f17, 7 * SIZE(AO1)
  421. LFD f18, 6 * SIZE(AO2)
  422. LFD f19, 7 * SIZE(AO2)
  423. FMADD f8, f20, f28, f8
  424. FMADD f9, f20, f29, f9
  425. FMADD f10, f21, f28, f10
  426. FMADD f11, f21, f29, f11
  427. FMADD f12, f22, f28, f12
  428. FMADD f13, f22, f29, f13
  429. FMADD f14, f23, f28, f14
  430. FMADD f15, f23, f29, f15
  431. LFD f20, 6 * SIZE(AO3)
  432. LFD f21, 7 * SIZE(AO3)
  433. LFD f22, 6 * SIZE(AO4)
  434. LFD f23, 7 * SIZE(AO4)
  435. FMADD f0, f16, f30, f0
  436. FMADD f1, f16, f31, f1
  437. FMADD f2, f17, f30, f2
  438. FMADD f3, f17, f31, f3
  439. FMADD f4, f18, f30, f4
  440. FMADD f5, f18, f31, f5
  441. FMADD f6, f19, f30, f6
  442. FMADD f7, f19, f31, f7
  443. LFD f16, 8 * SIZE(AO1)
  444. LFD f17, 9 * SIZE(AO1)
  445. LFD f18, 8 * SIZE(AO2)
  446. LFD f19, 9 * SIZE(AO2)
  447. FMADD f8, f20, f30, f8
  448. FMADD f9, f20, f31, f9
  449. FMADD f10, f21, f30, f10
  450. FMADD f11, f21, f31, f11
  451. FMADD f12, f22, f30, f12
  452. FMADD f13, f22, f31, f13
  453. FMADD f14, f23, f30, f14
  454. FMADD f15, f23, f31, f15
  455. LFD f20, 8 * SIZE(AO3)
  456. LFD f21, 9 * SIZE(AO3)
  457. LFD f22, 8 * SIZE(AO4)
  458. LFD f23, 9 * SIZE(AO4)
  459. LFD f28, 13 * SIZE(BO)
  460. LFD f29, 14 * SIZE(BO)
  461. LFD f30, 15 * SIZE(BO)
  462. LFD f31, 16 * SIZE(BO)
  463. FMADD f0, f16, f24, f0
  464. FMADD f1, f16, f25, f1
  465. FMADD f2, f17, f24, f2
  466. FMADD f3, f17, f25, f3
  467. FMADD f4, f18, f24, f4
  468. FMADD f5, f18, f25, f5
  469. FMADD f6, f19, f24, f6
  470. FMADD f7, f19, f25, f7
  471. LFD f16, 10 * SIZE(AO1)
  472. LFD f17, 11 * SIZE(AO1)
  473. LFD f18, 10 * SIZE(AO2)
  474. LFD f19, 11 * SIZE(AO2)
  475. FMADD f8, f20, f24, f8
  476. FMADD f9, f20, f25, f9
  477. FMADD f10, f21, f24, f10
  478. FMADD f11, f21, f25, f11
  479. FMADD f12, f22, f24, f12
  480. FMADD f13, f22, f25, f13
  481. FMADD f14, f23, f24, f14
  482. FMADD f15, f23, f25, f15
  483. LFD f20, 10 * SIZE(AO3)
  484. LFD f21, 11 * SIZE(AO3)
  485. LFD f22, 10 * SIZE(AO4)
  486. LFD f23, 11 * SIZE(AO4)
  487. FMADD f0, f16, f26, f0
  488. FMADD f1, f16, f27, f1
  489. FMADD f2, f17, f26, f2
  490. FMADD f3, f17, f27, f3
  491. FMADD f4, f18, f26, f4
  492. FMADD f5, f18, f27, f5
  493. FMADD f6, f19, f26, f6
  494. FMADD f7, f19, f27, f7
  495. LFD f16, 12 * SIZE(AO1)
  496. LFD f17, 13 * SIZE(AO1)
  497. LFD f18, 12 * SIZE(AO2)
  498. LFD f19, 13 * SIZE(AO2)
  499. FMADD f8, f20, f26, f8
  500. FMADD f9, f20, f27, f9
  501. FMADD f10, f21, f26, f10
  502. FMADD f11, f21, f27, f11
  503. FMADD f12, f22, f26, f12
  504. FMADD f13, f22, f27, f13
  505. FMADD f14, f23, f26, f14
  506. FMADD f15, f23, f27, f15
  507. LFD f20, 12 * SIZE(AO3)
  508. LFD f21, 13 * SIZE(AO3)
  509. LFD f22, 12 * SIZE(AO4)
  510. LFD f23, 13 * SIZE(AO4)
  511. LFD f24, 17 * SIZE(BO)
  512. LFD f25, 18 * SIZE(BO)
  513. LFD f26, 19 * SIZE(BO)
  514. LFD f27, 20 * SIZE(BO)
  515. FMADD f0, f16, f28, f0
  516. FMADD f1, f16, f29, f1
  517. FMADD f2, f17, f28, f2
  518. FMADD f3, f17, f29, f3
  519. FMADD f4, f18, f28, f4
  520. FMADD f5, f18, f29, f5
  521. FMADD f6, f19, f28, f6
  522. FMADD f7, f19, f29, f7
  523. LFD f16, 14 * SIZE(AO1)
  524. LFD f17, 15 * SIZE(AO1)
  525. LFD f18, 14 * SIZE(AO2)
  526. LFD f19, 15 * SIZE(AO2)
  527. FMADD f8, f20, f28, f8
  528. FMADD f9, f20, f29, f9
  529. FMADD f10, f21, f28, f10
  530. FMADD f11, f21, f29, f11
  531. FMADD f12, f22, f28, f12
  532. FMADD f13, f22, f29, f13
  533. FMADD f14, f23, f28, f14
  534. FMADD f15, f23, f29, f15
  535. LFD f20, 14 * SIZE(AO3)
  536. LFD f21, 15 * SIZE(AO3)
  537. LFD f22, 14 * SIZE(AO4)
  538. LFD f23, 15 * SIZE(AO4)
  539. FMADD f0, f16, f30, f0
  540. FMADD f1, f16, f31, f1
  541. FMADD f2, f17, f30, f2
  542. FMADD f3, f17, f31, f3
  543. FMADD f4, f18, f30, f4
  544. FMADD f5, f18, f31, f5
  545. FMADD f6, f19, f30, f6
  546. FMADD f7, f19, f31, f7
  547. LFD f16, 16 * SIZE(AO1)
  548. LFD f17, 17 * SIZE(AO1)
  549. LFD f18, 16 * SIZE(AO2)
  550. LFD f19, 17 * SIZE(AO2)
  551. addi AO1, AO1, 16 * SIZE
  552. addi AO2, AO2, 16 * SIZE
  553. DCBT(AO1, PREA)
  554. DCBT(AO2, PREA)
  555. FMADD f8, f20, f30, f8
  556. FMADD f9, f20, f31, f9
  557. FMADD f10, f21, f30, f10
  558. FMADD f11, f21, f31, f11
  559. FMADD f12, f22, f30, f12
  560. FMADD f13, f22, f31, f13
  561. FMADD f14, f23, f30, f14
  562. FMADD f15, f23, f31, f15
  563. LFD f20, 16 * SIZE(AO3)
  564. LFD f21, 17 * SIZE(AO3)
  565. LFD f22, 16 * SIZE(AO4)
  566. LFD f23, 17 * SIZE(AO4)
  567. LFD f28, 21 * SIZE(BO)
  568. LFD f29, 22 * SIZE(BO)
  569. LFD f30, 23 * SIZE(BO)
  570. LFD f31, 24 * SIZE(BO)
  571. addi AO3, AO3, 16 * SIZE
  572. addi AO4, AO4, 16 * SIZE
  573. DCBT(AO3, PREA)
  574. DCBT(AO4, PREA)
  575. addi BO, BO, 16 * SIZE
  576. bdnz LL(MainKernel)
  577. .align 4
  578. LL(MainKernelSkip):
  579. FMADD f0, f16, f24, f0
  580. FMADD f1, f16, f25, f1
  581. FMADD f2, f17, f24, f2
  582. FMADD f3, f17, f25, f3
  583. FMADD f4, f18, f24, f4
  584. FMADD f5, f18, f25, f5
  585. FMADD f6, f19, f24, f6
  586. FMADD f7, f19, f25, f7
  587. LFD f16, 2 * SIZE(AO1)
  588. LFD f17, 3 * SIZE(AO1)
  589. LFD f18, 2 * SIZE(AO2)
  590. LFD f19, 3 * SIZE(AO2)
  591. FMADD f8, f20, f24, f8
  592. FMADD f9, f20, f25, f9
  593. FMADD f10, f21, f24, f10
  594. FMADD f11, f21, f25, f11
  595. FMADD f12, f22, f24, f12
  596. FMADD f13, f22, f25, f13
  597. FMADD f14, f23, f24, f14
  598. FMADD f15, f23, f25, f15
  599. LFD f20, 2 * SIZE(AO3)
  600. LFD f21, 3 * SIZE(AO3)
  601. LFD f22, 2 * SIZE(AO4)
  602. LFD f23, 3 * SIZE(AO4)
  603. FMADD f0, f16, f26, f0
  604. FMADD f1, f16, f27, f1
  605. FMADD f2, f17, f26, f2
  606. FMADD f3, f17, f27, f3
  607. FMADD f4, f18, f26, f4
  608. FMADD f5, f18, f27, f5
  609. FMADD f6, f19, f26, f6
  610. FMADD f7, f19, f27, f7
  611. LFD f16, 4 * SIZE(AO1)
  612. LFD f17, 5 * SIZE(AO1)
  613. LFD f18, 4 * SIZE(AO2)
  614. LFD f19, 5 * SIZE(AO2)
  615. FMADD f8, f20, f26, f8
  616. FMADD f9, f20, f27, f9
  617. FMADD f10, f21, f26, f10
  618. FMADD f11, f21, f27, f11
  619. FMADD f12, f22, f26, f12
  620. FMADD f13, f22, f27, f13
  621. FMADD f14, f23, f26, f14
  622. FMADD f15, f23, f27, f15
  623. LFD f20, 4 * SIZE(AO3)
  624. LFD f21, 5 * SIZE(AO3)
  625. LFD f22, 4 * SIZE(AO4)
  626. LFD f23, 5 * SIZE(AO4)
  627. FMADD f0, f16, f28, f0
  628. FMADD f1, f16, f29, f1
  629. FMADD f2, f17, f28, f2
  630. FMADD f3, f17, f29, f3
  631. FMADD f4, f18, f28, f4
  632. FMADD f5, f18, f29, f5
  633. FMADD f6, f19, f28, f6
  634. FMADD f7, f19, f29, f7
  635. LFD f16, 6 * SIZE(AO1)
  636. LFD f17, 7 * SIZE(AO1)
  637. LFD f18, 6 * SIZE(AO2)
  638. LFD f19, 7 * SIZE(AO2)
  639. FMADD f8, f20, f28, f8
  640. FMADD f9, f20, f29, f9
  641. FMADD f10, f21, f28, f10
  642. FMADD f11, f21, f29, f11
  643. FMADD f12, f22, f28, f12
  644. FMADD f13, f22, f29, f13
  645. FMADD f14, f23, f28, f14
  646. FMADD f15, f23, f29, f15
  647. LFD f20, 6 * SIZE(AO3)
  648. LFD f21, 7 * SIZE(AO3)
  649. LFD f22, 6 * SIZE(AO4)
  650. LFD f23, 7 * SIZE(AO4)
  651. FMADD f0, f16, f30, f0
  652. FMADD f1, f16, f31, f1
  653. FMADD f2, f17, f30, f2
  654. FMADD f3, f17, f31, f3
  655. FMADD f4, f18, f30, f4
  656. FMADD f5, f18, f31, f5
  657. FMADD f6, f19, f30, f6
  658. FMADD f7, f19, f31, f7
  659. LFD f16, 8 * SIZE(AO1)
  660. LFD f17, 9 * SIZE(AO1)
  661. LFD f18, 8 * SIZE(AO2)
  662. LFD f19, 9 * SIZE(AO2)
  663. FMADD f8, f20, f30, f8
  664. FMADD f9, f20, f31, f9
  665. FMADD f10, f21, f30, f10
  666. FMADD f11, f21, f31, f11
  667. FMADD f12, f22, f30, f12
  668. FMADD f13, f22, f31, f13
  669. FMADD f14, f23, f30, f14
  670. FMADD f15, f23, f31, f15
  671. LFD f20, 8 * SIZE(AO3)
  672. LFD f21, 9 * SIZE(AO3)
  673. LFD f22, 8 * SIZE(AO4)
  674. LFD f23, 9 * SIZE(AO4)
  675. LFD f24, 9 * SIZE(BO)
  676. LFD f25, 10 * SIZE(BO)
  677. LFD f26, 11 * SIZE(BO)
  678. LFD f27, 12 * SIZE(BO)
  679. LFD f28, 13 * SIZE(BO)
  680. LFD f29, 14 * SIZE(BO)
  681. LFD f30, 15 * SIZE(BO)
  682. LFDU f31, 16 * SIZE(BO)
  683. FMADD f0, f16, f24, f0
  684. FMADD f1, f16, f25, f1
  685. FMADD f2, f17, f24, f2
  686. FMADD f3, f17, f25, f3
  687. FMADD f4, f18, f24, f4
  688. FMADD f5, f18, f25, f5
  689. FMADD f6, f19, f24, f6
  690. FMADD f7, f19, f25, f7
  691. LFD f16, 10 * SIZE(AO1)
  692. LFD f17, 11 * SIZE(AO1)
  693. LFD f18, 10 * SIZE(AO2)
  694. LFD f19, 11 * SIZE(AO2)
  695. FMADD f8, f20, f24, f8
  696. FMADD f9, f20, f25, f9
  697. FMADD f10, f21, f24, f10
  698. FMADD f11, f21, f25, f11
  699. FMADD f12, f22, f24, f12
  700. FMADD f13, f22, f25, f13
  701. FMADD f14, f23, f24, f14
  702. FMADD f15, f23, f25, f15
  703. LFD f20, 10 * SIZE(AO3)
  704. LFD f21, 11 * SIZE(AO3)
  705. LFD f22, 10 * SIZE(AO4)
  706. LFD f23, 11 * SIZE(AO4)
  707. FMADD f0, f16, f26, f0
  708. FMADD f1, f16, f27, f1
  709. FMADD f2, f17, f26, f2
  710. FMADD f3, f17, f27, f3
  711. FMADD f4, f18, f26, f4
  712. FMADD f5, f18, f27, f5
  713. FMADD f6, f19, f26, f6
  714. FMADD f7, f19, f27, f7
  715. LFD f16, 12 * SIZE(AO1)
  716. LFD f17, 13 * SIZE(AO1)
  717. LFD f18, 12 * SIZE(AO2)
  718. LFD f19, 13 * SIZE(AO2)
  719. FMADD f8, f20, f26, f8
  720. FMADD f9, f20, f27, f9
  721. FMADD f10, f21, f26, f10
  722. FMADD f11, f21, f27, f11
  723. FMADD f12, f22, f26, f12
  724. FMADD f13, f22, f27, f13
  725. FMADD f14, f23, f26, f14
  726. FMADD f15, f23, f27, f15
  727. LFD f20, 12 * SIZE(AO3)
  728. LFD f21, 13 * SIZE(AO3)
  729. LFD f22, 12 * SIZE(AO4)
  730. LFD f23, 13 * SIZE(AO4)
  731. FMADD f0, f16, f28, f0
  732. FMADD f1, f16, f29, f1
  733. FMADD f2, f17, f28, f2
  734. FMADD f3, f17, f29, f3
  735. FMADD f4, f18, f28, f4
  736. FMADD f5, f18, f29, f5
  737. FMADD f6, f19, f28, f6
  738. FMADD f7, f19, f29, f7
  739. LFD f16, 14 * SIZE(AO1)
  740. LFD f17, 15 * SIZE(AO1)
  741. LFD f18, 14 * SIZE(AO2)
  742. LFD f19, 15 * SIZE(AO2)
  743. FMADD f8, f20, f28, f8
  744. FMADD f9, f20, f29, f9
  745. FMADD f10, f21, f28, f10
  746. FMADD f11, f21, f29, f11
  747. FMADD f12, f22, f28, f12
  748. FMADD f13, f22, f29, f13
  749. FMADD f14, f23, f28, f14
  750. FMADD f15, f23, f29, f15
  751. LFD f20, 14 * SIZE(AO3)
  752. LFD f21, 15 * SIZE(AO3)
  753. LFD f22, 14 * SIZE(AO4)
  754. LFD f23, 15 * SIZE(AO4)
  755. addi AO1, AO1, 16 * SIZE
  756. addi AO2, AO2, 16 * SIZE
  757. addi AO3, AO3, 16 * SIZE
  758. addi AO4, AO4, 16 * SIZE
  759. FMADD f0, f16, f30, f0
  760. FMADD f1, f16, f31, f1
  761. FMADD f2, f17, f30, f2
  762. FMADD f3, f17, f31, f3
  763. FMADD f4, f18, f30, f4
  764. FMADD f5, f18, f31, f5
  765. FMADD f6, f19, f30, f6
  766. FMADD f7, f19, f31, f7
  767. FMADD f8, f20, f30, f8
  768. FMADD f9, f20, f31, f9
  769. FMADD f10, f21, f30, f10
  770. FMADD f11, f21, f31, f11
  771. FMADD f12, f22, f30, f12
  772. FMADD f13, f22, f31, f13
  773. FMADD f14, f23, f30, f14
  774. FMADD f15, f23, f31, f15
  775. .align 4
  776. LL(MainN3):
  777. andi. r0, MIN_N, 7
  778. mtspr CTR, r0
  779. ble LL(MainFinish)
  780. .align 4
  781. LFD f16, 0 * SIZE(AO1)
  782. LFD f17, 1 * SIZE(AO1)
  783. LFD f18, 0 * SIZE(AO2)
  784. LFD f19, 1 * SIZE(AO2)
  785. LFD f20, 0 * SIZE(AO3)
  786. LFD f21, 1 * SIZE(AO3)
  787. LFD f22, 0 * SIZE(AO4)
  788. LFD f23, 1 * SIZE(AO4)
  789. LFD f24, 1 * SIZE(BO)
  790. LFDU f25, 2 * SIZE(BO)
  791. addi AO1, AO1, 2 * SIZE
  792. addi AO2, AO2, 2 * SIZE
  793. addi AO3, AO3, 2 * SIZE
  794. addi AO4, AO4, 2 * SIZE
  795. bdz LL(MainN3KernelSkip)
  796. .align 4
  797. LL(MainN3Kernel):
  798. FMADD f0, f16, f24, f0
  799. FMADD f1, f16, f25, f1
  800. FMADD f2, f17, f24, f2
  801. FMADD f3, f17, f25, f3
  802. FMADD f4, f18, f24, f4
  803. FMADD f5, f18, f25, f5
  804. FMADD f6, f19, f24, f6
  805. FMADD f7, f19, f25, f7
  806. LFD f16, 0 * SIZE(AO1)
  807. LFD f17, 1 * SIZE(AO1)
  808. LFD f18, 0 * SIZE(AO2)
  809. LFD f19, 1 * SIZE(AO2)
  810. FMADD f8, f20, f24, f8
  811. FMADD f9, f20, f25, f9
  812. FMADD f10, f21, f24, f10
  813. FMADD f11, f21, f25, f11
  814. FMADD f12, f22, f24, f12
  815. FMADD f13, f22, f25, f13
  816. FMADD f14, f23, f24, f14
  817. FMADD f15, f23, f25, f15
  818. LFD f20, 0 * SIZE(AO3)
  819. LFD f21, 1 * SIZE(AO3)
  820. LFD f22, 0 * SIZE(AO4)
  821. LFD f23, 1 * SIZE(AO4)
  822. LFD f24, 1 * SIZE(BO)
  823. LFDU f25, 2 * SIZE(BO)
  824. addi AO1, AO1, 2 * SIZE
  825. addi AO2, AO2, 2 * SIZE
  826. addi AO3, AO3, 2 * SIZE
  827. addi AO4, AO4, 2 * SIZE
  828. bdnz LL(MainN3Kernel)
  829. .align 4
  830. LL(MainN3KernelSkip):
  831. FMADD f0, f16, f24, f0
  832. FMADD f1, f16, f25, f1
  833. FMADD f2, f17, f24, f2
  834. FMADD f3, f17, f25, f3
  835. FMADD f4, f18, f24, f4
  836. FMADD f5, f18, f25, f5
  837. FMADD f6, f19, f24, f6
  838. FMADD f7, f19, f25, f7
  839. FMADD f8, f20, f24, f8
  840. FMADD f9, f20, f25, f9
  841. FMADD f10, f21, f24, f10
  842. FMADD f11, f21, f25, f11
  843. FMADD f12, f22, f24, f12
  844. FMADD f13, f22, f25, f13
  845. FMADD f14, f23, f24, f14
  846. FMADD f15, f23, f25, f15
  847. .align 4
  848. LL(MainFinish):
  849. lfd f30, ALPHA_R
  850. lfd f31, ALPHA_I
  851. #ifndef XCONJ
  852. #ifndef CONJ
  853. FSUB f0, f0, f3
  854. FADD f1, f1, f2
  855. FSUB f4, f4, f7
  856. FADD f5, f5, f6
  857. FSUB f8, f8, f11
  858. FADD f9, f9, f10
  859. FSUB f12, f12, f15
  860. FADD f13, f13, f14
  861. #else
  862. FADD f0, f0, f3
  863. FSUB f1, f1, f2
  864. FADD f4, f4, f7
  865. FSUB f5, f5, f6
  866. FADD f8, f8, f11
  867. FSUB f9, f9, f10
  868. FADD f12, f12, f15
  869. FSUB f13, f13, f14
  870. #endif
  871. #else
  872. #ifndef CONJ
  873. FADD f0, f0, f3
  874. FSUB f1, f2, f1
  875. FADD f4, f4, f7
  876. FSUB f5, f6, f5
  877. FADD f8, f8, f11
  878. FSUB f9, f10, f9
  879. FADD f12, f12, f15
  880. FSUB f13, f14, f13
  881. #else
  882. FSUB f0, f0, f3
  883. FADD f1, f1, f2
  884. FSUB f4, f4, f7
  885. FADD f5, f5, f6
  886. FSUB f8, f8, f11
  887. FADD f9, f9, f10
  888. FSUB f12, f12, f15
  889. FADD f13, f13, f14
  890. #endif
  891. #endif
  892. mr BO, CO
  893. cmpwi cr0, INCY, 2 * SIZE
  894. bne LL(FinishN1)
  895. LFD f16, 0 * SIZE(CO)
  896. LFD f17, 1 * SIZE(CO)
  897. LFD f18, 2 * SIZE(CO)
  898. LFD f19, 3 * SIZE(CO)
  899. LFD f20, 4 * SIZE(CO)
  900. LFD f21, 5 * SIZE(CO)
  901. LFD f22, 6 * SIZE(CO)
  902. LFD f23, 7 * SIZE(CO)
  903. FMADD f16, f30, f0, f16
  904. FMADDR f17, f30, f1, f17
  905. FMADD f18, f30, f4, f18
  906. FMADDR f19, f30, f5, f19
  907. FMADD f20, f30, f8, f20
  908. FMADDR f21, f30, f9, f21
  909. FMADD f22, f30, f12, f22
  910. FMADDR f23, f30, f13, f23
  911. FMSUBR f16, f31, f1, f16
  912. FMADD f17, f31, f0, f17
  913. FMSUBR f18, f31, f5, f18
  914. FMADD f19, f31, f4, f19
  915. FMSUBR f20, f31, f9, f20
  916. FMADD f21, f31, f8, f21
  917. FMSUBR f22, f31, f13, f22
  918. FMADD f23, f31, f12, f23
  919. STFD f16, 0 * SIZE(CO)
  920. STFD f17, 1 * SIZE(CO)
  921. STFD f18, 2 * SIZE(CO)
  922. STFD f19, 3 * SIZE(CO)
  923. STFD f20, 4 * SIZE(CO)
  924. STFD f21, 5 * SIZE(CO)
  925. STFD f22, 6 * SIZE(CO)
  926. STFD f23, 7 * SIZE(CO)
  927. addi CO, CO, 8 * SIZE
  928. addi J, J, -1
  929. cmpwi cr0, J, 0
  930. bgt LL(MainHead)
  931. b LL(Remain)
  932. .align 4
  933. LL(FinishN1):
  934. LFD f16, 0 * SIZE(CO)
  935. LFD f17, 1 * SIZE(CO)
  936. add CO, CO, INCY
  937. LFD f18, 0 * SIZE(CO)
  938. LFD f19, 1 * SIZE(CO)
  939. add CO, CO, INCY
  940. LFD f20, 0 * SIZE(CO)
  941. LFD f21, 1 * SIZE(CO)
  942. add CO, CO, INCY
  943. LFD f22, 0 * SIZE(CO)
  944. LFD f23, 1 * SIZE(CO)
  945. add CO, CO, INCY
  946. FMADD f16, f30, f0, f16
  947. FMADDR f17, f30, f1, f17
  948. FMADD f18, f30, f4, f18
  949. FMADDR f19, f30, f5, f19
  950. FMADD f20, f30, f8, f20
  951. FMADDR f21, f30, f9, f21
  952. FMADD f22, f30, f12, f22
  953. FMADDR f23, f30, f13, f23
  954. FMSUBR f16, f31, f1, f16
  955. FMADD f17, f31, f0, f17
  956. FMSUBR f18, f31, f5, f18
  957. FMADD f19, f31, f4, f19
  958. FMSUBR f20, f31, f9, f20
  959. FMADD f21, f31, f8, f21
  960. FMSUBR f22, f31, f13, f22
  961. FMADD f23, f31, f12, f23
  962. STFD f16, 0 * SIZE(BO)
  963. STFD f17, 1 * SIZE(BO)
  964. add BO, BO, INCY
  965. STFD f18, 0 * SIZE(BO)
  966. STFD f19, 1 * SIZE(BO)
  967. add BO, BO, INCY
  968. STFD f20, 0 * SIZE(BO)
  969. STFD f21, 1 * SIZE(BO)
  970. add BO, BO, INCY
  971. STFD f22, 0 * SIZE(BO)
  972. STFD f23, 1 * SIZE(BO)
  973. addi J, J, -1
  974. cmpwi cr0, J, 0
  975. bgt LL(MainHead)
  976. .align 4
  977. LL(Remain):
  978. andi. J, N, 3
  979. ble LL(ISEnd)
  980. .align 4
  981. LL(RemainHead):
  982. mr AO1, A
  983. add A, A, LDA
  984. mr BO, XP
  985. lfd f0, FZERO
  986. fmr f1, f0
  987. fmr f2, f0
  988. fmr f3, f0
  989. fmr f4, f0
  990. fmr f5, f0
  991. fmr f6, f0
  992. fmr f7, f0
  993. fmr f8, f0
  994. fmr f9, f0
  995. fmr f10, f0
  996. fmr f11, f0
  997. fmr f12, f0
  998. fmr f13, f0
  999. fmr f14, f0
  1000. fmr f15, f0
  1001. srawi. r0 , MIN_N, 3
  1002. mtspr CTR, r0
  1003. ble LL(RemainN3)
  1004. LFD f16, 0 * SIZE(AO1)
  1005. LFD f17, 1 * SIZE(AO1)
  1006. LFD f18, 2 * SIZE(AO1)
  1007. LFD f19, 3 * SIZE(AO1)
  1008. LFD f20, 4 * SIZE(AO1)
  1009. LFD f21, 5 * SIZE(AO1)
  1010. LFD f22, 6 * SIZE(AO1)
  1011. LFD f23, 7 * SIZE(AO1)
  1012. LFD f24, 1 * SIZE(BO)
  1013. LFD f25, 2 * SIZE(BO)
  1014. LFD f26, 3 * SIZE(BO)
  1015. LFD f27, 4 * SIZE(BO)
  1016. LFD f28, 5 * SIZE(BO)
  1017. LFD f29, 6 * SIZE(BO)
  1018. LFD f30, 7 * SIZE(BO)
  1019. LFD f31, 8 * SIZE(BO)
  1020. bdz LL(RemainKernelSkip)
  1021. .align 4
  1022. LL(RemainKernel):
  1023. FMADD f0, f16, f24, f0
  1024. FMADD f1, f16, f25, f1
  1025. FMADD f2, f17, f24, f2
  1026. FMADD f3, f17, f25, f3
  1027. FMADD f4, f18, f26, f4
  1028. FMADD f5, f18, f27, f5
  1029. FMADD f6, f19, f26, f6
  1030. FMADD f7, f19, f27, f7
  1031. LFD f16, 8 * SIZE(AO1)
  1032. LFD f17, 9 * SIZE(AO1)
  1033. LFD f18, 10 * SIZE(AO1)
  1034. LFD f19, 11 * SIZE(AO1)
  1035. LFD f24, 9 * SIZE(BO)
  1036. LFD f25, 10 * SIZE(BO)
  1037. LFD f26, 11 * SIZE(BO)
  1038. LFD f27, 12 * SIZE(BO)
  1039. FMADD f8, f20, f28, f8
  1040. FMADD f9, f20, f29, f9
  1041. FMADD f10, f21, f28, f10
  1042. FMADD f11, f21, f29, f11
  1043. FMADD f12, f22, f30, f12
  1044. FMADD f13, f22, f31, f13
  1045. FMADD f14, f23, f30, f14
  1046. FMADD f15, f23, f31, f15
  1047. LFD f20, 12 * SIZE(AO1)
  1048. LFD f21, 13 * SIZE(AO1)
  1049. LFD f22, 14 * SIZE(AO1)
  1050. LFD f23, 15 * SIZE(AO1)
  1051. LFD f28, 13 * SIZE(BO)
  1052. LFD f29, 14 * SIZE(BO)
  1053. LFD f30, 15 * SIZE(BO)
  1054. LFD f31, 16 * SIZE(BO)
  1055. FMADD f0, f16, f24, f0
  1056. FMADD f1, f16, f25, f1
  1057. FMADD f2, f17, f24, f2
  1058. FMADD f3, f17, f25, f3
  1059. FMADD f4, f18, f26, f4
  1060. FMADD f5, f18, f27, f5
  1061. FMADD f6, f19, f26, f6
  1062. FMADD f7, f19, f27, f7
  1063. LFD f16, 16 * SIZE(AO1)
  1064. LFD f17, 17 * SIZE(AO1)
  1065. LFD f18, 18 * SIZE(AO1)
  1066. LFD f19, 19 * SIZE(AO1)
  1067. LFD f24, 17 * SIZE(BO)
  1068. LFD f25, 18 * SIZE(BO)
  1069. LFD f26, 19 * SIZE(BO)
  1070. LFD f27, 20 * SIZE(BO)
  1071. FMADD f8, f20, f28, f8
  1072. FMADD f9, f20, f29, f9
  1073. FMADD f10, f21, f28, f10
  1074. FMADD f11, f21, f29, f11
  1075. FMADD f12, f22, f30, f12
  1076. FMADD f13, f22, f31, f13
  1077. FMADD f14, f23, f30, f14
  1078. FMADD f15, f23, f31, f15
  1079. LFD f20, 20 * SIZE(AO1)
  1080. LFD f21, 21 * SIZE(AO1)
  1081. LFD f22, 22 * SIZE(AO1)
  1082. LFD f23, 23 * SIZE(AO1)
  1083. LFD f28, 21 * SIZE(BO)
  1084. LFD f29, 22 * SIZE(BO)
  1085. LFD f30, 23 * SIZE(BO)
  1086. LFD f31, 24 * SIZE(BO)
  1087. addi AO1, AO1, 16 * SIZE
  1088. addi BO, BO, 16 * SIZE
  1089. DCBT(AO1, PREA)
  1090. bdnz LL(RemainKernel)
  1091. .align 4
  1092. LL(RemainKernelSkip):
  1093. FMADD f0, f16, f24, f0
  1094. FMADD f1, f16, f25, f1
  1095. FMADD f2, f17, f24, f2
  1096. FMADD f3, f17, f25, f3
  1097. FMADD f4, f18, f26, f4
  1098. FMADD f5, f18, f27, f5
  1099. FMADD f6, f19, f26, f6
  1100. FMADD f7, f19, f27, f7
  1101. LFD f16, 8 * SIZE(AO1)
  1102. LFD f17, 9 * SIZE(AO1)
  1103. LFD f18, 10 * SIZE(AO1)
  1104. LFD f19, 11 * SIZE(AO1)
  1105. LFD f24, 9 * SIZE(BO)
  1106. LFD f25, 10 * SIZE(BO)
  1107. LFD f26, 11 * SIZE(BO)
  1108. LFD f27, 12 * SIZE(BO)
  1109. FMADD f8, f20, f28, f8
  1110. FMADD f9, f20, f29, f9
  1111. FMADD f10, f21, f28, f10
  1112. FMADD f11, f21, f29, f11
  1113. FMADD f12, f22, f30, f12
  1114. FMADD f13, f22, f31, f13
  1115. FMADD f14, f23, f30, f14
  1116. FMADD f15, f23, f31, f15
  1117. LFD f20, 12 * SIZE(AO1)
  1118. LFD f21, 13 * SIZE(AO1)
  1119. LFD f22, 14 * SIZE(AO1)
  1120. LFD f23, 15 * SIZE(AO1)
  1121. LFD f28, 13 * SIZE(BO)
  1122. LFD f29, 14 * SIZE(BO)
  1123. LFD f30, 15 * SIZE(BO)
  1124. LFDU f31, 16 * SIZE(BO)
  1125. FMADD f0, f16, f24, f0
  1126. FMADD f1, f16, f25, f1
  1127. FMADD f2, f17, f24, f2
  1128. FMADD f3, f17, f25, f3
  1129. FMADD f4, f18, f26, f4
  1130. FMADD f5, f18, f27, f5
  1131. FMADD f6, f19, f26, f6
  1132. FMADD f7, f19, f27, f7
  1133. FMADD f8, f20, f28, f8
  1134. FMADD f9, f20, f29, f9
  1135. FMADD f10, f21, f28, f10
  1136. FMADD f11, f21, f29, f11
  1137. FMADD f12, f22, f30, f12
  1138. FMADD f13, f22, f31, f13
  1139. FMADD f14, f23, f30, f14
  1140. FMADD f15, f23, f31, f15
  1141. addi AO1, AO1, 16 * SIZE
  1142. .align 4
  1143. LL(RemainN3):
  1144. andi. r0, MIN_N, 7
  1145. mtspr CTR, r0
  1146. ble LL(RemainFinish)
  1147. .align 4
  1148. LFD f16, 0 * SIZE(AO1)
  1149. LFD f17, 1 * SIZE(AO1)
  1150. LFD f24, 1 * SIZE(BO)
  1151. LFDU f25, 2 * SIZE(BO)
  1152. addi AO1, AO1, 2 * SIZE
  1153. bdz LL(RemainN3KernelSkip)
  1154. .align 4
  1155. LL(RemainN3Kernel):
  1156. FMADD f0, f16, f24, f0
  1157. FMADD f1, f16, f25, f1
  1158. FMADD f2, f17, f24, f2
  1159. FMADD f3, f17, f25, f3
  1160. LFD f16, 0 * SIZE(AO1)
  1161. LFD f17, 1 * SIZE(AO1)
  1162. LFD f24, 1 * SIZE(BO)
  1163. LFDU f25, 2 * SIZE(BO)
  1164. addi AO1, AO1, 2 * SIZE
  1165. bdnz LL(RemainN3Kernel)
  1166. .align 4
  1167. LL(RemainN3KernelSkip):
  1168. FMADD f0, f16, f24, f0
  1169. FMADD f1, f16, f25, f1
  1170. FMADD f2, f17, f24, f2
  1171. FMADD f3, f17, f25, f3
  1172. .align 4
  1173. LL(RemainFinish):
  1174. lfd f30, ALPHA_R
  1175. lfd f31, ALPHA_I
  1176. LFD f16, 0 * SIZE(CO)
  1177. LFD f17, 1 * SIZE(CO)
  1178. FADD f0, f0, f4
  1179. FADD f1, f1, f5
  1180. FADD f2, f2, f6
  1181. FADD f3, f3, f7
  1182. FADD f8, f8, f12
  1183. FADD f9, f9, f13
  1184. FADD f10, f10, f14
  1185. FADD f11, f11, f15
  1186. FADD f0, f0, f8
  1187. FADD f1, f1, f9
  1188. FADD f2, f2, f10
  1189. FADD f3, f3, f11
  1190. #ifndef XCONJ
  1191. #ifndef CONJ
  1192. FSUB f0, f0, f3
  1193. FADD f1, f1, f2
  1194. #else
  1195. FADD f0, f0, f3
  1196. FSUB f1, f1, f2
  1197. #endif
  1198. #else
  1199. #ifndef CONJ
  1200. FADD f0, f0, f3
  1201. FSUB f1, f2, f1
  1202. #else
  1203. FSUB f0, f0, f3
  1204. FADD f1, f1, f2
  1205. #endif
  1206. #endif
  1207. FMADD f16, f30, f0, f16
  1208. FMADDR f17, f30, f1, f17
  1209. FMSUBR f16, f31, f1, f16
  1210. FMADD f17, f31, f0, f17
  1211. STFD f16, 0 * SIZE(CO)
  1212. STFD f17, 1 * SIZE(CO)
  1213. add CO, CO, INCY
  1214. addi J, J, -1
  1215. cmpi cr0, 0, J, 0
  1216. bgt LL(RemainHead)
  1217. .align 4
  1218. LL(ISEnd):
  1219. subf A, PLDA_M, A
  1220. addi IS, IS, P
  1221. cmp cr0, 0, IS, M
  1222. blt LL(ISLoop)
  1223. .align 4
  1224. LL(End):
  1225. li r3, 0
  1226. lfd f14, 0(SP)
  1227. lfd f15, 8(SP)
  1228. lfd f16, 16(SP)
  1229. lfd f17, 24(SP)
  1230. lfd f18, 32(SP)
  1231. lfd f19, 40(SP)
  1232. lfd f20, 48(SP)
  1233. lfd f21, 56(SP)
  1234. lfd f22, 64(SP)
  1235. lfd f23, 72(SP)
  1236. lfd f24, 80(SP)
  1237. lfd f25, 88(SP)
  1238. lfd f26, 96(SP)
  1239. lfd f27, 104(SP)
  1240. lfd f28, 112(SP)
  1241. lfd f29, 120(SP)
  1242. lfd f30, 128(SP)
  1243. lfd f31, 136(SP)
  1244. #ifdef __64BIT__
  1245. ld r14, 144(SP)
  1246. ld r15, 152(SP)
  1247. ld r16, 160(SP)
  1248. ld r17, 168(SP)
  1249. ld r18, 176(SP)
  1250. ld r19, 184(SP)
  1251. ld r20, 192(SP)
  1252. ld r21, 200(SP)
  1253. ld r22, 208(SP)
  1254. ld r23, 216(SP)
  1255. ld r24, 224(SP)
  1256. ld r25, 232(SP)
  1257. #else
  1258. lwz r14, 144(SP)
  1259. lwz r15, 148(SP)
  1260. lwz r16, 152(SP)
  1261. lwz r17, 156(SP)
  1262. lwz r18, 160(SP)
  1263. lwz r19, 164(SP)
  1264. lwz r20, 168(SP)
  1265. lwz r21, 172(SP)
  1266. lwz r22, 176(SP)
  1267. lwz r23, 180(SP)
  1268. lwz r24, 184(SP)
  1269. lwz r25, 188(SP)
  1270. #endif
  1271. addi SP, SP, STACKSIZE
  1272. blr
  1273. EPILOGUE
  1274. #endif