You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_hummer_n.S 32 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M r3
  41. #define N r4
  42. #define A r6
  43. #define LDA r7
  44. #define X r8
  45. #define INCX r9
  46. #define Y r10
  47. #define INCY r5
  48. #define I r11
  49. #define J r12
  50. #define INCY2 r24
  51. #define A1 r25
  52. #define A2 r26
  53. #define A3 r27
  54. #define A4 r28
  55. #define YL r29
  56. #define YS r30
  57. #define INC2 r31
  58. #define yl1 f0
  59. #define yl2 f2
  60. #define yl3 f3
  61. #define yl4 f4
  62. #define ys1 f5
  63. #define ys2 f6
  64. #define ys3 f7
  65. #define ys4 f8
  66. #define yl5 f27
  67. #define ys5 f28
  68. #define alpha1 f9
  69. #define alpha2 f10
  70. #define a1 f11
  71. #define a2 f12
  72. #define a3 f13
  73. #define a4 f14
  74. #define a5 f15
  75. #define a6 f16
  76. #define a7 f17
  77. #define a8 f18
  78. #define a9 f19
  79. #define a10 f20
  80. #define a11 f21
  81. #define a12 f22
  82. #define a13 f23
  83. #define a14 f24
  84. #define a15 f25
  85. #define a16 f26
  86. #define alpha f1
  87. PROLOGUE
  88. PROFCODE
  89. li r0, -16
  90. lwz INCY, 8(SP)
  91. stfpdux f14, SP, r0
  92. stfpdux f15, SP, r0
  93. stfpdux f16, SP, r0
  94. stfpdux f17, SP, r0
  95. stfpdux f18, SP, r0
  96. stfpdux f19, SP, r0
  97. stfpdux f20, SP, r0
  98. stfpdux f21, SP, r0
  99. stfpdux f22, SP, r0
  100. stfpdux f23, SP, r0
  101. stfpdux f24, SP, r0
  102. stfpdux f25, SP, r0
  103. stfpdux f26, SP, r0
  104. stfpdux f27, SP, r0
  105. stfpdux f28, SP, r0
  106. stfpdux f29, SP, r0
  107. stfpdux f30, SP, r0
  108. stfpdux f31, SP, r0
  109. stwu r31, -4(SP)
  110. stwu r30, -4(SP)
  111. stwu r29, -4(SP)
  112. stwu r28, -4(SP)
  113. stwu r27, -4(SP)
  114. stwu r26, -4(SP)
  115. stwu r25, -4(SP)
  116. stwu r24, -4(SP)
  117. stwu r23, -4(SP)
  118. stwu r22, -4(SP)
  119. stwu r21, -4(SP)
  120. stwu r20, -4(SP)
  121. stwu r19, -4(SP)
  122. stwu r18, -4(SP)
  123. stwu r17, -4(SP)
  124. stwu r16, -4(SP)
  125. slwi LDA, LDA, BASE_SHIFT
  126. slwi INCX, INCX, BASE_SHIFT
  127. slwi INCY, INCY, BASE_SHIFT
  128. fsmfp alpha, alpha
  129. cmpwi cr0, M, 0
  130. ble- .L999
  131. cmpwi cr0, N, 0
  132. ble- .L999
  133. add INCY2, INCY, INCY
  134. li INC2, 2 * SIZE
  135. sub X, X, INCX
  136. andi. r0, A, 2 * SIZE - 1
  137. # bne .L100
  138. # All cases for aligned A, even LDA
  139. cmpwi cr0, INCY, SIZE
  140. bne .L70
  141. andi. r0, Y, 2 * SIZE - 1
  142. bne .L40
  143. # A : aligned LDA : even Y : Unit Aligned
  144. sub A, A, INC2
  145. sub Y, Y, INCY2
  146. srawi. J, N, 2
  147. ble .L20
  148. .align 4
  149. .L11:
  150. LFDUX alpha1, X, INCX
  151. mr A1, A
  152. add A2, A, LDA
  153. add A3, A2, LDA
  154. LFSDUX alpha1, X, INCX
  155. LFDUX alpha2, X, INCX
  156. add A4, A3, LDA
  157. add A, A4, LDA
  158. mr YL, Y
  159. LFSDUX alpha2, X, INCX
  160. fpmul alpha1, alpha, alpha1
  161. mr YS, Y
  162. srawi. r0, M, 3
  163. mtspr CTR, r0
  164. fpmul alpha2, alpha, alpha2
  165. ble .L15
  166. LFPDUX yl1, YL, INCY2
  167. LFPDUX yl2, YL, INCY2
  168. LFPDUX yl3, YL, INCY2
  169. LFPDUX yl4, YL, INCY2
  170. LFPDUX a1, A1, INC2
  171. LFPDUX a5, A1, INC2
  172. LFPDUX a9, A1, INC2
  173. LFPDUX a13, A1, INC2
  174. LFPDUX a2, A2, INC2
  175. LFPDUX a6, A2, INC2
  176. LFPDUX a10, A2, INC2
  177. LFPDUX a14, A2, INC2
  178. LFPDUX a3, A3, INC2
  179. LFPDUX a7, A3, INC2
  180. LFPDUX a11, A3, INC2
  181. LFPDUX a15, A3, INC2
  182. LFPDUX a4, A4, INC2
  183. fxcpmadd ys1, alpha1, a1, yl1
  184. LFPDUX a8, A4, INC2
  185. fxcpmadd ys2, alpha1, a5, yl2
  186. LFPDUX a12, A4, INC2
  187. fxcpmadd ys3, alpha1, a9, yl3
  188. LFPDUX a16, A4, INC2
  189. fxcpmadd ys4, alpha1, a13, yl4
  190. bdz .L13
  191. .align 4
  192. .L12:
  193. LFPDUX yl1, YL, INCY2
  194. fxcsmadd ys1, alpha1, a2, ys1
  195. LFPDUX a1, A1, INC2
  196. fxcsmadd ys2, alpha1, a6, ys2
  197. LFPDUX a5, A1, INC2
  198. fxcsmadd ys3, alpha1, a10, ys3
  199. LFPDUX a9, A1, INC2
  200. fxcsmadd ys4, alpha1, a14, ys4
  201. LFPDUX a13, A1, INC2
  202. LFPDUX yl2, YL, INCY2
  203. fxcpmadd ys1, alpha2, a3, ys1
  204. LFPDUX a2, A2, INC2
  205. fxcpmadd ys2, alpha2, a7, ys2
  206. LFPDUX a6, A2, INC2
  207. fxcpmadd ys3, alpha2, a11, ys3
  208. LFPDUX a10, A2, INC2
  209. fxcpmadd ys4, alpha2, a15, ys4
  210. LFPDUX a14, A2, INC2
  211. LFPDUX yl3, YL, INCY2
  212. fxcsmadd ys1, alpha2, a4, ys1
  213. LFPDUX a3, A3, INC2
  214. fxcsmadd ys2, alpha2, a8, ys2
  215. LFPDUX a7, A3, INC2
  216. fxcsmadd ys3, alpha2, a12, ys3
  217. LFPDUX a11, A3, INC2
  218. fxcsmadd ys4, alpha2, a16, ys4
  219. LFPDUX a15, A3, INC2
  220. LFPDUX yl4, YL, INCY2
  221. STFPDUX ys1, YS, INCY2
  222. STFPDUX ys2, YS, INCY2
  223. STFPDUX ys3, YS, INCY2
  224. STFPDUX ys4, YS, INCY2
  225. LFPDUX a4, A4, INC2
  226. fxcpmadd ys1, alpha1, a1, yl1
  227. LFPDUX a8, A4, INC2
  228. fxcpmadd ys2, alpha1, a5, yl2
  229. LFPDUX a12, A4, INC2
  230. fxcpmadd ys3, alpha1, a9, yl3
  231. LFPDUX a16, A4, INC2
  232. fxcpmadd ys4, alpha1, a13, yl4
  233. bdnz .L12
  234. .align 4
  235. .L13:
  236. fxcsmadd ys1, alpha1, a2, ys1
  237. fxcsmadd ys2, alpha1, a6, ys2
  238. fxcsmadd ys3, alpha1, a10, ys3
  239. fxcsmadd ys4, alpha1, a14, ys4
  240. fxcpmadd ys1, alpha2, a3, ys1
  241. fxcpmadd ys2, alpha2, a7, ys2
  242. fxcpmadd ys3, alpha2, a11, ys3
  243. fxcpmadd ys4, alpha2, a15, ys4
  244. fxcsmadd ys1, alpha2, a4, ys1
  245. fxcsmadd ys2, alpha2, a8, ys2
  246. fxcsmadd ys3, alpha2, a12, ys3
  247. fxcsmadd ys4, alpha2, a16, ys4
  248. STFPDUX ys1, YS, INCY2
  249. STFPDUX ys2, YS, INCY2
  250. STFPDUX ys3, YS, INCY2
  251. STFPDUX ys4, YS, INCY2
  252. .align 4
  253. .L15:
  254. andi. r0, M, 7
  255. ble .L19
  256. andi. r0, M, 4
  257. ble .L17
  258. LFPDUX yl1, YL, INCY2
  259. LFPDUX a1, A1, INC2
  260. LFPDUX yl2, YL, INCY2
  261. LFPDUX a5, A1, INC2
  262. LFPDUX a2, A2, INC2
  263. LFPDUX a6, A2, INC2
  264. LFPDUX a3, A3, INC2
  265. LFPDUX a7, A3, INC2
  266. LFPDUX a4, A4, INC2
  267. LFPDUX a8, A4, INC2
  268. fxcpmadd ys1, alpha1, a1, yl1
  269. fxcpmadd ys2, alpha1, a5, yl2
  270. fxcsmadd ys1, alpha1, a2, ys1
  271. fxcsmadd ys2, alpha1, a6, ys2
  272. fxcpmadd ys1, alpha2, a3, ys1
  273. fxcpmadd ys2, alpha2, a7, ys2
  274. fxcsmadd ys1, alpha2, a4, ys1
  275. fxcsmadd ys2, alpha2, a8, ys2
  276. STFPDUX ys1, YS, INCY2
  277. STFPDUX ys2, YS, INCY2
  278. .align 4
  279. .L17:
  280. andi. r0, M, 2
  281. ble .L18
  282. LFPDUX yl1, YL, INCY2
  283. LFPDUX a1, A1, INC2
  284. LFPDUX a2, A2, INC2
  285. LFPDUX a3, A3, INC2
  286. LFPDUX a4, A4, INC2
  287. fxcpmadd ys1, alpha1, a1, yl1
  288. fxcsmadd ys1, alpha1, a2, ys1
  289. fxcpmadd ys1, alpha2, a3, ys1
  290. fxcsmadd ys1, alpha2, a4, ys1
  291. STFPDUX ys1, YS, INCY2
  292. .align 4
  293. .L18:
  294. andi. r0, M, 1
  295. ble .L19
  296. LFDUX yl1, YL, INCY2
  297. LFDUX a1, A1, INC2
  298. LFDUX a2, A2, INC2
  299. LFDUX a3, A3, INC2
  300. LFDUX a4, A4, INC2
  301. fxcpmadd ys1, alpha1, a1, yl1
  302. fxcsmadd ys1, alpha1, a2, ys1
  303. fxcpmadd ys1, alpha2, a3, ys1
  304. fxcsmadd ys1, alpha2, a4, ys1
  305. STFDUX ys1, YS, INCY2
  306. .align 4
  307. .L19:
  308. addi J, J, -1
  309. cmpi cr0, 0, J, 0
  310. bgt .L11
  311. .align 4
  312. .L20:
  313. andi. J, N, 2
  314. ble .L30
  315. LFDUX alpha1, X, INCX
  316. mr A1, A
  317. add A2, A, LDA
  318. add A, A2, LDA
  319. LFSDUX alpha1, X, INCX
  320. mr YL, Y
  321. mr YS, Y
  322. fpmul alpha1, alpha, alpha1
  323. srawi. r0, M, 3
  324. mtspr CTR, r0
  325. ble .L25
  326. LFPDUX yl1, YL, INCY2
  327. LFPDUX a1, A1, INC2
  328. LFPDUX yl2, YL, INCY2
  329. LFPDUX a5, A1, INC2
  330. LFPDUX yl3, YL, INCY2
  331. LFPDUX a9, A1, INC2
  332. LFPDUX yl4, YL, INCY2
  333. LFPDUX a13, A1, INC2
  334. LFPDUX a2, A2, INC2
  335. LFPDUX a6, A2, INC2
  336. LFPDUX a10, A2, INC2
  337. LFPDUX a14, A2, INC2
  338. bdz .L23
  339. .align 4
  340. .L22:
  341. fxcpmadd ys1, alpha1, a1, yl1
  342. LFPDUX a1, A1, INC2
  343. LFPDUX yl1, YL, INCY2
  344. fxcpmadd ys2, alpha1, a5, yl2
  345. LFPDUX a5, A1, INC2
  346. LFPDUX yl2, YL, INCY2
  347. fxcpmadd ys3, alpha1, a9, yl3
  348. LFPDUX a9, A1, INC2
  349. LFPDUX yl3, YL, INCY2
  350. fxcpmadd ys4, alpha1, a13, yl4
  351. LFPDUX a13, A1, INC2
  352. LFPDUX yl4, YL, INCY2
  353. fxcsmadd ys1, alpha1, a2, ys1
  354. LFPDUX a2, A2, INC2
  355. fxcsmadd ys2, alpha1, a6, ys2
  356. LFPDUX a6, A2, INC2
  357. fxcsmadd ys3, alpha1, a10, ys3
  358. LFPDUX a10, A2, INC2
  359. fxcsmadd ys4, alpha1, a14, ys4
  360. LFPDUX a14, A2, INC2
  361. STFPDUX ys1, YS, INCY2
  362. STFPDUX ys2, YS, INCY2
  363. STFPDUX ys3, YS, INCY2
  364. STFPDUX ys4, YS, INCY2
  365. bdnz .L22
  366. .align 4
  367. .L23:
  368. fxcpmadd ys1, alpha1, a1, yl1
  369. fxcpmadd ys2, alpha1, a5, yl2
  370. fxcpmadd ys3, alpha1, a9, yl3
  371. fxcpmadd ys4, alpha1, a13, yl4
  372. fxcsmadd ys1, alpha1, a2, ys1
  373. fxcsmadd ys2, alpha1, a6, ys2
  374. fxcsmadd ys3, alpha1, a10, ys3
  375. fxcsmadd ys4, alpha1, a14, ys4
  376. STFPDUX ys1, YS, INCY2
  377. STFPDUX ys2, YS, INCY2
  378. STFPDUX ys3, YS, INCY2
  379. STFPDUX ys4, YS, INCY2
  380. .align 4
  381. .L25:
  382. andi. r0, M, 7
  383. ble .L30
  384. andi. r0, M, 4
  385. ble .L27
  386. LFPDUX yl1, YL, INCY2
  387. LFPDUX a1, A1, INC2
  388. LFPDUX a2, A2, INC2
  389. LFPDUX yl2, YL, INCY2
  390. LFPDUX a5, A1, INC2
  391. LFPDUX a6, A2, INC2
  392. fxcpmadd ys1, alpha1, a1, yl1
  393. fxcsmadd ys1, alpha1, a2, ys1
  394. fxcpmadd ys2, alpha1, a5, yl2
  395. fxcsmadd ys2, alpha1, a6, ys2
  396. STFPDUX ys1, YS, INCY2
  397. STFPDUX ys2, YS, INCY2
  398. .align 4
  399. .L27:
  400. andi. r0, M, 2
  401. ble .L28
  402. LFPDUX yl1, YL, INCY2
  403. LFPDUX a1, A1, INC2
  404. LFPDUX a2, A2, INC2
  405. fxcpmadd ys1, alpha1, a1, yl1
  406. fxcsmadd ys1, alpha1, a2, ys1
  407. STFPDUX ys1, YS, INCY2
  408. .align 4
  409. .L28:
  410. andi. r0, M, 1
  411. ble .L30
  412. LFDUX yl1, YL, INCY2
  413. LFDUX a1, A1, INC2
  414. LFDUX a2, A2, INC2
  415. fxcpmadd ys1, alpha1, a1, yl1
  416. fxcsmadd ys1, alpha1, a2, ys1
  417. STFDUX ys1, YS, INCY2
  418. .align 4
  419. .L30:
  420. andi. J, N, 1
  421. ble .L999
  422. LFDUX alpha1, X, INCX
  423. mr A1, A
  424. mr YL, Y
  425. mr YS, Y
  426. fmul alpha1, alpha, alpha1
  427. srawi. r0, M, 3
  428. mtspr CTR, r0
  429. ble .L35
  430. LFPDUX yl1, YL, INCY2
  431. LFPDUX a1, A1, INC2
  432. LFPDUX yl2, YL, INCY2
  433. LFPDUX a5, A1, INC2
  434. LFPDUX yl3, YL, INCY2
  435. LFPDUX a9, A1, INC2
  436. LFPDUX yl4, YL, INCY2
  437. LFPDUX a13, A1, INC2
  438. bdz .L33
  439. .align 4
  440. .L32:
  441. fxcpmadd ys1, alpha1, a1, yl1
  442. LFPDUX yl1, YL, INCY2
  443. LFPDUX a1, A1, INC2
  444. fxcpmadd ys2, alpha1, a5, yl2
  445. LFPDUX yl2, YL, INCY2
  446. LFPDUX a5, A1, INC2
  447. fxcpmadd ys3, alpha1, a9, yl3
  448. LFPDUX yl3, YL, INCY2
  449. LFPDUX a9, A1, INC2
  450. fxcpmadd ys4, alpha1, a13, yl4
  451. LFPDUX yl4, YL, INCY2
  452. LFPDUX a13, A1, INC2
  453. STFPDUX ys1, YS, INCY2
  454. STFPDUX ys2, YS, INCY2
  455. STFPDUX ys3, YS, INCY2
  456. STFPDUX ys4, YS, INCY2
  457. bdnz .L32
  458. .align 4
  459. .L33:
  460. fxcpmadd ys1, alpha1, a1, yl1
  461. fxcpmadd ys2, alpha1, a5, yl2
  462. fxcpmadd ys3, alpha1, a9, yl3
  463. fxcpmadd ys4, alpha1, a13, yl4
  464. STFPDUX ys1, YS, INCY2
  465. STFPDUX ys2, YS, INCY2
  466. STFPDUX ys3, YS, INCY2
  467. STFPDUX ys4, YS, INCY2
  468. .align 4
  469. .L35:
  470. andi. r0, M, 7
  471. ble .L999
  472. andi. r0, M, 4
  473. ble .L37
  474. LFPDUX yl1, YL, INCY2
  475. LFPDUX a1, A1, INC2
  476. LFPDUX yl2, YL, INCY2
  477. LFPDUX a5, A1, INC2
  478. fxcpmadd ys1, alpha1, a1, yl1
  479. fxcpmadd ys2, alpha1, a5, yl2
  480. STFPDUX ys1, YS, INCY2
  481. STFPDUX ys2, YS, INCY2
  482. .align 4
  483. .L37:
  484. andi. r0, M, 2
  485. ble .L38
  486. LFPDUX yl1, YL, INCY2
  487. LFPDUX a1, A1, INC2
  488. fxcpmadd ys1, alpha1, a1, yl1
  489. STFPDUX ys1, YS, INCY2
  490. .align 4
  491. .L38:
  492. andi. r0, M, 1
  493. ble .L999
  494. LFDUX yl1, YL, INCY2
  495. LFDUX a1, A1, INC2
  496. fxcpmadd ys1, alpha1, a1, yl1
  497. STFDUX ys1, YS, INCY2
  498. b .L999
  499. .align 4
  500. .L40:
  501. # A : aligned LDA : even Y : Unaligned
  502. sub A, A, INC2
  503. sub Y, Y, INCY
  504. srawi. J, N, 2
  505. ble .L50
  506. .align 4
  507. .L41:
  508. LFDUX alpha1, X, INCX
  509. LFSDUX alpha1, X, INCX
  510. LFDUX alpha2, X, INCX
  511. LFSDUX alpha2, X, INCX
  512. fpmul alpha1, alpha, alpha1
  513. fpmul alpha2, alpha, alpha2
  514. mr A1, A
  515. add A2, A, LDA
  516. add A3, A2, LDA
  517. add A4, A3, LDA
  518. add A, A4, LDA
  519. mr YL, Y
  520. sub YS, Y, INCY2
  521. LFSDX ys1, YS, INCY2
  522. LFDX yl1, YL, INCY
  523. srawi. r0, M, 3
  524. mtspr CTR, r0
  525. ble .L45
  526. LFPDUX a1, A1, INC2
  527. LFPDUX a5, A1, INC2
  528. LFPDUX a9, A1, INC2
  529. LFPDUX a13, A1, INC2
  530. LFXDUX yl2, YL, INCY2
  531. LFXDUX yl3, YL, INCY2
  532. LFXDUX yl4, YL, INCY2
  533. LFXDUX yl5, YL, INCY2
  534. LFPDUX a2, A2, INC2
  535. LFPDUX a6, A2, INC2
  536. LFPDUX a10, A2, INC2
  537. LFPDUX a14, A2, INC2
  538. LFPDUX a3, A3, INC2
  539. LFPDUX a7, A3, INC2
  540. LFPDUX a11, A3, INC2
  541. LFPDUX a15, A3, INC2
  542. LFPDUX a4, A4, INC2
  543. fsmr yl1, yl2
  544. LFPDUX a8, A4, INC2
  545. fsmr yl2, yl3
  546. LFPDUX a12, A4, INC2
  547. fsmr yl3, yl4
  548. LFPDUX a16, A4, INC2
  549. fsmr yl4, yl5
  550. bdz .L43
  551. .align 4
  552. .L42:
  553. fxcpmadd ys2, alpha1, a1, yl1
  554. LFPDUX a1, A1, INC2
  555. fxcpmadd ys3, alpha1, a5, yl2
  556. LFPDUX a5, A1, INC2
  557. fxcpmadd ys4, alpha1, a9, yl3
  558. LFPDUX a9, A1, INC2
  559. fxcpmadd ys5, alpha1, a13, yl4
  560. LFPDUX a13, A1, INC2
  561. fxcsmadd ys2, alpha1, a2, ys2
  562. LFPDUX a2, A2, INC2
  563. fxcsmadd ys3, alpha1, a6, ys3
  564. LFPDUX a6, A2, INC2
  565. fxcsmadd ys4, alpha1, a10, ys4
  566. LFPDUX a10, A2, INC2
  567. fxcsmadd ys5, alpha1, a14, ys5
  568. LFPDUX a14, A2, INC2
  569. fxcpmadd ys2, alpha2, a3, ys2
  570. LFPDUX a3, A3, INC2
  571. fxcpmadd ys3, alpha2, a7, ys3
  572. LFPDUX a7, A3, INC2
  573. fxcpmadd ys4, alpha2, a11, ys4
  574. LFPDUX a11, A3, INC2
  575. fxcpmadd ys5, alpha2, a15, ys5
  576. LFPDUX a15, A3, INC2
  577. fxcsmadd ys2, alpha2, a4, ys2
  578. LFPDUX a4, A4, INC2
  579. fxcsmadd ys3, alpha2, a8, ys3
  580. LFPDUX a8, A4, INC2
  581. fxcsmadd ys4, alpha2, a12, ys4
  582. LFPDUX a12, A4, INC2
  583. fxcsmadd ys5, alpha2, a16, ys5
  584. LFPDUX a16, A4, INC2
  585. fmr yl1, yl5
  586. LFXDUX yl2, YL, INCY2
  587. fmr ys1, ys2
  588. LFXDUX yl3, YL, INCY2
  589. fmr ys2, ys3
  590. LFXDUX yl4, YL, INCY2
  591. fmr ys3, ys4
  592. LFXDUX yl5, YL, INCY2
  593. fmr ys4, ys5
  594. STFXDUX ys1, YS, INCY2
  595. fsmr ys1, ys5
  596. STFXDUX ys2, YS, INCY2
  597. fsmr yl1, yl2
  598. STFXDUX ys3, YS, INCY2
  599. fsmr yl2, yl3
  600. STFXDUX ys4, YS, INCY2
  601. fsmr yl3, yl4
  602. fsmr yl4, yl5
  603. bdnz .L42
  604. .align 4
  605. .L43:
  606. fxcpmadd ys2, alpha1, a1, yl1
  607. fxcpmadd ys3, alpha1, a5, yl2
  608. fxcpmadd ys4, alpha1, a9, yl3
  609. fxcpmadd ys5, alpha1, a13, yl4
  610. fxcsmadd ys2, alpha1, a2, ys2
  611. fxcsmadd ys3, alpha1, a6, ys3
  612. fxcsmadd ys4, alpha1, a10, ys4
  613. fxcsmadd ys5, alpha1, a14, ys5
  614. fxcpmadd ys2, alpha2, a3, ys2
  615. fxcpmadd ys3, alpha2, a7, ys3
  616. fxcpmadd ys4, alpha2, a11, ys4
  617. fxcpmadd ys5, alpha2, a15, ys5
  618. fxcsmadd ys2, alpha2, a4, ys2
  619. fxcsmadd ys3, alpha2, a8, ys3
  620. fxcsmadd ys4, alpha2, a12, ys4
  621. fxcsmadd ys5, alpha2, a16, ys5
  622. fmr ys1, ys2
  623. fmr ys2, ys3
  624. fmr ys3, ys4
  625. fmr ys4, ys5
  626. fmr yl1, yl5
  627. STFXDUX ys1, YS, INCY2
  628. fsmr ys1, ys5
  629. STFXDUX ys2, YS, INCY2
  630. STFXDUX ys3, YS, INCY2
  631. STFXDUX ys4, YS, INCY2
  632. .align 4
  633. .L45:
  634. andi. r0, M, 7
  635. ble .L48
  636. andi. r0, M, 4
  637. ble .L46
  638. LFXDUX yl2, YL, INCY2
  639. LFXDUX yl3, YL, INCY2
  640. LFPDUX a1, A1, INC2
  641. LFPDUX a5, A1, INC2
  642. LFPDUX a2, A2, INC2
  643. LFPDUX a6, A2, INC2
  644. LFPDUX a3, A3, INC2
  645. LFPDUX a7, A3, INC2
  646. LFPDUX a4, A4, INC2
  647. fsmr yl1, yl2
  648. LFPDUX a8, A4, INC2
  649. fsmr yl2, yl3
  650. fxcpmadd ys2, alpha1, a1, yl1
  651. fxcpmadd ys3, alpha1, a5, yl2
  652. fxcsmadd ys2, alpha1, a2, ys2
  653. fxcsmadd ys3, alpha1, a6, ys3
  654. fxcpmadd ys2, alpha2, a3, ys2
  655. fxcpmadd ys3, alpha2, a7, ys3
  656. fxcsmadd ys2, alpha2, a4, ys2
  657. fxcsmadd ys3, alpha2, a8, ys3
  658. fmr yl1, yl3
  659. fmr ys1, ys2
  660. fmr ys2, ys3
  661. STFXDUX ys1, YS, INCY2
  662. fsmr ys1, ys3
  663. STFXDUX ys2, YS, INCY2
  664. .align 4
  665. .L46:
  666. andi. r0, M, 2
  667. ble .L47
  668. LFXDUX yl2, YL, INCY2
  669. LFPDUX a1, A1, INC2
  670. LFPDUX a2, A2, INC2
  671. LFPDUX a3, A3, INC2
  672. LFPDUX a4, A4, INC2
  673. fsmr yl1, yl2
  674. fxcpmadd ys2, alpha1, a1, yl1
  675. fxcsmadd ys2, alpha1, a2, ys2
  676. fxcpmadd ys2, alpha2, a3, ys2
  677. fxcsmadd ys2, alpha2, a4, ys2
  678. fmr yl1, yl2
  679. fmr ys1, ys2
  680. STFXDUX ys1, YS, INCY2
  681. fsmr ys1, ys2
  682. .align 4
  683. .L47:
  684. andi. r0, M, 1
  685. ble .L48
  686. LFDUX a1, A1, INC2
  687. LFDUX a2, A2, INC2
  688. LFDUX a3, A3, INC2
  689. LFDUX a4, A4, INC2
  690. fxcpmadd ys2, alpha1, a1, yl1
  691. fxcsmadd ys2, alpha1, a2, ys2
  692. fxcpmadd ys2, alpha2, a3, ys2
  693. fxcsmadd ys2, alpha2, a4, ys2
  694. STFSDX ys1, YS, INCY2
  695. add YS, YS, INCY
  696. STFDX ys2, YS, INCY2
  697. b .L49
  698. .align 4
  699. .L48:
  700. STFSDUX ys1, YS, INCY2
  701. .align 4
  702. .L49:
  703. addi J, J, -1
  704. cmpi cr0, 0, J, 0
  705. bgt .L41
  706. .align 4
  707. .L50:
  708. andi. J, N, 2
  709. ble .L60
  710. LFDUX alpha1, X, INCX
  711. mr A1, A
  712. add A2, A, LDA
  713. add A, A2, LDA
  714. LFSDUX alpha1, X, INCX
  715. mr YL, Y
  716. sub YS, Y, INCY2
  717. fpmul alpha1, alpha, alpha1
  718. LFSDX ys1, YS, INCY2
  719. LFDX yl1, YL, INCY
  720. srawi. r0, M, 3
  721. mtspr CTR, r0
  722. ble .L55
  723. LFPDUX a1, A1, INC2
  724. LFPDUX a5, A1, INC2
  725. LFPDUX a9, A1, INC2
  726. LFPDUX a13, A1, INC2
  727. LFXDUX yl2, YL, INCY2
  728. LFXDUX yl3, YL, INCY2
  729. LFXDUX yl4, YL, INCY2
  730. LFXDUX yl5, YL, INCY2
  731. LFPDUX a2, A2, INC2
  732. fsmr yl1, yl2
  733. LFPDUX a6, A2, INC2
  734. fsmr yl2, yl3
  735. LFPDUX a10, A2, INC2
  736. fsmr yl3, yl4
  737. LFPDUX a14, A2, INC2
  738. fsmr yl4, yl5
  739. bdz .L53
  740. .align 4
  741. .L52:
  742. fxcpmadd ys2, alpha1, a1, yl1
  743. LFPDUX a1, A1, INC2
  744. fxcpmadd ys3, alpha1, a5, yl2
  745. LFPDUX a5, A1, INC2
  746. fxcpmadd ys4, alpha1, a9, yl3
  747. LFPDUX a9, A1, INC2
  748. fxcpmadd ys5, alpha1, a13, yl4
  749. LFPDUX a13, A1, INC2
  750. fxcsmadd ys2, alpha1, a2, ys2
  751. LFPDUX a2, A2, INC2
  752. fxcsmadd ys3, alpha1, a6, ys3
  753. LFPDUX a6, A2, INC2
  754. fxcsmadd ys4, alpha1, a10, ys4
  755. LFPDUX a10, A2, INC2
  756. fxcsmadd ys5, alpha1, a14, ys5
  757. LFPDUX a14, A2, INC2
  758. fmr yl1, yl5
  759. LFXDUX yl2, YL, INCY2
  760. fmr ys1, ys2
  761. LFXDUX yl3, YL, INCY2
  762. fmr ys2, ys3
  763. LFXDUX yl4, YL, INCY2
  764. fmr ys3, ys4
  765. LFXDUX yl5, YL, INCY2
  766. fmr ys4, ys5
  767. STFXDUX ys1, YS, INCY2
  768. fsmr ys1, ys5
  769. STFXDUX ys2, YS, INCY2
  770. fsmr yl1, yl2
  771. STFXDUX ys3, YS, INCY2
  772. fsmr yl2, yl3
  773. STFXDUX ys4, YS, INCY2
  774. fsmr yl3, yl4
  775. fsmr yl4, yl5
  776. bdnz .L52
  777. .align 4
  778. .L53:
  779. fxcpmadd ys2, alpha1, a1, yl1
  780. fxcpmadd ys3, alpha1, a5, yl2
  781. fxcpmadd ys4, alpha1, a9, yl3
  782. fxcpmadd ys5, alpha1, a13, yl4
  783. fxcsmadd ys2, alpha1, a2, ys2
  784. fxcsmadd ys3, alpha1, a6, ys3
  785. fxcsmadd ys4, alpha1, a10, ys4
  786. fxcsmadd ys5, alpha1, a14, ys5
  787. fmr yl1, yl5
  788. fmr ys1, ys2
  789. fmr ys2, ys3
  790. fmr ys3, ys4
  791. fmr ys4, ys5
  792. STFXDUX ys1, YS, INCY2
  793. fsmr ys1, ys5
  794. STFXDUX ys2, YS, INCY2
  795. STFXDUX ys3, YS, INCY2
  796. STFXDUX ys4, YS, INCY2
  797. .align 4
  798. .L55:
  799. andi. r0, M, 7
  800. ble .L59
  801. andi. r0, M, 4
  802. ble .L57
  803. LFXDUX yl2, YL, INCY2
  804. LFXDUX yl3, YL, INCY2
  805. LFPDUX a1, A1, INC2
  806. LFPDUX a2, A2, INC2
  807. LFPDUX a5, A1, INC2
  808. LFPDUX a6, A2, INC2
  809. fsmr yl1, yl2
  810. fsmr yl2, yl3
  811. fxcpmadd ys2, alpha1, a1, yl1
  812. fxcsmadd ys2, alpha1, a2, ys2
  813. fxcpmadd ys3, alpha1, a5, yl2
  814. fxcsmadd ys3, alpha1, a6, ys3
  815. fmr yl1, yl3
  816. fmr ys1, ys2
  817. fmr ys2, ys3
  818. STFXDUX ys1, YS, INCY2
  819. STFXDUX ys2, YS, INCY2
  820. fsmr ys1, ys3
  821. .align 4
  822. .L57:
  823. andi. r0, M, 2
  824. ble .L58
  825. LFXDUX yl2, YL, INCY2
  826. LFPDUX a1, A1, INC2
  827. LFPDUX a2, A2, INC2
  828. fsmr yl1, yl2
  829. fxcpmadd ys2, alpha1, a1, yl1
  830. fxcsmadd ys2, alpha1, a2, ys2
  831. fmr yl1, yl2
  832. fmr ys1, ys2
  833. STFXDUX ys1, YS, INCY2
  834. fsmr ys1, ys2
  835. .align 4
  836. .L58:
  837. andi. r0, M, 1
  838. ble .L59
  839. LFDUX a1, A1, INC2
  840. LFDUX a2, A2, INC2
  841. fxmr alpha2, alpha1
  842. fmadd ys1, alpha1, a1, yl1
  843. fmadd ys1, alpha2, a2, ys1
  844. STFXDUX ys1, YS, INCY2
  845. b .L60
  846. .align 4
  847. .L59:
  848. STFSDUX ys1, YS, INCY2
  849. .align 4
  850. .L60:
  851. andi. J, N, 1
  852. ble .L999
  853. LFDUX alpha1, X, INCX
  854. mr A1, A
  855. mr YL, Y
  856. sub YS, Y, INCY2
  857. fmul alpha1, alpha, alpha1
  858. LFSDX ys1, YS, INCY2
  859. LFDX yl1, YL, INCY
  860. srawi. r0, M, 3
  861. mtspr CTR, r0
  862. ble .L65
  863. LFXDUX yl2, YL, INCY2
  864. LFXDUX yl3, YL, INCY2
  865. LFXDUX yl4, YL, INCY2
  866. LFXDUX yl5, YL, INCY2
  867. LFPDUX a1, A1, INC2
  868. LFPDUX a5, A1, INC2
  869. LFPDUX a9, A1, INC2
  870. LFPDUX a13, A1, INC2
  871. fsmr yl1, yl2
  872. fsmr yl2, yl3
  873. fsmr yl3, yl4
  874. fsmr yl4, yl5
  875. bdz .L63
  876. .align 4
  877. .L62:
  878. fxcpmadd ys2, alpha1, a1, yl1
  879. LFPDUX a1, A1, INC2
  880. fxcpmadd ys3, alpha1, a5, yl2
  881. LFXDUX yl2, YL, INCY2
  882. fxcpmadd ys4, alpha1, a9, yl3
  883. LFXDUX yl3, YL, INCY2
  884. fxcpmadd ys5, alpha1, a13, yl4
  885. LFXDUX yl4, YL, INCY2
  886. fmr yl1, yl5
  887. LFXDUX yl5, YL, INCY2
  888. fmr ys1, ys2
  889. LFPDUX a5, A1, INC2
  890. fmr ys2, ys3
  891. LFPDUX a9, A1, INC2
  892. fmr ys3, ys4
  893. LFPDUX a13, A1, INC2
  894. fmr ys4, ys5
  895. STFXDUX ys1, YS, INCY2
  896. fsmr ys1, ys5
  897. STFXDUX ys2, YS, INCY2
  898. fsmr yl1, yl2
  899. STFXDUX ys3, YS, INCY2
  900. fsmr yl2, yl3
  901. STFXDUX ys4, YS, INCY2
  902. fsmr yl3, yl4
  903. fsmr yl4, yl5
  904. bdnz .L62
  905. .align 4
  906. .L63:
  907. fxcpmadd ys2, alpha1, a1, yl1
  908. fxcpmadd ys3, alpha1, a5, yl2
  909. fxcpmadd ys4, alpha1, a9, yl3
  910. fxcpmadd ys5, alpha1, a13, yl4
  911. fmr yl1, yl5
  912. fmr ys1, ys2
  913. fmr ys2, ys3
  914. fmr ys3, ys4
  915. fmr ys4, ys5
  916. STFXDUX ys1, YS, INCY2
  917. fsmr ys1, ys5
  918. STFXDUX ys2, YS, INCY2
  919. STFXDUX ys3, YS, INCY2
  920. STFXDUX ys4, YS, INCY2
  921. .align 4
  922. .L65:
  923. andi. r0, M, 7
  924. ble .L69
  925. andi. r0, M, 4
  926. ble .L67
  927. LFXDUX yl2, YL, INCY2
  928. LFXDUX yl3, YL, INCY2
  929. LFPDUX a1, A1, INC2
  930. LFPDUX a5, A1, INC2
  931. fsmr yl1, yl2
  932. fsmr yl2, yl3
  933. fxcpmadd ys2, alpha1, a1, yl1
  934. fxcpmadd ys3, alpha1, a5, yl2
  935. fmr yl1, yl3
  936. fmr ys1, ys2
  937. fmr ys2, ys3
  938. STFXDUX ys1, YS, INCY2
  939. fsmr ys1, ys3
  940. STFXDUX ys2, YS, INCY2
  941. .align 4
  942. .L67:
  943. andi. r0, M, 2
  944. ble .L68
  945. LFPDUX a1, A1, INC2
  946. LFXDUX yl2, YL, INCY2
  947. fsmr yl1, yl2
  948. fxcpmadd ys2, alpha1, a1, yl1
  949. fmr yl1, yl2
  950. fmr ys1, ys2
  951. STFXDUX ys1, YS, INCY2
  952. fsmr ys1, ys2
  953. .align 4
  954. .L68:
  955. andi. r0, M, 1
  956. ble .L69
  957. LFDUX a1, A1, INC2
  958. fmadd ys1, alpha1, a1, yl1
  959. STFXDUX ys1, YS, INCY2
  960. b .L999
  961. .align 4
  962. .L69:
  963. STFSDUX ys1, YS, INCY2
  964. b .L999
  965. .align 4
  966. .L70:
  967. sub A, A, INC2
  968. sub Y, Y, INCY
  969. srawi. J, N, 2
  970. ble .L80
  971. .align 4
  972. .L71:
  973. LFDUX alpha1, X, INCX
  974. mr A1, A
  975. add A2, A, LDA
  976. add A3, A2, LDA
  977. LFSDUX alpha1, X, INCX
  978. LFDUX alpha2, X, INCX
  979. add A4, A3, LDA
  980. add A, A4, LDA
  981. mr YL, Y
  982. LFSDUX alpha2, X, INCX
  983. fpmul alpha1, alpha, alpha1
  984. mr YS, Y
  985. srawi. r0, M, 3
  986. mtspr CTR, r0
  987. fpmul alpha2, alpha, alpha2
  988. ble .L75
  989. LFDUX yl1, YL, INCY
  990. LFPDUX a1, A1, INC2
  991. LFPDUX a5, A1, INC2
  992. LFPDUX a9, A1, INC2
  993. LFPDUX a13, A1, INC2
  994. LFSDUX yl1, YL, INCY
  995. LFDUX yl2, YL, INCY
  996. LFPDUX a2, A2, INC2
  997. LFPDUX a6, A2, INC2
  998. LFPDUX a10, A2, INC2
  999. LFPDUX a14, A2, INC2
  1000. LFSDUX yl2, YL, INCY
  1001. LFDUX yl3, YL, INCY
  1002. LFPDUX a3, A3, INC2
  1003. LFPDUX a7, A3, INC2
  1004. LFPDUX a11, A3, INC2
  1005. LFPDUX a15, A3, INC2
  1006. LFSDUX yl3, YL, INCY
  1007. LFDUX yl4, YL, INCY
  1008. LFPDUX a4, A4, INC2
  1009. LFPDUX a8, A4, INC2
  1010. LFPDUX a12, A4, INC2
  1011. LFPDUX a16, A4, INC2
  1012. LFSDUX yl4, YL, INCY
  1013. bdz .L73
  1014. .align 4
  1015. .L72:
  1016. fxcpmadd ys1, alpha1, a1, yl1
  1017. LFPDUX a1, A1, INC2
  1018. LFDUX yl1, YL, INCY
  1019. fxcpmadd ys2, alpha1, a5, yl2
  1020. LFPDUX a5, A1, INC2
  1021. fxcpmadd ys3, alpha1, a9, yl3
  1022. LFPDUX a9, A1, INC2
  1023. fxcpmadd ys4, alpha1, a13, yl4
  1024. LFPDUX a13, A1, INC2
  1025. LFSDUX yl1, YL, INCY
  1026. fxcsmadd ys1, alpha1, a2, ys1
  1027. LFPDUX a2, A2, INC2
  1028. LFDUX yl2, YL, INCY
  1029. fxcsmadd ys2, alpha1, a6, ys2
  1030. LFPDUX a6, A2, INC2
  1031. fxcsmadd ys3, alpha1, a10, ys3
  1032. LFPDUX a10, A2, INC2
  1033. fxcsmadd ys4, alpha1, a14, ys4
  1034. LFPDUX a14, A2, INC2
  1035. LFSDUX yl2, YL, INCY
  1036. fxcpmadd ys1, alpha2, a3, ys1
  1037. LFPDUX a3, A3, INC2
  1038. LFDUX yl3, YL, INCY
  1039. fxcpmadd ys2, alpha2, a7, ys2
  1040. LFPDUX a7, A3, INC2
  1041. fxcpmadd ys3, alpha2, a11, ys3
  1042. LFPDUX a11, A3, INC2
  1043. fxcpmadd ys4, alpha2, a15, ys4
  1044. LFPDUX a15, A3, INC2
  1045. LFSDUX yl3, YL, INCY
  1046. fxcsmadd ys1, alpha2, a4, ys1
  1047. LFPDUX a4, A4, INC2
  1048. LFDUX yl4, YL, INCY
  1049. fxcsmadd ys2, alpha2, a8, ys2
  1050. LFPDUX a8, A4, INC2
  1051. fxcsmadd ys3, alpha2, a12, ys3
  1052. LFPDUX a12, A4, INC2
  1053. fxcsmadd ys4, alpha2, a16, ys4
  1054. LFPDUX a16, A4, INC2
  1055. LFSDUX yl4, YL, INCY
  1056. STFDUX ys1, YS, INCY
  1057. STFSDUX ys1, YS, INCY
  1058. STFDUX ys2, YS, INCY
  1059. STFSDUX ys2, YS, INCY
  1060. STFDUX ys3, YS, INCY
  1061. STFSDUX ys3, YS, INCY
  1062. STFDUX ys4, YS, INCY
  1063. STFSDUX ys4, YS, INCY
  1064. bdnz .L72
  1065. .align 4
  1066. .L73:
  1067. fxcpmadd ys1, alpha1, a1, yl1
  1068. fxcpmadd ys2, alpha1, a5, yl2
  1069. fxcpmadd ys3, alpha1, a9, yl3
  1070. fxcpmadd ys4, alpha1, a13, yl4
  1071. fxcsmadd ys1, alpha1, a2, ys1
  1072. fxcsmadd ys2, alpha1, a6, ys2
  1073. fxcsmadd ys3, alpha1, a10, ys3
  1074. fxcsmadd ys4, alpha1, a14, ys4
  1075. fxcpmadd ys1, alpha2, a3, ys1
  1076. fxcpmadd ys2, alpha2, a7, ys2
  1077. fxcpmadd ys3, alpha2, a11, ys3
  1078. fxcpmadd ys4, alpha2, a15, ys4
  1079. fxcsmadd ys1, alpha2, a4, ys1
  1080. fxcsmadd ys2, alpha2, a8, ys2
  1081. fxcsmadd ys3, alpha2, a12, ys3
  1082. fxcsmadd ys4, alpha2, a16, ys4
  1083. STFDUX ys1, YS, INCY
  1084. STFSDUX ys1, YS, INCY
  1085. STFDUX ys2, YS, INCY
  1086. STFSDUX ys2, YS, INCY
  1087. STFDUX ys3, YS, INCY
  1088. STFSDUX ys3, YS, INCY
  1089. STFDUX ys4, YS, INCY
  1090. STFSDUX ys4, YS, INCY
  1091. .align 4
  1092. .L75:
  1093. andi. r0, M, 7
  1094. ble .L79
  1095. andi. r0, M, 4
  1096. ble .L77
  1097. LFDUX yl1, YL, INCY
  1098. LFPDUX a1, A1, INC2
  1099. LFPDUX a5, A1, INC2
  1100. LFSDUX yl1, YL, INCY
  1101. LFPDUX a2, A2, INC2
  1102. LFPDUX a6, A2, INC2
  1103. LFDUX yl2, YL, INCY
  1104. LFPDUX a3, A3, INC2
  1105. LFPDUX a7, A3, INC2
  1106. LFSDUX yl2, YL, INCY
  1107. LFPDUX a4, A4, INC2
  1108. LFPDUX a8, A4, INC2
  1109. fxcpmadd ys1, alpha1, a1, yl1
  1110. fxcpmadd ys2, alpha1, a5, yl2
  1111. fxcsmadd ys1, alpha1, a2, ys1
  1112. fxcsmadd ys2, alpha1, a6, ys2
  1113. fxcpmadd ys1, alpha2, a3, ys1
  1114. fxcpmadd ys2, alpha2, a7, ys2
  1115. fxcsmadd ys1, alpha2, a4, ys1
  1116. fxcsmadd ys2, alpha2, a8, ys2
  1117. STFDUX ys1, YS, INCY
  1118. STFSDUX ys1, YS, INCY
  1119. STFDUX ys2, YS, INCY
  1120. STFSDUX ys2, YS, INCY
  1121. .align 4
  1122. .L77:
  1123. andi. r0, M, 2
  1124. ble .L78
  1125. LFDUX yl1, YL, INCY
  1126. LFPDUX a1, A1, INC2
  1127. LFPDUX a2, A2, INC2
  1128. LFSDUX yl1, YL, INCY
  1129. LFPDUX a3, A3, INC2
  1130. LFPDUX a4, A4, INC2
  1131. fxcpmadd ys1, alpha1, a1, yl1
  1132. fxcsmadd ys1, alpha1, a2, ys1
  1133. fxcpmadd ys1, alpha2, a3, ys1
  1134. fxcsmadd ys1, alpha2, a4, ys1
  1135. STFDUX ys1, YS, INCY
  1136. STFSDUX ys1, YS, INCY
  1137. .align 4
  1138. .L78:
  1139. andi. r0, M, 1
  1140. ble .L79
  1141. LFDUX yl1, YL, INCY
  1142. LFDUX a1, A1, INC2
  1143. LFDUX a2, A2, INC2
  1144. LFDUX a3, A3, INC2
  1145. LFDUX a4, A4, INC2
  1146. fxcpmadd ys1, alpha1, a1, yl1
  1147. fxcsmadd ys1, alpha1, a2, ys1
  1148. fxcpmadd ys1, alpha2, a3, ys1
  1149. fxcsmadd ys1, alpha2, a4, ys1
  1150. STFDUX ys1, YS, INCY
  1151. .align 4
  1152. .L79:
  1153. addi J, J, -1
  1154. cmpi cr0, 0, J, 0
  1155. bgt .L71
  1156. .align 4
  1157. .L80:
  1158. andi. J, N, 2
  1159. ble .L90
  1160. LFDUX alpha1, X, INCX
  1161. mr A1, A
  1162. add A2, A, LDA
  1163. add A, A2, LDA
  1164. LFSDUX alpha1, X, INCX
  1165. mr YL, Y
  1166. mr YS, Y
  1167. fpmul alpha1, alpha, alpha1
  1168. srawi. r0, M, 3
  1169. mtspr CTR, r0
  1170. ble .L85
  1171. LFDUX yl1, YL, INCY
  1172. LFDUX a9, YL, INCY
  1173. LFDUX yl2, YL, INCY
  1174. LFDUX a10, YL, INCY
  1175. LFPDUX a1, A1, INC2
  1176. LFPDUX a5, A1, INC2
  1177. LFPDUX a3, A1, INC2
  1178. LFPDUX a7, A1, INC2
  1179. LFDUX yl3, YL, INCY
  1180. LFDUX a11, YL, INCY
  1181. LFDUX yl4, YL, INCY
  1182. LFDUX a12, YL, INCY
  1183. LFPDUX a2, A2, INC2
  1184. LFPDUX a6, A2, INC2
  1185. LFPDUX a4, A2, INC2
  1186. LFPDUX a8, A2, INC2
  1187. bdz .L83
  1188. .align 4
  1189. .L82:
  1190. fsmfp yl1, a9
  1191. fsmfp yl2, a10
  1192. fsmfp yl3, a11
  1193. fsmfp yl4, a12
  1194. fxcpmadd ys1, alpha1, a1, yl1
  1195. LFDUX yl1, YL, INCY
  1196. LFDUX a9, YL, INCY
  1197. LFPDUX a1, A1, INC2
  1198. fxcpmadd ys2, alpha1, a5, yl2
  1199. LFDUX yl2, YL, INCY
  1200. LFDUX a10, YL, INCY
  1201. LFPDUX a5, A1, INC2
  1202. fxcpmadd ys3, alpha1, a3, yl3
  1203. LFDUX yl3, YL, INCY
  1204. LFDUX a11, YL, INCY
  1205. LFPDUX a3, A1, INC2
  1206. fxcpmadd ys4, alpha1, a7, yl4
  1207. LFDUX yl4, YL, INCY
  1208. LFDUX a12, YL, INCY
  1209. LFPDUX a7, A1, INC2
  1210. fxcsmadd ys1, alpha1, a2, ys1
  1211. LFPDUX a2, A2, INC2
  1212. fxcsmadd ys2, alpha1, a6, ys2
  1213. LFPDUX a6, A2, INC2
  1214. fxcsmadd ys3, alpha1, a4, ys3
  1215. LFPDUX a4, A2, INC2
  1216. fxcsmadd ys4, alpha1, a8, ys4
  1217. LFPDUX a8, A2, INC2
  1218. STFDUX ys1, YS, INCY
  1219. STFSDUX ys1, YS, INCY
  1220. STFDUX ys2, YS, INCY
  1221. STFSDUX ys2, YS, INCY
  1222. STFDUX ys3, YS, INCY
  1223. STFSDUX ys3, YS, INCY
  1224. STFDUX ys4, YS, INCY
  1225. STFSDUX ys4, YS, INCY
  1226. bdnz .L82
  1227. .align 4
  1228. .L83:
  1229. fsmfp yl1, a9
  1230. fsmfp yl2, a10
  1231. fsmfp yl3, a11
  1232. fsmfp yl4, a12
  1233. fxcpmadd ys1, alpha1, a1, yl1
  1234. fxcpmadd ys2, alpha1, a5, yl2
  1235. fxcpmadd ys3, alpha1, a3, yl3
  1236. fxcpmadd ys4, alpha1, a7, yl4
  1237. fxcsmadd ys1, alpha1, a2, ys1
  1238. fxcsmadd ys2, alpha1, a6, ys2
  1239. fxcsmadd ys3, alpha1, a4, ys3
  1240. fxcsmadd ys4, alpha1, a8, ys4
  1241. STFDUX ys1, YS, INCY
  1242. STFSDUX ys1, YS, INCY
  1243. STFDUX ys2, YS, INCY
  1244. STFSDUX ys2, YS, INCY
  1245. STFDUX ys3, YS, INCY
  1246. STFSDUX ys3, YS, INCY
  1247. STFDUX ys4, YS, INCY
  1248. STFSDUX ys4, YS, INCY
  1249. .align 4
  1250. .L85:
  1251. andi. r0, M, 7
  1252. ble .L90
  1253. andi. r0, M, 4
  1254. ble .L87
  1255. LFDUX yl1, YL, INCY
  1256. LFPDUX a1, A1, INC2
  1257. LFPDUX a2, A2, INC2
  1258. LFSDUX yl1, YL, INCY
  1259. LFDUX yl2, YL, INCY
  1260. LFPDUX a5, A1, INC2
  1261. LFPDUX a6, A2, INC2
  1262. LFSDUX yl2, YL, INCY
  1263. fxcpmadd ys1, alpha1, a1, yl1
  1264. fxcpmadd ys2, alpha1, a5, yl2
  1265. fxcsmadd ys1, alpha1, a2, ys1
  1266. fxcsmadd ys2, alpha1, a6, ys2
  1267. STFDUX ys1, YS, INCY
  1268. STFSDUX ys1, YS, INCY
  1269. STFDUX ys2, YS, INCY
  1270. STFSDUX ys2, YS, INCY
  1271. .align 4
  1272. .L87:
  1273. andi. r0, M, 2
  1274. ble .L88
  1275. LFDUX yl1, YL, INCY
  1276. LFPDUX a1, A1, INC2
  1277. LFPDUX a2, A2, INC2
  1278. LFSDUX yl1, YL, INCY
  1279. fxcpmadd ys1, alpha1, a1, yl1
  1280. fxcsmadd ys1, alpha1, a2, ys1
  1281. STFDUX ys1, YS, INCY
  1282. STFSDUX ys1, YS, INCY
  1283. .align 4
  1284. .L88:
  1285. andi. r0, M, 1
  1286. ble .L90
  1287. LFDUX yl1, YL, INCY
  1288. LFDUX a1, A1, INC2
  1289. LFDUX a2, A2, INC2
  1290. fxcpmadd ys1, alpha1, a1, yl1
  1291. fxcsmadd ys1, alpha1, a2, ys1
  1292. STFDUX ys1, YS, INCY
  1293. .align 4
  1294. .L90:
  1295. andi. J, N, 1
  1296. ble .L999
  1297. LFDUX alpha1, X, INCX
  1298. mr A1, A
  1299. mr YL, Y
  1300. mr YS, Y
  1301. fmul alpha1, alpha, alpha1
  1302. srawi. r0, M, 3
  1303. mtspr CTR, r0
  1304. ble .L95
  1305. LFDUX yl1, YL, INCY
  1306. LFSDUX a2, YL, INCY
  1307. LFDUX yl2, YL, INCY
  1308. LFSDUX a4, YL, INCY
  1309. LFDUX yl3, YL, INCY
  1310. LFSDUX a6, YL, INCY
  1311. LFDUX yl4, YL, INCY
  1312. LFSDUX a8, YL, INCY
  1313. LFPDUX a1, A1, INC2
  1314. LFPDUX a5, A1, INC2
  1315. LFPDUX a9, A1, INC2
  1316. LFPDUX a13, A1, INC2
  1317. bdz .L93
  1318. .align 4
  1319. .L92:
  1320. fmr a2, yl1
  1321. fmr a4, yl2
  1322. fmr a6, yl3
  1323. fmr a8, yl4
  1324. fxcpmadd ys1, alpha1, a1, a2
  1325. LFDUX yl1, YL, INCY
  1326. LFSDUX a2, YL, INCY
  1327. fxcpmadd ys2, alpha1, a5, a4
  1328. LFDUX yl2, YL, INCY
  1329. LFSDUX a4, YL, INCY
  1330. fxcpmadd ys3, alpha1, a9, a6
  1331. LFDUX yl3, YL, INCY
  1332. LFSDUX a6, YL, INCY
  1333. fxcpmadd ys4, alpha1, a13, a8
  1334. LFDUX yl4, YL, INCY
  1335. LFSDUX a8, YL, INCY
  1336. LFPDUX a1, A1, INC2
  1337. LFPDUX a5, A1, INC2
  1338. LFPDUX a9, A1, INC2
  1339. LFPDUX a13, A1, INC2
  1340. STFDUX ys1, YS, INCY
  1341. STFSDUX ys1, YS, INCY
  1342. STFDUX ys2, YS, INCY
  1343. STFSDUX ys2, YS, INCY
  1344. STFDUX ys3, YS, INCY
  1345. STFSDUX ys3, YS, INCY
  1346. STFDUX ys4, YS, INCY
  1347. STFSDUX ys4, YS, INCY
  1348. bdnz .L92
  1349. .align 4
  1350. .L93:
  1351. fmr a2, yl1
  1352. fmr a4, yl2
  1353. fmr a6, yl3
  1354. fmr a8, yl4
  1355. fxcpmadd ys1, alpha1, a1, a2
  1356. fxcpmadd ys2, alpha1, a5, a4
  1357. fxcpmadd ys3, alpha1, a9, a6
  1358. fxcpmadd ys4, alpha1, a13, a8
  1359. STFDUX ys1, YS, INCY
  1360. STFSDUX ys1, YS, INCY
  1361. STFDUX ys2, YS, INCY
  1362. STFSDUX ys2, YS, INCY
  1363. STFDUX ys3, YS, INCY
  1364. STFSDUX ys3, YS, INCY
  1365. STFDUX ys4, YS, INCY
  1366. STFSDUX ys4, YS, INCY
  1367. .align 4
  1368. .L95:
  1369. andi. r0, M, 7
  1370. ble .L999
  1371. andi. r0, M, 4
  1372. ble .L97
  1373. LFPDUX a1, A1, INC2
  1374. LFDUX yl1, YL, INCY
  1375. LFDUX yl2, YL, INCY
  1376. LFPDUX a2, A1, INC2
  1377. LFDUX yl3, YL, INCY
  1378. LFDUX yl4, YL, INCY
  1379. fxcpmadd ys1, a1, alpha1, yl1
  1380. fxcsmadd ys2, a1, alpha1, yl2
  1381. fxcpmadd ys3, a2, alpha1, yl3
  1382. fxcsmadd ys4, a2, alpha1, yl4
  1383. STFDUX ys1, YS, INCY
  1384. STFDUX ys2, YS, INCY
  1385. STFDUX ys3, YS, INCY
  1386. STFDUX ys4, YS, INCY
  1387. .align 4
  1388. .L97:
  1389. andi. r0, M, 2
  1390. ble .L98
  1391. LFPDUX a1, A1, INC2
  1392. LFDUX yl1, YL, INCY
  1393. LFDUX yl2, YL, INCY
  1394. fxcpmadd ys1, a1, alpha1, yl1
  1395. fxcsmadd ys2, a1, alpha1, yl2
  1396. STFDUX ys1, YS, INCY
  1397. STFDUX ys2, YS, INCY
  1398. .align 4
  1399. .L98:
  1400. andi. r0, M, 1
  1401. ble .L999
  1402. LFDUX yl1, YL, INCY
  1403. LFDUX a1, A1, INC2
  1404. fxcpmadd ys1, alpha1, a1, yl1
  1405. STFDUX ys1, YS, INCY
  1406. b .L999
  1407. .align 4
  1408. .L999:
  1409. addi SP, SP, -4
  1410. lwzu r16, 4(SP)
  1411. lwzu r17, 4(SP)
  1412. lwzu r18, 4(SP)
  1413. lwzu r19, 4(SP)
  1414. lwzu r20, 4(SP)
  1415. lwzu r21, 4(SP)
  1416. lwzu r22, 4(SP)
  1417. lwzu r23, 4(SP)
  1418. lwzu r24, 4(SP)
  1419. lwzu r25, 4(SP)
  1420. lwzu r26, 4(SP)
  1421. lwzu r27, 4(SP)
  1422. lwzu r28, 4(SP)
  1423. lwzu r29, 4(SP)
  1424. lwzu r30, 4(SP)
  1425. lwzu r31, 4(SP)
  1426. subi SP, SP, 12
  1427. li r0, 16
  1428. lfpdux f31, SP, r0
  1429. lfpdux f30, SP, r0
  1430. lfpdux f29, SP, r0
  1431. lfpdux f28, SP, r0
  1432. lfpdux f27, SP, r0
  1433. lfpdux f26, SP, r0
  1434. lfpdux f25, SP, r0
  1435. lfpdux f24, SP, r0
  1436. lfpdux f23, SP, r0
  1437. lfpdux f22, SP, r0
  1438. lfpdux f21, SP, r0
  1439. lfpdux f20, SP, r0
  1440. lfpdux f19, SP, r0
  1441. lfpdux f18, SP, r0
  1442. lfpdux f17, SP, r0
  1443. lfpdux f16, SP, r0
  1444. lfpdux f15, SP, r0
  1445. lfpdux f14, SP, r0
  1446. addi SP, SP, 16
  1447. blr
  1448. EPILOGUE