You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 57 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define BUFFER r11
  83. #define XP r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define AO5 r18
  89. #define AO6 r19
  90. #define AO7 r20
  91. #define AO8 r21
  92. #define MIN_N r22
  93. #define J r23
  94. #define CO r24
  95. #define PREA r25
  96. #define PREC r26
  97. #define BO r27
  98. #define PLDA_M r28
  99. #define IS r29
  100. #define Y1 CO
  101. #if defined(PPCG4)
  102. #define PREFETCHSIZE_A 42
  103. #define PREFETCHSIZE_C 16
  104. #endif
  105. #if defined(PPC440) || defined(PPC440FP2)
  106. #define PREFETCHSIZE_A 42
  107. #define PREFETCHSIZE_C 16
  108. #endif
  109. #ifdef PPC970
  110. #define PREFETCHSIZE_A 42
  111. #define PREFETCHSIZE_C 16
  112. #endif
  113. #ifdef CELL
  114. #define PREFETCHSIZE_A 42
  115. #define PREFETCHSIZE_C 16
  116. #endif
  117. #ifdef POWER4
  118. #define PREFETCHSIZE_A 48
  119. #define PREFETCHSIZE_C 16
  120. #endif
  121. #ifdef POWER5
  122. #define PREFETCHSIZE_A 40
  123. #define PREFETCHSIZE_C 8
  124. #endif
  125. #ifdef POWER6
  126. #define PREFETCHSIZE_A 96
  127. #define PREFETCHSIZE_C 8
  128. #endif
  129. #ifdef POWER8
  130. #define PREFETCHSIZE_A 96
  131. #define PREFETCHSIZE_C 8
  132. #endif
  133. #define y01 f0
  134. #define y02 f1
  135. #define y03 f2
  136. #define y04 f3
  137. #define y05 f4
  138. #define y06 f5
  139. #define y07 f6
  140. #define y08 f7
  141. #define y09 f8
  142. #define y10 f9
  143. #define y11 f10
  144. #define y12 f11
  145. #define y13 f12
  146. #define y14 f13
  147. #define y15 f14
  148. #define y16 f15
  149. #define a1 f16
  150. #define a2 f17
  151. #define a3 f18
  152. #define a4 f19
  153. #define a5 f20
  154. #define a6 f21
  155. #define a7 f22
  156. #define a8 f23
  157. #define b1 f24
  158. #define b2 f25
  159. #define b3 f26
  160. #define b4 f27
  161. #define b5 f28
  162. #define b6 f29
  163. #define b7 f30
  164. #define b8 f31
  165. #define alpha f31
  166. #ifndef NEEDPARAM
  167. #define P 2048
  168. #ifndef __64BIT__
  169. #define STACKSIZE 224
  170. #else
  171. #define STACKSIZE 288
  172. #endif
  173. #define FZERO 144(SP)
  174. #define ALPHA 152(SP)
  175. PROLOGUE
  176. PROFCODE
  177. addi SP, SP, -STACKSIZE
  178. li r0, 0
  179. stfd f14, 0(SP)
  180. stfd f15, 8(SP)
  181. stfd f16, 16(SP)
  182. stfd f17, 24(SP)
  183. stfd f18, 32(SP)
  184. stfd f19, 40(SP)
  185. stfd f20, 48(SP)
  186. stfd f21, 56(SP)
  187. stfd f22, 64(SP)
  188. stfd f23, 72(SP)
  189. stfd f24, 80(SP)
  190. stfd f25, 88(SP)
  191. stfd f26, 96(SP)
  192. stfd f27, 104(SP)
  193. stfd f28, 112(SP)
  194. stfd f29, 120(SP)
  195. stfd f30, 128(SP)
  196. stfd f31, 136(SP)
  197. #ifdef __64BIT__
  198. std r0, FZERO
  199. stfd f1, ALPHA
  200. std r14, 160(SP)
  201. std r15, 168(SP)
  202. std r16, 176(SP)
  203. std r17, 184(SP)
  204. std r18, 192(SP)
  205. std r19, 200(SP)
  206. std r20, 208(SP)
  207. std r21, 216(SP)
  208. std r22, 224(SP)
  209. std r23, 232(SP)
  210. std r24, 240(SP)
  211. std r25, 248(SP)
  212. std r26, 256(SP)
  213. std r27, 264(SP)
  214. std r28, 272(SP)
  215. std r29, 280(SP)
  216. #else
  217. stw r0, 0 + FZERO
  218. stw r0, 4 + FZERO
  219. stfd f1, ALPHA
  220. stw r14, 160(SP)
  221. stw r15, 164(SP)
  222. stw r16, 168(SP)
  223. stw r17, 172(SP)
  224. stw r18, 176(SP)
  225. stw r19, 180(SP)
  226. stw r20, 184(SP)
  227. stw r21, 188(SP)
  228. stw r22, 192(SP)
  229. stw r23, 196(SP)
  230. stw r24, 200(SP)
  231. stw r25, 204(SP)
  232. stw r26, 208(SP)
  233. stw r27, 212(SP)
  234. stw r28, 216(SP)
  235. stw r29, 220(SP)
  236. #endif
  237. #ifdef linux
  238. #ifndef __64BIT__
  239. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  240. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  241. #else
  242. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  243. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  244. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  245. #endif
  246. #endif
  247. #if defined(_AIX) || defined(__APPLE__)
  248. #ifndef __64BIT__
  249. #ifdef DOUBLE
  250. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  251. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  252. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  253. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  254. #else
  255. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  256. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  257. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  258. #endif
  259. #else
  260. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  261. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  262. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  263. #endif
  264. #endif
  265. mullw PLDA_M, LDA, N
  266. li XP, P
  267. subf PLDA_M, XP, PLDA_M
  268. slwi PLDA_M, PLDA_M, BASE_SHIFT
  269. slwi LDA, LDA, BASE_SHIFT
  270. slwi INCX, INCX, BASE_SHIFT
  271. slwi INCY, INCY, BASE_SHIFT
  272. subf Y, INCY, Y
  273. li IS, 0
  274. addi A, A, -SIZE
  275. li PREA, PREFETCHSIZE_A * SIZE
  276. li PREC, PREFETCHSIZE_C * SIZE
  277. cmpi cr0, 0, M, 0
  278. ble LL(999)
  279. cmpi cr0, 0, N, 0
  280. ble LL(999)
  281. .align 4
  282. LL(ISLoop):
  283. subf MIN_N, IS, M
  284. slwi r0, IS, BASE_SHIFT
  285. cmpi cr0, 0, MIN_N, P
  286. ble+ LL(min_nP)
  287. li MIN_N, P
  288. LL(min_nP):
  289. add XP, X, r0
  290. cmpi cr0, 0, INCX, SIZE
  291. beq LL(10)
  292. mr XP, BUFFER
  293. addi CO, BUFFER, -SIZE
  294. srawi. r0, MIN_N, 3
  295. mtspr CTR, r0
  296. ble LL(CopyRemain)
  297. .align 4
  298. LL(CopyKernel):
  299. LFD f0, 0 * SIZE(X)
  300. add X, X, INCX
  301. LFD f1, 0 * SIZE(X)
  302. add X, X, INCX
  303. LFD f2, 0 * SIZE(X)
  304. add X, X, INCX
  305. LFD f3, 0 * SIZE(X)
  306. add X, X, INCX
  307. LFD f4, 0 * SIZE(X)
  308. add X, X, INCX
  309. LFD f5, 0 * SIZE(X)
  310. add X, X, INCX
  311. LFD f6, 0 * SIZE(X)
  312. add X, X, INCX
  313. LFD f7, 0 * SIZE(X)
  314. add X, X, INCX
  315. STFD f0, 1 * SIZE(CO)
  316. STFD f1, 2 * SIZE(CO)
  317. STFD f2, 3 * SIZE(CO)
  318. STFD f3, 4 * SIZE(CO)
  319. STFD f4, 5 * SIZE(CO)
  320. STFD f5, 6 * SIZE(CO)
  321. STFD f6, 7 * SIZE(CO)
  322. STFDU f7, 8 * SIZE(CO)
  323. bdnz LL(CopyKernel)
  324. .align 4
  325. LL(CopyRemain):
  326. andi. r0, MIN_N, 7
  327. mtspr CTR, r0
  328. ble LL(10)
  329. .align 4
  330. LL(CopySub):
  331. LFD f0, 0 * SIZE(X)
  332. add X, X, INCX
  333. STFDU f0, 1 * SIZE(CO)
  334. bdnz LL(CopySub)
  335. .align 4
  336. LL(10):
  337. mr CO, Y
  338. addi XP, XP, -SIZE
  339. srawi. J, N, 3
  340. ble LL(20)
  341. .align 4
  342. LL(11):
  343. mr AO1, A
  344. add AO2, A, LDA
  345. add AO3, AO2, LDA
  346. add AO4, AO3, LDA
  347. add AO5, AO4, LDA
  348. add AO6, AO5, LDA
  349. add AO7, AO6, LDA
  350. add AO8, AO7, LDA
  351. add A, AO8, LDA
  352. mr BO, XP
  353. lfd y01, FZERO
  354. fmr y02, y01
  355. fmr y03, y01
  356. fmr y04, y01
  357. fmr y05, y01
  358. fmr y06, y01
  359. fmr y07, y01
  360. fmr y08, y01
  361. fmr y09, y01
  362. fmr y10, y01
  363. fmr y11, y01
  364. fmr y12, y01
  365. fmr y13, y01
  366. fmr y14, y01
  367. fmr y15, y01
  368. fmr y16, y01
  369. DCBT(Y1, PREC)
  370. srawi. r0, MIN_N, 4
  371. mtspr CTR, r0
  372. ble LL(14)
  373. LFD a1, 1 * SIZE(AO1)
  374. LFD a2, 1 * SIZE(AO2)
  375. LFD a3, 1 * SIZE(AO3)
  376. LFD a4, 1 * SIZE(AO4)
  377. LFD a5, 1 * SIZE(AO5)
  378. LFD a6, 1 * SIZE(AO6)
  379. LFD a7, 1 * SIZE(AO7)
  380. LFD a8, 1 * SIZE(AO8)
  381. LFD b1, 1 * SIZE(BO)
  382. LFD b2, 2 * SIZE(BO)
  383. LFD b3, 3 * SIZE(BO)
  384. LFD b4, 4 * SIZE(BO)
  385. LFD b5, 5 * SIZE(BO)
  386. LFD b6, 6 * SIZE(BO)
  387. LFD b7, 7 * SIZE(BO)
  388. LFD b8, 8 * SIZE(BO)
  389. bdz LL(13)
  390. .align 4
  391. LL(12):
  392. FMADD y01, a1, b1, y01
  393. LFD a1, 2 * SIZE(AO1)
  394. FMADD y02, a2, b1, y02
  395. LFD a2, 2 * SIZE(AO2)
  396. FMADD y03, a3, b1, y03
  397. LFD a3, 2 * SIZE(AO3)
  398. FMADD y04, a4, b1, y04
  399. LFD a4, 2 * SIZE(AO4)
  400. FMADD y05, a5, b1, y05
  401. LFD a5, 2 * SIZE(AO5)
  402. FMADD y06, a6, b1, y06
  403. LFD a6, 2 * SIZE(AO6)
  404. FMADD y07, a7, b1, y07
  405. LFD a7, 2 * SIZE(AO7)
  406. FMADD y08, a8, b1, y08
  407. LFD a8, 2 * SIZE(AO8)
  408. FMADD y09, a1, b2, y09
  409. LFD a1, 3 * SIZE(AO1)
  410. FMADD y10, a2, b2, y10
  411. LFD a2, 3 * SIZE(AO2)
  412. FMADD y11, a3, b2, y11
  413. LFD a3, 3 * SIZE(AO3)
  414. FMADD y12, a4, b2, y12
  415. LFD a4, 3 * SIZE(AO4)
  416. FMADD y13, a5, b2, y13
  417. LFD a5, 3 * SIZE(AO5)
  418. FMADD y14, a6, b2, y14
  419. LFD a6, 3 * SIZE(AO6)
  420. FMADD y15, a7, b2, y15
  421. LFD a7, 3 * SIZE(AO7)
  422. FMADD y16, a8, b2, y16
  423. LFD a8, 3 * SIZE(AO8)
  424. FMADD y01, a1, b3, y01
  425. LFD a1, 4 * SIZE(AO1)
  426. FMADD y02, a2, b3, y02
  427. LFD a2, 4 * SIZE(AO2)
  428. FMADD y03, a3, b3, y03
  429. LFD a3, 4 * SIZE(AO3)
  430. FMADD y04, a4, b3, y04
  431. LFD a4, 4 * SIZE(AO4)
  432. FMADD y05, a5, b3, y05
  433. LFD a5, 4 * SIZE(AO5)
  434. FMADD y06, a6, b3, y06
  435. LFD a6, 4 * SIZE(AO6)
  436. FMADD y07, a7, b3, y07
  437. LFD a7, 4 * SIZE(AO7)
  438. FMADD y08, a8, b3, y08
  439. LFD a8, 4 * SIZE(AO8)
  440. FMADD y09, a1, b4, y09
  441. LFD a1, 5 * SIZE(AO1)
  442. FMADD y10, a2, b4, y10
  443. LFD a2, 5 * SIZE(AO2)
  444. FMADD y11, a3, b4, y11
  445. LFD a3, 5 * SIZE(AO3)
  446. FMADD y12, a4, b4, y12
  447. LFD a4, 5 * SIZE(AO4)
  448. FMADD y13, a5, b4, y13
  449. LFD a5, 5 * SIZE(AO5)
  450. FMADD y14, a6, b4, y14
  451. LFD a6, 5 * SIZE(AO6)
  452. FMADD y15, a7, b4, y15
  453. LFD a7, 5 * SIZE(AO7)
  454. FMADD y16, a8, b4, y16
  455. LFD a8, 5 * SIZE(AO8)
  456. LFD b1, 9 * SIZE(BO)
  457. LFD b2, 10 * SIZE(BO)
  458. LFD b3, 11 * SIZE(BO)
  459. LFD b4, 12 * SIZE(BO)
  460. FMADD y01, a1, b5, y01
  461. LFD a1, 6 * SIZE(AO1)
  462. FMADD y02, a2, b5, y02
  463. LFD a2, 6 * SIZE(AO2)
  464. FMADD y03, a3, b5, y03
  465. LFD a3, 6 * SIZE(AO3)
  466. FMADD y04, a4, b5, y04
  467. LFD a4, 6 * SIZE(AO4)
  468. FMADD y05, a5, b5, y05
  469. LFD a5, 6 * SIZE(AO5)
  470. FMADD y06, a6, b5, y06
  471. LFD a6, 6 * SIZE(AO6)
  472. FMADD y07, a7, b5, y07
  473. LFD a7, 6 * SIZE(AO7)
  474. FMADD y08, a8, b5, y08
  475. LFD a8, 6 * SIZE(AO8)
  476. FMADD y09, a1, b6, y09
  477. LFD a1, 7 * SIZE(AO1)
  478. FMADD y10, a2, b6, y10
  479. LFD a2, 7 * SIZE(AO2)
  480. FMADD y11, a3, b6, y11
  481. LFD a3, 7 * SIZE(AO3)
  482. FMADD y12, a4, b6, y12
  483. LFD a4, 7 * SIZE(AO4)
  484. FMADD y13, a5, b6, y13
  485. LFD a5, 7 * SIZE(AO5)
  486. FMADD y14, a6, b6, y14
  487. LFD a6, 7 * SIZE(AO6)
  488. FMADD y15, a7, b6, y15
  489. LFD a7, 7 * SIZE(AO7)
  490. FMADD y16, a8, b6, y16
  491. LFD a8, 7 * SIZE(AO8)
  492. FMADD y01, a1, b7, y01
  493. LFD a1, 8 * SIZE(AO1)
  494. FMADD y02, a2, b7, y02
  495. LFD a2, 8 * SIZE(AO2)
  496. FMADD y03, a3, b7, y03
  497. LFD a3, 8 * SIZE(AO3)
  498. FMADD y04, a4, b7, y04
  499. LFD a4, 8 * SIZE(AO4)
  500. FMADD y05, a5, b7, y05
  501. LFD a5, 8 * SIZE(AO5)
  502. FMADD y06, a6, b7, y06
  503. LFD a6, 8 * SIZE(AO6)
  504. FMADD y07, a7, b7, y07
  505. LFD a7, 8 * SIZE(AO7)
  506. FMADD y08, a8, b7, y08
  507. LFD a8, 8 * SIZE(AO8)
  508. FMADD y09, a1, b8, y09
  509. LFD a1, 9 * SIZE(AO1)
  510. FMADD y10, a2, b8, y10
  511. LFD a2, 9 * SIZE(AO2)
  512. FMADD y11, a3, b8, y11
  513. LFD a3, 9 * SIZE(AO3)
  514. FMADD y12, a4, b8, y12
  515. LFD a4, 9 * SIZE(AO4)
  516. FMADD y13, a5, b8, y13
  517. LFD a5, 9 * SIZE(AO5)
  518. FMADD y14, a6, b8, y14
  519. LFD a6, 9 * SIZE(AO6)
  520. FMADD y15, a7, b8, y15
  521. LFD a7, 9 * SIZE(AO7)
  522. FMADD y16, a8, b8, y16
  523. LFD a8, 9 * SIZE(AO8)
  524. LFD b5, 13 * SIZE(BO)
  525. LFD b6, 14 * SIZE(BO)
  526. LFD b7, 15 * SIZE(BO)
  527. LFD b8, 16 * SIZE(BO)
  528. DCBT(AO1, PREA)
  529. DCBT(AO2, PREA)
  530. DCBT(AO3, PREA)
  531. DCBT(AO4, PREA)
  532. FMADD y01, a1, b1, y01
  533. LFD a1, 10 * SIZE(AO1)
  534. FMADD y02, a2, b1, y02
  535. LFD a2, 10 * SIZE(AO2)
  536. FMADD y03, a3, b1, y03
  537. LFD a3, 10 * SIZE(AO3)
  538. FMADD y04, a4, b1, y04
  539. LFD a4, 10 * SIZE(AO4)
  540. FMADD y05, a5, b1, y05
  541. LFD a5, 10 * SIZE(AO5)
  542. FMADD y06, a6, b1, y06
  543. LFD a6, 10 * SIZE(AO6)
  544. FMADD y07, a7, b1, y07
  545. LFD a7, 10 * SIZE(AO7)
  546. FMADD y08, a8, b1, y08
  547. LFD a8, 10 * SIZE(AO8)
  548. FMADD y09, a1, b2, y09
  549. LFD a1, 11 * SIZE(AO1)
  550. FMADD y10, a2, b2, y10
  551. LFD a2, 11 * SIZE(AO2)
  552. FMADD y11, a3, b2, y11
  553. LFD a3, 11 * SIZE(AO3)
  554. FMADD y12, a4, b2, y12
  555. LFD a4, 11 * SIZE(AO4)
  556. FMADD y13, a5, b2, y13
  557. LFD a5, 11 * SIZE(AO5)
  558. FMADD y14, a6, b2, y14
  559. LFD a6, 11 * SIZE(AO6)
  560. FMADD y15, a7, b2, y15
  561. LFD a7, 11 * SIZE(AO7)
  562. FMADD y16, a8, b2, y16
  563. LFD a8, 11 * SIZE(AO8)
  564. FMADD y01, a1, b3, y01
  565. LFD a1, 12 * SIZE(AO1)
  566. FMADD y02, a2, b3, y02
  567. LFD a2, 12 * SIZE(AO2)
  568. FMADD y03, a3, b3, y03
  569. LFD a3, 12 * SIZE(AO3)
  570. FMADD y04, a4, b3, y04
  571. LFD a4, 12 * SIZE(AO4)
  572. FMADD y05, a5, b3, y05
  573. LFD a5, 12 * SIZE(AO5)
  574. FMADD y06, a6, b3, y06
  575. LFD a6, 12 * SIZE(AO6)
  576. FMADD y07, a7, b3, y07
  577. LFD a7, 12 * SIZE(AO7)
  578. FMADD y08, a8, b3, y08
  579. LFD a8, 12 * SIZE(AO8)
  580. FMADD y09, a1, b4, y09
  581. LFD a1, 13 * SIZE(AO1)
  582. FMADD y10, a2, b4, y10
  583. LFD a2, 13 * SIZE(AO2)
  584. FMADD y11, a3, b4, y11
  585. LFD a3, 13 * SIZE(AO3)
  586. FMADD y12, a4, b4, y12
  587. LFD a4, 13 * SIZE(AO4)
  588. FMADD y13, a5, b4, y13
  589. LFD a5, 13 * SIZE(AO5)
  590. FMADD y14, a6, b4, y14
  591. LFD a6, 13 * SIZE(AO6)
  592. FMADD y15, a7, b4, y15
  593. LFD a7, 13 * SIZE(AO7)
  594. FMADD y16, a8, b4, y16
  595. LFD a8, 13 * SIZE(AO8)
  596. LFD b1, 17 * SIZE(BO)
  597. LFD b2, 18 * SIZE(BO)
  598. LFD b3, 19 * SIZE(BO)
  599. LFD b4, 20 * SIZE(BO)
  600. FMADD y01, a1, b5, y01
  601. LFD a1, 14 * SIZE(AO1)
  602. FMADD y02, a2, b5, y02
  603. LFD a2, 14 * SIZE(AO2)
  604. FMADD y03, a3, b5, y03
  605. LFD a3, 14 * SIZE(AO3)
  606. FMADD y04, a4, b5, y04
  607. LFD a4, 14 * SIZE(AO4)
  608. FMADD y05, a5, b5, y05
  609. LFD a5, 14 * SIZE(AO5)
  610. FMADD y06, a6, b5, y06
  611. LFD a6, 14 * SIZE(AO6)
  612. FMADD y07, a7, b5, y07
  613. LFD a7, 14 * SIZE(AO7)
  614. FMADD y08, a8, b5, y08
  615. LFD a8, 14 * SIZE(AO8)
  616. FMADD y09, a1, b6, y09
  617. LFD a1, 15 * SIZE(AO1)
  618. FMADD y10, a2, b6, y10
  619. LFD a2, 15 * SIZE(AO2)
  620. FMADD y11, a3, b6, y11
  621. LFD a3, 15 * SIZE(AO3)
  622. FMADD y12, a4, b6, y12
  623. LFD a4, 15 * SIZE(AO4)
  624. FMADD y13, a5, b6, y13
  625. LFD a5, 15 * SIZE(AO5)
  626. FMADD y14, a6, b6, y14
  627. LFD a6, 15 * SIZE(AO6)
  628. FMADD y15, a7, b6, y15
  629. LFD a7, 15 * SIZE(AO7)
  630. FMADD y16, a8, b6, y16
  631. LFD a8, 15 * SIZE(AO8)
  632. FMADD y01, a1, b7, y01
  633. LFD a1, 16 * SIZE(AO1)
  634. FMADD y02, a2, b7, y02
  635. LFD a2, 16 * SIZE(AO2)
  636. FMADD y03, a3, b7, y03
  637. LFD a3, 16 * SIZE(AO3)
  638. FMADD y04, a4, b7, y04
  639. LFD a4, 16 * SIZE(AO4)
  640. FMADD y05, a5, b7, y05
  641. LFD a5, 16 * SIZE(AO5)
  642. FMADD y06, a6, b7, y06
  643. LFD a6, 16 * SIZE(AO6)
  644. FMADD y07, a7, b7, y07
  645. LFD a7, 16 * SIZE(AO7)
  646. FMADD y08, a8, b7, y08
  647. LFD a8, 16 * SIZE(AO8)
  648. FMADD y09, a1, b8, y09
  649. LFD a1, 17 * SIZE(AO1)
  650. FMADD y10, a2, b8, y10
  651. LFD a2, 17 * SIZE(AO2)
  652. FMADD y11, a3, b8, y11
  653. LFD a3, 17 * SIZE(AO3)
  654. FMADD y12, a4, b8, y12
  655. LFD a4, 17 * SIZE(AO4)
  656. addi AO1, AO1, 16 * SIZE
  657. addi AO2, AO2, 16 * SIZE
  658. addi AO3, AO3, 16 * SIZE
  659. addi AO4, AO4, 16 * SIZE
  660. FMADD y13, a5, b8, y13
  661. LFD a5, 17 * SIZE(AO5)
  662. FMADD y14, a6, b8, y14
  663. LFD a6, 17 * SIZE(AO6)
  664. FMADD y15, a7, b8, y15
  665. LFD a7, 17 * SIZE(AO7)
  666. FMADD y16, a8, b8, y16
  667. LFD a8, 17 * SIZE(AO8)
  668. LFD b5, 21 * SIZE(BO)
  669. LFD b6, 22 * SIZE(BO)
  670. LFD b7, 23 * SIZE(BO)
  671. LFD b8, 24 * SIZE(BO)
  672. addi AO5, AO5, 16 * SIZE
  673. addi AO6, AO6, 16 * SIZE
  674. DCBT(AO5, PREA)
  675. DCBT(AO6, PREA)
  676. addi AO7, AO7, 16 * SIZE
  677. addi AO8, AO8, 16 * SIZE
  678. DCBT(AO7, PREA)
  679. DCBT(AO8, PREA)
  680. addi BO, BO, 16 * SIZE
  681. bdnz LL(12)
  682. .align 4
  683. LL(13):
  684. FMADD y01, a1, b1, y01
  685. LFD a1, 2 * SIZE(AO1)
  686. FMADD y02, a2, b1, y02
  687. LFD a2, 2 * SIZE(AO2)
  688. FMADD y03, a3, b1, y03
  689. LFD a3, 2 * SIZE(AO3)
  690. FMADD y04, a4, b1, y04
  691. LFD a4, 2 * SIZE(AO4)
  692. FMADD y05, a5, b1, y05
  693. LFD a5, 2 * SIZE(AO5)
  694. FMADD y06, a6, b1, y06
  695. LFD a6, 2 * SIZE(AO6)
  696. FMADD y07, a7, b1, y07
  697. LFD a7, 2 * SIZE(AO7)
  698. FMADD y08, a8, b1, y08
  699. LFD a8, 2 * SIZE(AO8)
  700. FMADD y09, a1, b2, y09
  701. LFD a1, 3 * SIZE(AO1)
  702. FMADD y10, a2, b2, y10
  703. LFD a2, 3 * SIZE(AO2)
  704. FMADD y11, a3, b2, y11
  705. LFD a3, 3 * SIZE(AO3)
  706. FMADD y12, a4, b2, y12
  707. LFD a4, 3 * SIZE(AO4)
  708. FMADD y13, a5, b2, y13
  709. LFD a5, 3 * SIZE(AO5)
  710. FMADD y14, a6, b2, y14
  711. LFD a6, 3 * SIZE(AO6)
  712. FMADD y15, a7, b2, y15
  713. LFD a7, 3 * SIZE(AO7)
  714. FMADD y16, a8, b2, y16
  715. LFD a8, 3 * SIZE(AO8)
  716. FMADD y01, a1, b3, y01
  717. LFD a1, 4 * SIZE(AO1)
  718. FMADD y02, a2, b3, y02
  719. LFD a2, 4 * SIZE(AO2)
  720. FMADD y03, a3, b3, y03
  721. LFD a3, 4 * SIZE(AO3)
  722. FMADD y04, a4, b3, y04
  723. LFD a4, 4 * SIZE(AO4)
  724. FMADD y05, a5, b3, y05
  725. LFD a5, 4 * SIZE(AO5)
  726. FMADD y06, a6, b3, y06
  727. LFD a6, 4 * SIZE(AO6)
  728. FMADD y07, a7, b3, y07
  729. LFD a7, 4 * SIZE(AO7)
  730. FMADD y08, a8, b3, y08
  731. LFD a8, 4 * SIZE(AO8)
  732. FMADD y09, a1, b4, y09
  733. LFD a1, 5 * SIZE(AO1)
  734. FMADD y10, a2, b4, y10
  735. LFD a2, 5 * SIZE(AO2)
  736. FMADD y11, a3, b4, y11
  737. LFD a3, 5 * SIZE(AO3)
  738. FMADD y12, a4, b4, y12
  739. LFD a4, 5 * SIZE(AO4)
  740. FMADD y13, a5, b4, y13
  741. LFD a5, 5 * SIZE(AO5)
  742. FMADD y14, a6, b4, y14
  743. LFD a6, 5 * SIZE(AO6)
  744. FMADD y15, a7, b4, y15
  745. LFD a7, 5 * SIZE(AO7)
  746. FMADD y16, a8, b4, y16
  747. LFD a8, 5 * SIZE(AO8)
  748. LFD b1, 9 * SIZE(BO)
  749. LFD b2, 10 * SIZE(BO)
  750. LFD b3, 11 * SIZE(BO)
  751. LFD b4, 12 * SIZE(BO)
  752. FMADD y01, a1, b5, y01
  753. LFD a1, 6 * SIZE(AO1)
  754. FMADD y02, a2, b5, y02
  755. LFD a2, 6 * SIZE(AO2)
  756. FMADD y03, a3, b5, y03
  757. LFD a3, 6 * SIZE(AO3)
  758. FMADD y04, a4, b5, y04
  759. LFD a4, 6 * SIZE(AO4)
  760. FMADD y05, a5, b5, y05
  761. LFD a5, 6 * SIZE(AO5)
  762. FMADD y06, a6, b5, y06
  763. LFD a6, 6 * SIZE(AO6)
  764. FMADD y07, a7, b5, y07
  765. LFD a7, 6 * SIZE(AO7)
  766. FMADD y08, a8, b5, y08
  767. LFD a8, 6 * SIZE(AO8)
  768. FMADD y09, a1, b6, y09
  769. LFD a1, 7 * SIZE(AO1)
  770. FMADD y10, a2, b6, y10
  771. LFD a2, 7 * SIZE(AO2)
  772. FMADD y11, a3, b6, y11
  773. LFD a3, 7 * SIZE(AO3)
  774. FMADD y12, a4, b6, y12
  775. LFD a4, 7 * SIZE(AO4)
  776. FMADD y13, a5, b6, y13
  777. LFD a5, 7 * SIZE(AO5)
  778. FMADD y14, a6, b6, y14
  779. LFD a6, 7 * SIZE(AO6)
  780. FMADD y15, a7, b6, y15
  781. LFD a7, 7 * SIZE(AO7)
  782. FMADD y16, a8, b6, y16
  783. LFD a8, 7 * SIZE(AO8)
  784. FMADD y01, a1, b7, y01
  785. LFD a1, 8 * SIZE(AO1)
  786. FMADD y02, a2, b7, y02
  787. LFD a2, 8 * SIZE(AO2)
  788. FMADD y03, a3, b7, y03
  789. LFD a3, 8 * SIZE(AO3)
  790. FMADD y04, a4, b7, y04
  791. LFD a4, 8 * SIZE(AO4)
  792. FMADD y05, a5, b7, y05
  793. LFD a5, 8 * SIZE(AO5)
  794. FMADD y06, a6, b7, y06
  795. LFD a6, 8 * SIZE(AO6)
  796. FMADD y07, a7, b7, y07
  797. LFD a7, 8 * SIZE(AO7)
  798. FMADD y08, a8, b7, y08
  799. LFD a8, 8 * SIZE(AO8)
  800. FMADD y09, a1, b8, y09
  801. LFD a1, 9 * SIZE(AO1)
  802. FMADD y10, a2, b8, y10
  803. LFD a2, 9 * SIZE(AO2)
  804. FMADD y11, a3, b8, y11
  805. LFD a3, 9 * SIZE(AO3)
  806. FMADD y12, a4, b8, y12
  807. LFD a4, 9 * SIZE(AO4)
  808. FMADD y13, a5, b8, y13
  809. LFD a5, 9 * SIZE(AO5)
  810. FMADD y14, a6, b8, y14
  811. LFD a6, 9 * SIZE(AO6)
  812. FMADD y15, a7, b8, y15
  813. LFD a7, 9 * SIZE(AO7)
  814. FMADD y16, a8, b8, y16
  815. LFD a8, 9 * SIZE(AO8)
  816. LFD b5, 13 * SIZE(BO)
  817. LFD b6, 14 * SIZE(BO)
  818. LFD b7, 15 * SIZE(BO)
  819. LFD b8, 16 * SIZE(BO)
  820. FMADD y01, a1, b1, y01
  821. LFD a1, 10 * SIZE(AO1)
  822. FMADD y02, a2, b1, y02
  823. LFD a2, 10 * SIZE(AO2)
  824. FMADD y03, a3, b1, y03
  825. LFD a3, 10 * SIZE(AO3)
  826. FMADD y04, a4, b1, y04
  827. LFD a4, 10 * SIZE(AO4)
  828. FMADD y05, a5, b1, y05
  829. LFD a5, 10 * SIZE(AO5)
  830. FMADD y06, a6, b1, y06
  831. LFD a6, 10 * SIZE(AO6)
  832. FMADD y07, a7, b1, y07
  833. LFD a7, 10 * SIZE(AO7)
  834. FMADD y08, a8, b1, y08
  835. LFD a8, 10 * SIZE(AO8)
  836. FMADD y09, a1, b2, y09
  837. LFD a1, 11 * SIZE(AO1)
  838. FMADD y10, a2, b2, y10
  839. LFD a2, 11 * SIZE(AO2)
  840. FMADD y11, a3, b2, y11
  841. LFD a3, 11 * SIZE(AO3)
  842. FMADD y12, a4, b2, y12
  843. LFD a4, 11 * SIZE(AO4)
  844. FMADD y13, a5, b2, y13
  845. LFD a5, 11 * SIZE(AO5)
  846. FMADD y14, a6, b2, y14
  847. LFD a6, 11 * SIZE(AO6)
  848. FMADD y15, a7, b2, y15
  849. LFD a7, 11 * SIZE(AO7)
  850. FMADD y16, a8, b2, y16
  851. LFD a8, 11 * SIZE(AO8)
  852. FMADD y01, a1, b3, y01
  853. LFD a1, 12 * SIZE(AO1)
  854. FMADD y02, a2, b3, y02
  855. LFD a2, 12 * SIZE(AO2)
  856. FMADD y03, a3, b3, y03
  857. LFD a3, 12 * SIZE(AO3)
  858. FMADD y04, a4, b3, y04
  859. LFD a4, 12 * SIZE(AO4)
  860. FMADD y05, a5, b3, y05
  861. LFD a5, 12 * SIZE(AO5)
  862. FMADD y06, a6, b3, y06
  863. LFD a6, 12 * SIZE(AO6)
  864. FMADD y07, a7, b3, y07
  865. LFD a7, 12 * SIZE(AO7)
  866. FMADD y08, a8, b3, y08
  867. LFD a8, 12 * SIZE(AO8)
  868. FMADD y09, a1, b4, y09
  869. LFD a1, 13 * SIZE(AO1)
  870. FMADD y10, a2, b4, y10
  871. LFD a2, 13 * SIZE(AO2)
  872. FMADD y11, a3, b4, y11
  873. LFD a3, 13 * SIZE(AO3)
  874. FMADD y12, a4, b4, y12
  875. LFD a4, 13 * SIZE(AO4)
  876. FMADD y13, a5, b4, y13
  877. LFD a5, 13 * SIZE(AO5)
  878. FMADD y14, a6, b4, y14
  879. LFD a6, 13 * SIZE(AO6)
  880. FMADD y15, a7, b4, y15
  881. LFD a7, 13 * SIZE(AO7)
  882. FMADD y16, a8, b4, y16
  883. LFD a8, 13 * SIZE(AO8)
  884. FMADD y01, a1, b5, y01
  885. LFD a1, 14 * SIZE(AO1)
  886. FMADD y02, a2, b5, y02
  887. LFD a2, 14 * SIZE(AO2)
  888. FMADD y03, a3, b5, y03
  889. LFD a3, 14 * SIZE(AO3)
  890. FMADD y04, a4, b5, y04
  891. LFD a4, 14 * SIZE(AO4)
  892. FMADD y05, a5, b5, y05
  893. LFD a5, 14 * SIZE(AO5)
  894. FMADD y06, a6, b5, y06
  895. LFD a6, 14 * SIZE(AO6)
  896. FMADD y07, a7, b5, y07
  897. LFD a7, 14 * SIZE(AO7)
  898. FMADD y08, a8, b5, y08
  899. LFD a8, 14 * SIZE(AO8)
  900. FMADD y09, a1, b6, y09
  901. LFD a1, 15 * SIZE(AO1)
  902. FMADD y10, a2, b6, y10
  903. LFD a2, 15 * SIZE(AO2)
  904. FMADD y11, a3, b6, y11
  905. LFD a3, 15 * SIZE(AO3)
  906. FMADD y12, a4, b6, y12
  907. LFD a4, 15 * SIZE(AO4)
  908. FMADD y13, a5, b6, y13
  909. LFD a5, 15 * SIZE(AO5)
  910. FMADD y14, a6, b6, y14
  911. LFD a6, 15 * SIZE(AO6)
  912. FMADD y15, a7, b6, y15
  913. LFD a7, 15 * SIZE(AO7)
  914. FMADD y16, a8, b6, y16
  915. LFD a8, 15 * SIZE(AO8)
  916. FMADD y01, a1, b7, y01
  917. LFD a1, 16 * SIZE(AO1)
  918. FMADD y02, a2, b7, y02
  919. LFD a2, 16 * SIZE(AO2)
  920. FMADD y03, a3, b7, y03
  921. LFD a3, 16 * SIZE(AO3)
  922. FMADD y04, a4, b7, y04
  923. LFD a4, 16 * SIZE(AO4)
  924. FMADD y05, a5, b7, y05
  925. LFD a5, 16 * SIZE(AO5)
  926. FMADD y06, a6, b7, y06
  927. LFD a6, 16 * SIZE(AO6)
  928. FMADD y07, a7, b7, y07
  929. LFD a7, 16 * SIZE(AO7)
  930. FMADD y08, a8, b7, y08
  931. LFD a8, 16 * SIZE(AO8)
  932. FMADD y09, a1, b8, y09
  933. FMADD y10, a2, b8, y10
  934. FMADD y11, a3, b8, y11
  935. FMADD y12, a4, b8, y12
  936. addi AO1, AO1, 16 * SIZE
  937. addi AO2, AO2, 16 * SIZE
  938. addi AO3, AO3, 16 * SIZE
  939. addi AO4, AO4, 16 * SIZE
  940. FMADD y13, a5, b8, y13
  941. FMADD y14, a6, b8, y14
  942. FMADD y15, a7, b8, y15
  943. FMADD y16, a8, b8, y16
  944. addi AO5, AO5, 16 * SIZE
  945. addi AO6, AO6, 16 * SIZE
  946. addi AO7, AO7, 16 * SIZE
  947. addi AO8, AO8, 16 * SIZE
  948. addi BO, BO, 16 * SIZE
  949. .align 4
  950. LL(14):
  951. andi. r0, MIN_N, 15
  952. ble LL(18)
  953. andi. r0, MIN_N, 8
  954. ble LL(15)
  955. LFD a1, 1 * SIZE(AO1)
  956. LFD b1, 1 * SIZE(BO)
  957. LFD a2, 1 * SIZE(AO2)
  958. LFD a3, 1 * SIZE(AO3)
  959. LFD a4, 1 * SIZE(AO4)
  960. LFD a5, 1 * SIZE(AO5)
  961. LFD a6, 1 * SIZE(AO6)
  962. LFD a7, 1 * SIZE(AO7)
  963. LFD a8, 1 * SIZE(AO8)
  964. LFD b2, 2 * SIZE(BO)
  965. LFD b3, 3 * SIZE(BO)
  966. LFD b4, 4 * SIZE(BO)
  967. FMADD y01, a1, b1, y01
  968. LFD a1, 2 * SIZE(AO1)
  969. FMADD y02, a2, b1, y02
  970. LFD a2, 2 * SIZE(AO2)
  971. FMADD y03, a3, b1, y03
  972. LFD a3, 2 * SIZE(AO3)
  973. FMADD y04, a4, b1, y04
  974. LFD a4, 2 * SIZE(AO4)
  975. FMADD y05, a5, b1, y05
  976. LFD a5, 2 * SIZE(AO5)
  977. FMADD y06, a6, b1, y06
  978. LFD a6, 2 * SIZE(AO6)
  979. FMADD y07, a7, b1, y07
  980. LFD a7, 2 * SIZE(AO7)
  981. FMADD y08, a8, b1, y08
  982. LFD a8, 2 * SIZE(AO8)
  983. FMADD y09, a1, b2, y09
  984. LFD a1, 3 * SIZE(AO1)
  985. FMADD y10, a2, b2, y10
  986. LFD a2, 3 * SIZE(AO2)
  987. FMADD y11, a3, b2, y11
  988. LFD a3, 3 * SIZE(AO3)
  989. FMADD y12, a4, b2, y12
  990. LFD a4, 3 * SIZE(AO4)
  991. FMADD y13, a5, b2, y13
  992. LFD a5, 3 * SIZE(AO5)
  993. FMADD y14, a6, b2, y14
  994. LFD a6, 3 * SIZE(AO6)
  995. FMADD y15, a7, b2, y15
  996. LFD a7, 3 * SIZE(AO7)
  997. FMADD y16, a8, b2, y16
  998. LFD a8, 3 * SIZE(AO8)
  999. LFD b5, 5 * SIZE(BO)
  1000. LFD b6, 6 * SIZE(BO)
  1001. LFD b7, 7 * SIZE(BO)
  1002. LFD b8, 8 * SIZE(BO)
  1003. FMADD y01, a1, b3, y01
  1004. LFD a1, 4 * SIZE(AO1)
  1005. FMADD y02, a2, b3, y02
  1006. LFD a2, 4 * SIZE(AO2)
  1007. FMADD y03, a3, b3, y03
  1008. LFD a3, 4 * SIZE(AO3)
  1009. FMADD y04, a4, b3, y04
  1010. LFD a4, 4 * SIZE(AO4)
  1011. FMADD y05, a5, b3, y05
  1012. LFD a5, 4 * SIZE(AO5)
  1013. FMADD y06, a6, b3, y06
  1014. LFD a6, 4 * SIZE(AO6)
  1015. FMADD y07, a7, b3, y07
  1016. LFD a7, 4 * SIZE(AO7)
  1017. FMADD y08, a8, b3, y08
  1018. LFD a8, 4 * SIZE(AO8)
  1019. FMADD y09, a1, b4, y09
  1020. LFD a1, 5 * SIZE(AO1)
  1021. FMADD y10, a2, b4, y10
  1022. LFD a2, 5 * SIZE(AO2)
  1023. FMADD y11, a3, b4, y11
  1024. LFD a3, 5 * SIZE(AO3)
  1025. FMADD y12, a4, b4, y12
  1026. LFD a4, 5 * SIZE(AO4)
  1027. FMADD y13, a5, b4, y13
  1028. LFD a5, 5 * SIZE(AO5)
  1029. FMADD y14, a6, b4, y14
  1030. LFD a6, 5 * SIZE(AO6)
  1031. FMADD y15, a7, b4, y15
  1032. LFD a7, 5 * SIZE(AO7)
  1033. FMADD y16, a8, b4, y16
  1034. LFD a8, 5 * SIZE(AO8)
  1035. FMADD y01, a1, b5, y01
  1036. LFD a1, 6 * SIZE(AO1)
  1037. FMADD y02, a2, b5, y02
  1038. LFD a2, 6 * SIZE(AO2)
  1039. FMADD y03, a3, b5, y03
  1040. LFD a3, 6 * SIZE(AO3)
  1041. FMADD y04, a4, b5, y04
  1042. LFD a4, 6 * SIZE(AO4)
  1043. FMADD y05, a5, b5, y05
  1044. LFD a5, 6 * SIZE(AO5)
  1045. FMADD y06, a6, b5, y06
  1046. LFD a6, 6 * SIZE(AO6)
  1047. FMADD y07, a7, b5, y07
  1048. LFD a7, 6 * SIZE(AO7)
  1049. FMADD y08, a8, b5, y08
  1050. LFD a8, 6 * SIZE(AO8)
  1051. FMADD y09, a1, b6, y09
  1052. LFD a1, 7 * SIZE(AO1)
  1053. FMADD y10, a2, b6, y10
  1054. LFD a2, 7 * SIZE(AO2)
  1055. FMADD y11, a3, b6, y11
  1056. LFD a3, 7 * SIZE(AO3)
  1057. FMADD y12, a4, b6, y12
  1058. LFD a4, 7 * SIZE(AO4)
  1059. FMADD y13, a5, b6, y13
  1060. LFD a5, 7 * SIZE(AO5)
  1061. FMADD y14, a6, b6, y14
  1062. LFD a6, 7 * SIZE(AO6)
  1063. FMADD y15, a7, b6, y15
  1064. LFD a7, 7 * SIZE(AO7)
  1065. FMADD y16, a8, b6, y16
  1066. LFD a8, 7 * SIZE(AO8)
  1067. FMADD y01, a1, b7, y01
  1068. LFD a1, 8 * SIZE(AO1)
  1069. FMADD y02, a2, b7, y02
  1070. LFD a2, 8 * SIZE(AO2)
  1071. FMADD y03, a3, b7, y03
  1072. LFD a3, 8 * SIZE(AO3)
  1073. FMADD y04, a4, b7, y04
  1074. LFD a4, 8 * SIZE(AO4)
  1075. FMADD y05, a5, b7, y05
  1076. LFD a5, 8 * SIZE(AO5)
  1077. FMADD y06, a6, b7, y06
  1078. LFD a6, 8 * SIZE(AO6)
  1079. FMADD y07, a7, b7, y07
  1080. LFD a7, 8 * SIZE(AO7)
  1081. FMADD y08, a8, b7, y08
  1082. LFD a8, 8 * SIZE(AO8)
  1083. FMADD y09, a1, b8, y09
  1084. addi AO1, AO1, 8 * SIZE
  1085. FMADD y10, a2, b8, y10
  1086. addi AO2, AO2, 8 * SIZE
  1087. FMADD y11, a3, b8, y11
  1088. addi AO3, AO3, 8 * SIZE
  1089. FMADD y12, a4, b8, y12
  1090. addi AO4, AO4, 8 * SIZE
  1091. FMADD y13, a5, b8, y13
  1092. addi AO5, AO5, 8 * SIZE
  1093. FMADD y14, a6, b8, y14
  1094. addi AO6, AO6, 8 * SIZE
  1095. FMADD y15, a7, b8, y15
  1096. addi AO7, AO7, 8 * SIZE
  1097. FMADD y16, a8, b8, y16
  1098. addi AO8, AO8, 8 * SIZE
  1099. addi BO, BO, 8 * SIZE
  1100. .align 4
  1101. LL(15):
  1102. andi. r0, MIN_N, 4
  1103. ble LL(16)
  1104. LFD a1, 1 * SIZE(AO1)
  1105. LFD b1, 1 * SIZE(BO)
  1106. LFD a2, 1 * SIZE(AO2)
  1107. LFD a3, 1 * SIZE(AO3)
  1108. LFD a4, 1 * SIZE(AO4)
  1109. LFD a5, 1 * SIZE(AO5)
  1110. LFD a6, 1 * SIZE(AO6)
  1111. LFD a7, 1 * SIZE(AO7)
  1112. LFD a8, 1 * SIZE(AO8)
  1113. LFD b2, 2 * SIZE(BO)
  1114. LFD b3, 3 * SIZE(BO)
  1115. LFD b4, 4 * SIZE(BO)
  1116. FMADD y01, a1, b1, y01
  1117. LFD a1, 2 * SIZE(AO1)
  1118. FMADD y02, a2, b1, y02
  1119. LFD a2, 2 * SIZE(AO2)
  1120. FMADD y03, a3, b1, y03
  1121. LFD a3, 2 * SIZE(AO3)
  1122. FMADD y04, a4, b1, y04
  1123. LFD a4, 2 * SIZE(AO4)
  1124. FMADD y05, a5, b1, y05
  1125. LFD a5, 2 * SIZE(AO5)
  1126. FMADD y06, a6, b1, y06
  1127. LFD a6, 2 * SIZE(AO6)
  1128. FMADD y07, a7, b1, y07
  1129. LFD a7, 2 * SIZE(AO7)
  1130. FMADD y08, a8, b1, y08
  1131. LFD a8, 2 * SIZE(AO8)
  1132. FMADD y09, a1, b2, y09
  1133. LFD a1, 3 * SIZE(AO1)
  1134. FMADD y10, a2, b2, y10
  1135. LFD a2, 3 * SIZE(AO2)
  1136. FMADD y11, a3, b2, y11
  1137. LFD a3, 3 * SIZE(AO3)
  1138. FMADD y12, a4, b2, y12
  1139. LFD a4, 3 * SIZE(AO4)
  1140. FMADD y13, a5, b2, y13
  1141. LFD a5, 3 * SIZE(AO5)
  1142. FMADD y14, a6, b2, y14
  1143. LFD a6, 3 * SIZE(AO6)
  1144. FMADD y15, a7, b2, y15
  1145. LFD a7, 3 * SIZE(AO7)
  1146. FMADD y16, a8, b2, y16
  1147. LFD a8, 3 * SIZE(AO8)
  1148. FMADD y01, a1, b3, y01
  1149. LFD a1, 4 * SIZE(AO1)
  1150. FMADD y02, a2, b3, y02
  1151. LFD a2, 4 * SIZE(AO2)
  1152. FMADD y03, a3, b3, y03
  1153. LFD a3, 4 * SIZE(AO3)
  1154. FMADD y04, a4, b3, y04
  1155. LFD a4, 4 * SIZE(AO4)
  1156. FMADD y05, a5, b3, y05
  1157. LFD a5, 4 * SIZE(AO5)
  1158. FMADD y06, a6, b3, y06
  1159. LFD a6, 4 * SIZE(AO6)
  1160. FMADD y07, a7, b3, y07
  1161. LFD a7, 4 * SIZE(AO7)
  1162. FMADD y08, a8, b3, y08
  1163. LFD a8, 4 * SIZE(AO8)
  1164. FMADD y09, a1, b4, y09
  1165. addi AO1, AO1, 4 * SIZE
  1166. FMADD y10, a2, b4, y10
  1167. addi AO2, AO2, 4 * SIZE
  1168. FMADD y11, a3, b4, y11
  1169. addi AO3, AO3, 4 * SIZE
  1170. FMADD y12, a4, b4, y12
  1171. addi AO4, AO4, 4 * SIZE
  1172. FMADD y13, a5, b4, y13
  1173. addi AO5, AO5, 4 * SIZE
  1174. FMADD y14, a6, b4, y14
  1175. addi AO6, AO6, 4 * SIZE
  1176. FMADD y15, a7, b4, y15
  1177. addi AO7, AO7, 4 * SIZE
  1178. FMADD y16, a8, b4, y16
  1179. addi AO8, AO8, 4 * SIZE
  1180. addi BO, BO, 4 * SIZE
  1181. .align 4
  1182. LL(16):
  1183. andi. r0, MIN_N, 2
  1184. ble LL(17)
  1185. LFD a1, 1 * SIZE(AO1)
  1186. LFD b1, 1 * SIZE(BO)
  1187. LFD a2, 1 * SIZE(AO2)
  1188. LFD a3, 1 * SIZE(AO3)
  1189. LFD a4, 1 * SIZE(AO4)
  1190. LFD a5, 1 * SIZE(AO5)
  1191. LFD a6, 1 * SIZE(AO6)
  1192. LFD a7, 1 * SIZE(AO7)
  1193. LFD a8, 1 * SIZE(AO8)
  1194. LFD b2, 2 * SIZE(BO)
  1195. FMADD y01, a1, b1, y01
  1196. LFD a1, 2 * SIZE(AO1)
  1197. FMADD y02, a2, b1, y02
  1198. LFD a2, 2 * SIZE(AO2)
  1199. FMADD y03, a3, b1, y03
  1200. LFD a3, 2 * SIZE(AO3)
  1201. FMADD y04, a4, b1, y04
  1202. LFD a4, 2 * SIZE(AO4)
  1203. FMADD y05, a5, b1, y05
  1204. LFD a5, 2 * SIZE(AO5)
  1205. FMADD y06, a6, b1, y06
  1206. LFD a6, 2 * SIZE(AO6)
  1207. FMADD y07, a7, b1, y07
  1208. LFD a7, 2 * SIZE(AO7)
  1209. FMADD y08, a8, b1, y08
  1210. LFD a8, 2 * SIZE(AO8)
  1211. FMADD y09, a1, b2, y09
  1212. addi AO1, AO1, 2 * SIZE
  1213. addi AO2, AO2, 2 * SIZE
  1214. FMADD y10, a2, b2, y10
  1215. addi AO3, AO3, 2 * SIZE
  1216. addi AO4, AO4, 2 * SIZE
  1217. FMADD y11, a3, b2, y11
  1218. FMADD y12, a4, b2, y12
  1219. addi AO5, AO5, 2 * SIZE
  1220. addi AO6, AO6, 2 * SIZE
  1221. FMADD y13, a5, b2, y13
  1222. FMADD y14, a6, b2, y14
  1223. addi AO7, AO7, 2 * SIZE
  1224. addi AO8, AO8, 2 * SIZE
  1225. FMADD y15, a7, b2, y15
  1226. FMADD y16, a8, b2, y16
  1227. addi BO, BO, 2 * SIZE
  1228. .align 4
  1229. LL(17):
  1230. andi. r0, MIN_N, 1
  1231. ble LL(18)
  1232. LFD a1, 1 * SIZE(AO1)
  1233. LFD b1, 1 * SIZE(BO)
  1234. LFD a2, 1 * SIZE(AO2)
  1235. LFD a3, 1 * SIZE(AO3)
  1236. LFD a4, 1 * SIZE(AO4)
  1237. LFD a5, 1 * SIZE(AO5)
  1238. LFD a6, 1 * SIZE(AO6)
  1239. LFD a7, 1 * SIZE(AO7)
  1240. LFD a8, 1 * SIZE(AO8)
  1241. FMADD y01, a1, b1, y01
  1242. FMADD y02, a2, b1, y02
  1243. FMADD y03, a3, b1, y03
  1244. FMADD y04, a4, b1, y04
  1245. FMADD y05, a5, b1, y05
  1246. FMADD y06, a6, b1, y06
  1247. FMADD y07, a7, b1, y07
  1248. FMADD y08, a8, b1, y08
  1249. .align 4
  1250. LL(18):
  1251. mr BO, CO
  1252. lfd alpha, ALPHA
  1253. cmpi cr0, 0, INCY, SIZE
  1254. bne LL(19)
  1255. LFD a1, 1 * SIZE(CO)
  1256. LFD a2, 2 * SIZE(CO)
  1257. LFD a3, 3 * SIZE(CO)
  1258. LFD a4, 4 * SIZE(CO)
  1259. LFD a5, 5 * SIZE(CO)
  1260. LFD a6, 6 * SIZE(CO)
  1261. LFD a7, 7 * SIZE(CO)
  1262. LFD a8, 8 * SIZE(CO)
  1263. FADD y01, y09, y01
  1264. FADD y02, y10, y02
  1265. FADD y03, y11, y03
  1266. FADD y04, y12, y04
  1267. FADD y05, y13, y05
  1268. FADD y06, y14, y06
  1269. FADD y07, y15, y07
  1270. FADD y08, y16, y08
  1271. FMADD a1, alpha, y01, a1
  1272. FMADD a2, alpha, y02, a2
  1273. FMADD a3, alpha, y03, a3
  1274. FMADD a4, alpha, y04, a4
  1275. FMADD a5, alpha, y05, a5
  1276. FMADD a6, alpha, y06, a6
  1277. FMADD a7, alpha, y07, a7
  1278. FMADD a8, alpha, y08, a8
  1279. STFD a1, 1 * SIZE(CO)
  1280. STFD a2, 2 * SIZE(CO)
  1281. STFD a3, 3 * SIZE(CO)
  1282. STFD a4, 4 * SIZE(CO)
  1283. STFD a5, 5 * SIZE(CO)
  1284. STFD a6, 6 * SIZE(CO)
  1285. STFD a7, 7 * SIZE(CO)
  1286. STFD a8, 8 * SIZE(CO)
  1287. addi J, J, -1
  1288. addi CO, CO, 8 * SIZE
  1289. cmpi cr0, 0, J, 0
  1290. bgt LL(11)
  1291. b LL(20)
  1292. .align 4
  1293. LL(19):
  1294. LFDUX a1, CO, INCY
  1295. LFDUX a2, CO, INCY
  1296. LFDUX a3, CO, INCY
  1297. LFDUX a4, CO, INCY
  1298. LFDUX a5, CO, INCY
  1299. LFDUX a6, CO, INCY
  1300. LFDUX a7, CO, INCY
  1301. LFDUX a8, CO, INCY
  1302. FADD y01, y09, y01
  1303. FADD y02, y10, y02
  1304. FADD y03, y11, y03
  1305. FADD y04, y12, y04
  1306. FADD y05, y13, y05
  1307. FADD y06, y14, y06
  1308. FADD y07, y15, y07
  1309. FADD y08, y16, y08
  1310. FMADD a1, alpha, f0, a1
  1311. FMADD a2, alpha, f1, a2
  1312. FMADD a3, alpha, f2, a3
  1313. FMADD a4, alpha, f3, a4
  1314. FMADD a5, alpha, f4, a5
  1315. FMADD a6, alpha, f5, a6
  1316. FMADD a7, alpha, f6, a7
  1317. FMADD a8, alpha, f7, a8
  1318. STFDUX a1, BO, INCY
  1319. STFDUX a2, BO, INCY
  1320. STFDUX a3, BO, INCY
  1321. STFDUX a4, BO, INCY
  1322. STFDUX a5, BO, INCY
  1323. STFDUX a6, BO, INCY
  1324. STFDUX a7, BO, INCY
  1325. STFDUX a8, BO, INCY
  1326. addi J, J, -1
  1327. cmpi cr0, 0, J, 0
  1328. bgt LL(11)
  1329. .align 4
  1330. LL(20):
  1331. andi. J, N, 7
  1332. ble LL(99)
  1333. andi. J, N, 4
  1334. ble LL(30)
  1335. mr AO1, A
  1336. add AO2, A, LDA
  1337. add AO3, AO2, LDA
  1338. add AO4, AO3, LDA
  1339. add A, AO4, LDA
  1340. mr BO, XP
  1341. lfd y01, FZERO
  1342. fmr y02, y01
  1343. fmr y03, y01
  1344. fmr y04, y01
  1345. fmr y09, y01
  1346. fmr y10, y01
  1347. fmr y11, y01
  1348. fmr y12, y01
  1349. DCBT(Y1, PREC)
  1350. srawi. r0, MIN_N, 4
  1351. mtspr CTR, r0
  1352. ble LL(24)
  1353. LFD a1, 1 * SIZE(AO1)
  1354. LFD a2, 1 * SIZE(AO2)
  1355. LFD a3, 1 * SIZE(AO3)
  1356. LFD a4, 1 * SIZE(AO4)
  1357. LFD a5, 2 * SIZE(AO1)
  1358. LFD a6, 2 * SIZE(AO2)
  1359. LFD a7, 2 * SIZE(AO3)
  1360. LFD a8, 2 * SIZE(AO4)
  1361. LFD b1, 1 * SIZE(BO)
  1362. LFD b2, 2 * SIZE(BO)
  1363. LFD b3, 3 * SIZE(BO)
  1364. LFD b4, 4 * SIZE(BO)
  1365. LFD b5, 5 * SIZE(BO)
  1366. LFD b6, 6 * SIZE(BO)
  1367. LFD b7, 7 * SIZE(BO)
  1368. LFD b8, 8 * SIZE(BO)
  1369. bdz LL(23)
  1370. .align 4
  1371. LL(22):
  1372. FMADD y01, a1, b1, y01
  1373. LFD a1, 3 * SIZE(AO1)
  1374. FMADD y02, a2, b1, y02
  1375. LFD a2, 3 * SIZE(AO2)
  1376. FMADD y03, a3, b1, y03
  1377. LFD a3, 3 * SIZE(AO3)
  1378. FMADD y04, a4, b1, y04
  1379. LFD a4, 3 * SIZE(AO4)
  1380. FMADD y09, a5, b2, y09
  1381. LFD a5, 4 * SIZE(AO1)
  1382. FMADD y10, a6, b2, y10
  1383. LFD a6, 4 * SIZE(AO2)
  1384. FMADD y11, a7, b2, y11
  1385. LFD a7, 4 * SIZE(AO3)
  1386. FMADD y12, a8, b2, y12
  1387. LFD a8, 4 * SIZE(AO4)
  1388. FMADD y01, a1, b3, y01
  1389. LFD a1, 5 * SIZE(AO1)
  1390. FMADD y02, a2, b3, y02
  1391. LFD a2, 5 * SIZE(AO2)
  1392. FMADD y03, a3, b3, y03
  1393. LFD a3, 5 * SIZE(AO3)
  1394. FMADD y04, a4, b3, y04
  1395. LFD a4, 5 * SIZE(AO4)
  1396. FMADD y09, a5, b4, y09
  1397. LFD a5, 6 * SIZE(AO1)
  1398. FMADD y10, a6, b4, y10
  1399. LFD a6, 6 * SIZE(AO2)
  1400. FMADD y11, a7, b4, y11
  1401. LFD a7, 6 * SIZE(AO3)
  1402. FMADD y12, a8, b4, y12
  1403. LFD a8, 6 * SIZE(AO4)
  1404. LFD b1, 9 * SIZE(BO)
  1405. LFD b2, 10 * SIZE(BO)
  1406. LFD b3, 11 * SIZE(BO)
  1407. LFD b4, 12 * SIZE(BO)
  1408. FMADD y01, a1, b5, y01
  1409. LFD a1, 7 * SIZE(AO1)
  1410. FMADD y02, a2, b5, y02
  1411. LFD a2, 7 * SIZE(AO2)
  1412. FMADD y03, a3, b5, y03
  1413. LFD a3, 7 * SIZE(AO3)
  1414. FMADD y04, a4, b5, y04
  1415. LFD a4, 7 * SIZE(AO4)
  1416. FMADD y09, a5, b6, y09
  1417. LFD a5, 8 * SIZE(AO1)
  1418. FMADD y10, a6, b6, y10
  1419. LFD a6, 8 * SIZE(AO2)
  1420. FMADD y11, a7, b6, y11
  1421. LFD a7, 8 * SIZE(AO3)
  1422. FMADD y12, a8, b6, y12
  1423. LFD a8, 8 * SIZE(AO4)
  1424. FMADD y01, a1, b7, y01
  1425. LFD a1, 9 * SIZE(AO1)
  1426. FMADD y02, a2, b7, y02
  1427. LFD a2, 9 * SIZE(AO2)
  1428. FMADD y03, a3, b7, y03
  1429. LFD a3, 9 * SIZE(AO3)
  1430. FMADD y04, a4, b7, y04
  1431. LFD a4, 9 * SIZE(AO4)
  1432. FMADD y09, a5, b8, y09
  1433. LFD a5, 10 * SIZE(AO1)
  1434. FMADD y10, a6, b8, y10
  1435. LFD a6, 10 * SIZE(AO2)
  1436. FMADD y11, a7, b8, y11
  1437. LFD a7, 10 * SIZE(AO3)
  1438. FMADD y12, a8, b8, y12
  1439. LFD a8, 10 * SIZE(AO4)
  1440. LFD b5, 13 * SIZE(BO)
  1441. LFD b6, 14 * SIZE(BO)
  1442. LFD b7, 15 * SIZE(BO)
  1443. LFD b8, 16 * SIZE(BO)
  1444. FMADD y01, a1, b1, y01
  1445. LFD a1, 11 * SIZE(AO1)
  1446. FMADD y02, a2, b1, y02
  1447. LFD a2, 11 * SIZE(AO2)
  1448. FMADD y03, a3, b1, y03
  1449. LFD a3, 11 * SIZE(AO3)
  1450. FMADD y04, a4, b1, y04
  1451. LFD a4, 11 * SIZE(AO4)
  1452. FMADD y09, a5, b2, y09
  1453. LFD a5, 12 * SIZE(AO1)
  1454. FMADD y10, a6, b2, y10
  1455. LFD a6, 12 * SIZE(AO2)
  1456. FMADD y11, a7, b2, y11
  1457. LFD a7, 12 * SIZE(AO3)
  1458. FMADD y12, a8, b2, y12
  1459. LFD a8, 12 * SIZE(AO4)
  1460. FMADD y01, a1, b3, y01
  1461. LFD a1, 13 * SIZE(AO1)
  1462. FMADD y02, a2, b3, y02
  1463. LFD a2, 13 * SIZE(AO2)
  1464. FMADD y03, a3, b3, y03
  1465. LFD a3, 13 * SIZE(AO3)
  1466. FMADD y04, a4, b3, y04
  1467. LFD a4, 13 * SIZE(AO4)
  1468. FMADD y09, a5, b4, y09
  1469. LFD a5, 14 * SIZE(AO1)
  1470. FMADD y10, a6, b4, y10
  1471. LFD a6, 14 * SIZE(AO2)
  1472. FMADD y11, a7, b4, y11
  1473. LFD a7, 14 * SIZE(AO3)
  1474. FMADD y12, a8, b4, y12
  1475. LFD a8, 14 * SIZE(AO4)
  1476. LFD b1, 17 * SIZE(BO)
  1477. LFD b2, 18 * SIZE(BO)
  1478. LFD b3, 19 * SIZE(BO)
  1479. LFD b4, 20 * SIZE(BO)
  1480. FMADD y01, a1, b5, y01
  1481. LFD a1, 15 * SIZE(AO1)
  1482. FMADD y02, a2, b5, y02
  1483. LFD a2, 15 * SIZE(AO2)
  1484. FMADD y03, a3, b5, y03
  1485. LFD a3, 15 * SIZE(AO3)
  1486. FMADD y04, a4, b5, y04
  1487. LFD a4, 15 * SIZE(AO4)
  1488. FMADD y09, a5, b6, y09
  1489. LFD a5, 16 * SIZE(AO1)
  1490. FMADD y10, a6, b6, y10
  1491. LFD a6, 16 * SIZE(AO2)
  1492. FMADD y11, a7, b6, y11
  1493. LFD a7, 16 * SIZE(AO3)
  1494. FMADD y12, a8, b6, y12
  1495. LFD a8, 16 * SIZE(AO4)
  1496. FMADD y01, a1, b7, y01
  1497. LFD a1, 17 * SIZE(AO1)
  1498. FMADD y02, a2, b7, y02
  1499. LFD a2, 17 * SIZE(AO2)
  1500. FMADD y03, a3, b7, y03
  1501. LFD a3, 17 * SIZE(AO3)
  1502. FMADD y04, a4, b7, y04
  1503. LFD a4, 17 * SIZE(AO4)
  1504. FMADD y09, a5, b8, y09
  1505. LFD a5, 18 * SIZE(AO1)
  1506. FMADD y10, a6, b8, y10
  1507. LFD a6, 18 * SIZE(AO2)
  1508. FMADD y11, a7, b8, y11
  1509. LFD a7, 18 * SIZE(AO3)
  1510. FMADD y12, a8, b8, y12
  1511. LFD a8, 18 * SIZE(AO4)
  1512. LFD b5, 21 * SIZE(BO)
  1513. LFD b6, 22 * SIZE(BO)
  1514. LFD b7, 23 * SIZE(BO)
  1515. LFD b8, 24 * SIZE(BO)
  1516. addi AO1, AO1, 16 * SIZE
  1517. addi AO2, AO2, 16 * SIZE
  1518. DCBT(AO1, PREA)
  1519. DCBT(AO2, PREA)
  1520. addi AO3, AO3, 16 * SIZE
  1521. addi AO4, AO4, 16 * SIZE
  1522. DCBT(AO3, PREA)
  1523. DCBT(AO4, PREA)
  1524. addi BO, BO, 16 * SIZE
  1525. bdnz LL(22)
  1526. .align 4
  1527. LL(23):
  1528. FMADD y01, a1, b1, y01
  1529. LFD a1, 3 * SIZE(AO1)
  1530. FMADD y02, a2, b1, y02
  1531. LFD a2, 3 * SIZE(AO2)
  1532. FMADD y03, a3, b1, y03
  1533. LFD a3, 3 * SIZE(AO3)
  1534. FMADD y04, a4, b1, y04
  1535. LFD a4, 3 * SIZE(AO4)
  1536. FMADD y09, a5, b2, y09
  1537. LFD a5, 4 * SIZE(AO1)
  1538. FMADD y10, a6, b2, y10
  1539. LFD a6, 4 * SIZE(AO2)
  1540. FMADD y11, a7, b2, y11
  1541. LFD a7, 4 * SIZE(AO3)
  1542. FMADD y12, a8, b2, y12
  1543. LFD a8, 4 * SIZE(AO4)
  1544. FMADD y01, a1, b3, y01
  1545. LFD a1, 5 * SIZE(AO1)
  1546. FMADD y02, a2, b3, y02
  1547. LFD a2, 5 * SIZE(AO2)
  1548. FMADD y03, a3, b3, y03
  1549. LFD a3, 5 * SIZE(AO3)
  1550. FMADD y04, a4, b3, y04
  1551. LFD a4, 5 * SIZE(AO4)
  1552. FMADD y09, a5, b4, y09
  1553. LFD a5, 6 * SIZE(AO1)
  1554. FMADD y10, a6, b4, y10
  1555. LFD a6, 6 * SIZE(AO2)
  1556. FMADD y11, a7, b4, y11
  1557. LFD a7, 6 * SIZE(AO3)
  1558. FMADD y12, a8, b4, y12
  1559. LFD a8, 6 * SIZE(AO4)
  1560. LFD b1, 9 * SIZE(BO)
  1561. LFD b2, 10 * SIZE(BO)
  1562. LFD b3, 11 * SIZE(BO)
  1563. LFD b4, 12 * SIZE(BO)
  1564. FMADD y01, a1, b5, y01
  1565. LFD a1, 7 * SIZE(AO1)
  1566. FMADD y02, a2, b5, y02
  1567. LFD a2, 7 * SIZE(AO2)
  1568. FMADD y03, a3, b5, y03
  1569. LFD a3, 7 * SIZE(AO3)
  1570. FMADD y04, a4, b5, y04
  1571. LFD a4, 7 * SIZE(AO4)
  1572. FMADD y09, a5, b6, y09
  1573. LFD a5, 8 * SIZE(AO1)
  1574. FMADD y10, a6, b6, y10
  1575. LFD a6, 8 * SIZE(AO2)
  1576. FMADD y11, a7, b6, y11
  1577. LFD a7, 8 * SIZE(AO3)
  1578. FMADD y12, a8, b6, y12
  1579. LFD a8, 8 * SIZE(AO4)
  1580. FMADD y01, a1, b7, y01
  1581. LFD a1, 9 * SIZE(AO1)
  1582. FMADD y02, a2, b7, y02
  1583. LFD a2, 9 * SIZE(AO2)
  1584. FMADD y03, a3, b7, y03
  1585. LFD a3, 9 * SIZE(AO3)
  1586. FMADD y04, a4, b7, y04
  1587. LFD a4, 9 * SIZE(AO4)
  1588. FMADD y09, a5, b8, y09
  1589. LFD a5, 10 * SIZE(AO1)
  1590. FMADD y10, a6, b8, y10
  1591. LFD a6, 10 * SIZE(AO2)
  1592. FMADD y11, a7, b8, y11
  1593. LFD a7, 10 * SIZE(AO3)
  1594. FMADD y12, a8, b8, y12
  1595. LFD a8, 10 * SIZE(AO4)
  1596. LFD b5, 13 * SIZE(BO)
  1597. LFD b6, 14 * SIZE(BO)
  1598. LFD b7, 15 * SIZE(BO)
  1599. LFD b8, 16 * SIZE(BO)
  1600. FMADD y01, a1, b1, y01
  1601. LFD a1, 11 * SIZE(AO1)
  1602. FMADD y02, a2, b1, y02
  1603. LFD a2, 11 * SIZE(AO2)
  1604. FMADD y03, a3, b1, y03
  1605. LFD a3, 11 * SIZE(AO3)
  1606. FMADD y04, a4, b1, y04
  1607. LFD a4, 11 * SIZE(AO4)
  1608. FMADD y09, a5, b2, y09
  1609. LFD a5, 12 * SIZE(AO1)
  1610. FMADD y10, a6, b2, y10
  1611. LFD a6, 12 * SIZE(AO2)
  1612. FMADD y11, a7, b2, y11
  1613. LFD a7, 12 * SIZE(AO3)
  1614. FMADD y12, a8, b2, y12
  1615. LFD a8, 12 * SIZE(AO4)
  1616. FMADD y01, a1, b3, y01
  1617. LFD a1, 13 * SIZE(AO1)
  1618. FMADD y02, a2, b3, y02
  1619. LFD a2, 13 * SIZE(AO2)
  1620. FMADD y03, a3, b3, y03
  1621. LFD a3, 13 * SIZE(AO3)
  1622. FMADD y04, a4, b3, y04
  1623. LFD a4, 13 * SIZE(AO4)
  1624. FMADD y09, a5, b4, y09
  1625. LFD a5, 14 * SIZE(AO1)
  1626. FMADD y10, a6, b4, y10
  1627. LFD a6, 14 * SIZE(AO2)
  1628. FMADD y11, a7, b4, y11
  1629. LFD a7, 14 * SIZE(AO3)
  1630. FMADD y12, a8, b4, y12
  1631. LFD a8, 14 * SIZE(AO4)
  1632. FMADD y01, a1, b5, y01
  1633. LFD a1, 15 * SIZE(AO1)
  1634. FMADD y02, a2, b5, y02
  1635. LFD a2, 15 * SIZE(AO2)
  1636. FMADD y03, a3, b5, y03
  1637. LFD a3, 15 * SIZE(AO3)
  1638. FMADD y04, a4, b5, y04
  1639. LFD a4, 15 * SIZE(AO4)
  1640. FMADD y09, a5, b6, y09
  1641. LFD a5, 16 * SIZE(AO1)
  1642. FMADD y10, a6, b6, y10
  1643. LFD a6, 16 * SIZE(AO2)
  1644. FMADD y11, a7, b6, y11
  1645. LFD a7, 16 * SIZE(AO3)
  1646. FMADD y12, a8, b6, y12
  1647. LFD a8, 16 * SIZE(AO4)
  1648. FMADD y01, a1, b7, y01
  1649. FMADD y02, a2, b7, y02
  1650. FMADD y03, a3, b7, y03
  1651. FMADD y04, a4, b7, y04
  1652. FMADD y09, a5, b8, y09
  1653. FMADD y10, a6, b8, y10
  1654. FMADD y11, a7, b8, y11
  1655. FMADD y12, a8, b8, y12
  1656. addi AO1, AO1, 16 * SIZE
  1657. addi AO2, AO2, 16 * SIZE
  1658. addi AO3, AO3, 16 * SIZE
  1659. addi AO4, AO4, 16 * SIZE
  1660. addi BO, BO, 16 * SIZE
  1661. .align 4
  1662. LL(24):
  1663. andi. r0, MIN_N, 15
  1664. ble LL(28)
  1665. andi. r0, MIN_N, 8
  1666. ble LL(25)
  1667. LFD a1, 1 * SIZE(AO1)
  1668. LFD a2, 1 * SIZE(AO2)
  1669. LFD a3, 1 * SIZE(AO3)
  1670. LFD a4, 1 * SIZE(AO4)
  1671. LFD b1, 1 * SIZE(BO)
  1672. LFD b2, 2 * SIZE(BO)
  1673. LFD b3, 3 * SIZE(BO)
  1674. LFD b4, 4 * SIZE(BO)
  1675. LFD a5, 2 * SIZE(AO1)
  1676. LFD a6, 2 * SIZE(AO2)
  1677. LFD a7, 2 * SIZE(AO3)
  1678. LFD a8, 2 * SIZE(AO4)
  1679. FMADD y01, a1, b1, y01
  1680. LFD a1, 3 * SIZE(AO1)
  1681. FMADD y02, a2, b1, y02
  1682. LFD a2, 3 * SIZE(AO2)
  1683. FMADD y03, a3, b1, y03
  1684. LFD a3, 3 * SIZE(AO3)
  1685. FMADD y04, a4, b1, y04
  1686. LFD a4, 3 * SIZE(AO4)
  1687. FMADD y09, a5, b2, y09
  1688. LFD a5, 4 * SIZE(AO1)
  1689. FMADD y10, a6, b2, y10
  1690. LFD a6, 4 * SIZE(AO2)
  1691. FMADD y11, a7, b2, y11
  1692. LFD a7, 4 * SIZE(AO3)
  1693. FMADD y12, a8, b2, y12
  1694. LFD a8, 4 * SIZE(AO4)
  1695. FMADD y01, a1, b3, y01
  1696. LFD a1, 5 * SIZE(AO1)
  1697. FMADD y02, a2, b3, y02
  1698. LFD a2, 5 * SIZE(AO2)
  1699. FMADD y03, a3, b3, y03
  1700. LFD a3, 5 * SIZE(AO3)
  1701. FMADD y04, a4, b3, y04
  1702. LFD a4, 5 * SIZE(AO4)
  1703. FMADD y09, a5, b4, y09
  1704. LFD a5, 6 * SIZE(AO1)
  1705. FMADD y10, a6, b4, y10
  1706. LFD a6, 6 * SIZE(AO2)
  1707. FMADD y11, a7, b4, y11
  1708. LFD a7, 6 * SIZE(AO3)
  1709. FMADD y12, a8, b4, y12
  1710. LFD a8, 6 * SIZE(AO4)
  1711. LFD b1, 5 * SIZE(BO)
  1712. LFD b2, 6 * SIZE(BO)
  1713. LFD b3, 7 * SIZE(BO)
  1714. LFD b4, 8 * SIZE(BO)
  1715. FMADD y01, a1, b1, y01
  1716. LFD a1, 7 * SIZE(AO1)
  1717. FMADD y02, a2, b1, y02
  1718. LFD a2, 7 * SIZE(AO2)
  1719. FMADD y03, a3, b1, y03
  1720. LFD a3, 7 * SIZE(AO3)
  1721. FMADD y04, a4, b1, y04
  1722. LFD a4, 7 * SIZE(AO4)
  1723. FMADD y09, a5, b2, y09
  1724. LFD a5, 8 * SIZE(AO1)
  1725. FMADD y10, a6, b2, y10
  1726. LFD a6, 8 * SIZE(AO2)
  1727. FMADD y11, a7, b2, y11
  1728. LFD a7, 8 * SIZE(AO3)
  1729. FMADD y12, a8, b2, y12
  1730. LFD a8, 8 * SIZE(AO4)
  1731. FMADD y01, a1, b3, y01
  1732. FMADD y02, a2, b3, y02
  1733. FMADD y03, a3, b3, y03
  1734. FMADD y04, a4, b3, y04
  1735. FMADD y09, a5, b4, y09
  1736. addi AO1, AO1, 8 * SIZE
  1737. FMADD y10, a6, b4, y10
  1738. addi AO2, AO2, 8 * SIZE
  1739. FMADD y11, a7, b4, y11
  1740. addi AO3, AO3, 8 * SIZE
  1741. FMADD y12, a8, b4, y12
  1742. addi AO4, AO4, 8 * SIZE
  1743. addi BO, BO, 8 * SIZE
  1744. .align 4
  1745. LL(25):
  1746. andi. r0, MIN_N, 4
  1747. ble LL(26)
  1748. LFD a1, 1 * SIZE(AO1)
  1749. LFD a2, 1 * SIZE(AO2)
  1750. LFD a3, 1 * SIZE(AO3)
  1751. LFD a4, 1 * SIZE(AO4)
  1752. LFD b1, 1 * SIZE(BO)
  1753. LFD b2, 2 * SIZE(BO)
  1754. LFD b3, 3 * SIZE(BO)
  1755. LFD b4, 4 * SIZE(BO)
  1756. LFD a5, 2 * SIZE(AO1)
  1757. LFD a6, 2 * SIZE(AO2)
  1758. LFD a7, 2 * SIZE(AO3)
  1759. LFD a8, 2 * SIZE(AO4)
  1760. FMADD y01, a1, b1, y01
  1761. LFD a1, 3 * SIZE(AO1)
  1762. FMADD y02, a2, b1, y02
  1763. LFD a2, 3 * SIZE(AO2)
  1764. FMADD y03, a3, b1, y03
  1765. LFD a3, 3 * SIZE(AO3)
  1766. FMADD y04, a4, b1, y04
  1767. LFD a4, 3 * SIZE(AO4)
  1768. FMADD y09, a5, b2, y09
  1769. LFD a5, 4 * SIZE(AO1)
  1770. FMADD y10, a6, b2, y10
  1771. LFD a6, 4 * SIZE(AO2)
  1772. FMADD y11, a7, b2, y11
  1773. LFD a7, 4 * SIZE(AO3)
  1774. FMADD y12, a8, b2, y12
  1775. LFD a8, 4 * SIZE(AO4)
  1776. FMADD y01, a1, b3, y01
  1777. FMADD y02, a2, b3, y02
  1778. FMADD y03, a3, b3, y03
  1779. FMADD y04, a4, b3, y04
  1780. FMADD y09, a5, b4, y09
  1781. addi AO1, AO1, 4 * SIZE
  1782. FMADD y10, a6, b4, y10
  1783. addi AO2, AO2, 4 * SIZE
  1784. FMADD y11, a7, b4, y11
  1785. addi AO3, AO3, 4 * SIZE
  1786. FMADD y12, a8, b4, y12
  1787. addi AO4, AO4, 4 * SIZE
  1788. addi BO, BO, 4 * SIZE
  1789. .align 4
  1790. LL(26):
  1791. andi. r0, MIN_N, 2
  1792. ble LL(27)
  1793. LFD a1, 1 * SIZE(AO1)
  1794. LFD a2, 1 * SIZE(AO2)
  1795. LFD b1, 1 * SIZE(BO)
  1796. LFD b2, 2 * SIZE(BO)
  1797. LFD a3, 1 * SIZE(AO3)
  1798. LFD a4, 1 * SIZE(AO4)
  1799. LFD a5, 2 * SIZE(AO1)
  1800. LFD a6, 2 * SIZE(AO2)
  1801. LFD a7, 2 * SIZE(AO3)
  1802. LFD a8, 2 * SIZE(AO4)
  1803. FMADD y01, a1, b1, y01
  1804. FMADD y02, a2, b1, y02
  1805. FMADD y03, a3, b1, y03
  1806. FMADD y04, a4, b1, y04
  1807. FMADD y09, a5, b2, y09
  1808. addi AO1, AO1, 2 * SIZE
  1809. FMADD y10, a6, b2, y10
  1810. addi AO2, AO2, 2 * SIZE
  1811. FMADD y11, a7, b2, y11
  1812. addi AO3, AO3, 2 * SIZE
  1813. FMADD y12, a8, b2, y12
  1814. addi AO4, AO4, 2 * SIZE
  1815. addi BO, BO, 2 * SIZE
  1816. .align 4
  1817. LL(27):
  1818. andi. r0, MIN_N, 1
  1819. ble LL(28)
  1820. LFD a1, 1 * SIZE(AO1)
  1821. LFD b1, 1 * SIZE(BO)
  1822. LFD a2, 1 * SIZE(AO2)
  1823. LFD a3, 1 * SIZE(AO3)
  1824. LFD a4, 1 * SIZE(AO4)
  1825. FMADD y01, a1, b1, y01
  1826. FMADD y02, a2, b1, y02
  1827. FMADD y03, a3, b1, y03
  1828. FMADD y04, a4, b1, y04
  1829. .align 4
  1830. LL(28):
  1831. mr BO, CO
  1832. lfd alpha, ALPHA
  1833. cmpi cr0, 0, INCY, SIZE
  1834. bne LL(29)
  1835. LFD a1, 1 * SIZE(CO)
  1836. LFD a2, 2 * SIZE(CO)
  1837. LFD a3, 3 * SIZE(CO)
  1838. LFD a4, 4 * SIZE(CO)
  1839. FADD y01, y09, y01
  1840. FADD y02, y10, y02
  1841. FADD y03, y11, y03
  1842. FADD y04, y12, y04
  1843. FMADD a1, alpha, y01, a1
  1844. FMADD a2, alpha, y02, a2
  1845. FMADD a3, alpha, y03, a3
  1846. FMADD a4, alpha, y04, a4
  1847. STFD a1, 1 * SIZE(CO)
  1848. STFD a2, 2 * SIZE(CO)
  1849. STFD a3, 3 * SIZE(CO)
  1850. STFD a4, 4 * SIZE(CO)
  1851. addi CO, CO, 4 * SIZE
  1852. b LL(30)
  1853. .align 4
  1854. LL(29):
  1855. LFDUX a1, CO, INCY
  1856. LFDUX a2, CO, INCY
  1857. LFDUX a3, CO, INCY
  1858. LFDUX a4, CO, INCY
  1859. FADD y01, y09, y01
  1860. FADD y02, y10, y02
  1861. FADD y03, y11, y03
  1862. FADD y04, y12, y04
  1863. FMADD a1, alpha, f0, a1
  1864. FMADD a2, alpha, f1, a2
  1865. FMADD a3, alpha, f2, a3
  1866. FMADD a4, alpha, f3, a4
  1867. STFDUX a1, BO, INCY
  1868. STFDUX a2, BO, INCY
  1869. STFDUX a3, BO, INCY
  1870. STFDUX a4, BO, INCY
  1871. .align 4
  1872. LL(30):
  1873. andi. J, N, 2
  1874. ble LL(40)
  1875. mr AO1, A
  1876. add AO2, A, LDA
  1877. add A, AO2, LDA
  1878. mr BO, XP
  1879. lfd y01, FZERO
  1880. fmr y02, y01
  1881. fmr y03, y01
  1882. fmr y04, y01
  1883. fmr y09, y01
  1884. fmr y10, y01
  1885. fmr y11, y01
  1886. fmr y12, y01
  1887. DCBT(Y1, PREC)
  1888. srawi. r0, MIN_N, 4
  1889. mtspr CTR, r0
  1890. ble LL(34)
  1891. LFD a1, 1 * SIZE(AO1)
  1892. LFD a2, 1 * SIZE(AO2)
  1893. LFD a3, 2 * SIZE(AO1)
  1894. LFD a4, 2 * SIZE(AO2)
  1895. LFD a5, 3 * SIZE(AO1)
  1896. LFD a6, 3 * SIZE(AO2)
  1897. LFD a7, 4 * SIZE(AO1)
  1898. LFD a8, 4 * SIZE(AO2)
  1899. LFD b1, 1 * SIZE(BO)
  1900. LFD b2, 2 * SIZE(BO)
  1901. LFD b3, 3 * SIZE(BO)
  1902. LFD b4, 4 * SIZE(BO)
  1903. LFD b5, 5 * SIZE(BO)
  1904. LFD b6, 6 * SIZE(BO)
  1905. LFD b7, 7 * SIZE(BO)
  1906. LFD b8, 8 * SIZE(BO)
  1907. bdz LL(33)
  1908. .align 4
  1909. LL(32):
  1910. FMADD y01, a1, b1, y01
  1911. LFD a1, 5 * SIZE(AO1)
  1912. FMADD y02, a2, b1, y02
  1913. LFD a2, 5 * SIZE(AO2)
  1914. FMADD y03, a3, b2, y03
  1915. LFD a3, 6 * SIZE(AO1)
  1916. FMADD y04, a4, b2, y04
  1917. LFD a4, 6 * SIZE(AO2)
  1918. FMADD y09, a5, b3, y09
  1919. LFD a5, 7 * SIZE(AO1)
  1920. FMADD y10, a6, b3, y10
  1921. LFD a6, 7 * SIZE(AO2)
  1922. FMADD y11, a7, b4, y11
  1923. LFD a7, 8 * SIZE(AO1)
  1924. FMADD y12, a8, b4, y12
  1925. LFD a8, 8 * SIZE(AO2)
  1926. LFD b1, 9 * SIZE(BO)
  1927. LFD b2, 10 * SIZE(BO)
  1928. LFD b3, 11 * SIZE(BO)
  1929. LFD b4, 12 * SIZE(BO)
  1930. FMADD y01, a1, b5, y01
  1931. LFD a1, 9 * SIZE(AO1)
  1932. FMADD y02, a2, b5, y02
  1933. LFD a2, 9 * SIZE(AO2)
  1934. FMADD y03, a3, b6, y03
  1935. LFD a3, 10 * SIZE(AO1)
  1936. FMADD y04, a4, b6, y04
  1937. LFD a4, 10 * SIZE(AO2)
  1938. FMADD y09, a5, b7, y09
  1939. LFD a5, 11 * SIZE(AO1)
  1940. FMADD y10, a6, b7, y10
  1941. LFD a6, 11 * SIZE(AO2)
  1942. FMADD y11, a7, b8, y11
  1943. LFD a7, 12 * SIZE(AO1)
  1944. FMADD y12, a8, b8, y12
  1945. LFD a8, 12 * SIZE(AO2)
  1946. LFD b5, 13 * SIZE(BO)
  1947. LFD b6, 14 * SIZE(BO)
  1948. LFD b7, 15 * SIZE(BO)
  1949. LFD b8, 16 * SIZE(BO)
  1950. FMADD y01, a1, b1, y01
  1951. LFD a1, 13 * SIZE(AO1)
  1952. FMADD y02, a2, b1, y02
  1953. LFD a2, 13 * SIZE(AO2)
  1954. FMADD y03, a3, b2, y03
  1955. LFD a3, 14 * SIZE(AO1)
  1956. FMADD y04, a4, b2, y04
  1957. LFD a4, 14 * SIZE(AO2)
  1958. FMADD y09, a5, b3, y09
  1959. LFD a5, 15 * SIZE(AO1)
  1960. FMADD y10, a6, b3, y10
  1961. LFD a6, 15 * SIZE(AO2)
  1962. FMADD y11, a7, b4, y11
  1963. LFD a7, 16 * SIZE(AO1)
  1964. FMADD y12, a8, b4, y12
  1965. LFD a8, 16 * SIZE(AO2)
  1966. LFD b1, 17 * SIZE(BO)
  1967. LFD b2, 18 * SIZE(BO)
  1968. LFD b3, 19 * SIZE(BO)
  1969. LFD b4, 20 * SIZE(BO)
  1970. FMADD y01, a1, b5, y01
  1971. LFD a1, 17 * SIZE(AO1)
  1972. FMADD y02, a2, b5, y02
  1973. LFD a2, 17 * SIZE(AO2)
  1974. FMADD y03, a3, b6, y03
  1975. LFD a3, 18 * SIZE(AO1)
  1976. FMADD y04, a4, b6, y04
  1977. LFD a4, 18 * SIZE(AO2)
  1978. FMADD y09, a5, b7, y09
  1979. LFD a5, 19 * SIZE(AO1)
  1980. FMADD y10, a6, b7, y10
  1981. LFD a6, 19 * SIZE(AO2)
  1982. FMADD y11, a7, b8, y11
  1983. LFD a7, 20 * SIZE(AO1)
  1984. FMADD y12, a8, b8, y12
  1985. LFD a8, 20 * SIZE(AO2)
  1986. LFD b5, 21 * SIZE(BO)
  1987. LFD b6, 22 * SIZE(BO)
  1988. LFD b7, 23 * SIZE(BO)
  1989. LFD b8, 24 * SIZE(BO)
  1990. addi AO1, AO1, 16 * SIZE
  1991. addi AO2, AO2, 16 * SIZE
  1992. DCBT(AO1, PREA)
  1993. DCBT(AO2, PREA)
  1994. addi BO, BO, 16 * SIZE
  1995. bdnz LL(32)
  1996. .align 4
  1997. LL(33):
  1998. FMADD y01, a1, b1, y01
  1999. LFD a1, 5 * SIZE(AO1)
  2000. FMADD y02, a2, b1, y02
  2001. LFD a2, 5 * SIZE(AO2)
  2002. FMADD y03, a3, b2, y03
  2003. LFD a3, 6 * SIZE(AO1)
  2004. FMADD y04, a4, b2, y04
  2005. LFD a4, 6 * SIZE(AO2)
  2006. FMADD y09, a5, b3, y09
  2007. LFD a5, 7 * SIZE(AO1)
  2008. FMADD y10, a6, b3, y10
  2009. LFD a6, 7 * SIZE(AO2)
  2010. FMADD y11, a7, b4, y11
  2011. LFD a7, 8 * SIZE(AO1)
  2012. FMADD y12, a8, b4, y12
  2013. LFD a8, 8 * SIZE(AO2)
  2014. LFD b1, 9 * SIZE(BO)
  2015. LFD b2, 10 * SIZE(BO)
  2016. LFD b3, 11 * SIZE(BO)
  2017. LFD b4, 12 * SIZE(BO)
  2018. FMADD y01, a1, b5, y01
  2019. LFD a1, 9 * SIZE(AO1)
  2020. FMADD y02, a2, b5, y02
  2021. LFD a2, 9 * SIZE(AO2)
  2022. FMADD y03, a3, b6, y03
  2023. LFD a3, 10 * SIZE(AO1)
  2024. FMADD y04, a4, b6, y04
  2025. LFD a4, 10 * SIZE(AO2)
  2026. FMADD y09, a5, b7, y09
  2027. LFD a5, 11 * SIZE(AO1)
  2028. FMADD y10, a6, b7, y10
  2029. LFD a6, 11 * SIZE(AO2)
  2030. FMADD y11, a7, b8, y11
  2031. LFD a7, 12 * SIZE(AO1)
  2032. FMADD y12, a8, b8, y12
  2033. LFD a8, 12 * SIZE(AO2)
  2034. LFD b5, 13 * SIZE(BO)
  2035. LFD b6, 14 * SIZE(BO)
  2036. LFD b7, 15 * SIZE(BO)
  2037. LFD b8, 16 * SIZE(BO)
  2038. FMADD y01, a1, b1, y01
  2039. LFD a1, 13 * SIZE(AO1)
  2040. FMADD y02, a2, b1, y02
  2041. LFD a2, 13 * SIZE(AO2)
  2042. FMADD y03, a3, b2, y03
  2043. LFD a3, 14 * SIZE(AO1)
  2044. FMADD y04, a4, b2, y04
  2045. LFD a4, 14 * SIZE(AO2)
  2046. FMADD y09, a5, b3, y09
  2047. LFD a5, 15 * SIZE(AO1)
  2048. FMADD y10, a6, b3, y10
  2049. LFD a6, 15 * SIZE(AO2)
  2050. FMADD y11, a7, b4, y11
  2051. LFD a7, 16 * SIZE(AO1)
  2052. FMADD y12, a8, b4, y12
  2053. LFD a8, 16 * SIZE(AO2)
  2054. FMADD y01, a1, b5, y01
  2055. FMADD y02, a2, b5, y02
  2056. FMADD y03, a3, b6, y03
  2057. FMADD y04, a4, b6, y04
  2058. FMADD y09, a5, b7, y09
  2059. FMADD y10, a6, b7, y10
  2060. FMADD y11, a7, b8, y11
  2061. FMADD y12, a8, b8, y12
  2062. addi AO1, AO1, 16 * SIZE
  2063. addi AO2, AO2, 16 * SIZE
  2064. addi BO, BO, 16 * SIZE
  2065. .align 4
  2066. LL(34):
  2067. andi. r0, MIN_N, 15
  2068. ble LL(38)
  2069. andi. r0, MIN_N, 8
  2070. ble LL(35)
  2071. LFD a1, 1 * SIZE(AO1)
  2072. LFD a2, 1 * SIZE(AO2)
  2073. LFD a3, 2 * SIZE(AO1)
  2074. LFD a4, 2 * SIZE(AO2)
  2075. LFD b1, 1 * SIZE(BO)
  2076. LFD b2, 2 * SIZE(BO)
  2077. LFD b3, 3 * SIZE(BO)
  2078. LFD b4, 4 * SIZE(BO)
  2079. LFD a5, 3 * SIZE(AO1)
  2080. LFD a6, 3 * SIZE(AO2)
  2081. LFD a7, 4 * SIZE(AO1)
  2082. LFD a8, 4 * SIZE(AO2)
  2083. LFD b5, 5 * SIZE(BO)
  2084. LFD b6, 6 * SIZE(BO)
  2085. LFD b7, 7 * SIZE(BO)
  2086. LFD b8, 8 * SIZE(BO)
  2087. FMADD y01, a1, b1, y01
  2088. LFD a1, 5 * SIZE(AO1)
  2089. FMADD y02, a2, b1, y02
  2090. LFD a2, 5 * SIZE(AO2)
  2091. FMADD y09, a3, b2, y09
  2092. LFD a3, 6 * SIZE(AO1)
  2093. FMADD y10, a4, b2, y10
  2094. LFD a4, 6 * SIZE(AO2)
  2095. FMADD y01, a5, b3, y01
  2096. LFD a5, 7 * SIZE(AO1)
  2097. FMADD y02, a6, b3, y02
  2098. LFD a6, 7 * SIZE(AO2)
  2099. FMADD y09, a7, b4, y09
  2100. LFD a7, 8 * SIZE(AO1)
  2101. FMADD y10, a8, b4, y10
  2102. LFD a8, 8 * SIZE(AO2)
  2103. FMADD y01, a1, b5, y01
  2104. FMADD y02, a2, b5, y02
  2105. FMADD y09, a3, b6, y09
  2106. FMADD y10, a4, b6, y10
  2107. FMADD y01, a5, b7, y01
  2108. addi AO1, AO1, 8 * SIZE
  2109. FMADD y02, a6, b7, y02
  2110. addi AO2, AO2, 8 * SIZE
  2111. FMADD y09, a7, b8, y09
  2112. addi BO, BO, 8 * SIZE
  2113. FMADD y10, a8, b8, y10
  2114. nop
  2115. .align 4
  2116. LL(35):
  2117. andi. r0, MIN_N, 4
  2118. ble LL(36)
  2119. LFD a1, 1 * SIZE(AO1)
  2120. LFD a2, 1 * SIZE(AO2)
  2121. LFD a3, 2 * SIZE(AO1)
  2122. LFD a4, 2 * SIZE(AO2)
  2123. LFD a5, 3 * SIZE(AO1)
  2124. LFD a6, 3 * SIZE(AO2)
  2125. LFD a7, 4 * SIZE(AO1)
  2126. LFD a8, 4 * SIZE(AO2)
  2127. LFD b1, 1 * SIZE(BO)
  2128. LFD b2, 2 * SIZE(BO)
  2129. LFD b3, 3 * SIZE(BO)
  2130. LFD b4, 4 * SIZE(BO)
  2131. FMADD y01, a1, b1, y01
  2132. FMADD y02, a2, b1, y02
  2133. FMADD y09, a3, b2, y09
  2134. FMADD y10, a4, b2, y10
  2135. FMADD y01, a5, b3, y01
  2136. addi AO1, AO1, 4 * SIZE
  2137. FMADD y02, a6, b3, y02
  2138. addi AO2, AO2, 4 * SIZE
  2139. FMADD y09, a7, b4, y09
  2140. addi BO, BO, 4 * SIZE
  2141. FMADD y10, a8, b4, y10
  2142. .align 4
  2143. LL(36):
  2144. andi. r0, MIN_N, 2
  2145. ble LL(37)
  2146. LFD a1, 1 * SIZE(AO1)
  2147. LFD a2, 1 * SIZE(AO2)
  2148. LFD b1, 1 * SIZE(BO)
  2149. LFD b2, 2 * SIZE(BO)
  2150. LFD a3, 2 * SIZE(AO1)
  2151. LFD a4, 2 * SIZE(AO2)
  2152. FMADD y01, a1, b1, y01
  2153. FMADD y02, a2, b1, y02
  2154. FMADD y09, a3, b2, y09
  2155. FMADD y10, a4, b2, y10
  2156. addi AO1, AO1, 2 * SIZE
  2157. addi AO2, AO2, 2 * SIZE
  2158. addi BO, BO, 2 * SIZE
  2159. .align 4
  2160. LL(37):
  2161. andi. r0, MIN_N, 1
  2162. ble LL(38)
  2163. LFD a1, 1 * SIZE(AO1)
  2164. LFD b1, 1 * SIZE(BO)
  2165. LFD a2, 1 * SIZE(AO2)
  2166. FMADD y01, a1, b1, y01
  2167. FMADD y02, a2, b1, y02
  2168. .align 4
  2169. LL(38):
  2170. mr BO, CO
  2171. lfd alpha, ALPHA
  2172. cmpi cr0, 0, INCY, SIZE
  2173. bne LL(39)
  2174. LFD a1, 1 * SIZE(CO)
  2175. LFD a2, 2 * SIZE(CO)
  2176. FADD y01, y03, y01
  2177. FADD y02, y04, y02
  2178. FADD y09, y11, y09
  2179. FADD y10, y12, y10
  2180. FADD y01, y09, y01
  2181. FADD y02, y10, y02
  2182. FMADD a1, alpha, y01, a1
  2183. FMADD a2, alpha, y02, a2
  2184. STFD a1, 1 * SIZE(CO)
  2185. STFD a2, 2 * SIZE(CO)
  2186. addi CO, CO, 2 * SIZE
  2187. b LL(40)
  2188. .align 4
  2189. LL(39):
  2190. LFDUX a1, CO, INCY
  2191. LFDUX a2, CO, INCY
  2192. FADD y01, y03, y01
  2193. FADD y02, y04, y02
  2194. FADD y09, y11, y09
  2195. FADD y10, y12, y10
  2196. FADD y01, y09, y01
  2197. FADD y02, y10, y02
  2198. FMADD a1, alpha, f0, a1
  2199. FMADD a2, alpha, f1, a2
  2200. STFDUX a1, BO, INCY
  2201. STFDUX a2, BO, INCY
  2202. .align 4
  2203. LL(40):
  2204. andi. J, N, 1
  2205. ble LL(99)
  2206. mr AO1, A
  2207. add A, A, LDA
  2208. mr BO, XP
  2209. lfd y01, FZERO
  2210. fmr y02, y01
  2211. fmr y03, y01
  2212. fmr y04, y01
  2213. fmr y09, y01
  2214. fmr y10, y01
  2215. fmr y11, y01
  2216. fmr y12, y01
  2217. DCBT(Y1, PREC)
  2218. srawi. r0, MIN_N, 4
  2219. mtspr CTR, r0
  2220. ble LL(44)
  2221. LFD a1, 1 * SIZE(AO1)
  2222. LFD a2, 2 * SIZE(AO1)
  2223. LFD a3, 3 * SIZE(AO1)
  2224. LFD a4, 4 * SIZE(AO1)
  2225. LFD a5, 5 * SIZE(AO1)
  2226. LFD a6, 6 * SIZE(AO1)
  2227. LFD a7, 7 * SIZE(AO1)
  2228. LFD a8, 8 * SIZE(AO1)
  2229. LFD b1, 1 * SIZE(BO)
  2230. LFD b2, 2 * SIZE(BO)
  2231. LFD b3, 3 * SIZE(BO)
  2232. LFD b4, 4 * SIZE(BO)
  2233. LFD b5, 5 * SIZE(BO)
  2234. LFD b6, 6 * SIZE(BO)
  2235. LFD b7, 7 * SIZE(BO)
  2236. LFD b8, 8 * SIZE(BO)
  2237. bdz LL(43)
  2238. .align 4
  2239. LL(42):
  2240. FMADD y01, a1, b1, y01
  2241. nop
  2242. LFD a1, 9 * SIZE(AO1)
  2243. LFD b1, 9 * SIZE(BO)
  2244. FMADD y02, a2, b2, y02
  2245. nop
  2246. LFD a2, 10 * SIZE(AO1)
  2247. LFD b2, 10 * SIZE(BO)
  2248. FMADD y03, a3, b3, y03
  2249. nop
  2250. LFD a3, 11 * SIZE(AO1)
  2251. LFD b3, 11 * SIZE(BO)
  2252. FMADD y04, a4, b4, y04
  2253. nop
  2254. LFD a4, 12 * SIZE(AO1)
  2255. LFD b4, 12 * SIZE(BO)
  2256. FMADD y01, a5, b5, y01
  2257. nop
  2258. LFD a5, 13 * SIZE(AO1)
  2259. LFD b5, 13 * SIZE(BO)
  2260. FMADD y02, a6, b6, y02
  2261. nop
  2262. LFD a6, 14 * SIZE(AO1)
  2263. LFD b6, 14 * SIZE(BO)
  2264. FMADD y03, a7, b7, y03
  2265. nop
  2266. LFD a7, 15 * SIZE(AO1)
  2267. LFD b7, 15 * SIZE(BO)
  2268. FMADD y04, a8, b8, y04
  2269. nop
  2270. LFD a8, 16 * SIZE(AO1)
  2271. LFD b8, 16 * SIZE(BO)
  2272. FMADD y01, a1, b1, y01
  2273. nop
  2274. LFD a1, 17 * SIZE(AO1)
  2275. LFD b1, 17 * SIZE(BO)
  2276. FMADD y02, a2, b2, y02
  2277. nop
  2278. LFD a2, 18 * SIZE(AO1)
  2279. LFD b2, 18 * SIZE(BO)
  2280. FMADD y03, a3, b3, y03
  2281. nop
  2282. LFD a3, 19 * SIZE(AO1)
  2283. LFD b3, 19 * SIZE(BO)
  2284. FMADD y04, a4, b4, y04
  2285. nop
  2286. LFD a4, 20 * SIZE(AO1)
  2287. LFD b4, 20 * SIZE(BO)
  2288. FMADD y01, a5, b5, y01
  2289. nop
  2290. LFD a5, 21 * SIZE(AO1)
  2291. LFD b5, 21 * SIZE(BO)
  2292. FMADD y02, a6, b6, y02
  2293. nop
  2294. LFD a6, 22 * SIZE(AO1)
  2295. LFD b6, 22 * SIZE(BO)
  2296. FMADD y03, a7, b7, y03
  2297. nop
  2298. LFD a7, 23 * SIZE(AO1)
  2299. LFD b7, 23 * SIZE(BO)
  2300. FMADD y04, a8, b8, y04
  2301. nop
  2302. LFD a8, 24 * SIZE(AO1)
  2303. LFD b8, 24 * SIZE(BO)
  2304. addi AO1, AO1, 16 * SIZE
  2305. addi BO, BO, 16 * SIZE
  2306. DCBT(AO1, PREA)
  2307. bdnz LL(42)
  2308. .align 4
  2309. LL(43):
  2310. FMADD y01, a1, b1, y01
  2311. nop
  2312. LFD a1, 9 * SIZE(AO1)
  2313. LFD b1, 9 * SIZE(BO)
  2314. FMADD y02, a2, b2, y02
  2315. nop
  2316. LFD a2, 10 * SIZE(AO1)
  2317. LFD b2, 10 * SIZE(BO)
  2318. FMADD y03, a3, b3, y03
  2319. nop
  2320. LFD a3, 11 * SIZE(AO1)
  2321. LFD b3, 11 * SIZE(BO)
  2322. FMADD y04, a4, b4, y04
  2323. nop
  2324. LFD a4, 12 * SIZE(AO1)
  2325. LFD b4, 12 * SIZE(BO)
  2326. FMADD y01, a5, b5, y01
  2327. nop
  2328. LFD a5, 13 * SIZE(AO1)
  2329. LFD b5, 13 * SIZE(BO)
  2330. FMADD y02, a6, b6, y02
  2331. nop
  2332. LFD a6, 14 * SIZE(AO1)
  2333. LFD b6, 14 * SIZE(BO)
  2334. FMADD y03, a7, b7, y03
  2335. nop
  2336. LFD a7, 15 * SIZE(AO1)
  2337. LFD b7, 15 * SIZE(BO)
  2338. FMADD y04, a8, b8, y04
  2339. nop
  2340. LFD a8, 16 * SIZE(AO1)
  2341. LFD b8, 16 * SIZE(BO)
  2342. FMADD y01, a1, b1, y01
  2343. FMADD y02, a2, b2, y02
  2344. FMADD y03, a3, b3, y03
  2345. FMADD y04, a4, b4, y04
  2346. FMADD y01, a5, b5, y01
  2347. addi AO1, AO1, 16 * SIZE
  2348. FMADD y02, a6, b6, y02
  2349. addi BO, BO, 16 * SIZE
  2350. FMADD y03, a7, b7, y03
  2351. nop
  2352. FMADD y04, a8, b8, y04
  2353. nop
  2354. .align 4
  2355. LL(44):
  2356. andi. r0, MIN_N, 15
  2357. ble LL(48)
  2358. andi. r0, MIN_N, 8
  2359. ble LL(45)
  2360. LFD a1, 1 * SIZE(AO1)
  2361. LFD a2, 2 * SIZE(AO1)
  2362. LFD a3, 3 * SIZE(AO1)
  2363. LFD a4, 4 * SIZE(AO1)
  2364. LFD b1, 1 * SIZE(BO)
  2365. LFD b2, 2 * SIZE(BO)
  2366. LFD b3, 3 * SIZE(BO)
  2367. LFD b4, 4 * SIZE(BO)
  2368. LFD a5, 5 * SIZE(AO1)
  2369. LFD a6, 6 * SIZE(AO1)
  2370. LFD a7, 7 * SIZE(AO1)
  2371. LFD a8, 8 * SIZE(AO1)
  2372. LFD b5, 5 * SIZE(BO)
  2373. LFD b6, 6 * SIZE(BO)
  2374. LFD b7, 7 * SIZE(BO)
  2375. LFD b8, 8 * SIZE(BO)
  2376. FMADD y01, a1, b1, y01
  2377. FMADD y02, a2, b2, y02
  2378. FMADD y03, a3, b3, y03
  2379. FMADD y04, a4, b4, y04
  2380. FMADD y01, a5, b5, y01
  2381. addi AO1, AO1, 8 * SIZE
  2382. FMADD y02, a6, b6, y02
  2383. addi BO, BO, 8 * SIZE
  2384. FMADD y03, a7, b7, y03
  2385. nop
  2386. FMADD y04, a8, b8, y04
  2387. nop
  2388. .align 4
  2389. LL(45):
  2390. andi. r0, MIN_N, 4
  2391. ble LL(46)
  2392. LFD a1, 1 * SIZE(AO1)
  2393. LFD b1, 1 * SIZE(BO)
  2394. LFD a2, 2 * SIZE(AO1)
  2395. LFD b2, 2 * SIZE(BO)
  2396. LFD a3, 3 * SIZE(AO1)
  2397. LFD b3, 3 * SIZE(BO)
  2398. LFD a4, 4 * SIZE(AO1)
  2399. LFD b4, 4 * SIZE(BO)
  2400. FMADD y01, a1, b1, y01
  2401. addi AO1, AO1, 4 * SIZE
  2402. FMADD y02, a2, b2, y02
  2403. addi AO2, AO2, 4 * SIZE
  2404. FMADD y03, a3, b3, y03
  2405. addi BO, BO, 4 * SIZE
  2406. FMADD y04, a4, b4, y04
  2407. nop
  2408. .align 4
  2409. LL(46):
  2410. andi. r0, MIN_N, 2
  2411. ble LL(47)
  2412. LFD a1, 1 * SIZE(AO1)
  2413. LFD b1, 1 * SIZE(BO)
  2414. LFD a2, 2 * SIZE(AO1)
  2415. LFD b2, 2 * SIZE(BO)
  2416. FMADD y01, a1, b1, y01
  2417. addi AO1, AO1, 2 * SIZE
  2418. FMADD y02, a2, b2, y02
  2419. addi BO, BO, 2 * SIZE
  2420. .align 4
  2421. LL(47):
  2422. andi. r0, MIN_N, 1
  2423. ble LL(48)
  2424. LFD a1, 1 * SIZE(AO1)
  2425. LFD b1, 1 * SIZE(BO)
  2426. FMADD y01, a1, b1, y01
  2427. .align 4
  2428. LL(48):
  2429. mr BO, CO
  2430. lfd alpha, ALPHA
  2431. cmpi cr0, 0, INCY, SIZE
  2432. bne LL(49)
  2433. LFD a1, 1 * SIZE(CO)
  2434. FADD y01, y02, y01
  2435. FADD y03, y04, y03
  2436. FADD y01, y03, y01
  2437. FMADD a1, alpha, y01, a1
  2438. STFD a1, 1 * SIZE(CO)
  2439. b LL(99)
  2440. .align 4
  2441. LL(49):
  2442. LFDUX a1, CO, INCY
  2443. FADD y01, y02, y01
  2444. FADD y03, y04, y03
  2445. FADD y01, y03, y01
  2446. FMADD a1, alpha, f0, a1
  2447. STFDUX a1, BO, INCY
  2448. .align 4
  2449. LL(99):
  2450. subf A, PLDA_M, A
  2451. addi IS, IS, P
  2452. cmp cr0, 0, IS, M
  2453. blt LL(ISLoop)
  2454. .align 4
  2455. LL(999):
  2456. li r3, 0
  2457. lfd f14, 0(SP)
  2458. lfd f15, 8(SP)
  2459. lfd f16, 16(SP)
  2460. lfd f17, 24(SP)
  2461. lfd f18, 32(SP)
  2462. lfd f19, 40(SP)
  2463. lfd f20, 48(SP)
  2464. lfd f21, 56(SP)
  2465. lfd f22, 64(SP)
  2466. lfd f23, 72(SP)
  2467. lfd f24, 80(SP)
  2468. lfd f25, 88(SP)
  2469. lfd f26, 96(SP)
  2470. lfd f27, 104(SP)
  2471. lfd f28, 112(SP)
  2472. lfd f29, 120(SP)
  2473. lfd f30, 128(SP)
  2474. lfd f31, 136(SP)
  2475. #ifdef __64BIT__
  2476. ld r14, 160(SP)
  2477. ld r15, 168(SP)
  2478. ld r16, 176(SP)
  2479. ld r17, 184(SP)
  2480. ld r18, 192(SP)
  2481. ld r19, 200(SP)
  2482. ld r20, 208(SP)
  2483. ld r21, 216(SP)
  2484. ld r22, 224(SP)
  2485. ld r23, 232(SP)
  2486. ld r24, 240(SP)
  2487. ld r25, 248(SP)
  2488. ld r26, 256(SP)
  2489. ld r27, 264(SP)
  2490. ld r28, 272(SP)
  2491. ld r29, 280(SP)
  2492. #else
  2493. lwz r14, 160(SP)
  2494. lwz r15, 164(SP)
  2495. lwz r16, 168(SP)
  2496. lwz r17, 172(SP)
  2497. lwz r18, 176(SP)
  2498. lwz r19, 180(SP)
  2499. lwz r20, 184(SP)
  2500. lwz r21, 188(SP)
  2501. lwz r22, 192(SP)
  2502. lwz r23, 196(SP)
  2503. lwz r24, 200(SP)
  2504. lwz r25, 204(SP)
  2505. lwz r26, 208(SP)
  2506. lwz r27, 212(SP)
  2507. lwz r28, 216(SP)
  2508. lwz r29, 220(SP)
  2509. #endif
  2510. addi SP, SP, STACKSIZE
  2511. blr
  2512. EPILOGUE
  2513. #endif