You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_ppc440_LN.S 39 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define AORIG r21
  90. #define TEMP r22
  91. #define KK r23
  92. #define I r24
  93. #define J r25
  94. #define AO r26
  95. #define BO r27
  96. #define CO1 r28
  97. #define CO2 r29
  98. #define A1 f16
  99. #define A2 f17
  100. #define A3 f18
  101. #define A4 f19
  102. #define A5 f20
  103. #define A6 f21
  104. #define B1 f22
  105. #define B2 f23
  106. #define B3 f24
  107. #define B4 f25
  108. #define B5 f26
  109. #define B6 f27
  110. #define B7 f28
  111. #define B8 f29
  112. #define B9 f30
  113. #define B10 f31
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. std r23, 208(SP)
  146. std r22, 216(SP)
  147. std r21, 224(SP)
  148. #else
  149. stw r31, 144(SP)
  150. stw r30, 148(SP)
  151. stw r29, 152(SP)
  152. stw r28, 156(SP)
  153. stw r27, 160(SP)
  154. stw r26, 164(SP)
  155. stw r25, 168(SP)
  156. stw r24, 172(SP)
  157. stw r23, 176(SP)
  158. stw r22, 180(SP)
  159. stw r21, 184(SP)
  160. #endif
  161. stw r0, FZERO
  162. #if defined(linux) || defined(__FreeBSD__)
  163. #ifdef __64BIT__
  164. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  165. #endif
  166. #endif
  167. #if defined(_AIX) || defined(__APPLE__)
  168. #ifdef __64BIT__
  169. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  170. #else
  171. #ifdef DOUBLE
  172. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  173. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  174. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  175. #else
  176. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  177. #endif
  178. #endif
  179. #endif
  180. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  181. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  182. #endif
  183. #if defined(_AIX) || defined(__APPLE__)
  184. #ifdef __64BIT__
  185. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  186. #else
  187. #ifdef DOUBLE
  188. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  189. #else
  190. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #endif
  192. #endif
  193. #endif
  194. slwi LDC, LDC, ZBASE_SHIFT
  195. #ifdef LN
  196. mullw r0, M, K
  197. slwi r0, r0, ZBASE_SHIFT
  198. add A, A, r0
  199. slwi r0, M, ZBASE_SHIFT
  200. add C, C, r0
  201. #endif
  202. #ifdef RN
  203. neg KK, OFFSET
  204. #endif
  205. #ifdef RT
  206. mullw r0, N, K
  207. slwi r0, r0, ZBASE_SHIFT
  208. add B, B, r0
  209. mullw r0, N, LDC
  210. add C, C, r0
  211. sub KK, N, OFFSET
  212. #endif
  213. cmpwi cr0, M, 0
  214. ble .L999
  215. cmpwi cr0, N, 0
  216. ble .L999
  217. cmpwi cr0, K, 0
  218. ble .L999
  219. srawi. J, N, 1
  220. ble .L30
  221. .align 4
  222. .L10:
  223. #ifdef RT
  224. slwi r0, K, 1 + ZBASE_SHIFT
  225. sub B, B, r0
  226. slwi r0, LDC, 1
  227. sub C, C, r0
  228. #endif
  229. mr CO1, C
  230. add CO2, C, LDC
  231. #ifdef LN
  232. add KK, M, OFFSET
  233. #endif
  234. #ifdef LT
  235. mr KK, OFFSET
  236. #endif
  237. #if defined(LN) || defined(RT)
  238. mr AORIG, A
  239. #else
  240. mr AO, A
  241. #endif
  242. #ifndef RT
  243. add C, CO2, LDC
  244. #endif
  245. .L20:
  246. andi. I, M, 1
  247. ble .L09
  248. #if defined(LT) || defined(RN)
  249. LFD f16, 0 * SIZE(AO)
  250. LFD f17, 1 * SIZE(AO)
  251. LFD f18, 2 * SIZE(AO)
  252. LFD f19, 3 * SIZE(AO)
  253. LFD f20, 0 * SIZE(B)
  254. LFD f21, 1 * SIZE(B)
  255. LFD f22, 2 * SIZE(B)
  256. LFD f23, 3 * SIZE(B)
  257. LFD f24, 4 * SIZE(B)
  258. LFD f25, 5 * SIZE(B)
  259. LFD f26, 6 * SIZE(B)
  260. LFD f27, 7 * SIZE(B)
  261. lfs f0, FZERO
  262. fmr f1, f0
  263. fmr f2, f0
  264. fmr f3, f0
  265. fmr f4, f0
  266. fmr f5, f0
  267. fmr f6, f0
  268. fmr f7, f0
  269. srawi. r0, KK, 2
  270. mr BO, B
  271. mtspr CTR, r0
  272. #else
  273. #ifdef LN
  274. slwi r0, K, 0 + ZBASE_SHIFT
  275. sub AORIG, AORIG, r0
  276. #endif
  277. slwi r0, KK, 0 + ZBASE_SHIFT
  278. slwi TEMP, KK, 1 + ZBASE_SHIFT
  279. add AO, AORIG, r0
  280. add BO, B, TEMP
  281. sub TEMP, K, KK
  282. LFD f16, 0 * SIZE(AO)
  283. LFD f17, 1 * SIZE(AO)
  284. LFD f18, 2 * SIZE(AO)
  285. LFD f19, 3 * SIZE(AO)
  286. LFD f20, 0 * SIZE(BO)
  287. LFD f21, 1 * SIZE(BO)
  288. LFD f22, 2 * SIZE(BO)
  289. LFD f23, 3 * SIZE(BO)
  290. LFD f24, 4 * SIZE(BO)
  291. LFD f25, 5 * SIZE(BO)
  292. LFD f26, 6 * SIZE(BO)
  293. LFD f27, 7 * SIZE(BO)
  294. lfs f0, FZERO
  295. fmr f1, f0
  296. fmr f2, f0
  297. fmr f3, f0
  298. fmr f4, f0
  299. fmr f5, f0
  300. fmr f6, f0
  301. fmr f7, f0
  302. srawi. r0, TEMP, 2
  303. mtspr CTR, r0
  304. #endif
  305. ble .L25
  306. .align 4
  307. .L22:
  308. fmadd f0, f16, f20, f0
  309. LFD f19, 3 * SIZE(AO)
  310. fmadd f1, f16, f21, f1
  311. nop
  312. fmadd f2, f16, f22, f2
  313. nop
  314. fmadd f3, f16, f23, f3
  315. LFD f16, 4 * SIZE(AO)
  316. fmadd f4, f17, f20, f4
  317. LFD f20, 8 * SIZE(BO)
  318. fmadd f5, f17, f21, f5
  319. LFD f21, 9 * SIZE(BO)
  320. fmadd f6, f17, f22, f6
  321. LFD f22, 10 * SIZE(BO)
  322. fmadd f7, f17, f23, f7
  323. LFD f23, 11 * SIZE(BO)
  324. fmadd f0, f18, f24, f0
  325. LFD f17, 5 * SIZE(AO)
  326. fmadd f1, f18, f25, f1
  327. nop
  328. fmadd f2, f18, f26, f2
  329. nop
  330. fmadd f3, f18, f27, f3
  331. LFD f18, 6 * SIZE(AO)
  332. fmadd f4, f19, f24, f4
  333. LFD f24, 12 * SIZE(BO)
  334. fmadd f5, f19, f25, f5
  335. LFD f25, 13 * SIZE(BO)
  336. fmadd f6, f19, f26, f6
  337. LFD f26, 14 * SIZE(BO)
  338. fmadd f7, f19, f27, f7
  339. LFD f27, 15 * SIZE(BO)
  340. fmadd f0, f16, f20, f0
  341. LFD f19, 7 * SIZE(AO)
  342. fmadd f1, f16, f21, f1
  343. nop
  344. fmadd f2, f16, f22, f2
  345. nop
  346. fmadd f3, f16, f23, f3
  347. LFDU f16, 8 * SIZE(AO)
  348. fmadd f4, f17, f20, f4
  349. LFDU f20, 16 * SIZE(BO)
  350. fmadd f5, f17, f21, f5
  351. LFD f21, 1 * SIZE(BO)
  352. fmadd f6, f17, f22, f6
  353. LFD f22, 2 * SIZE(BO)
  354. fmadd f7, f17, f23, f7
  355. LFD f23, 3 * SIZE(BO)
  356. fmadd f0, f18, f24, f0
  357. LFD f17, 1 * SIZE(AO)
  358. fmadd f1, f18, f25, f1
  359. nop
  360. fmadd f2, f18, f26, f2
  361. nop
  362. fmadd f3, f18, f27, f3
  363. LFD f18, 2 * SIZE(AO)
  364. fmadd f4, f19, f24, f4
  365. LFD f24, 4 * SIZE(BO)
  366. fmadd f5, f19, f25, f5
  367. LFD f25, 5 * SIZE(BO)
  368. fmadd f6, f19, f26, f6
  369. LFD f26, 6 * SIZE(BO)
  370. fmadd f7, f19, f27, f7
  371. LFD f27, 7 * SIZE(BO)
  372. bdnz .L22
  373. .align 4
  374. .L25:
  375. #if defined(LT) || defined(RN)
  376. andi. r0, KK, 3
  377. #else
  378. andi. r0, TEMP, 3
  379. #endif
  380. mtspr CTR, r0
  381. ble .L27
  382. .align 4
  383. .L26:
  384. fmadd f0, f16, f20, f0
  385. LFD f17, 1 * SIZE(AO)
  386. fmadd f1, f16, f21, f1
  387. nop
  388. fmadd f2, f16, f22, f2
  389. nop
  390. fmadd f3, f16, f23, f3
  391. LFDU f16, 2 * SIZE(AO)
  392. fmadd f4, f17, f20, f4
  393. LFDU f20, 4 * SIZE(BO)
  394. fmadd f5, f17, f21, f5
  395. LFD f21, 1 * SIZE(BO)
  396. fmadd f6, f17, f22, f6
  397. LFD f22, 2 * SIZE(BO)
  398. fmadd f7, f17, f23, f7
  399. LFD f23, 3 * SIZE(BO)
  400. bdnz .L26
  401. .align 4
  402. .L27:
  403. #ifndef CONJ
  404. FSUB f0, f0, f5
  405. FADD f1, f1, f4
  406. FSUB f2, f2, f7
  407. FADD f3, f3, f6
  408. #else
  409. FADD f0, f0, f5
  410. FSUB f1, f4, f1
  411. FADD f2, f2, f7
  412. FSUB f3, f6, f3
  413. #endif
  414. #if defined(LN) || defined(RT)
  415. #ifdef LN
  416. subi r0, KK, 1
  417. #else
  418. subi r0, KK, 2
  419. #endif
  420. slwi TEMP, r0, 0 + ZBASE_SHIFT
  421. slwi r0, r0, 1 + ZBASE_SHIFT
  422. add AO, AORIG, TEMP
  423. add BO, B, r0
  424. #endif
  425. #if defined(LN) || defined(LT)
  426. LFD f16, 0 * SIZE(BO)
  427. LFD f17, 1 * SIZE(BO)
  428. LFD f18, 2 * SIZE(BO)
  429. LFD f19, 3 * SIZE(BO)
  430. FSUB f0, f16, f0
  431. FSUB f1, f17, f1
  432. FSUB f2, f18, f2
  433. FSUB f3, f19, f3
  434. #else
  435. LFD f16, 0 * SIZE(AO)
  436. LFD f17, 1 * SIZE(AO)
  437. LFD f20, 2 * SIZE(AO)
  438. LFD f21, 3 * SIZE(AO)
  439. #ifndef CONJ
  440. FSUB f0, f16, f0
  441. FSUB f1, f17, f1
  442. FSUB f2, f20, f2
  443. FSUB f3, f21, f3
  444. #else
  445. FSUB f0, f16, f0
  446. FADD f1, f17, f1
  447. FSUB f2, f20, f2
  448. FADD f3, f21, f3
  449. #endif
  450. #endif
  451. #ifdef LN
  452. LFD f20, 0 * SIZE(AO)
  453. LFD f21, 1 * SIZE(AO)
  454. FMUL f4, f21, f1
  455. FMUL f5, f21, f0
  456. FMUL f12, f21, f3
  457. FMUL f13, f21, f2
  458. #ifndef CONJ
  459. FMSUB f0, f20, f0, f4
  460. FMADD f1, f20, f1, f5
  461. FMSUB f2, f20, f2, f12
  462. FMADD f3, f20, f3, f13
  463. #else
  464. FMADD f0, f20, f0, f4
  465. FMSUB f1, f20, f1, f5
  466. FMADD f2, f20, f2, f12
  467. FMSUB f3, f20, f3, f13
  468. #endif
  469. #endif
  470. #ifdef LT
  471. LFD f16, 0 * SIZE(AO)
  472. LFD f17, 1 * SIZE(AO)
  473. FMUL f4, f17, f1
  474. FMUL f5, f17, f0
  475. FMUL f12, f17, f3
  476. FMUL f13, f17, f2
  477. #ifndef CONJ
  478. FMSUB f0, f16, f0, f4
  479. FMADD f1, f16, f1, f5
  480. FMSUB f2, f16, f2, f12
  481. FMADD f3, f16, f3, f13
  482. #else
  483. FMADD f0, f16, f0, f4
  484. FMSUB f1, f16, f1, f5
  485. FMADD f2, f16, f2, f12
  486. FMSUB f3, f16, f3, f13
  487. #endif
  488. #endif
  489. #ifdef RN
  490. LFD f16, 0 * SIZE(BO)
  491. LFD f17, 1 * SIZE(BO)
  492. LFD f18, 2 * SIZE(BO)
  493. LFD f19, 3 * SIZE(BO)
  494. LFD f20, 6 * SIZE(BO)
  495. LFD f21, 7 * SIZE(BO)
  496. FMUL f4, f17, f1
  497. FMUL f5, f17, f0
  498. #ifndef CONJ
  499. FMSUB f0, f16, f0, f4
  500. FMADD f1, f16, f1, f5
  501. FMADD f2, f19, f1, f2
  502. FNMSUB f3, f19, f0, f3
  503. FNMSUB f2, f18, f0, f2
  504. FNMSUB f3, f18, f1, f3
  505. FMUL f4, f21, f3
  506. FMUL f5, f21, f2
  507. FMSUB f2, f20, f2, f4
  508. FMADD f3, f20, f3, f5
  509. #else
  510. FMADD f0, f16, f0, f4
  511. FMSUB f1, f16, f1, f5
  512. FMSUB f2, f19, f1, f2
  513. FNMADD f3, f19, f0, f3
  514. FNMADD f2, f18, f0, f2
  515. FNMADD f3, f18, f1, f3
  516. FMUL f4, f21, f3
  517. FMUL f5, f21, f2
  518. FMADD f2, f20, f2, f4
  519. FMSUB f3, f20, f3, f5
  520. #endif
  521. #endif
  522. #ifdef RT
  523. LFD f16, 6 * SIZE(BO)
  524. LFD f17, 7 * SIZE(BO)
  525. LFD f18, 4 * SIZE(BO)
  526. LFD f19, 5 * SIZE(BO)
  527. LFD f20, 0 * SIZE(BO)
  528. LFD f21, 1 * SIZE(BO)
  529. FMUL f12, f17, f9
  530. FMUL f13, f17, f8
  531. #ifndef CONJ
  532. FMSUB f2, f16, f2, f12
  533. FMADD f3, f16, f3, f13
  534. FMADD f0, f19, f3, f0
  535. FNMSUB f1, f19, f2, f1
  536. FNMSUB f0, f18, f2, f0
  537. FNMSUB f1, f18, f3, f1
  538. FMUL f4, f21, f1
  539. FMUL f5, f21, f0
  540. FMSUB f0, f20, f0, f4
  541. FMADD f1, f20, f1, f5
  542. #else
  543. FMADD f2, f16, f2, f12
  544. FMSUB f3, f16, f3, f13
  545. FMSUB f0, f19, f3, f0
  546. FNMADD f1, f19, f2, f1
  547. FNMADD f0, f18, f2, f0
  548. FNMADD f1, f18, f3, f1
  549. FMUL f4, f21, f1
  550. FMUL f5, f21, f0
  551. FMADD f0, f20, f0, f4
  552. FMSUB f1, f20, f1, f5
  553. #endif
  554. #endif
  555. #ifdef LN
  556. subi CO1, CO1, 2 * SIZE
  557. subi CO2, CO2, 2 * SIZE
  558. #endif
  559. #if defined(LN) || defined(LT)
  560. STFD f0, 0 * SIZE(BO)
  561. STFD f1, 1 * SIZE(BO)
  562. STFD f2, 2 * SIZE(BO)
  563. STFD f3, 3 * SIZE(BO)
  564. #else
  565. STFD f0, 0 * SIZE(AO)
  566. STFD f1, 1 * SIZE(AO)
  567. STFD f2, 2 * SIZE(AO)
  568. STFD f3, 3 * SIZE(AO)
  569. #endif
  570. STFD f0, 0 * SIZE(CO1)
  571. STFD f1, 1 * SIZE(CO1)
  572. STFD f2, 0 * SIZE(CO2)
  573. STFD f3, 1 * SIZE(CO2)
  574. #ifndef LN
  575. addi CO1, CO1, 2 * SIZE
  576. addi CO2, CO2, 2 * SIZE
  577. #endif
  578. #ifdef RT
  579. slwi r0, K, 0 + ZBASE_SHIFT
  580. add AORIG, AORIG, r0
  581. #endif
  582. #if defined(LT) || defined(RN)
  583. sub TEMP, K, KK
  584. slwi r0, TEMP, 0 + ZBASE_SHIFT
  585. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  586. add AO, AO, r0
  587. add BO, BO, TEMP
  588. #endif
  589. #ifdef LT
  590. addi KK, KK, 1
  591. #endif
  592. #ifdef LN
  593. subi KK, KK, 1
  594. #endif
  595. .align 4
  596. .L09:
  597. srawi. I, M, 1
  598. ble .L29
  599. .align 4
  600. .L11:
  601. #if defined(LT) || defined(RN)
  602. LFD A1, 0 * SIZE(AO)
  603. LFD A2, 1 * SIZE(AO)
  604. LFD A4, 4 * SIZE(AO)
  605. LFD A5, 8 * SIZE(AO)
  606. LFD B1, 0 * SIZE(B)
  607. LFD B2, 1 * SIZE(B)
  608. LFD B3, 2 * SIZE(B)
  609. LFD B4, 3 * SIZE(B)
  610. LFD B5, 4 * SIZE(B)
  611. LFD B6, 8 * SIZE(B)
  612. LFD B7, 12 * SIZE(B)
  613. lfs f0, FZERO
  614. fmr f1, f0
  615. fmr f2, f0
  616. fmr f3, f0
  617. fmr f4, f0
  618. fmr f5, f0
  619. fmr f6, f0
  620. fmr f7, f0
  621. fmr f8, f0
  622. fmr f9, f0
  623. fmr f10, f0
  624. fmr f11, f0
  625. fmr f12, f0
  626. fmr f13, f0
  627. fmr f14, f0
  628. fmr f15, f0
  629. srawi. r0, KK, 2
  630. mtspr CTR, r0
  631. mr BO, B
  632. #else
  633. #ifdef LN
  634. slwi r0, K, 1 + ZBASE_SHIFT
  635. sub AORIG, AORIG, r0
  636. #endif
  637. slwi TEMP, KK, 1 + ZBASE_SHIFT
  638. add AO, AORIG, TEMP
  639. add BO, B, TEMP
  640. sub TEMP, K, KK
  641. LFD A1, 0 * SIZE(AO)
  642. LFD A2, 1 * SIZE(AO)
  643. LFD A4, 4 * SIZE(AO)
  644. LFD A5, 8 * SIZE(AO)
  645. LFD B1, 0 * SIZE(BO)
  646. LFD B2, 1 * SIZE(BO)
  647. LFD B3, 2 * SIZE(BO)
  648. LFD B4, 3 * SIZE(BO)
  649. LFD B5, 4 * SIZE(BO)
  650. LFD B6, 8 * SIZE(BO)
  651. LFD B7, 12 * SIZE(BO)
  652. lfs f0, FZERO
  653. fmr f1, f0
  654. fmr f2, f0
  655. fmr f3, f0
  656. fmr f4, f0
  657. fmr f5, f0
  658. fmr f6, f0
  659. fmr f7, f0
  660. fmr f8, f0
  661. fmr f9, f0
  662. fmr f10, f0
  663. fmr f11, f0
  664. fmr f12, f0
  665. fmr f13, f0
  666. fmr f14, f0
  667. fmr f15, f0
  668. srawi. r0, TEMP, 2
  669. mtspr CTR, r0
  670. #endif
  671. ble .L15
  672. .align 4
  673. .L12:
  674. FMADD f0, A1, B1, f0
  675. LFD A3, 2 * SIZE(AO)
  676. FMADD f4, A1, B2, f4
  677. LFD A6, 12 * SIZE(AO)
  678. FMADD f8, A1, B3, f8
  679. nop
  680. FMADD f12, A1, B4, f12
  681. nop
  682. FMADD f1, A2, B1, f1
  683. LFD A1, 3 * SIZE(AO)
  684. FMADD f5, A2, B2, f5
  685. nop
  686. FMADD f9, A2, B3, f9
  687. nop
  688. FMADD f13, A2, B4, f13
  689. nop
  690. FMADD f2, A3, B1, f2
  691. nop
  692. FMADD f6, A3, B2, f6
  693. LFD B8, 5 * SIZE(BO)
  694. FMADD f10, A3, B3, f10
  695. LFD B9, 6 * SIZE(BO)
  696. FMADD f14, A3, B4, f14
  697. LFD B10, 7 * SIZE(BO)
  698. FMADD f3, A1, B1, f3
  699. LFD A2, 5 * SIZE(AO)
  700. FMADD f7, A1, B2, f7
  701. LFD B1, 16 * SIZE(BO)
  702. FMADD f11, A1, B3, f11
  703. nop
  704. FMADD f15, A1, B4, f15
  705. nop
  706. FMADD f0, A4, B5, f0
  707. LFD A3, 6 * SIZE(AO)
  708. FMADD f4, A4, B8, f4
  709. LFD A1, 16 * SIZE(AO)
  710. FMADD f8, A4, B9, f8
  711. nop
  712. FMADD f12, A4, B10, f12
  713. nop
  714. FMADD f1, A2, B5, f1
  715. LFD A4, 7 * SIZE(AO)
  716. FMADD f5, A2, B8, f5
  717. nop
  718. FMADD f9, A2, B9, f9
  719. nop
  720. FMADD f13, A2, B10, f13
  721. nop
  722. FMADD f2, A3, B5, f2
  723. nop
  724. FMADD f6, A3, B8, f6
  725. LFD B2, 9 * SIZE(BO)
  726. FMADD f10, A3, B9, f10
  727. LFD B3, 10 * SIZE(BO)
  728. FMADD f14, A3, B10, f14
  729. LFD B4, 11 * SIZE(BO)
  730. FMADD f3, A4, B5, f3
  731. LFD A2, 9 * SIZE(AO)
  732. FMADD f7, A4, B8, f7
  733. LFD B5, 20 * SIZE(BO)
  734. FMADD f11, A4, B9, f11
  735. nop
  736. FMADD f15, A4, B10, f15
  737. nop
  738. FMADD f0, A5, B6, f0
  739. LFD A3, 10 * SIZE(AO)
  740. FMADD f4, A5, B2, f4
  741. LFD A4, 20 * SIZE(AO)
  742. FMADD f8, A5, B3, f8
  743. nop
  744. FMADD f12, A5, B4, f12
  745. nop
  746. FMADD f1, A2, B6, f1
  747. LFD A5, 11 * SIZE(AO)
  748. FMADD f5, A2, B2, f5
  749. nop
  750. FMADD f9, A2, B3, f9
  751. nop
  752. FMADD f13, A2, B4, f13
  753. nop
  754. FMADD f2, A3, B6, f2
  755. nop
  756. FMADD f6, A3, B2, f6
  757. LFD B8, 13 * SIZE(BO)
  758. FMADD f10, A3, B3, f10
  759. LFD B9, 14 * SIZE(BO)
  760. FMADD f14, A3, B4, f14
  761. LFD B10,15 * SIZE(BO)
  762. FMADD f3, A5, B6, f3
  763. LFD A2, 13 * SIZE(AO)
  764. FMADD f7, A5, B2, f7
  765. LFD B6, 24 * SIZE(BO)
  766. FMADD f11, A5, B3, f11
  767. nop
  768. FMADD f15, A5, B4, f15
  769. nop
  770. FMADD f0, A6, B7, f0
  771. LFD A3, 14 * SIZE(AO)
  772. FMADD f4, A6, B8, f4
  773. LFD A5, 24 * SIZE(AO)
  774. FMADD f8, A6, B9, f8
  775. nop
  776. FMADD f12, A6, B10, f12
  777. nop
  778. FMADD f1, A2, B7, f1
  779. LFD A6, 15 * SIZE(AO)
  780. FMADD f5, A2, B8, f5
  781. nop
  782. FMADD f9, A2, B9, f9
  783. nop
  784. FMADD f13, A2, B10, f13
  785. nop
  786. FMADD f2, A3, B7, f2
  787. addi AO, AO, 16 * SIZE
  788. FMADD f6, A3, B8, f6
  789. LFD B2, 17 * SIZE(BO)
  790. FMADD f10, A3, B9, f10
  791. LFD B3, 18 * SIZE(BO)
  792. FMADD f14, A3, B10, f14
  793. LFD B4, 19 * SIZE(BO)
  794. FMADD f3, A6, B7, f3
  795. LFD A2, 1 * SIZE(AO)
  796. FMADD f7, A6, B8, f7
  797. LFD B7, 28 * SIZE(BO)
  798. FMADD f11, A6, B9, f11
  799. addi BO, BO, 16 * SIZE
  800. FMADD f15, A6, B10, f15
  801. bdnz .L12
  802. .align 4
  803. .L15:
  804. #if defined(LT) || defined(RN)
  805. andi. r0, KK, 3
  806. #else
  807. andi. r0, TEMP, 3
  808. #endif
  809. mtspr CTR, r0
  810. ble .LKERNEL_MainFinish
  811. .align 4
  812. .L16:
  813. FMADD f0, A1, B1, f0
  814. LFD A3, 2 * SIZE(AO)
  815. FMADD f4, A1, B2, f4
  816. FMADD f8, A1, B3, f8
  817. FMADD f12, A1, B4, f12
  818. LFD A4, 3 * SIZE(AO)
  819. FMADD f1, A2, B1, f1
  820. FMADD f5, A2, B2, f5
  821. FMADD f9, A2, B3, f9
  822. FMADD f13, A2, B4, f13
  823. LFDU A1, 4 * SIZE(AO)
  824. FMADD f2, A3, B1, f2
  825. FMADD f6, A3, B2, f6
  826. FMADD f10, A3, B3, f10
  827. FMADD f14, A3, B4, f14
  828. LFD A2, 1 * SIZE(AO)
  829. FMADD f3, A4, B1, f3
  830. LFDU B1, 4 * SIZE(BO)
  831. FMADD f7, A4, B2, f7
  832. LFD B2, 1 * SIZE(BO)
  833. FMADD f11, A4, B3, f11
  834. LFD B3, 2 * SIZE(BO)
  835. FMADD f15, A4, B4, f15
  836. LFD B4, 3 * SIZE(BO)
  837. bdnz .L16
  838. .align 4
  839. .LKERNEL_MainFinish:
  840. #ifndef CONJ
  841. FSUB f0, f0, f5
  842. FADD f1, f1, f4
  843. FSUB f2, f2, f7
  844. FADD f3, f3, f6
  845. FSUB f8, f8, f13
  846. FADD f9, f9, f12
  847. FSUB f10, f10, f15
  848. FADD f11, f11, f14
  849. #else
  850. FADD f0, f0, f5
  851. FSUB f1, f4, f1
  852. FADD f2, f2, f7
  853. FSUB f3, f6, f3
  854. FADD f8, f8, f13
  855. FSUB f9, f12, f9
  856. FADD f10, f10, f15
  857. FSUB f11, f14, f11
  858. #endif
  859. #if defined(LN) || defined(RT)
  860. subi r0, KK, 2
  861. slwi r0, r0, 1 + ZBASE_SHIFT
  862. add AO, AORIG, r0
  863. add BO, B, r0
  864. #endif
  865. #if defined(LN) || defined(LT)
  866. LFD f16, 0 * SIZE(BO)
  867. LFD f17, 1 * SIZE(BO)
  868. LFD f18, 2 * SIZE(BO)
  869. LFD f19, 3 * SIZE(BO)
  870. LFD f20, 4 * SIZE(BO)
  871. LFD f21, 5 * SIZE(BO)
  872. LFD f22, 6 * SIZE(BO)
  873. LFD f23, 7 * SIZE(BO)
  874. FSUB f0, f16, f0
  875. FSUB f1, f17, f1
  876. FSUB f8, f18, f8
  877. FSUB f9, f19, f9
  878. FSUB f2, f20, f2
  879. FSUB f3, f21, f3
  880. FSUB f10, f22, f10
  881. FSUB f11, f23, f11
  882. #else
  883. LFD f16, 0 * SIZE(AO)
  884. LFD f17, 1 * SIZE(AO)
  885. LFD f18, 2 * SIZE(AO)
  886. LFD f19, 3 * SIZE(AO)
  887. LFD f20, 4 * SIZE(AO)
  888. LFD f21, 5 * SIZE(AO)
  889. LFD f22, 6 * SIZE(AO)
  890. LFD f23, 7 * SIZE(AO)
  891. #ifndef CONJ
  892. FSUB f0, f16, f0
  893. FSUB f1, f17, f1
  894. FSUB f2, f18, f2
  895. FSUB f3, f19, f3
  896. FSUB f8, f20, f8
  897. FSUB f9, f21, f9
  898. FSUB f10, f22, f10
  899. FSUB f11, f23, f11
  900. #else
  901. FSUB f0, f16, f0
  902. FADD f1, f17, f1
  903. FSUB f2, f18, f2
  904. FADD f3, f19, f3
  905. FSUB f8, f20, f8
  906. FADD f9, f21, f9
  907. FSUB f10, f22, f10
  908. FADD f11, f23, f11
  909. #endif
  910. #endif
  911. #ifdef LN
  912. LFD f16, 6 * SIZE(AO)
  913. LFD f17, 7 * SIZE(AO)
  914. LFD f18, 4 * SIZE(AO)
  915. LFD f19, 5 * SIZE(AO)
  916. LFD f20, 0 * SIZE(AO)
  917. LFD f21, 1 * SIZE(AO)
  918. FMUL f6, f17, f3
  919. FMUL f7, f17, f2
  920. FMUL f14, f17, f11
  921. FMUL f15, f17, f10
  922. #ifndef CONJ
  923. FMSUB f2, f16, f2, f6
  924. FMADD f3, f16, f3, f7
  925. FMSUB f10, f16, f10, f14
  926. FMADD f11, f16, f11, f15
  927. FMADD f0, f19, f3, f0
  928. FNMSUB f1, f19, f2, f1
  929. FMADD f8, f19, f11, f8
  930. FNMSUB f9, f19, f10, f9
  931. FNMSUB f0, f18, f2, f0
  932. FNMSUB f1, f18, f3, f1
  933. FNMSUB f8, f18, f10, f8
  934. FNMSUB f9, f18, f11, f9
  935. FMUL f4, f21, f1
  936. FMUL f5, f21, f0
  937. FMUL f12, f21, f9
  938. FMUL f13, f21, f8
  939. FMSUB f0, f20, f0, f4
  940. FMADD f1, f20, f1, f5
  941. FMSUB f8, f20, f8, f12
  942. FMADD f9, f20, f9, f13
  943. #else
  944. FMADD f2, f16, f2, f6
  945. FMSUB f3, f16, f3, f7
  946. FMADD f10, f16, f10, f14
  947. FMSUB f11, f16, f11, f15
  948. FMSUB f0, f19, f3, f0
  949. FNMADD f1, f19, f2, f1
  950. FMSUB f8, f19, f11, f8
  951. FNMADD f9, f19, f10, f9
  952. FNMADD f0, f18, f2, f0
  953. FNMADD f1, f18, f3, f1
  954. FNMADD f8, f18, f10, f8
  955. FNMADD f9, f18, f11, f9
  956. FMUL f4, f21, f1
  957. FMUL f5, f21, f0
  958. FMUL f12, f21, f9
  959. FMUL f13, f21, f8
  960. FMADD f0, f20, f0, f4
  961. FMSUB f1, f20, f1, f5
  962. FMADD f8, f20, f8, f12
  963. FMSUB f9, f20, f9, f13
  964. #endif
  965. #endif
  966. #ifdef LT
  967. LFD f16, 0 * SIZE(AO)
  968. LFD f17, 1 * SIZE(AO)
  969. LFD f18, 2 * SIZE(AO)
  970. LFD f19, 3 * SIZE(AO)
  971. LFD f20, 6 * SIZE(AO)
  972. LFD f21, 7 * SIZE(AO)
  973. FMUL f4, f17, f1
  974. FMUL f5, f17, f0
  975. FMUL f12, f17, f9
  976. FMUL f13, f17, f8
  977. #ifndef CONJ
  978. FMSUB f0, f16, f0, f4
  979. FMADD f1, f16, f1, f5
  980. FMSUB f8, f16, f8, f12
  981. FMADD f9, f16, f9, f13
  982. FMADD f2, f19, f1, f2
  983. FNMSUB f3, f19, f0, f3
  984. FMADD f10, f19, f9, f10
  985. FNMSUB f11, f19, f8, f11
  986. FNMSUB f2, f18, f0, f2
  987. FNMSUB f3, f18, f1, f3
  988. FNMSUB f10, f18, f8, f10
  989. FNMSUB f11, f18, f9, f11
  990. FMUL f4, f21, f3
  991. FMUL f5, f21, f2
  992. FMUL f12, f21, f11
  993. FMUL f13, f21, f10
  994. FMSUB f2, f20, f2, f4
  995. FMADD f3, f20, f3, f5
  996. FMSUB f10, f20, f10, f12
  997. FMADD f11, f20, f11, f13
  998. #else
  999. FMADD f0, f16, f0, f4
  1000. FMSUB f1, f16, f1, f5
  1001. FMADD f8, f16, f8, f12
  1002. FMSUB f9, f16, f9, f13
  1003. FMSUB f2, f19, f1, f2
  1004. FNMADD f3, f19, f0, f3
  1005. FMSUB f10, f19, f9, f10
  1006. FNMADD f11, f19, f8, f11
  1007. FNMADD f2, f18, f0, f2
  1008. FNMADD f3, f18, f1, f3
  1009. FNMADD f10, f18, f8, f10
  1010. FNMADD f11, f18, f9, f11
  1011. FMUL f4, f21, f3
  1012. FMUL f5, f21, f2
  1013. FMUL f12, f21, f11
  1014. FMUL f13, f21, f10
  1015. FMADD f2, f20, f2, f4
  1016. FMSUB f3, f20, f3, f5
  1017. FMADD f10, f20, f10, f12
  1018. FMSUB f11, f20, f11, f13
  1019. #endif
  1020. #endif
  1021. #ifdef RN
  1022. LFD f16, 0 * SIZE(BO)
  1023. LFD f17, 1 * SIZE(BO)
  1024. LFD f18, 2 * SIZE(BO)
  1025. LFD f19, 3 * SIZE(BO)
  1026. LFD f20, 6 * SIZE(BO)
  1027. LFD f21, 7 * SIZE(BO)
  1028. FMUL f4, f17, f1
  1029. FMUL f5, f17, f0
  1030. FMUL f6, f17, f3
  1031. FMUL f7, f17, f2
  1032. #ifndef CONJ
  1033. FMSUB f0, f16, f0, f4
  1034. FMADD f1, f16, f1, f5
  1035. FMSUB f2, f16, f2, f6
  1036. FMADD f3, f16, f3, f7
  1037. FMADD f8, f19, f1, f8
  1038. FNMSUB f9, f19, f0, f9
  1039. FMADD f10, f19, f3, f10
  1040. FNMSUB f11, f19, f2, f11
  1041. FNMSUB f8, f18, f0, f8
  1042. FNMSUB f9, f18, f1, f9
  1043. FNMSUB f10, f18, f2, f10
  1044. FNMSUB f11, f18, f3, f11
  1045. FMUL f4, f21, f9
  1046. FMUL f5, f21, f8
  1047. FMUL f6, f21, f11
  1048. FMUL f7, f21, f10
  1049. FMSUB f8, f20, f8, f4
  1050. FMADD f9, f20, f9, f5
  1051. FMSUB f10, f20, f10, f6
  1052. FMADD f11, f20, f11, f7
  1053. #else
  1054. FMADD f0, f16, f0, f4
  1055. FMSUB f1, f16, f1, f5
  1056. FMADD f2, f16, f2, f6
  1057. FMSUB f3, f16, f3, f7
  1058. FMSUB f8, f19, f1, f8
  1059. FNMADD f9, f19, f0, f9
  1060. FMSUB f10, f19, f3, f10
  1061. FNMADD f11, f19, f2, f11
  1062. FNMADD f8, f18, f0, f8
  1063. FNMADD f9, f18, f1, f9
  1064. FNMADD f10, f18, f2, f10
  1065. FNMADD f11, f18, f3, f11
  1066. FMUL f4, f21, f9
  1067. FMUL f5, f21, f8
  1068. FMUL f6, f21, f11
  1069. FMUL f7, f21, f10
  1070. FMADD f8, f20, f8, f4
  1071. FMSUB f9, f20, f9, f5
  1072. FMADD f10, f20, f10, f6
  1073. FMSUB f11, f20, f11, f7
  1074. #endif
  1075. #endif
  1076. #ifdef RT
  1077. LFD f16, 6 * SIZE(BO)
  1078. LFD f17, 7 * SIZE(BO)
  1079. LFD f18, 4 * SIZE(BO)
  1080. LFD f19, 5 * SIZE(BO)
  1081. LFD f20, 0 * SIZE(BO)
  1082. LFD f21, 1 * SIZE(BO)
  1083. FMUL f12, f17, f9
  1084. FMUL f13, f17, f8
  1085. FMUL f14, f17, f11
  1086. FMUL f15, f17, f10
  1087. #ifndef CONJ
  1088. FMSUB f8, f16, f8, f12
  1089. FMADD f9, f16, f9, f13
  1090. FMSUB f10, f16, f10, f14
  1091. FMADD f11, f16, f11, f15
  1092. FMADD f0, f19, f9, f0
  1093. FNMSUB f1, f19, f8, f1
  1094. FMADD f2, f19, f11, f2
  1095. FNMSUB f3, f19, f10, f3
  1096. FNMSUB f0, f18, f8, f0
  1097. FNMSUB f1, f18, f9, f1
  1098. FNMSUB f2, f18, f10, f2
  1099. FNMSUB f3, f18, f11, f3
  1100. FMUL f4, f21, f1
  1101. FMUL f5, f21, f0
  1102. FMUL f6, f21, f3
  1103. FMUL f7, f21, f2
  1104. FMSUB f0, f20, f0, f4
  1105. FMADD f1, f20, f1, f5
  1106. FMSUB f2, f20, f2, f6
  1107. FMADD f3, f20, f3, f7
  1108. #else
  1109. FMADD f8, f16, f8, f12
  1110. FMSUB f9, f16, f9, f13
  1111. FMADD f10, f16, f10, f14
  1112. FMSUB f11, f16, f11, f15
  1113. FMSUB f0, f19, f9, f0
  1114. FNMADD f1, f19, f8, f1
  1115. FMSUB f2, f19, f11, f2
  1116. FNMADD f3, f19, f10, f3
  1117. FNMADD f0, f18, f8, f0
  1118. FNMADD f1, f18, f9, f1
  1119. FNMADD f2, f18, f10, f2
  1120. FNMADD f3, f18, f11, f3
  1121. FMUL f4, f21, f1
  1122. FMUL f5, f21, f0
  1123. FMUL f6, f21, f3
  1124. FMUL f7, f21, f2
  1125. FMADD f0, f20, f0, f4
  1126. FMSUB f1, f20, f1, f5
  1127. FMADD f2, f20, f2, f6
  1128. FMSUB f3, f20, f3, f7
  1129. #endif
  1130. #endif
  1131. #ifdef LN
  1132. subi CO1, CO1, 4 * SIZE
  1133. subi CO2, CO2, 4 * SIZE
  1134. #endif
  1135. #if defined(LN) || defined(LT)
  1136. STFD f0, 0 * SIZE(BO)
  1137. STFD f1, 1 * SIZE(BO)
  1138. STFD f8, 2 * SIZE(BO)
  1139. STFD f9, 3 * SIZE(BO)
  1140. STFD f2, 4 * SIZE(BO)
  1141. STFD f3, 5 * SIZE(BO)
  1142. STFD f10, 6 * SIZE(BO)
  1143. STFD f11, 7 * SIZE(BO)
  1144. #else
  1145. STFD f0, 0 * SIZE(AO)
  1146. STFD f1, 1 * SIZE(AO)
  1147. STFD f2, 2 * SIZE(AO)
  1148. STFD f3, 3 * SIZE(AO)
  1149. STFD f8, 4 * SIZE(AO)
  1150. STFD f9, 5 * SIZE(AO)
  1151. STFD f10, 6 * SIZE(AO)
  1152. STFD f11, 7 * SIZE(AO)
  1153. #endif
  1154. STFD f0, 0 * SIZE(CO1)
  1155. STFD f1, 1 * SIZE(CO1)
  1156. STFD f2, 2 * SIZE(CO1)
  1157. STFD f3, 3 * SIZE(CO1)
  1158. STFD f8, 0 * SIZE(CO2)
  1159. STFD f9, 1 * SIZE(CO2)
  1160. STFD f10, 2 * SIZE(CO2)
  1161. STFD f11, 3 * SIZE(CO2)
  1162. #ifndef LN
  1163. addi CO1, CO1, 4 * SIZE
  1164. addi CO2, CO2, 4 * SIZE
  1165. #endif
  1166. #ifdef RT
  1167. slwi r0, K, 1 + ZBASE_SHIFT
  1168. add AORIG, AORIG, r0
  1169. #endif
  1170. #if defined(LT) || defined(RN)
  1171. sub TEMP, K, KK
  1172. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1173. add AO, AO, TEMP
  1174. add BO, BO, TEMP
  1175. #endif
  1176. #ifdef LT
  1177. addi KK, KK, 2
  1178. #endif
  1179. #ifdef LN
  1180. subi KK, KK, 2
  1181. #endif
  1182. addic. I, I, -1
  1183. bgt .L11
  1184. .align 4
  1185. .L29:
  1186. #ifdef LN
  1187. slwi r0, K, 1 + ZBASE_SHIFT
  1188. add B, B, r0
  1189. #endif
  1190. #if defined(LT) || defined(RN)
  1191. mr B, BO
  1192. #endif
  1193. #ifdef RN
  1194. addi KK, KK, 2
  1195. #endif
  1196. #ifdef RT
  1197. subi KK, KK, 2
  1198. #endif
  1199. addic. J, J, -1
  1200. bgt .L10
  1201. .align 4
  1202. .L30:
  1203. andi. J, N, 1
  1204. ble .L999
  1205. #ifdef RT
  1206. slwi r0, K, 0 + ZBASE_SHIFT
  1207. sub B, B, r0
  1208. sub C, C, LDC
  1209. #endif
  1210. mr CO1, C
  1211. #ifdef LN
  1212. add KK, M, OFFSET
  1213. #endif
  1214. #ifdef LT
  1215. mr KK, OFFSET
  1216. #endif
  1217. #if defined(LN) || defined(RT)
  1218. mr AORIG, A
  1219. #else
  1220. mr AO, A
  1221. #endif
  1222. #ifndef RT
  1223. add C, C, LDC
  1224. #endif
  1225. andi. I, M, 1
  1226. ble .L40
  1227. #if defined(LT) || defined(RN)
  1228. LFD f16, 0 * SIZE(AO)
  1229. LFD f17, 1 * SIZE(AO)
  1230. LFD f18, 2 * SIZE(AO)
  1231. LFD f19, 3 * SIZE(AO)
  1232. LFD f20, 0 * SIZE(B)
  1233. LFD f21, 1 * SIZE(B)
  1234. LFD f22, 2 * SIZE(B)
  1235. LFD f23, 3 * SIZE(B)
  1236. lfs f0, FZERO
  1237. fmr f1, f0
  1238. fmr f2, f0
  1239. fmr f3, f0
  1240. fmr f4, f0
  1241. fmr f5, f0
  1242. fmr f6, f0
  1243. fmr f7, f0
  1244. srawi. r0, KK, 2
  1245. mr BO, B
  1246. mtspr CTR, r0
  1247. #else
  1248. #ifdef LN
  1249. slwi r0, K, 0 + ZBASE_SHIFT
  1250. sub AORIG, AORIG, r0
  1251. #endif
  1252. slwi r0, KK, 0 + ZBASE_SHIFT
  1253. add AO, AORIG, r0
  1254. add BO, B, r0
  1255. sub TEMP, K, KK
  1256. LFD f16, 0 * SIZE(AO)
  1257. LFD f17, 1 * SIZE(AO)
  1258. LFD f18, 2 * SIZE(AO)
  1259. LFD f19, 3 * SIZE(AO)
  1260. LFD f20, 0 * SIZE(BO)
  1261. LFD f21, 1 * SIZE(BO)
  1262. LFD f22, 2 * SIZE(BO)
  1263. LFD f23, 3 * SIZE(BO)
  1264. lfs f0, FZERO
  1265. fmr f1, f0
  1266. fmr f2, f0
  1267. fmr f3, f0
  1268. fmr f4, f0
  1269. fmr f5, f0
  1270. fmr f6, f0
  1271. fmr f7, f0
  1272. srawi. r0, TEMP, 2
  1273. mtspr CTR, r0
  1274. #endif
  1275. ble .L45
  1276. .align 4
  1277. .L42:
  1278. FMADD f0, f16, f20, f0
  1279. LFD f23, 3 * SIZE(BO)
  1280. FMADD f1, f17, f20, f1
  1281. nop
  1282. FMADD f2, f18, f20, f2
  1283. nop
  1284. FMADD f3, f19, f20, f3
  1285. LFD f20, 4 * SIZE(BO)
  1286. FMADD f4, f16, f21, f4
  1287. LFD f16, 4 * SIZE(AO)
  1288. FMADD f5, f17, f21, f5
  1289. LFD f17, 5 * SIZE(AO)
  1290. FMADD f6, f18, f21, f6
  1291. LFD f18, 6 * SIZE(AO)
  1292. FMADD f7, f19, f21, f7
  1293. LFD f19, 7 * SIZE(AO)
  1294. FMADD f0, f16, f22, f0
  1295. LFD f21, 5 * SIZE(BO)
  1296. FMADD f1, f17, f22, f1
  1297. nop
  1298. FMADD f2, f18, f22, f2
  1299. nop
  1300. FMADD f3, f19, f22, f3
  1301. LFD f22, 6 * SIZE(BO)
  1302. FMADD f4, f16, f23, f4
  1303. LFD f16, 8 * SIZE(AO)
  1304. FMADD f5, f17, f23, f5
  1305. LFD f17, 9 * SIZE(AO)
  1306. FMADD f6, f18, f23, f6
  1307. LFD f18, 10 * SIZE(AO)
  1308. FMADD f7, f19, f23, f7
  1309. LFD f19, 11 * SIZE(AO)
  1310. FMADD f0, f16, f20, f0
  1311. LFD f23, 7 * SIZE(BO)
  1312. FMADD f1, f17, f20, f1
  1313. nop
  1314. FMADD f2, f18, f20, f2
  1315. nop
  1316. FMADD f3, f19, f20, f3
  1317. LFDU f20, 8 * SIZE(BO)
  1318. FMADD f4, f16, f21, f4
  1319. LFD f16, 12 * SIZE(AO)
  1320. FMADD f5, f17, f21, f5
  1321. LFD f17, 13 * SIZE(AO)
  1322. FMADD f6, f18, f21, f6
  1323. LFD f18, 14 * SIZE(AO)
  1324. FMADD f7, f19, f21, f7
  1325. LFD f19, 15 * SIZE(AO)
  1326. FMADD f0, f16, f22, f0
  1327. LFD f21, 1 * SIZE(BO)
  1328. FMADD f1, f17, f22, f1
  1329. nop
  1330. FMADD f2, f18, f22, f2
  1331. nop
  1332. FMADD f3, f19, f22, f3
  1333. LFD f22, 2 * SIZE(BO)
  1334. FMADD f4, f16, f23, f4
  1335. LFDU f16, 16 * SIZE(AO)
  1336. FMADD f5, f17, f23, f5
  1337. LFD f17, 1 * SIZE(AO)
  1338. FMADD f6, f18, f23, f6
  1339. LFD f18, 2 * SIZE(AO)
  1340. FMADD f7, f19, f23, f7
  1341. LFD f19, 3 * SIZE(AO)
  1342. bdnz .L42
  1343. .align 4
  1344. .L45:
  1345. fadd f0, f0, f4
  1346. fadd f1, f1, f5
  1347. fadd f2, f2, f6
  1348. fadd f3, f3, f7
  1349. #if defined(LT) || defined(RN)
  1350. andi. r0, KK, 3
  1351. #else
  1352. andi. r0, TEMP, 3
  1353. #endif
  1354. mtspr CTR,r0
  1355. ble .L47
  1356. .align 4
  1357. .L46:
  1358. FMADD f0, f16, f20, f0
  1359. LFD f21, 1 * SIZE(BO)
  1360. FMADD f1, f17, f20, f1
  1361. nop
  1362. FMADD f2, f18, f20, f2
  1363. nop
  1364. FMADD f3, f19, f20, f3
  1365. LFDU f20, 2 * SIZE(BO)
  1366. FMADD f4, f16, f21, f4
  1367. LFDU f16, 4 * SIZE(AO)
  1368. FMADD f5, f17, f21, f5
  1369. LFD f17, 1 * SIZE(AO)
  1370. FMADD f6, f18, f21, f6
  1371. LFD f18, 2 * SIZE(AO)
  1372. FMADD f7, f19, f21, f7
  1373. LFD f19, 3 * SIZE(AO)
  1374. bdnz .L46
  1375. .align 4
  1376. .L47:
  1377. #ifndef CONJ
  1378. FSUB f0, f0, f1
  1379. FADD f1, f2, f3
  1380. #else
  1381. FADD f0, f0, f1
  1382. FSUB f1, f3, f2
  1383. #endif
  1384. #if defined(LN) || defined(RT)
  1385. subi r0, KK, 1
  1386. slwi r0, r0, 0 + ZBASE_SHIFT
  1387. add AO, AORIG, r0
  1388. add BO, B, r0
  1389. #endif
  1390. #if defined(LN) || defined(LT)
  1391. LFD f16, 0 * SIZE(BO)
  1392. LFD f17, 1 * SIZE(BO)
  1393. FSUB f0, f16, f0
  1394. FSUB f1, f17, f1
  1395. #else
  1396. LFD f16, 0 * SIZE(AO)
  1397. LFD f17, 1 * SIZE(AO)
  1398. #ifndef CONJ
  1399. FSUB f0, f16, f0
  1400. FSUB f1, f17, f1
  1401. #else
  1402. FSUB f0, f16, f0
  1403. FADD f1, f17, f1
  1404. #endif
  1405. #endif
  1406. #ifdef LN
  1407. LFD f20, 0 * SIZE(AO)
  1408. LFD f21, 1 * SIZE(AO)
  1409. FMUL f4, f21, f1
  1410. FMUL f5, f21, f0
  1411. #ifndef CONJ
  1412. FMSUB f0, f20, f0, f4
  1413. FMADD f1, f20, f1, f5
  1414. #else
  1415. FMADD f0, f20, f0, f4
  1416. FMSUB f1, f20, f1, f5
  1417. #endif
  1418. #endif
  1419. #ifdef LT
  1420. LFD f16, 0 * SIZE(AO)
  1421. LFD f17, 1 * SIZE(AO)
  1422. FMUL f4, f17, f1
  1423. FMUL f5, f17, f0
  1424. #ifndef CONJ
  1425. FMSUB f0, f16, f0, f4
  1426. FMADD f1, f16, f1, f5
  1427. #else
  1428. FMADD f0, f16, f0, f4
  1429. FMSUB f1, f16, f1, f5
  1430. #endif
  1431. #endif
  1432. #ifdef RN
  1433. LFD f16, 0 * SIZE(BO)
  1434. LFD f17, 1 * SIZE(BO)
  1435. FMUL f4, f17, f1
  1436. FMUL f5, f17, f0
  1437. #ifndef CONJ
  1438. FMSUB f0, f16, f0, f4
  1439. FMADD f1, f16, f1, f5
  1440. #else
  1441. FMADD f0, f16, f0, f4
  1442. FMSUB f1, f16, f1, f5
  1443. #endif
  1444. #endif
  1445. #ifdef RT
  1446. LFD f20, 0 * SIZE(BO)
  1447. LFD f21, 1 * SIZE(BO)
  1448. FMUL f4, f21, f1
  1449. FMUL f5, f21, f0
  1450. #ifndef CONJ
  1451. FMSUB f0, f20, f0, f4
  1452. FMADD f1, f20, f1, f5
  1453. #else
  1454. FMADD f0, f20, f0, f4
  1455. FMSUB f1, f20, f1, f5
  1456. #endif
  1457. #endif
  1458. #ifdef LN
  1459. subi CO1, CO1, 2 * SIZE
  1460. #endif
  1461. #if defined(LN) || defined(LT)
  1462. STFD f0, 0 * SIZE(BO)
  1463. STFD f1, 1 * SIZE(BO)
  1464. #else
  1465. STFD f0, 0 * SIZE(AO)
  1466. STFD f1, 1 * SIZE(AO)
  1467. #endif
  1468. STFD f0, 0 * SIZE(CO1)
  1469. STFD f1, 1 * SIZE(CO1)
  1470. #ifndef LN
  1471. addi CO1, CO1, 2 * SIZE
  1472. #endif
  1473. #ifdef RT
  1474. slwi r0, K, 0 + ZBASE_SHIFT
  1475. add AORIG, AORIG, r0
  1476. #endif
  1477. #if defined(LT) || defined(RN)
  1478. sub TEMP, K, KK
  1479. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1480. add AO, AO, TEMP
  1481. add BO, BO, TEMP
  1482. #endif
  1483. #ifdef LT
  1484. addi KK, KK, 1
  1485. #endif
  1486. #ifdef LN
  1487. subi KK, KK, 1
  1488. #endif
  1489. .align 4
  1490. .L40:
  1491. srawi. I, M, 1
  1492. ble .L49
  1493. .align 4
  1494. .L31:
  1495. #if defined(LT) || defined(RN)
  1496. LFD f20, 0 * SIZE(AO)
  1497. LFD f21, 1 * SIZE(AO)
  1498. LFD f22, 2 * SIZE(AO)
  1499. LFD f23, 3 * SIZE(AO)
  1500. LFD f24, 4 * SIZE(AO)
  1501. LFD f25, 5 * SIZE(AO)
  1502. LFD f26, 6 * SIZE(AO)
  1503. LFD f27, 7 * SIZE(AO)
  1504. LFD f16, 0 * SIZE(B)
  1505. LFD f17, 1 * SIZE(B)
  1506. LFD f18, 2 * SIZE(B)
  1507. LFD f19, 3 * SIZE(B)
  1508. lfs f0, FZERO
  1509. fmr f1, f0
  1510. fmr f2, f0
  1511. fmr f3, f0
  1512. fmr f4, f0
  1513. fmr f5, f0
  1514. fmr f6, f0
  1515. fmr f7, f0
  1516. srawi. r0, KK, 2
  1517. mr BO, B
  1518. mtspr CTR, r0
  1519. #else
  1520. #ifdef LN
  1521. slwi r0, K, 1 + ZBASE_SHIFT
  1522. sub AORIG, AORIG, r0
  1523. #endif
  1524. slwi r0, KK, 1 + ZBASE_SHIFT
  1525. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1526. add AO, AORIG, r0
  1527. add BO, B, TEMP
  1528. sub TEMP, K, KK
  1529. LFD f20, 0 * SIZE(AO)
  1530. LFD f21, 1 * SIZE(AO)
  1531. LFD f22, 2 * SIZE(AO)
  1532. LFD f23, 3 * SIZE(AO)
  1533. LFD f24, 4 * SIZE(AO)
  1534. LFD f25, 5 * SIZE(AO)
  1535. LFD f26, 6 * SIZE(AO)
  1536. LFD f27, 7 * SIZE(AO)
  1537. LFD f16, 0 * SIZE(BO)
  1538. LFD f17, 1 * SIZE(BO)
  1539. LFD f18, 2 * SIZE(BO)
  1540. LFD f19, 3 * SIZE(BO)
  1541. lfs f0, FZERO
  1542. fmr f1, f0
  1543. fmr f2, f0
  1544. fmr f3, f0
  1545. fmr f4, f0
  1546. fmr f5, f0
  1547. fmr f6, f0
  1548. fmr f7, f0
  1549. srawi. r0, TEMP, 2
  1550. mtspr CTR, r0
  1551. #endif
  1552. ble .L35
  1553. .align 4
  1554. .L32:
  1555. fmadd f0, f16, f20, f0
  1556. LFD f19, 3 * SIZE(BO)
  1557. fmadd f1, f16, f21, f1
  1558. nop
  1559. fmadd f2, f16, f22, f2
  1560. nop
  1561. fmadd f3, f16, f23, f3
  1562. LFD f16, 4 * SIZE(BO)
  1563. fmadd f4, f17, f20, f4
  1564. LFD f20, 8 * SIZE(AO)
  1565. fmadd f5, f17, f21, f5
  1566. LFD f21, 9 * SIZE(AO)
  1567. fmadd f6, f17, f22, f6
  1568. LFD f22, 10 * SIZE(AO)
  1569. fmadd f7, f17, f23, f7
  1570. LFD f23, 11 * SIZE(AO)
  1571. fmadd f0, f18, f24, f0
  1572. LFD f17, 5 * SIZE(BO)
  1573. fmadd f1, f18, f25, f1
  1574. nop
  1575. fmadd f2, f18, f26, f2
  1576. nop
  1577. fmadd f3, f18, f27, f3
  1578. LFD f18, 6 * SIZE(BO)
  1579. fmadd f4, f19, f24, f4
  1580. LFD f24, 12 * SIZE(AO)
  1581. fmadd f5, f19, f25, f5
  1582. LFD f25, 13 * SIZE(AO)
  1583. fmadd f6, f19, f26, f6
  1584. LFD f26, 14 * SIZE(AO)
  1585. fmadd f7, f19, f27, f7
  1586. LFD f27, 15 * SIZE(AO)
  1587. fmadd f0, f16, f20, f0
  1588. LFD f19, 7 * SIZE(BO)
  1589. fmadd f1, f16, f21, f1
  1590. nop
  1591. fmadd f2, f16, f22, f2
  1592. nop
  1593. fmadd f3, f16, f23, f3
  1594. LFDU f16, 8 * SIZE(BO)
  1595. fmadd f4, f17, f20, f4
  1596. LFDU f20, 16 * SIZE(AO)
  1597. fmadd f5, f17, f21, f5
  1598. LFD f21, 1 * SIZE(AO)
  1599. fmadd f6, f17, f22, f6
  1600. LFD f22, 2 * SIZE(AO)
  1601. fmadd f7, f17, f23, f7
  1602. LFD f23, 3 * SIZE(AO)
  1603. fmadd f0, f18, f24, f0
  1604. LFD f17, 1 * SIZE(BO)
  1605. fmadd f1, f18, f25, f1
  1606. nop
  1607. fmadd f2, f18, f26, f2
  1608. nop
  1609. fmadd f3, f18, f27, f3
  1610. LFD f18, 2 * SIZE(BO)
  1611. fmadd f4, f19, f24, f4
  1612. LFD f24, 4 * SIZE(AO)
  1613. fmadd f5, f19, f25, f5
  1614. LFD f25, 5 * SIZE(AO)
  1615. fmadd f6, f19, f26, f6
  1616. LFD f26, 6 * SIZE(AO)
  1617. fmadd f7, f19, f27, f7
  1618. LFD f27, 7 * SIZE(AO)
  1619. bdnz .L32
  1620. .align 4
  1621. .L35:
  1622. #if defined(LT) || defined(RN)
  1623. andi. r0, KK, 3
  1624. #else
  1625. andi. r0, TEMP, 3
  1626. #endif
  1627. mtspr CTR, r0
  1628. ble .L37
  1629. .align 4
  1630. .L36:
  1631. fmadd f0, f16, f20, f0
  1632. LFD f17, 1 * SIZE(BO)
  1633. fmadd f1, f16, f21, f1
  1634. nop
  1635. fmadd f2, f16, f22, f2
  1636. nop
  1637. fmadd f3, f16, f23, f3
  1638. LFDU f16, 2 * SIZE(BO)
  1639. fmadd f4, f17, f20, f4
  1640. LFDU f20, 4 * SIZE(AO)
  1641. fmadd f5, f17, f21, f5
  1642. LFD f21, 1 * SIZE(AO)
  1643. fmadd f6, f17, f22, f6
  1644. LFD f22, 2 * SIZE(AO)
  1645. fmadd f7, f17, f23, f7
  1646. LFD f23, 3 * SIZE(AO)
  1647. bdnz .L36
  1648. .align 4
  1649. .L37:
  1650. #ifndef CONJ
  1651. FSUB f0, f0, f5
  1652. FADD f1, f1, f4
  1653. FSUB f2, f2, f7
  1654. FADD f3, f3, f6
  1655. #else
  1656. FADD f0, f0, f5
  1657. FSUB f1, f4, f1
  1658. FADD f2, f2, f7
  1659. FSUB f3, f6, f3
  1660. #endif
  1661. #if defined(LN) || defined(RT)
  1662. #ifdef LN
  1663. subi r0, KK, 2
  1664. #else
  1665. subi r0, KK, 1
  1666. #endif
  1667. slwi TEMP, r0, 1 + ZBASE_SHIFT
  1668. slwi r0, r0, 0 + ZBASE_SHIFT
  1669. add AO, AORIG, TEMP
  1670. add BO, B, r0
  1671. #endif
  1672. #if defined(LN) || defined(LT)
  1673. LFD f16, 0 * SIZE(BO)
  1674. LFD f17, 1 * SIZE(BO)
  1675. LFD f18, 2 * SIZE(BO)
  1676. LFD f19, 3 * SIZE(BO)
  1677. FSUB f0, f16, f0
  1678. FSUB f1, f17, f1
  1679. FSUB f2, f18, f2
  1680. FSUB f3, f19, f3
  1681. #else
  1682. LFD f16, 0 * SIZE(AO)
  1683. LFD f17, 1 * SIZE(AO)
  1684. LFD f18, 2 * SIZE(AO)
  1685. LFD f19, 3 * SIZE(AO)
  1686. #ifndef CONJ
  1687. FSUB f0, f16, f0
  1688. FSUB f1, f17, f1
  1689. FSUB f2, f18, f2
  1690. FSUB f3, f19, f3
  1691. #else
  1692. FSUB f0, f16, f0
  1693. FADD f1, f17, f1
  1694. FSUB f2, f18, f2
  1695. FADD f3, f19, f3
  1696. #endif
  1697. #endif
  1698. #ifdef LN
  1699. LFD f16, 6 * SIZE(AO)
  1700. LFD f17, 7 * SIZE(AO)
  1701. LFD f18, 4 * SIZE(AO)
  1702. LFD f19, 5 * SIZE(AO)
  1703. LFD f20, 0 * SIZE(AO)
  1704. LFD f21, 1 * SIZE(AO)
  1705. FMUL f6, f17, f3
  1706. FMUL f7, f17, f2
  1707. #ifndef CONJ
  1708. FMSUB f2, f16, f2, f6
  1709. FMADD f3, f16, f3, f7
  1710. FMADD f0, f19, f3, f0
  1711. FNMSUB f1, f19, f2, f1
  1712. FNMSUB f0, f18, f2, f0
  1713. FNMSUB f1, f18, f3, f1
  1714. FMUL f4, f21, f1
  1715. FMUL f5, f21, f0
  1716. FMSUB f0, f20, f0, f4
  1717. FMADD f1, f20, f1, f5
  1718. #else
  1719. FMADD f2, f16, f2, f6
  1720. FMSUB f3, f16, f3, f7
  1721. FMSUB f0, f19, f3, f0
  1722. FNMADD f1, f19, f2, f1
  1723. FNMADD f0, f18, f2, f0
  1724. FNMADD f1, f18, f3, f1
  1725. FMUL f4, f21, f1
  1726. FMUL f5, f21, f0
  1727. FMADD f0, f20, f0, f4
  1728. FMSUB f1, f20, f1, f5
  1729. #endif
  1730. #endif
  1731. #ifdef LT
  1732. LFD f16, 0 * SIZE(AO)
  1733. LFD f17, 1 * SIZE(AO)
  1734. LFD f18, 2 * SIZE(AO)
  1735. LFD f19, 3 * SIZE(AO)
  1736. LFD f20, 6 * SIZE(AO)
  1737. LFD f21, 7 * SIZE(AO)
  1738. FMUL f4, f17, f1
  1739. FMUL f5, f17, f0
  1740. #ifndef CONJ
  1741. FMSUB f0, f16, f0, f4
  1742. FMADD f1, f16, f1, f5
  1743. FMADD f2, f19, f1, f2
  1744. FNMSUB f3, f19, f0, f3
  1745. FNMSUB f2, f18, f0, f2
  1746. FNMSUB f3, f18, f1, f3
  1747. FMUL f4, f21, f3
  1748. FMUL f5, f21, f2
  1749. FMSUB f2, f20, f2, f4
  1750. FMADD f3, f20, f3, f5
  1751. #else
  1752. FMADD f0, f16, f0, f4
  1753. FMSUB f1, f16, f1, f5
  1754. FMSUB f2, f19, f1, f2
  1755. FNMADD f3, f19, f0, f3
  1756. FNMADD f2, f18, f0, f2
  1757. FNMADD f3, f18, f1, f3
  1758. FMUL f4, f21, f3
  1759. FMUL f5, f21, f2
  1760. FMADD f2, f20, f2, f4
  1761. FMSUB f3, f20, f3, f5
  1762. #endif
  1763. #endif
  1764. #ifdef RN
  1765. LFD f16, 0 * SIZE(BO)
  1766. LFD f17, 1 * SIZE(BO)
  1767. FMUL f4, f17, f1
  1768. FMUL f5, f17, f0
  1769. FMUL f6, f17, f3
  1770. FMUL f7, f17, f2
  1771. #ifndef CONJ
  1772. FMSUB f0, f16, f0, f4
  1773. FMADD f1, f16, f1, f5
  1774. FMSUB f2, f16, f2, f6
  1775. FMADD f3, f16, f3, f7
  1776. #else
  1777. FMADD f0, f16, f0, f4
  1778. FMSUB f1, f16, f1, f5
  1779. FMADD f2, f16, f2, f6
  1780. FMSUB f3, f16, f3, f7
  1781. #endif
  1782. #endif
  1783. #ifdef RT
  1784. LFD f20, 0 * SIZE(BO)
  1785. LFD f21, 1 * SIZE(BO)
  1786. FMUL f4, f21, f1
  1787. FMUL f5, f21, f0
  1788. FMUL f6, f21, f3
  1789. FMUL f7, f21, f2
  1790. #ifndef CONJ
  1791. FMSUB f0, f20, f0, f4
  1792. FMADD f1, f20, f1, f5
  1793. FMSUB f2, f20, f2, f6
  1794. FMADD f3, f20, f3, f7
  1795. #else
  1796. FMADD f0, f20, f0, f4
  1797. FMSUB f1, f20, f1, f5
  1798. FMADD f2, f20, f2, f6
  1799. FMSUB f3, f20, f3, f7
  1800. #endif
  1801. #endif
  1802. #ifdef LN
  1803. subi CO1, CO1, 4 * SIZE
  1804. #endif
  1805. #if defined(LN) || defined(LT)
  1806. STFD f0, 0 * SIZE(BO)
  1807. STFD f1, 1 * SIZE(BO)
  1808. STFD f2, 2 * SIZE(BO)
  1809. STFD f3, 3 * SIZE(BO)
  1810. #else
  1811. STFD f0, 0 * SIZE(AO)
  1812. STFD f1, 1 * SIZE(AO)
  1813. STFD f2, 2 * SIZE(AO)
  1814. STFD f3, 3 * SIZE(AO)
  1815. #endif
  1816. STFD f0, 0 * SIZE(CO1)
  1817. STFD f1, 1 * SIZE(CO1)
  1818. STFD f2, 2 * SIZE(CO1)
  1819. STFD f3, 3 * SIZE(CO1)
  1820. #ifndef LN
  1821. addi CO1, CO1, 4 * SIZE
  1822. #endif
  1823. #ifdef RT
  1824. slwi r0, K, 1 + ZBASE_SHIFT
  1825. add AORIG, AORIG, r0
  1826. #endif
  1827. #if defined(LT) || defined(RN)
  1828. sub TEMP, K, KK
  1829. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1830. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1831. add AO, AO, r0
  1832. add BO, BO, TEMP
  1833. #endif
  1834. #ifdef LT
  1835. addi KK, KK, 2
  1836. #endif
  1837. #ifdef LN
  1838. subi KK, KK, 2
  1839. #endif
  1840. addic. I, I, -1
  1841. bgt .L31
  1842. .align 4
  1843. .L49:
  1844. #ifdef LN
  1845. slwi r0, K, 0 + ZBASE_SHIFT
  1846. add B, B, r0
  1847. #endif
  1848. #if defined(LT) || defined(RN)
  1849. mr B, BO
  1850. #endif
  1851. #ifdef RN
  1852. addi KK, KK, 1
  1853. #endif
  1854. #ifdef RT
  1855. subi KK, KK, 1
  1856. #endif
  1857. .align 4
  1858. .L999:
  1859. addi r3, 0, 0
  1860. lfd f14, 0(SP)
  1861. lfd f15, 8(SP)
  1862. lfd f16, 16(SP)
  1863. lfd f17, 24(SP)
  1864. lfd f18, 32(SP)
  1865. lfd f19, 40(SP)
  1866. lfd f20, 48(SP)
  1867. lfd f21, 56(SP)
  1868. lfd f22, 64(SP)
  1869. lfd f23, 72(SP)
  1870. lfd f24, 80(SP)
  1871. lfd f25, 88(SP)
  1872. lfd f26, 96(SP)
  1873. lfd f27, 104(SP)
  1874. lfd f28, 112(SP)
  1875. lfd f29, 120(SP)
  1876. lfd f30, 128(SP)
  1877. lfd f31, 136(SP)
  1878. #ifdef __64BIT__
  1879. ld r31, 144(SP)
  1880. ld r30, 152(SP)
  1881. ld r29, 160(SP)
  1882. ld r28, 168(SP)
  1883. ld r27, 176(SP)
  1884. ld r26, 184(SP)
  1885. ld r25, 192(SP)
  1886. ld r24, 200(SP)
  1887. ld r23, 208(SP)
  1888. ld r22, 216(SP)
  1889. ld r21, 224(SP)
  1890. #else
  1891. lwz r31, 144(SP)
  1892. lwz r30, 148(SP)
  1893. lwz r29, 152(SP)
  1894. lwz r28, 156(SP)
  1895. lwz r27, 160(SP)
  1896. lwz r26, 164(SP)
  1897. lwz r25, 168(SP)
  1898. lwz r24, 172(SP)
  1899. lwz r23, 176(SP)
  1900. lwz r22, 180(SP)
  1901. lwz r21, 184(SP)
  1902. #endif
  1903. addi SP, SP, STACKSIZE
  1904. blr
  1905. EPILOGUE