You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $8
  44. #define B $9
  45. #define C $10
  46. #define LDC $11
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define CO5 $18
  57. #define CO6 $19
  58. #define CO7 $20
  59. #define CO8 $21
  60. #define OFFSET $22
  61. #define KK $23
  62. #define TEMP $24
  63. #define AORIG $25
  64. #define a1 $f0
  65. #define a2 $f1
  66. #define a3 $f27
  67. #define a4 $f28
  68. #define b1 $f2
  69. #define b2 $f3
  70. #define b3 $f4
  71. #define b4 $f5
  72. #define b5 $f6
  73. #define b6 $f7
  74. #define b7 $f8
  75. #define b8 $f9
  76. #define a5 b8
  77. #define c11 $f10
  78. #define c12 $f11
  79. #define c21 $f12
  80. #define c22 $f13
  81. #define c31 $f14
  82. #define c32 $f16
  83. #define c41 $f17
  84. #define c42 $f18
  85. #define c51 $f19
  86. #define c52 $f20
  87. #define c61 $f21
  88. #define c62 $f22
  89. #define c71 $f23
  90. #define c72 $f24
  91. #define c81 $f25
  92. #define c82 $f26
  93. #define ALPHA $f15
  94. PROLOGUE
  95. daddiu $sp, $sp, -144
  96. SDARG $16, 0($sp)
  97. SDARG $17, 8($sp)
  98. SDARG $18, 16($sp)
  99. SDARG $19, 24($sp)
  100. SDARG $20, 32($sp)
  101. SDARG $21, 40($sp)
  102. sdc1 $f24, 48($sp)
  103. sdc1 $f25, 56($sp)
  104. sdc1 $f26, 64($sp)
  105. sdc1 $f27, 72($sp)
  106. sdc1 $f28, 80($sp)
  107. SDARG $22, 88($sp)
  108. SDARG $23, 96($sp)
  109. SDARG $24, 104($sp)
  110. SDARG $25, 112($sp)
  111. #ifndef __64BIT__
  112. sdc1 $f20,112($sp)
  113. sdc1 $f21,120($sp)
  114. sdc1 $f22,128($sp)
  115. sdc1 $f23,136($sp)
  116. #endif
  117. LDARG OFFSET, 144($sp)
  118. dsll LDC, LDC, BASE_SHIFT
  119. #ifdef LN
  120. mult M, K
  121. mflo TEMP
  122. dsll TEMP, TEMP, BASE_SHIFT
  123. daddu A, A, TEMP
  124. dsll TEMP, M, BASE_SHIFT
  125. daddu C, C, TEMP
  126. #endif
  127. #ifdef RN
  128. neg KK, OFFSET
  129. #endif
  130. #ifdef RT
  131. mult N, K
  132. mflo TEMP
  133. dsll TEMP, TEMP, BASE_SHIFT
  134. daddu B, B, TEMP
  135. mult N, LDC
  136. mflo TEMP
  137. daddu C, C, TEMP
  138. dsubu KK, N, OFFSET
  139. #endif
  140. dsra J, N, 3
  141. blez J, .L30
  142. nop
  143. .L10:
  144. #ifdef RT
  145. dsll TEMP, K, 3 + BASE_SHIFT
  146. dsubu B, B, TEMP
  147. dsll TEMP, LDC, 3
  148. dsubu C, C, TEMP
  149. #endif
  150. move CO1, C
  151. MTC $0, c11
  152. daddu CO2, C, LDC
  153. daddu CO3, CO2, LDC
  154. daddiu J, J, -1
  155. daddu CO4, CO3, LDC
  156. MOV c21, c11
  157. daddu CO5, CO4, LDC
  158. MOV c31, c11
  159. daddu CO6, CO5, LDC
  160. MOV c41, c11
  161. daddu CO7, CO6, LDC
  162. MOV c51, c11
  163. daddu CO8, CO7, LDC
  164. #ifdef LN
  165. daddu KK, M, OFFSET
  166. #endif
  167. #ifdef LT
  168. move KK, OFFSET
  169. #endif
  170. #if defined(LN) || defined(RT)
  171. move AORIG, A
  172. #else
  173. move AO, A
  174. #endif
  175. #ifndef RT
  176. daddu C, CO8, LDC
  177. #endif
  178. andi I, M, 1
  179. MOV c61, c11
  180. blez I, .L20
  181. MOV c71, c11
  182. #if defined(LT) || defined(RN)
  183. LD a1, 0 * SIZE(AO)
  184. LD a2, 1 * SIZE(AO)
  185. LD a3, 2 * SIZE(AO)
  186. LD a4, 3 * SIZE(AO)
  187. LD b1, 0 * SIZE(B)
  188. LD b2, 1 * SIZE(B)
  189. LD b3, 2 * SIZE(B)
  190. LD b4, 3 * SIZE(B)
  191. LD b5, 4 * SIZE(B)
  192. LD b6, 8 * SIZE(B)
  193. LD b7, 12 * SIZE(B)
  194. dsra L, KK, 2
  195. MOV c81, c11
  196. blez L, .L25
  197. move BO, B
  198. #else
  199. #ifdef LN
  200. dsll TEMP, K, 0 + BASE_SHIFT
  201. dsubu AORIG, AORIG, TEMP
  202. #endif
  203. dsll L, KK, 0 + BASE_SHIFT
  204. dsll TEMP, KK, 3 + BASE_SHIFT
  205. daddu AO, AORIG, L
  206. daddu BO, B, TEMP
  207. dsubu TEMP, K, KK
  208. LD a1, 0 * SIZE(AO)
  209. LD a2, 1 * SIZE(AO)
  210. LD a3, 2 * SIZE(AO)
  211. LD a4, 3 * SIZE(AO)
  212. LD b1, 0 * SIZE(BO)
  213. LD b2, 1 * SIZE(BO)
  214. LD b3, 2 * SIZE(BO)
  215. LD b4, 3 * SIZE(BO)
  216. LD b5, 4 * SIZE(BO)
  217. LD b6, 8 * SIZE(BO)
  218. LD b7, 12 * SIZE(BO)
  219. dsra L, TEMP, 2
  220. MOV c81, c11
  221. blez L, .L25
  222. NOP
  223. #endif
  224. .align 3
  225. .L22:
  226. MADD c11, c11, a1, b1
  227. LD b1, 16 * SIZE(BO)
  228. MADD c21, c21, a1, b2
  229. LD b2, 5 * SIZE(BO)
  230. MADD c31, c31, a1, b3
  231. LD b3, 6 * SIZE(BO)
  232. MADD c41, c41, a1, b4
  233. LD b4, 7 * SIZE(BO)
  234. MADD c51, c51, a1, b5
  235. LD b5, 20 * SIZE(BO)
  236. MADD c61, c61, a1, b2
  237. LD b2, 9 * SIZE(BO)
  238. MADD c71, c71, a1, b3
  239. LD b3, 10 * SIZE(BO)
  240. MADD c81, c81, a1, b4
  241. LD b4, 11 * SIZE(BO)
  242. LD a1, 4 * SIZE(AO)
  243. daddiu L, L, -1
  244. MADD c11, c11, a2, b6
  245. LD b6, 24 * SIZE(BO)
  246. MADD c21, c21, a2, b2
  247. LD b2, 13 * SIZE(BO)
  248. MADD c31, c31, a2, b3
  249. LD b3, 14 * SIZE(BO)
  250. MADD c41, c41, a2, b4
  251. LD b4, 15 * SIZE(BO)
  252. MADD c51, c51, a2, b7
  253. LD b7, 28 * SIZE(BO)
  254. MADD c61, c61, a2, b2
  255. LD b2, 17 * SIZE(BO)
  256. MADD c71, c71, a2, b3
  257. LD b3, 18 * SIZE(BO)
  258. MADD c81, c81, a2, b4
  259. LD b4, 19 * SIZE(BO)
  260. LD a2, 5 * SIZE(AO)
  261. daddiu AO, AO, 4 * SIZE
  262. MADD c11, c11, a3, b1
  263. LD b1, 32 * SIZE(BO)
  264. MADD c21, c21, a3, b2
  265. LD b2, 21 * SIZE(BO)
  266. MADD c31, c31, a3, b3
  267. LD b3, 22 * SIZE(BO)
  268. MADD c41, c41, a3, b4
  269. LD b4, 23 * SIZE(BO)
  270. MADD c51, c51, a3, b5
  271. LD b5, 36 * SIZE(BO)
  272. MADD c61, c61, a3, b2
  273. LD b2, 25 * SIZE(BO)
  274. MADD c71, c71, a3, b3
  275. LD b3, 26 * SIZE(BO)
  276. MADD c81, c81, a3, b4
  277. LD b4, 27 * SIZE(BO)
  278. LD a3, 2 * SIZE(AO)
  279. daddiu BO, BO, 32 * SIZE
  280. MADD c11, c11, a4, b6
  281. LD b6, 8 * SIZE(BO)
  282. MADD c21, c21, a4, b2
  283. LD b2, -3 * SIZE(BO)
  284. MADD c31, c31, a4, b3
  285. LD b3, -2 * SIZE(BO)
  286. MADD c41, c41, a4, b4
  287. LD b4, -1 * SIZE(BO)
  288. MADD c51, c51, a4, b7
  289. LD b7, 12 * SIZE(BO)
  290. MADD c61, c61, a4, b2
  291. LD b2, 1 * SIZE(BO)
  292. MADD c71, c71, a4, b3
  293. LD b3, 2 * SIZE(BO)
  294. MADD c81, c81, a4, b4
  295. LD b4, 3 * SIZE(BO)
  296. bgtz L, .L22
  297. LD a4, 3 * SIZE(AO)
  298. .align 3
  299. .L25:
  300. #if defined(LT) || defined(RN)
  301. andi L, KK, 3
  302. #else
  303. andi L, TEMP, 3
  304. #endif
  305. NOP
  306. blez L, .L28
  307. NOP
  308. .align 3
  309. .L26:
  310. MADD c11, c11, a1, b1
  311. LD b1, 8 * SIZE(BO)
  312. MADD c21, c21, a1, b2
  313. LD b2, 5 * SIZE(BO)
  314. MADD c31, c31, a1, b3
  315. LD b3, 6 * SIZE(BO)
  316. MADD c41, c41, a1, b4
  317. LD b4, 7 * SIZE(BO)
  318. daddiu L, L, -1
  319. MOV a2, a2
  320. daddiu AO, AO, 1 * SIZE
  321. daddiu BO, BO, 8 * SIZE
  322. MADD c51, c51, a1, b5
  323. LD b5, 4 * SIZE(BO)
  324. MADD c61, c61, a1, b2
  325. LD b2, 1 * SIZE(BO)
  326. MADD c71, c71, a1, b3
  327. LD b3, 2 * SIZE(BO)
  328. MADD c81, c81, a1, b4
  329. LD a1, 0 * SIZE(AO)
  330. bgtz L, .L26
  331. LD b4, 3 * SIZE(BO)
  332. .L28:
  333. #if defined(LN) || defined(RT)
  334. #ifdef LN
  335. daddiu TEMP, KK, -1
  336. #else
  337. daddiu TEMP, KK, -8
  338. #endif
  339. dsll L, TEMP, 0 + BASE_SHIFT
  340. dsll TEMP, TEMP, 3 + BASE_SHIFT
  341. daddu AO, AORIG, L
  342. daddu BO, B, TEMP
  343. #endif
  344. #if defined(LN) || defined(LT)
  345. LD b1, 0 * SIZE(BO)
  346. LD b2, 1 * SIZE(BO)
  347. LD b3, 2 * SIZE(BO)
  348. LD b4, 3 * SIZE(BO)
  349. LD b5, 4 * SIZE(BO)
  350. LD b6, 5 * SIZE(BO)
  351. LD b7, 6 * SIZE(BO)
  352. LD b8, 7 * SIZE(BO)
  353. SUB c11, b1, c11
  354. SUB c21, b2, c21
  355. SUB c31, b3, c31
  356. SUB c41, b4, c41
  357. SUB c51, b5, c51
  358. SUB c61, b6, c61
  359. SUB c71, b7, c71
  360. SUB c81, b8, c81
  361. #else
  362. LD b1, 0 * SIZE(AO)
  363. LD b2, 1 * SIZE(AO)
  364. LD b3, 2 * SIZE(AO)
  365. LD b4, 3 * SIZE(AO)
  366. LD b5, 4 * SIZE(AO)
  367. LD b6, 5 * SIZE(AO)
  368. LD b7, 6 * SIZE(AO)
  369. LD b8, 7 * SIZE(AO)
  370. SUB c11, b1, c11
  371. SUB c21, b2, c21
  372. SUB c31, b3, c31
  373. SUB c41, b4, c41
  374. SUB c51, b5, c51
  375. SUB c61, b6, c61
  376. SUB c71, b7, c71
  377. SUB c81, b8, c81
  378. #endif
  379. #if defined(LN) || defined(LT)
  380. LD b1, 0 * SIZE(AO)
  381. MUL c11, b1, c11
  382. MUL c21, b1, c21
  383. MUL c31, b1, c31
  384. MUL c41, b1, c41
  385. MUL c51, b1, c51
  386. MUL c61, b1, c61
  387. MUL c71, b1, c71
  388. MUL c81, b1, c81
  389. #endif
  390. #ifdef RN
  391. LD b1, 0 * SIZE(BO)
  392. LD b2, 1 * SIZE(BO)
  393. LD b3, 2 * SIZE(BO)
  394. LD b4, 3 * SIZE(BO)
  395. LD b5, 4 * SIZE(BO)
  396. LD b6, 5 * SIZE(BO)
  397. LD b7, 6 * SIZE(BO)
  398. LD b8, 7 * SIZE(BO)
  399. MUL c11, b1, c11
  400. NMSUB c21, c21, b2, c11
  401. NMSUB c31, c31, b3, c11
  402. NMSUB c41, c41, b4, c11
  403. NMSUB c51, c51, b5, c11
  404. NMSUB c61, c61, b6, c11
  405. NMSUB c71, c71, b7, c11
  406. NMSUB c81, c81, b8, c11
  407. LD b2, 9 * SIZE(BO)
  408. LD b3, 10 * SIZE(BO)
  409. LD b4, 11 * SIZE(BO)
  410. LD b5, 12 * SIZE(BO)
  411. LD b6, 13 * SIZE(BO)
  412. LD b7, 14 * SIZE(BO)
  413. LD b8, 15 * SIZE(BO)
  414. MUL c21, b2, c21
  415. NMSUB c31, c31, b3, c21
  416. NMSUB c41, c41, b4, c21
  417. NMSUB c51, c51, b5, c21
  418. NMSUB c61, c61, b6, c21
  419. NMSUB c71, c71, b7, c21
  420. NMSUB c81, c81, b8, c21
  421. LD b3, 18 * SIZE(BO)
  422. LD b4, 19 * SIZE(BO)
  423. LD b5, 20 * SIZE(BO)
  424. LD b6, 21 * SIZE(BO)
  425. LD b7, 22 * SIZE(BO)
  426. LD b8, 23 * SIZE(BO)
  427. MUL c31, b3, c31
  428. NMSUB c41, c41, b4, c31
  429. NMSUB c51, c51, b5, c31
  430. NMSUB c61, c61, b6, c31
  431. NMSUB c71, c71, b7, c31
  432. NMSUB c81, c81, b8, c31
  433. LD b4, 27 * SIZE(BO)
  434. LD b5, 28 * SIZE(BO)
  435. LD b6, 29 * SIZE(BO)
  436. LD b7, 30 * SIZE(BO)
  437. LD b8, 31 * SIZE(BO)
  438. MUL c41, b4, c41
  439. NMSUB c51, c51, b5, c41
  440. NMSUB c61, c61, b6, c41
  441. NMSUB c71, c71, b7, c41
  442. NMSUB c81, c81, b8, c41
  443. LD b5, 36 * SIZE(BO)
  444. LD b6, 37 * SIZE(BO)
  445. LD b7, 38 * SIZE(BO)
  446. LD b8, 39 * SIZE(BO)
  447. MUL c51, b5, c51
  448. NMSUB c61, c61, b6, c51
  449. NMSUB c71, c71, b7, c51
  450. NMSUB c81, c81, b8, c51
  451. LD b6, 45 * SIZE(BO)
  452. LD b7, 46 * SIZE(BO)
  453. LD b8, 47 * SIZE(BO)
  454. MUL c61, b6, c61
  455. NMSUB c71, c71, b7, c61
  456. NMSUB c81, c81, b8, c61
  457. LD b7, 54 * SIZE(BO)
  458. LD b8, 55 * SIZE(BO)
  459. MUL c71, b7, c71
  460. NMSUB c81, c81, b8, c71
  461. LD b8, 63 * SIZE(BO)
  462. MUL c81, b8, c81
  463. #endif
  464. #ifdef RT
  465. LD b1, 63 * SIZE(BO)
  466. LD b2, 62 * SIZE(BO)
  467. LD b3, 61 * SIZE(BO)
  468. LD b4, 60 * SIZE(BO)
  469. LD b5, 59 * SIZE(BO)
  470. LD b6, 58 * SIZE(BO)
  471. LD b7, 57 * SIZE(BO)
  472. LD b8, 56 * SIZE(BO)
  473. MUL c81, b1, c81
  474. NMSUB c71, c71, b2, c81
  475. NMSUB c61, c61, b3, c81
  476. NMSUB c51, c51, b4, c81
  477. NMSUB c41, c41, b5, c81
  478. NMSUB c31, c31, b6, c81
  479. NMSUB c21, c21, b7, c81
  480. NMSUB c11, c11, b8, c81
  481. LD b2, 54 * SIZE(BO)
  482. LD b3, 53 * SIZE(BO)
  483. LD b4, 52 * SIZE(BO)
  484. LD b5, 51 * SIZE(BO)
  485. LD b6, 50 * SIZE(BO)
  486. LD b7, 49 * SIZE(BO)
  487. LD b8, 48 * SIZE(BO)
  488. MUL c71, b2, c71
  489. NMSUB c61, c61, b3, c71
  490. NMSUB c51, c51, b4, c71
  491. NMSUB c41, c41, b5, c71
  492. NMSUB c31, c31, b6, c71
  493. NMSUB c21, c21, b7, c71
  494. NMSUB c11, c11, b8, c71
  495. LD b3, 45 * SIZE(BO)
  496. LD b4, 44 * SIZE(BO)
  497. LD b5, 43 * SIZE(BO)
  498. LD b6, 42 * SIZE(BO)
  499. LD b7, 41 * SIZE(BO)
  500. LD b8, 40 * SIZE(BO)
  501. MUL c61, b3, c61
  502. NMSUB c51, c51, b4, c61
  503. NMSUB c41, c41, b5, c61
  504. NMSUB c31, c31, b6, c61
  505. NMSUB c21, c21, b7, c61
  506. NMSUB c11, c11, b8, c61
  507. LD b4, 36 * SIZE(BO)
  508. LD b5, 35 * SIZE(BO)
  509. LD b6, 34 * SIZE(BO)
  510. LD b7, 33 * SIZE(BO)
  511. LD b8, 32 * SIZE(BO)
  512. MUL c51, b4, c51
  513. NMSUB c41, c41, b5, c51
  514. NMSUB c31, c31, b6, c51
  515. NMSUB c21, c21, b7, c51
  516. NMSUB c11, c11, b8, c51
  517. LD b5, 27 * SIZE(BO)
  518. LD b6, 26 * SIZE(BO)
  519. LD b7, 25 * SIZE(BO)
  520. LD b8, 24 * SIZE(BO)
  521. MUL c41, b5, c41
  522. NMSUB c31, c31, b6, c41
  523. NMSUB c21, c21, b7, c41
  524. NMSUB c11, c11, b8, c41
  525. LD b6, 18 * SIZE(BO)
  526. LD b7, 17 * SIZE(BO)
  527. LD b8, 16 * SIZE(BO)
  528. MUL c31, b6, c31
  529. NMSUB c21, c21, b7, c31
  530. NMSUB c11, c11, b8, c31
  531. LD b7, 9 * SIZE(BO)
  532. LD b8, 8 * SIZE(BO)
  533. MUL c21, b7, c21
  534. NMSUB c11, c11, b8, c21
  535. LD b8, 0 * SIZE(BO)
  536. MUL c11, b8, c11
  537. #endif
  538. #ifdef LN
  539. daddiu CO1, CO1, -1 * SIZE
  540. daddiu CO2, CO2, -1 * SIZE
  541. daddiu CO3, CO3, -1 * SIZE
  542. daddiu CO4, CO4, -1 * SIZE
  543. daddiu CO5, CO5, -1 * SIZE
  544. daddiu CO6, CO6, -1 * SIZE
  545. daddiu CO7, CO7, -1 * SIZE
  546. daddiu CO8, CO8, -1 * SIZE
  547. #endif
  548. #if defined(LN) || defined(LT)
  549. ST c11, 0 * SIZE(BO)
  550. ST c21, 1 * SIZE(BO)
  551. ST c31, 2 * SIZE(BO)
  552. ST c41, 3 * SIZE(BO)
  553. ST c51, 4 * SIZE(BO)
  554. ST c61, 5 * SIZE(BO)
  555. ST c71, 6 * SIZE(BO)
  556. ST c81, 7 * SIZE(BO)
  557. #else
  558. ST c11, 0 * SIZE(AO)
  559. ST c21, 1 * SIZE(AO)
  560. ST c31, 2 * SIZE(AO)
  561. ST c41, 3 * SIZE(AO)
  562. ST c51, 4 * SIZE(AO)
  563. ST c61, 5 * SIZE(AO)
  564. ST c71, 6 * SIZE(AO)
  565. ST c81, 7 * SIZE(AO)
  566. #endif
  567. ST c11, 0 * SIZE(CO1)
  568. ST c21, 0 * SIZE(CO2)
  569. ST c31, 0 * SIZE(CO3)
  570. ST c41, 0 * SIZE(CO4)
  571. ST c51, 0 * SIZE(CO5)
  572. ST c61, 0 * SIZE(CO6)
  573. ST c71, 0 * SIZE(CO7)
  574. ST c81, 0 * SIZE(CO8)
  575. MTC $0, c11
  576. #ifndef LN
  577. daddiu CO1, CO1, 1 * SIZE
  578. daddiu CO2, CO2, 1 * SIZE
  579. daddiu CO3, CO3, 1 * SIZE
  580. daddiu CO4, CO4, 1 * SIZE
  581. daddiu CO5, CO5, 1 * SIZE
  582. daddiu CO6, CO6, 1 * SIZE
  583. daddiu CO7, CO7, 1 * SIZE
  584. daddiu CO8, CO8, 1 * SIZE
  585. #endif
  586. MOV c21, c11
  587. #ifdef RT
  588. dsll TEMP, K, BASE_SHIFT
  589. daddu AORIG, AORIG, TEMP
  590. #endif
  591. MOV c31, c11
  592. #if defined(LT) || defined(RN)
  593. dsubu TEMP, K, KK
  594. dsll L, TEMP, 0 + BASE_SHIFT
  595. dsll TEMP, TEMP, 3 + BASE_SHIFT
  596. daddu AO, AO, L
  597. daddu BO, BO, TEMP
  598. #endif
  599. MOV c41, c11
  600. #ifdef LT
  601. daddiu KK, KK, 1
  602. #endif
  603. #ifdef LN
  604. daddiu KK, KK, -1
  605. #endif
  606. .align 3
  607. .L20:
  608. dsra I, M, 1
  609. MOV c51, c11
  610. blez I, .L29
  611. MOV c61, c11
  612. .L11:
  613. #if defined(LT) || defined(RN)
  614. LD a1, 0 * SIZE(AO)
  615. MOV c71, c11
  616. LD b1, 0 * SIZE(B)
  617. MOV c81, c11
  618. LD a3, 4 * SIZE(AO)
  619. MOV c12, c11
  620. LD b2, 1 * SIZE(B)
  621. MOV c22, c11
  622. dsra L, KK, 2
  623. MOV c32, c11
  624. LD b3, 2 * SIZE(B)
  625. MOV c42, c11
  626. LD b4, 3 * SIZE(B)
  627. MOV c52, c11
  628. LD b5, 4 * SIZE(B)
  629. MOV c62, c11
  630. LD b6, 8 * SIZE(B)
  631. MOV c72, c11
  632. LD b7, 12 * SIZE(B)
  633. MOV c82, c11
  634. blez L, .L15
  635. move BO, B
  636. #else
  637. #ifdef LN
  638. dsll TEMP, K, 1 + BASE_SHIFT
  639. dsubu AORIG, AORIG, TEMP
  640. #endif
  641. dsll L, KK, 1 + BASE_SHIFT
  642. dsll TEMP, KK, 3 + BASE_SHIFT
  643. daddu AO, AORIG, L
  644. daddu BO, B, TEMP
  645. dsubu TEMP, K, KK
  646. LD a1, 0 * SIZE(AO)
  647. MOV c71, c11
  648. LD b1, 0 * SIZE(BO)
  649. MOV c81, c11
  650. LD a3, 4 * SIZE(AO)
  651. MOV c12, c11
  652. LD b2, 1 * SIZE(BO)
  653. MOV c22, c11
  654. MOV c32, c11
  655. LD b3, 2 * SIZE(BO)
  656. MOV c42, c11
  657. LD b4, 3 * SIZE(BO)
  658. MOV c52, c11
  659. LD b5, 4 * SIZE(BO)
  660. MOV c62, c11
  661. LD b6, 8 * SIZE(BO)
  662. MOV c72, c11
  663. LD b7, 12 * SIZE(BO)
  664. MOV c82, c11
  665. dsra L, TEMP, 2
  666. blez L, .L15
  667. NOP
  668. #endif
  669. MADD c11, c11, a1, b1
  670. LD a2, 1 * SIZE(AO)
  671. MADD c21, c21, a1, b2
  672. daddiu L, L, -1
  673. MADD c31, c31, a1, b3
  674. blez L, .L13
  675. MADD c41, c41, a1, b4
  676. NOP
  677. .align 3
  678. .L12:
  679. MADD c12, c12, a2, b1
  680. LD b1, 16 * SIZE(BO)
  681. MADD c22, c22, a2, b2
  682. LD b2, 5 * SIZE(BO)
  683. MADD c32, c32, a2, b3
  684. LD b3, 6 * SIZE(BO)
  685. MADD c42, c42, a2, b4
  686. LD b4, 7 * SIZE(BO)
  687. MADD c51, c51, a1, b5
  688. NOP
  689. MADD c61, c61, a1, b2
  690. LD a4, 2 * SIZE(AO)
  691. MADD c71, c71, a1, b3
  692. NOP
  693. MADD c81, c81, a1, b4
  694. LD a1, 8 * SIZE(AO)
  695. MADD c52, c52, a2, b5
  696. LD b5, 20 * SIZE(BO)
  697. MADD c62, c62, a2, b2
  698. LD b2, 9 * SIZE(BO)
  699. MADD c72, c72, a2, b3
  700. LD b3, 10 * SIZE(BO)
  701. MADD c82, c82, a2, b4
  702. LD b4, 11 * SIZE(BO)
  703. MADD c11, c11, a4, b6
  704. LD a2, 3 * SIZE(AO)
  705. MADD c21, c21, a4, b2
  706. NOP
  707. MADD c31, c31, a4, b3
  708. NOP
  709. MADD c41, c41, a4, b4
  710. NOP
  711. MADD c12, c12, a2, b6
  712. LD b6, 24 * SIZE(BO)
  713. MADD c22, c22, a2, b2
  714. LD b2, 13 * SIZE(BO)
  715. MADD c32, c32, a2, b3
  716. LD b3, 14 * SIZE(BO)
  717. MADD c42, c42, a2, b4
  718. LD b4, 15 * SIZE(BO)
  719. MADD c51, c51, a4, b7
  720. NOP
  721. MADD c61, c61, a4, b2
  722. NOP
  723. MADD c71, c71, a4, b3
  724. NOP
  725. MADD c81, c81, a4, b4
  726. NOP
  727. MADD c52, c52, a2, b7
  728. LD b7, 28 * SIZE(BO)
  729. MADD c62, c62, a2, b2
  730. LD b2, 17 * SIZE(BO)
  731. MADD c72, c72, a2, b3
  732. LD b3, 18 * SIZE(BO)
  733. MADD c82, c82, a2, b4
  734. LD b4, 19 * SIZE(BO)
  735. MADD c11, c11, a3, b1
  736. LD a2, 5 * SIZE(AO)
  737. MADD c21, c21, a3, b2
  738. NOP
  739. MADD c31, c31, a3, b3
  740. NOP
  741. MADD c41, c41, a3, b4
  742. NOP
  743. MADD c12, c12, a2, b1
  744. LD b1, 32 * SIZE(BO)
  745. MADD c22, c22, a2, b2
  746. LD b2, 21 * SIZE(BO)
  747. MADD c32, c32, a2, b3
  748. LD b3, 22 * SIZE(BO)
  749. MADD c42, c42, a2, b4
  750. LD b4, 23 * SIZE(BO)
  751. MADD c51, c51, a3, b5
  752. NOP
  753. MADD c61, c61, a3, b2
  754. LD a4, 6 * SIZE(AO)
  755. MADD c71, c71, a3, b3
  756. NOP
  757. MADD c81, c81, a3, b4
  758. LD a3, 12 * SIZE(AO)
  759. MADD c52, c52, a2, b5
  760. LD b5, 36 * SIZE(BO)
  761. MADD c62, c62, a2, b2
  762. LD b2, 25 * SIZE(BO)
  763. MADD c72, c72, a2, b3
  764. LD b3, 26 * SIZE(BO)
  765. MADD c82, c82, a2, b4
  766. LD b4, 27 * SIZE(BO)
  767. MADD c11, c11, a4, b6
  768. LD a2, 7 * SIZE(AO)
  769. MADD c21, c21, a4, b2
  770. NOP
  771. MADD c31, c31, a4, b3
  772. NOP
  773. MADD c41, c41, a4, b4
  774. daddiu L, L, -1
  775. MADD c12, c12, a2, b6
  776. LD b6, 40 * SIZE(BO)
  777. MADD c22, c22, a2, b2
  778. LD b2, 29 * SIZE(BO)
  779. MADD c32, c32, a2, b3
  780. LD b3, 30 * SIZE(BO)
  781. MADD c42, c42, a2, b4
  782. LD b4, 31 * SIZE(BO)
  783. MADD c51, c51, a4, b7
  784. daddiu BO, BO, 32 * SIZE
  785. MADD c61, c61, a4, b2
  786. daddiu AO, AO, 8 * SIZE
  787. MADD c71, c71, a4, b3
  788. NOP
  789. MADD c81, c81, a4, b4
  790. NOP
  791. MADD c52, c52, a2, b7
  792. LD b7, 12 * SIZE(BO)
  793. MADD c62, c62, a2, b2
  794. LD b2, 1 * SIZE(BO)
  795. MADD c72, c72, a2, b3
  796. LD b3, 2 * SIZE(BO)
  797. MADD c82, c82, a2, b4
  798. LD b4, 3 * SIZE(BO)
  799. MADD c11, c11, a1, b1
  800. LD a2, 1 * SIZE(AO)
  801. MADD c21, c21, a1, b2
  802. NOP
  803. MADD c31, c31, a1, b3
  804. bgtz L, .L12
  805. MADD c41, c41, a1, b4
  806. NOP
  807. .align 3
  808. .L13:
  809. MADD c12, c12, a2, b1
  810. LD b1, 16 * SIZE(BO)
  811. MADD c22, c22, a2, b2
  812. LD b2, 5 * SIZE(BO)
  813. MADD c32, c32, a2, b3
  814. LD b3, 6 * SIZE(BO)
  815. MADD c42, c42, a2, b4
  816. LD b4, 7 * SIZE(BO)
  817. MADD c51, c51, a1, b5
  818. NOP
  819. MADD c61, c61, a1, b2
  820. LD a4, 2 * SIZE(AO)
  821. MADD c71, c71, a1, b3
  822. NOP
  823. MADD c81, c81, a1, b4
  824. LD a1, 8 * SIZE(AO)
  825. MADD c52, c52, a2, b5
  826. LD b5, 20 * SIZE(BO)
  827. MADD c62, c62, a2, b2
  828. LD b2, 9 * SIZE(BO)
  829. MADD c72, c72, a2, b3
  830. LD b3, 10 * SIZE(BO)
  831. MADD c82, c82, a2, b4
  832. LD b4, 11 * SIZE(BO)
  833. MADD c11, c11, a4, b6
  834. LD a2, 3 * SIZE(AO)
  835. MADD c21, c21, a4, b2
  836. NOP
  837. MADD c31, c31, a4, b3
  838. NOP
  839. MADD c41, c41, a4, b4
  840. NOP
  841. MADD c12, c12, a2, b6
  842. LD b6, 24 * SIZE(BO)
  843. MADD c22, c22, a2, b2
  844. LD b2, 13 * SIZE(BO)
  845. MADD c32, c32, a2, b3
  846. LD b3, 14 * SIZE(BO)
  847. MADD c42, c42, a2, b4
  848. LD b4, 15 * SIZE(BO)
  849. MADD c51, c51, a4, b7
  850. NOP
  851. MADD c61, c61, a4, b2
  852. NOP
  853. MADD c71, c71, a4, b3
  854. NOP
  855. MADD c81, c81, a4, b4
  856. NOP
  857. MADD c52, c52, a2, b7
  858. LD b7, 28 * SIZE(BO)
  859. MADD c62, c62, a2, b2
  860. LD b2, 17 * SIZE(BO)
  861. MADD c72, c72, a2, b3
  862. LD b3, 18 * SIZE(BO)
  863. MADD c82, c82, a2, b4
  864. LD b4, 19 * SIZE(BO)
  865. MADD c11, c11, a3, b1
  866. LD a2, 5 * SIZE(AO)
  867. MADD c21, c21, a3, b2
  868. NOP
  869. MADD c31, c31, a3, b3
  870. NOP
  871. MADD c41, c41, a3, b4
  872. NOP
  873. MADD c12, c12, a2, b1
  874. LD b1, 32 * SIZE(BO)
  875. MADD c22, c22, a2, b2
  876. LD b2, 21 * SIZE(BO)
  877. MADD c32, c32, a2, b3
  878. LD b3, 22 * SIZE(BO)
  879. MADD c42, c42, a2, b4
  880. LD b4, 23 * SIZE(BO)
  881. MADD c51, c51, a3, b5
  882. NOP
  883. MADD c61, c61, a3, b2
  884. LD a4, 6 * SIZE(AO)
  885. MADD c71, c71, a3, b3
  886. NOP
  887. MADD c81, c81, a3, b4
  888. LD a3, 12 * SIZE(AO)
  889. MADD c52, c52, a2, b5
  890. LD b5, 36 * SIZE(BO)
  891. MADD c62, c62, a2, b2
  892. LD b2, 25 * SIZE(BO)
  893. MADD c72, c72, a2, b3
  894. LD b3, 26 * SIZE(BO)
  895. MADD c82, c82, a2, b4
  896. LD b4, 27 * SIZE(BO)
  897. MADD c11, c11, a4, b6
  898. LD a2, 7 * SIZE(AO)
  899. MADD c21, c21, a4, b2
  900. NOP
  901. MADD c31, c31, a4, b3
  902. NOP
  903. MADD c41, c41, a4, b4
  904. NOP
  905. MADD c12, c12, a2, b6
  906. LD b6, 40 * SIZE(BO)
  907. MADD c22, c22, a2, b2
  908. LD b2, 29 * SIZE(BO)
  909. MADD c32, c32, a2, b3
  910. LD b3, 30 * SIZE(BO)
  911. MADD c42, c42, a2, b4
  912. LD b4, 31 * SIZE(BO)
  913. MADD c51, c51, a4, b7
  914. daddiu BO, BO, 32 * SIZE
  915. MADD c61, c61, a4, b2
  916. daddiu AO, AO, 8 * SIZE
  917. MADD c71, c71, a4, b3
  918. NOP
  919. MADD c81, c81, a4, b4
  920. NOP
  921. MADD c52, c52, a2, b7
  922. LD b7, 12 * SIZE(BO)
  923. MADD c62, c62, a2, b2
  924. LD b2, 1 * SIZE(BO)
  925. MADD c72, c72, a2, b3
  926. LD b3, 2 * SIZE(BO)
  927. MADD c82, c82, a2, b4
  928. LD b4, 3 * SIZE(BO)
  929. .align 3
  930. .L15:
  931. #if defined(LT) || defined(RN)
  932. andi L, KK, 3
  933. #else
  934. andi L, TEMP, 3
  935. #endif
  936. blez L, .L18
  937. NOP
  938. .align 3
  939. .L16:
  940. MADD c11, c11, a1, b1
  941. LD a2, 1 * SIZE(AO)
  942. MADD c21, c21, a1, b2
  943. NOP
  944. MADD c31, c31, a1, b3
  945. NOP
  946. MADD c41, c41, a1, b4
  947. NOP
  948. MADD c12, c12, a2, b1
  949. LD b1, 8 * SIZE(BO)
  950. MADD c22, c22, a2, b2
  951. LD b2, 5 * SIZE(BO)
  952. MADD c32, c32, a2, b3
  953. LD b3, 6 * SIZE(BO)
  954. MADD c42, c42, a2, b4
  955. LD b4, 7 * SIZE(BO)
  956. MADD c51, c51, a1, b5
  957. daddiu L, L, -1
  958. MADD c61, c61, a1, b2
  959. daddiu AO, AO, 2 * SIZE
  960. MADD c71, c71, a1, b3
  961. daddiu BO, BO, 8 * SIZE
  962. MADD c81, c81, a1, b4
  963. LD a1, 0 * SIZE(AO)
  964. MADD c52, c52, a2, b5
  965. LD b5, 4 * SIZE(BO)
  966. MADD c62, c62, a2, b2
  967. LD b2, 1 * SIZE(BO)
  968. MADD c72, c72, a2, b3
  969. LD b3, 2 * SIZE(BO)
  970. MADD c82, c82, a2, b4
  971. bgtz L, .L16
  972. LD b4, 3 * SIZE(BO)
  973. .L18:
  974. #if defined(LN) || defined(RT)
  975. #ifdef LN
  976. daddiu TEMP, KK, -2
  977. #else
  978. daddiu TEMP, KK, -8
  979. #endif
  980. dsll L, TEMP, 1 + BASE_SHIFT
  981. dsll TEMP, TEMP, 3 + BASE_SHIFT
  982. daddu AO, AORIG, L
  983. daddu BO, B, TEMP
  984. #endif
  985. #if defined(LN) || defined(LT)
  986. LD b1, 0 * SIZE(BO)
  987. LD b2, 1 * SIZE(BO)
  988. LD b3, 2 * SIZE(BO)
  989. LD b4, 3 * SIZE(BO)
  990. SUB c11, b1, c11
  991. LD b5, 4 * SIZE(BO)
  992. SUB c21, b2, c21
  993. LD b6, 5 * SIZE(BO)
  994. SUB c31, b3, c31
  995. LD b7, 6 * SIZE(BO)
  996. SUB c41, b4, c41
  997. LD b8, 7 * SIZE(BO)
  998. SUB c51, b5, c51
  999. LD b1, 8 * SIZE(BO)
  1000. SUB c61, b6, c61
  1001. LD b2, 9 * SIZE(BO)
  1002. SUB c71, b7, c71
  1003. LD b3, 10 * SIZE(BO)
  1004. SUB c81, b8, c81
  1005. LD b4, 11 * SIZE(BO)
  1006. SUB c12, b1, c12
  1007. LD b5, 12 * SIZE(BO)
  1008. SUB c22, b2, c22
  1009. LD b6, 13 * SIZE(BO)
  1010. SUB c32, b3, c32
  1011. LD b7, 14 * SIZE(BO)
  1012. SUB c42, b4, c42
  1013. LD b8, 15 * SIZE(BO)
  1014. SUB c52, b5, c52
  1015. #ifdef LN
  1016. LD b1, 3 * SIZE(AO)
  1017. #else
  1018. LD b1, 0 * SIZE(AO)
  1019. #endif
  1020. SUB c62, b6, c62
  1021. SUB c72, b7, c72
  1022. SUB c82, b8, c82
  1023. #else
  1024. LD b1, 0 * SIZE(AO)
  1025. LD b2, 1 * SIZE(AO)
  1026. LD b3, 2 * SIZE(AO)
  1027. LD b4, 3 * SIZE(AO)
  1028. SUB c11, b1, c11
  1029. LD b5, 4 * SIZE(AO)
  1030. SUB c12, b2, c12
  1031. LD b6, 5 * SIZE(AO)
  1032. SUB c21, b3, c21
  1033. LD b7, 6 * SIZE(AO)
  1034. SUB c22, b4, c22
  1035. LD b8, 7 * SIZE(AO)
  1036. SUB c31, b5, c31
  1037. LD b1, 8 * SIZE(AO)
  1038. SUB c32, b6, c32
  1039. LD b2, 9 * SIZE(AO)
  1040. SUB c41, b7, c41
  1041. LD b3, 10 * SIZE(AO)
  1042. SUB c42, b8, c42
  1043. LD b4, 11 * SIZE(AO)
  1044. LD b5, 12 * SIZE(AO)
  1045. SUB c51, b1, c51
  1046. LD b6, 13 * SIZE(AO)
  1047. SUB c52, b2, c52
  1048. LD b7, 14 * SIZE(AO)
  1049. SUB c61, b3, c61
  1050. LD b8, 15 * SIZE(AO)
  1051. SUB c62, b4, c62
  1052. SUB c71, b5, c71
  1053. SUB c72, b6, c72
  1054. SUB c81, b7, c81
  1055. SUB c82, b8, c82
  1056. #endif
  1057. #ifdef LN
  1058. MUL c12, b1, c12
  1059. LD b2, 2 * SIZE(AO)
  1060. MUL c22, b1, c22
  1061. MUL c32, b1, c32
  1062. MUL c42, b1, c42
  1063. MUL c52, b1, c52
  1064. MUL c62, b1, c62
  1065. MUL c72, b1, c72
  1066. MUL c82, b1, c82
  1067. NMSUB c11, c11, b2, c12
  1068. LD b3, 0 * SIZE(AO)
  1069. NMSUB c21, c21, b2, c22
  1070. NMSUB c31, c31, b2, c32
  1071. NMSUB c41, c41, b2, c42
  1072. NMSUB c51, c51, b2, c52
  1073. NMSUB c61, c61, b2, c62
  1074. NMSUB c71, c71, b2, c72
  1075. NMSUB c81, c81, b2, c82
  1076. MUL c11, b3, c11
  1077. daddiu CO1, CO1, -2 * SIZE
  1078. MUL c21, b3, c21
  1079. daddiu CO2, CO2, -2 * SIZE
  1080. MUL c31, b3, c31
  1081. daddiu CO3, CO3, -2 * SIZE
  1082. MUL c41, b3, c41
  1083. daddiu CO4, CO4, -2 * SIZE
  1084. MUL c51, b3, c51
  1085. daddiu CO5, CO5, -2 * SIZE
  1086. MUL c61, b3, c61
  1087. daddiu CO6, CO6, -2 * SIZE
  1088. MUL c71, b3, c71
  1089. daddiu CO7, CO7, -2 * SIZE
  1090. MUL c81, b3, c81
  1091. daddiu CO8, CO8, -2 * SIZE
  1092. #endif
  1093. #ifdef LT
  1094. MUL c11, b1, c11
  1095. LD b2, 1 * SIZE(AO)
  1096. MUL c21, b1, c21
  1097. MUL c31, b1, c31
  1098. MUL c41, b1, c41
  1099. MUL c51, b1, c51
  1100. MUL c61, b1, c61
  1101. MUL c71, b1, c71
  1102. MUL c81, b1, c81
  1103. NMSUB c12, c12, b2, c11
  1104. LD b3, 3 * SIZE(AO)
  1105. NMSUB c22, c22, b2, c21
  1106. NMSUB c32, c32, b2, c31
  1107. NMSUB c42, c42, b2, c41
  1108. NMSUB c52, c52, b2, c51
  1109. NMSUB c62, c62, b2, c61
  1110. NMSUB c72, c72, b2, c71
  1111. NMSUB c82, c82, b2, c81
  1112. MUL c12, b3, c12
  1113. MUL c22, b3, c22
  1114. MUL c32, b3, c32
  1115. MUL c42, b3, c42
  1116. MUL c52, b3, c52
  1117. MUL c62, b3, c62
  1118. MUL c72, b3, c72
  1119. MUL c82, b3, c82
  1120. #endif
  1121. #ifdef RN
  1122. LD b1, 0 * SIZE(BO)
  1123. LD b2, 1 * SIZE(BO)
  1124. LD b3, 2 * SIZE(BO)
  1125. LD b4, 3 * SIZE(BO)
  1126. MUL c11, b1, c11
  1127. MUL c12, b1, c12
  1128. LD b5, 4 * SIZE(BO)
  1129. NMSUB c21, c21, b2, c11
  1130. NMSUB c22, c22, b2, c12
  1131. LD b6, 5 * SIZE(BO)
  1132. NMSUB c31, c31, b3, c11
  1133. NMSUB c32, c32, b3, c12
  1134. LD b7, 6 * SIZE(BO)
  1135. NMSUB c41, c41, b4, c11
  1136. NMSUB c42, c42, b4, c12
  1137. LD b8, 7 * SIZE(BO)
  1138. NMSUB c51, c51, b5, c11
  1139. NMSUB c52, c52, b5, c12
  1140. LD b2, 9 * SIZE(BO)
  1141. NMSUB c61, c61, b6, c11
  1142. NMSUB c62, c62, b6, c12
  1143. LD b3, 10 * SIZE(BO)
  1144. NMSUB c71, c71, b7, c11
  1145. NMSUB c72, c72, b7, c12
  1146. LD b4, 11 * SIZE(BO)
  1147. NMSUB c81, c81, b8, c11
  1148. NMSUB c82, c82, b8, c12
  1149. LD b5, 12 * SIZE(BO)
  1150. MUL c21, b2, c21
  1151. MUL c22, b2, c22
  1152. LD b6, 13 * SIZE(BO)
  1153. NMSUB c31, c31, b3, c21
  1154. NMSUB c32, c32, b3, c22
  1155. LD b7, 14 * SIZE(BO)
  1156. NMSUB c41, c41, b4, c21
  1157. NMSUB c42, c42, b4, c22
  1158. LD b8, 15 * SIZE(BO)
  1159. NMSUB c51, c51, b5, c21
  1160. NMSUB c52, c52, b5, c22
  1161. LD b3, 18 * SIZE(BO)
  1162. NMSUB c61, c61, b6, c21
  1163. NMSUB c62, c62, b6, c22
  1164. LD b4, 19 * SIZE(BO)
  1165. NMSUB c71, c71, b7, c21
  1166. NMSUB c72, c72, b7, c22
  1167. LD b5, 20 * SIZE(BO)
  1168. NMSUB c81, c81, b8, c21
  1169. NMSUB c82, c82, b8, c22
  1170. LD b6, 21 * SIZE(BO)
  1171. MUL c31, b3, c31
  1172. MUL c32, b3, c32
  1173. LD b7, 22 * SIZE(BO)
  1174. NMSUB c41, c41, b4, c31
  1175. NMSUB c42, c42, b4, c32
  1176. LD b8, 23 * SIZE(BO)
  1177. NMSUB c51, c51, b5, c31
  1178. NMSUB c52, c52, b5, c32
  1179. LD b4, 27 * SIZE(BO)
  1180. NMSUB c61, c61, b6, c31
  1181. NMSUB c62, c62, b6, c32
  1182. LD b5, 28 * SIZE(BO)
  1183. NMSUB c71, c71, b7, c31
  1184. NMSUB c72, c72, b7, c32
  1185. LD b6, 29 * SIZE(BO)
  1186. NMSUB c81, c81, b8, c31
  1187. NMSUB c82, c82, b8, c32
  1188. LD b7, 30 * SIZE(BO)
  1189. MUL c41, b4, c41
  1190. MUL c42, b4, c42
  1191. LD b8, 31 * SIZE(BO)
  1192. NMSUB c51, c51, b5, c41
  1193. NMSUB c52, c52, b5, c42
  1194. LD b5, 36 * SIZE(BO)
  1195. NMSUB c61, c61, b6, c41
  1196. NMSUB c62, c62, b6, c42
  1197. LD b6, 37 * SIZE(BO)
  1198. NMSUB c71, c71, b7, c41
  1199. NMSUB c72, c72, b7, c42
  1200. LD b7, 38 * SIZE(BO)
  1201. NMSUB c81, c81, b8, c41
  1202. NMSUB c82, c82, b8, c42
  1203. LD b8, 39 * SIZE(BO)
  1204. MUL c51, b5, c51
  1205. MUL c52, b5, c52
  1206. NMSUB c61, c61, b6, c51
  1207. NMSUB c62, c62, b6, c52
  1208. LD b6, 45 * SIZE(BO)
  1209. NMSUB c71, c71, b7, c51
  1210. NMSUB c72, c72, b7, c52
  1211. LD b7, 46 * SIZE(BO)
  1212. NMSUB c81, c81, b8, c51
  1213. NMSUB c82, c82, b8, c52
  1214. LD b8, 47 * SIZE(BO)
  1215. MUL c61, b6, c61
  1216. MUL c62, b6, c62
  1217. NMSUB c71, c71, b7, c61
  1218. NMSUB c72, c72, b7, c62
  1219. LD b7, 54 * SIZE(BO)
  1220. NMSUB c81, c81, b8, c61
  1221. NMSUB c82, c82, b8, c62
  1222. LD b8, 55 * SIZE(BO)
  1223. MUL c71, b7, c71
  1224. MUL c72, b7, c72
  1225. NMSUB c81, c81, b8, c71
  1226. NMSUB c82, c82, b8, c72
  1227. LD b8, 63 * SIZE(BO)
  1228. MUL c81, b8, c81
  1229. MUL c82, b8, c82
  1230. #endif
  1231. #ifdef RT
  1232. LD b1, 63 * SIZE(BO)
  1233. LD b2, 62 * SIZE(BO)
  1234. LD b3, 61 * SIZE(BO)
  1235. LD b4, 60 * SIZE(BO)
  1236. MUL c81, b1, c81
  1237. MUL c82, b1, c82
  1238. LD b5, 59 * SIZE(BO)
  1239. NMSUB c71, c71, b2, c81
  1240. NMSUB c72, c72, b2, c82
  1241. LD b6, 58 * SIZE(BO)
  1242. NMSUB c61, c61, b3, c81
  1243. NMSUB c62, c62, b3, c82
  1244. LD b7, 57 * SIZE(BO)
  1245. NMSUB c51, c51, b4, c81
  1246. NMSUB c52, c52, b4, c82
  1247. LD b8, 56 * SIZE(BO)
  1248. NMSUB c41, c41, b5, c81
  1249. NMSUB c42, c42, b5, c82
  1250. LD b2, 54 * SIZE(BO)
  1251. NMSUB c31, c31, b6, c81
  1252. NMSUB c32, c32, b6, c82
  1253. LD b3, 53 * SIZE(BO)
  1254. NMSUB c21, c21, b7, c81
  1255. NMSUB c22, c22, b7, c82
  1256. LD b4, 52 * SIZE(BO)
  1257. NMSUB c11, c11, b8, c81
  1258. NMSUB c12, c12, b8, c82
  1259. LD b5, 51 * SIZE(BO)
  1260. MUL c71, b2, c71
  1261. MUL c72, b2, c72
  1262. LD b6, 50 * SIZE(BO)
  1263. NMSUB c61, c61, b3, c71
  1264. NMSUB c62, c62, b3, c72
  1265. LD b7, 49 * SIZE(BO)
  1266. NMSUB c51, c51, b4, c71
  1267. NMSUB c52, c52, b4, c72
  1268. LD b8, 48 * SIZE(BO)
  1269. NMSUB c41, c41, b5, c71
  1270. NMSUB c42, c42, b5, c72
  1271. LD b3, 45 * SIZE(BO)
  1272. NMSUB c31, c31, b6, c71
  1273. NMSUB c32, c32, b6, c72
  1274. LD b4, 44 * SIZE(BO)
  1275. NMSUB c21, c21, b7, c71
  1276. NMSUB c22, c22, b7, c72
  1277. LD b5, 43 * SIZE(BO)
  1278. NMSUB c11, c11, b8, c71
  1279. NMSUB c12, c12, b8, c72
  1280. LD b6, 42 * SIZE(BO)
  1281. MUL c61, b3, c61
  1282. MUL c62, b3, c62
  1283. LD b7, 41 * SIZE(BO)
  1284. NMSUB c51, c51, b4, c61
  1285. NMSUB c52, c52, b4, c62
  1286. LD b8, 40 * SIZE(BO)
  1287. NMSUB c41, c41, b5, c61
  1288. NMSUB c42, c42, b5, c62
  1289. LD b4, 36 * SIZE(BO)
  1290. NMSUB c31, c31, b6, c61
  1291. NMSUB c32, c32, b6, c62
  1292. LD b5, 35 * SIZE(BO)
  1293. NMSUB c21, c21, b7, c61
  1294. NMSUB c22, c22, b7, c62
  1295. LD b6, 34 * SIZE(BO)
  1296. NMSUB c11, c11, b8, c61
  1297. NMSUB c12, c12, b8, c62
  1298. LD b7, 33 * SIZE(BO)
  1299. MUL c51, b4, c51
  1300. MUL c52, b4, c52
  1301. LD b8, 32 * SIZE(BO)
  1302. NMSUB c41, c41, b5, c51
  1303. NMSUB c42, c42, b5, c52
  1304. LD b5, 27 * SIZE(BO)
  1305. NMSUB c31, c31, b6, c51
  1306. NMSUB c32, c32, b6, c52
  1307. LD b6, 26 * SIZE(BO)
  1308. NMSUB c21, c21, b7, c51
  1309. NMSUB c22, c22, b7, c52
  1310. LD b7, 25 * SIZE(BO)
  1311. NMSUB c11, c11, b8, c51
  1312. NMSUB c12, c12, b8, c52
  1313. LD b8, 24 * SIZE(BO)
  1314. MUL c41, b5, c41
  1315. MUL c42, b5, c42
  1316. NMSUB c31, c31, b6, c41
  1317. NMSUB c32, c32, b6, c42
  1318. LD b6, 18 * SIZE(BO)
  1319. NMSUB c21, c21, b7, c41
  1320. NMSUB c22, c22, b7, c42
  1321. LD b7, 17 * SIZE(BO)
  1322. NMSUB c11, c11, b8, c41
  1323. NMSUB c12, c12, b8, c42
  1324. LD b8, 16 * SIZE(BO)
  1325. MUL c31, b6, c31
  1326. MUL c32, b6, c32
  1327. NMSUB c21, c21, b7, c31
  1328. NMSUB c22, c22, b7, c32
  1329. LD b7, 9 * SIZE(BO)
  1330. NMSUB c11, c11, b8, c31
  1331. NMSUB c12, c12, b8, c32
  1332. LD b8, 8 * SIZE(BO)
  1333. MUL c21, b7, c21
  1334. MUL c22, b7, c22
  1335. NMSUB c11, c11, b8, c21
  1336. NMSUB c12, c12, b8, c22
  1337. LD b8, 0 * SIZE(BO)
  1338. MUL c11, b8, c11
  1339. MUL c12, b8, c12
  1340. #endif
  1341. #if defined(LN) || defined(LT)
  1342. ST c11, 0 * SIZE(BO)
  1343. ST c21, 1 * SIZE(BO)
  1344. ST c31, 2 * SIZE(BO)
  1345. ST c41, 3 * SIZE(BO)
  1346. ST c51, 4 * SIZE(BO)
  1347. ST c61, 5 * SIZE(BO)
  1348. ST c71, 6 * SIZE(BO)
  1349. ST c81, 7 * SIZE(BO)
  1350. ST c12, 8 * SIZE(BO)
  1351. ST c22, 9 * SIZE(BO)
  1352. ST c32, 10 * SIZE(BO)
  1353. ST c42, 11 * SIZE(BO)
  1354. ST c52, 12 * SIZE(BO)
  1355. ST c62, 13 * SIZE(BO)
  1356. ST c72, 14 * SIZE(BO)
  1357. ST c82, 15 * SIZE(BO)
  1358. #else
  1359. ST c11, 0 * SIZE(AO)
  1360. ST c12, 1 * SIZE(AO)
  1361. ST c21, 2 * SIZE(AO)
  1362. ST c22, 3 * SIZE(AO)
  1363. ST c31, 4 * SIZE(AO)
  1364. ST c32, 5 * SIZE(AO)
  1365. ST c41, 6 * SIZE(AO)
  1366. ST c42, 7 * SIZE(AO)
  1367. ST c51, 8 * SIZE(AO)
  1368. ST c52, 9 * SIZE(AO)
  1369. ST c61, 10 * SIZE(AO)
  1370. ST c62, 11 * SIZE(AO)
  1371. ST c71, 12 * SIZE(AO)
  1372. ST c72, 13 * SIZE(AO)
  1373. ST c81, 14 * SIZE(AO)
  1374. ST c82, 15 * SIZE(AO)
  1375. #endif
  1376. ST c11, 0 * SIZE(CO1)
  1377. ST c12, 1 * SIZE(CO1)
  1378. ST c21, 0 * SIZE(CO2)
  1379. ST c22, 1 * SIZE(CO2)
  1380. ST c31, 0 * SIZE(CO3)
  1381. ST c32, 1 * SIZE(CO3)
  1382. ST c41, 0 * SIZE(CO4)
  1383. ST c42, 1 * SIZE(CO4)
  1384. ST c51, 0 * SIZE(CO5)
  1385. ST c52, 1 * SIZE(CO5)
  1386. ST c61, 0 * SIZE(CO6)
  1387. ST c62, 1 * SIZE(CO6)
  1388. ST c71, 0 * SIZE(CO7)
  1389. ST c72, 1 * SIZE(CO7)
  1390. ST c81, 0 * SIZE(CO8)
  1391. ST c82, 1 * SIZE(CO8)
  1392. MTC $0, a1
  1393. #ifndef LN
  1394. daddiu CO1, CO1, 2 * SIZE
  1395. daddiu CO2, CO2, 2 * SIZE
  1396. daddiu CO3, CO3, 2 * SIZE
  1397. daddiu CO4, CO4, 2 * SIZE
  1398. daddiu CO5, CO5, 2 * SIZE
  1399. daddiu CO6, CO6, 2 * SIZE
  1400. daddiu CO7, CO7, 2 * SIZE
  1401. daddiu CO8, CO8, 2 * SIZE
  1402. #endif
  1403. MOV c11, a1
  1404. MOV c21, a1
  1405. #ifdef RT
  1406. dsll TEMP, K, 1 + BASE_SHIFT
  1407. daddu AORIG, AORIG, TEMP
  1408. #endif
  1409. MOV c31, a1
  1410. MOV c41, a1
  1411. #if defined(LT) || defined(RN)
  1412. dsubu TEMP, K, KK
  1413. dsll L, TEMP, 1 + BASE_SHIFT
  1414. dsll TEMP, TEMP, 3 + BASE_SHIFT
  1415. daddu AO, AO, L
  1416. daddu BO, BO, TEMP
  1417. #endif
  1418. #ifdef LT
  1419. daddiu KK, KK, 2
  1420. #endif
  1421. #ifdef LN
  1422. daddiu KK, KK, -2
  1423. #endif
  1424. daddiu I, I, -1
  1425. MOV c51, a1
  1426. bgtz I, .L11
  1427. MOV c61, a1
  1428. .align 3
  1429. .L29:
  1430. #ifdef LN
  1431. dsll TEMP, K, 3 + BASE_SHIFT
  1432. daddu B, B, TEMP
  1433. #endif
  1434. #if defined(LT) || defined(RN)
  1435. move B, BO
  1436. #endif
  1437. #ifdef RN
  1438. daddiu KK, KK, 8
  1439. #endif
  1440. #ifdef RT
  1441. daddiu KK, KK, -8
  1442. #endif
  1443. bgtz J, .L10
  1444. NOP
  1445. .align 3
  1446. .L30:
  1447. andi J, N, 4
  1448. blez J, .L50
  1449. move AO, A
  1450. #ifdef RT
  1451. dsll TEMP, K, 2 + BASE_SHIFT
  1452. dsubu B, B, TEMP
  1453. dsll TEMP, LDC, 2
  1454. dsubu C, C, TEMP
  1455. #endif
  1456. move CO1, C
  1457. MTC $0, c11
  1458. daddu CO2, C, LDC
  1459. daddu CO3, CO2, LDC
  1460. MOV c21, c11
  1461. daddu CO4, CO3, LDC
  1462. MOV c31, c11
  1463. #ifdef LN
  1464. daddu KK, M, OFFSET
  1465. #endif
  1466. #ifdef LT
  1467. move KK, OFFSET
  1468. #endif
  1469. #if defined(LN) || defined(RT)
  1470. move AORIG, A
  1471. #else
  1472. move AO, A
  1473. #endif
  1474. #ifndef RT
  1475. daddu C, CO4, LDC
  1476. #endif
  1477. andi I, M, 1
  1478. blez I, .L40
  1479. MOV c41, c11
  1480. #if defined(LT) || defined(RN)
  1481. LD a1, 0 * SIZE(AO)
  1482. MOV c71, c11
  1483. LD a2, 1 * SIZE(AO)
  1484. MOV c81, c11
  1485. LD b1, 0 * SIZE(B)
  1486. LD b2, 1 * SIZE(B)
  1487. LD b3, 2 * SIZE(B)
  1488. LD b4, 3 * SIZE(B)
  1489. LD b5, 4 * SIZE(B)
  1490. LD b6, 8 * SIZE(B)
  1491. LD b7, 12 * SIZE(B)
  1492. dsra L, KK, 2
  1493. blez L, .L45
  1494. move BO, B
  1495. #else
  1496. #ifdef LN
  1497. dsll TEMP, K, BASE_SHIFT
  1498. dsubu AORIG, AORIG, TEMP
  1499. #endif
  1500. dsll L, KK, 0 + BASE_SHIFT
  1501. dsll TEMP, KK, 2 + BASE_SHIFT
  1502. daddu AO, AORIG, L
  1503. daddu BO, B, TEMP
  1504. dsubu TEMP, K, KK
  1505. LD a1, 0 * SIZE(AO)
  1506. MOV c71, c11
  1507. LD a2, 1 * SIZE(AO)
  1508. MOV c81, c11
  1509. LD b1, 0 * SIZE(BO)
  1510. LD b2, 1 * SIZE(BO)
  1511. LD b3, 2 * SIZE(BO)
  1512. LD b4, 3 * SIZE(BO)
  1513. LD b5, 4 * SIZE(BO)
  1514. LD b6, 8 * SIZE(BO)
  1515. LD b7, 12 * SIZE(BO)
  1516. dsra L, TEMP, 2
  1517. blez L, .L45
  1518. NOP
  1519. #endif
  1520. .align 3
  1521. .L42:
  1522. MADD c11, c11, a1, b1
  1523. LD b1, 16 * SIZE(BO)
  1524. MADD c21, c21, a1, b2
  1525. LD b2, 5 * SIZE(BO)
  1526. MADD c31, c31, a1, b3
  1527. LD b3, 6 * SIZE(BO)
  1528. MADD c41, c41, a1, b4
  1529. LD b4, 7 * SIZE(BO)
  1530. LD a1, 4 * SIZE(AO)
  1531. daddiu L, L, -1
  1532. MADD c11, c11, a2, b5
  1533. LD b5, 20 * SIZE(BO)
  1534. MADD c21, c21, a2, b2
  1535. LD b2, 9 * SIZE(BO)
  1536. MADD c31, c31, a2, b3
  1537. LD b3, 10 * SIZE(BO)
  1538. MADD c41, c41, a2, b4
  1539. LD b4, 11 * SIZE(BO)
  1540. LD a2, 2 * SIZE(AO)
  1541. daddiu AO, AO, 4 * SIZE
  1542. MADD c11, c11, a2, b6
  1543. LD b6, 24 * SIZE(BO)
  1544. MADD c21, c21, a2, b2
  1545. LD b2, 13 * SIZE(BO)
  1546. MADD c31, c31, a2, b3
  1547. LD b3, 14 * SIZE(BO)
  1548. MADD c41, c41, a2, b4
  1549. LD b4, 15 * SIZE(BO)
  1550. LD a2, -1 * SIZE(AO)
  1551. daddiu BO, BO, 16 * SIZE
  1552. MADD c11, c11, a2, b7
  1553. LD b7, 12 * SIZE(BO)
  1554. MADD c21, c21, a2, b2
  1555. LD b2, 1 * SIZE(BO)
  1556. MADD c31, c31, a2, b3
  1557. LD b3, 2 * SIZE(BO)
  1558. MADD c41, c41, a2, b4
  1559. LD b4, 3 * SIZE(BO)
  1560. bgtz L, .L42
  1561. LD a2, 1 * SIZE(AO)
  1562. .align 3
  1563. .L45:
  1564. #if defined(LT) || defined(RN)
  1565. andi L, KK, 3
  1566. #else
  1567. andi L, TEMP, 3
  1568. #endif
  1569. NOP
  1570. blez L, .L48
  1571. NOP
  1572. .align 3
  1573. .L46:
  1574. MADD c11, c11, a1, b1
  1575. LD b1, 4 * SIZE(BO)
  1576. MADD c21, c21, a1, b2
  1577. LD b2, 5 * SIZE(BO)
  1578. MADD c31, c31, a1, b3
  1579. LD b3, 6 * SIZE(BO)
  1580. MADD c41, c41, a1, b4
  1581. LD a1, 1 * SIZE(AO)
  1582. LD b4, 7 * SIZE(BO)
  1583. daddiu L, L, -1
  1584. daddiu AO, AO, 1 * SIZE
  1585. MOV a2, a2
  1586. bgtz L, .L46
  1587. daddiu BO, BO, 4 * SIZE
  1588. .L48:
  1589. #if defined(LN) || defined(RT)
  1590. #ifdef LN
  1591. daddiu TEMP, KK, -1
  1592. #else
  1593. daddiu TEMP, KK, -4
  1594. #endif
  1595. dsll L, TEMP, 0 + BASE_SHIFT
  1596. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1597. daddu AO, AORIG, L
  1598. daddu BO, B, TEMP
  1599. #endif
  1600. #if defined(LN) || defined(LT)
  1601. LD b1, 0 * SIZE(BO)
  1602. LD b2, 1 * SIZE(BO)
  1603. LD b3, 2 * SIZE(BO)
  1604. LD b4, 3 * SIZE(BO)
  1605. SUB c11, b1, c11
  1606. SUB c21, b2, c21
  1607. SUB c31, b3, c31
  1608. SUB c41, b4, c41
  1609. #else
  1610. LD b1, 0 * SIZE(AO)
  1611. LD b2, 1 * SIZE(AO)
  1612. LD b3, 2 * SIZE(AO)
  1613. LD b4, 3 * SIZE(AO)
  1614. SUB c11, b1, c11
  1615. SUB c21, b2, c21
  1616. SUB c31, b3, c31
  1617. SUB c41, b4, c41
  1618. #endif
  1619. #if defined(LN) || defined(LT)
  1620. LD b1, 0 * SIZE(AO)
  1621. MUL c11, b1, c11
  1622. MUL c21, b1, c21
  1623. MUL c31, b1, c31
  1624. MUL c41, b1, c41
  1625. #endif
  1626. #ifdef RN
  1627. LD b1, 0 * SIZE(BO)
  1628. LD b2, 1 * SIZE(BO)
  1629. LD b3, 2 * SIZE(BO)
  1630. LD b4, 3 * SIZE(BO)
  1631. MUL c11, b1, c11
  1632. NMSUB c21, c21, b2, c11
  1633. NMSUB c31, c31, b3, c11
  1634. NMSUB c41, c41, b4, c11
  1635. LD b2, 5 * SIZE(BO)
  1636. LD b3, 6 * SIZE(BO)
  1637. LD b4, 7 * SIZE(BO)
  1638. MUL c21, b2, c21
  1639. NMSUB c31, c31, b3, c21
  1640. NMSUB c41, c41, b4, c21
  1641. LD b3, 10 * SIZE(BO)
  1642. LD b4, 11 * SIZE(BO)
  1643. MUL c31, b3, c31
  1644. NMSUB c41, c41, b4, c31
  1645. LD b4, 15 * SIZE(BO)
  1646. MUL c41, b4, c41
  1647. #endif
  1648. #ifdef RT
  1649. LD b5, 15 * SIZE(BO)
  1650. LD b6, 14 * SIZE(BO)
  1651. LD b7, 13 * SIZE(BO)
  1652. LD b8, 12 * SIZE(BO)
  1653. MUL c41, b5, c41
  1654. NMSUB c31, c31, b6, c41
  1655. NMSUB c21, c21, b7, c41
  1656. NMSUB c11, c11, b8, c41
  1657. LD b6, 10 * SIZE(BO)
  1658. LD b7, 9 * SIZE(BO)
  1659. LD b8, 8 * SIZE(BO)
  1660. MUL c31, b6, c31
  1661. NMSUB c21, c21, b7, c31
  1662. NMSUB c11, c11, b8, c31
  1663. LD b7, 5 * SIZE(BO)
  1664. LD b8, 4 * SIZE(BO)
  1665. MUL c21, b7, c21
  1666. NMSUB c11, c11, b8, c21
  1667. LD b8, 0 * SIZE(BO)
  1668. MUL c11, b8, c11
  1669. #endif
  1670. #ifdef LN
  1671. daddiu CO1, CO1, -1 * SIZE
  1672. daddiu CO2, CO2, -1 * SIZE
  1673. daddiu CO3, CO3, -1 * SIZE
  1674. daddiu CO4, CO4, -1 * SIZE
  1675. #endif
  1676. #if defined(LN) || defined(LT)
  1677. ST c11, 0 * SIZE(BO)
  1678. ST c21, 1 * SIZE(BO)
  1679. ST c31, 2 * SIZE(BO)
  1680. ST c41, 3 * SIZE(BO)
  1681. #else
  1682. ST c11, 0 * SIZE(AO)
  1683. ST c21, 1 * SIZE(AO)
  1684. ST c31, 2 * SIZE(AO)
  1685. ST c41, 3 * SIZE(AO)
  1686. #endif
  1687. ST c11, 0 * SIZE(CO1)
  1688. ST c21, 0 * SIZE(CO2)
  1689. ST c31, 0 * SIZE(CO3)
  1690. ST c41, 0 * SIZE(CO4)
  1691. MTC $0, c11
  1692. #ifndef LN
  1693. daddiu CO1, CO1, 1 * SIZE
  1694. daddiu CO2, CO2, 1 * SIZE
  1695. daddiu CO3, CO3, 1 * SIZE
  1696. daddiu CO4, CO4, 1 * SIZE
  1697. #endif
  1698. MOV c21, c11
  1699. #ifdef RT
  1700. dsll TEMP, K, BASE_SHIFT
  1701. daddu AORIG, AORIG, TEMP
  1702. #endif
  1703. #if defined(LT) || defined(RN)
  1704. dsubu TEMP, K, KK
  1705. dsll L, TEMP, 0 + BASE_SHIFT
  1706. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1707. daddu AO, AO, L
  1708. daddu BO, BO, TEMP
  1709. #endif
  1710. MOV c31, c11
  1711. #ifdef LT
  1712. daddiu KK, KK, 1
  1713. #endif
  1714. #ifdef LN
  1715. daddiu KK, KK, -1
  1716. #endif
  1717. .align 3
  1718. .L40:
  1719. dsra I, M, 1
  1720. MOV c61, c11
  1721. blez I, .L49
  1722. MOV c41, c11
  1723. .L31:
  1724. #if defined(LT) || defined(RN)
  1725. LD a1, 0 * SIZE(AO)
  1726. LD a3, 4 * SIZE(AO)
  1727. LD b1, 0 * SIZE(B)
  1728. MOV c12, c11
  1729. LD b2, 1 * SIZE(B)
  1730. MOV c22, c11
  1731. LD b3, 2 * SIZE(B)
  1732. MOV c32, c11
  1733. LD b4, 3 * SIZE(B)
  1734. MOV c42, c11
  1735. LD b5, 4 * SIZE(B)
  1736. dsra L, KK, 2
  1737. LD b6, 8 * SIZE(B)
  1738. LD b7, 12 * SIZE(B)
  1739. blez L, .L35
  1740. move BO, B
  1741. #else
  1742. #ifdef LN
  1743. dsll TEMP, K, 1 + BASE_SHIFT
  1744. dsubu AORIG, AORIG, TEMP
  1745. #endif
  1746. dsll L, KK, 1 + BASE_SHIFT
  1747. dsll TEMP, KK, 2 + BASE_SHIFT
  1748. daddu AO, AORIG, L
  1749. daddu BO, B, TEMP
  1750. dsubu TEMP, K, KK
  1751. LD a1, 0 * SIZE(AO)
  1752. LD a3, 4 * SIZE(AO)
  1753. LD b1, 0 * SIZE(BO)
  1754. MOV c12, c11
  1755. LD b2, 1 * SIZE(BO)
  1756. MOV c22, c11
  1757. LD b3, 2 * SIZE(BO)
  1758. MOV c32, c11
  1759. LD b4, 3 * SIZE(BO)
  1760. MOV c42, c11
  1761. LD b5, 4 * SIZE(BO)
  1762. dsra L, TEMP, 2
  1763. LD b6, 8 * SIZE(BO)
  1764. LD b7, 12 * SIZE(BO)
  1765. blez L, .L35
  1766. NOP
  1767. #endif
  1768. .align 3
  1769. .L32:
  1770. MADD c11, c11, a1, b1
  1771. LD a2, 1 * SIZE(AO)
  1772. MADD c21, c21, a1, b2
  1773. daddiu L, L, -1
  1774. MADD c31, c31, a1, b3
  1775. NOP
  1776. MADD c41, c41, a1, b4
  1777. LD a1, 2 * SIZE(AO)
  1778. MADD c12, c12, a2, b1
  1779. LD b1, 16 * SIZE(BO)
  1780. MADD c22, c22, a2, b2
  1781. LD b2, 5 * SIZE(BO)
  1782. MADD c32, c32, a2, b3
  1783. LD b3, 6 * SIZE(BO)
  1784. MADD c42, c42, a2, b4
  1785. LD b4, 7 * SIZE(BO)
  1786. MADD c11, c11, a1, b5
  1787. LD a2, 3 * SIZE(AO)
  1788. MADD c21, c21, a1, b2
  1789. NOP
  1790. MADD c31, c31, a1, b3
  1791. NOP
  1792. MADD c41, c41, a1, b4
  1793. LD a1, 8 * SIZE(AO)
  1794. MADD c12, c12, a2, b5
  1795. LD b5, 20 * SIZE(BO)
  1796. MADD c22, c22, a2, b2
  1797. LD b2, 9 * SIZE(BO)
  1798. MADD c32, c32, a2, b3
  1799. LD b3, 10 * SIZE(BO)
  1800. MADD c42, c42, a2, b4
  1801. LD b4, 11 * SIZE(BO)
  1802. MADD c11, c11, a3, b6
  1803. LD a2, 5 * SIZE(AO)
  1804. MADD c21, c21, a3, b2
  1805. NOP
  1806. MADD c31, c31, a3, b3
  1807. NOP
  1808. MADD c41, c41, a3, b4
  1809. LD a3, 6 * SIZE(AO)
  1810. MADD c12, c12, a2, b6
  1811. LD b6, 24 * SIZE(BO)
  1812. MADD c22, c22, a2, b2
  1813. LD b2, 13 * SIZE(BO)
  1814. MADD c32, c32, a2, b3
  1815. LD b3, 14 * SIZE(BO)
  1816. MADD c42, c42, a2, b4
  1817. LD b4, 15 * SIZE(BO)
  1818. MADD c11, c11, a3, b7
  1819. LD a2, 7 * SIZE(AO)
  1820. MADD c21, c21, a3, b2
  1821. daddiu AO, AO, 8 * SIZE
  1822. MADD c31, c31, a3, b3
  1823. daddiu BO, BO, 16 * SIZE
  1824. MADD c41, c41, a3, b4
  1825. LD a3, 4 * SIZE(AO)
  1826. MADD c12, c12, a2, b7
  1827. LD b7, 12 * SIZE(BO)
  1828. MADD c22, c22, a2, b2
  1829. LD b2, 1 * SIZE(BO)
  1830. MADD c32, c32, a2, b3
  1831. LD b3, 2 * SIZE(BO)
  1832. MADD c42, c42, a2, b4
  1833. NOP
  1834. bgtz L, .L32
  1835. LD b4, 3 * SIZE(BO)
  1836. .align 3
  1837. .L35:
  1838. #if defined(LT) || defined(RN)
  1839. andi L, KK, 3
  1840. #else
  1841. andi L, TEMP, 3
  1842. #endif
  1843. NOP
  1844. blez L, .L38
  1845. NOP
  1846. .align 3
  1847. .L36:
  1848. MADD c11, c11, a1, b1
  1849. LD a2, 1 * SIZE(AO)
  1850. MADD c21, c21, a1, b2
  1851. daddiu L, L, -1
  1852. MADD c31, c31, a1, b3
  1853. daddiu AO, AO, 2 * SIZE
  1854. MADD c41, c41, a1, b4
  1855. LD a1, 0 * SIZE(AO)
  1856. MADD c12, c12, a2, b1
  1857. LD b1, 4 * SIZE(BO)
  1858. MADD c22, c22, a2, b2
  1859. LD b2, 5 * SIZE(BO)
  1860. MADD c32, c32, a2, b3
  1861. LD b3, 6 * SIZE(BO)
  1862. MADD c42, c42, a2, b4
  1863. LD b4, 7 * SIZE(BO)
  1864. bgtz L, .L36
  1865. daddiu BO, BO, 4 * SIZE
  1866. .L38:
  1867. #if defined(LN) || defined(RT)
  1868. #ifdef LN
  1869. daddiu TEMP, KK, -2
  1870. #else
  1871. daddiu TEMP, KK, -4
  1872. #endif
  1873. dsll L, TEMP, 1 + BASE_SHIFT
  1874. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1875. daddu AO, AORIG, L
  1876. daddu BO, B, TEMP
  1877. #endif
  1878. #if defined(LN) || defined(LT)
  1879. LD b1, 0 * SIZE(BO)
  1880. LD b2, 1 * SIZE(BO)
  1881. LD b3, 2 * SIZE(BO)
  1882. LD b4, 3 * SIZE(BO)
  1883. LD b5, 4 * SIZE(BO)
  1884. LD b6, 5 * SIZE(BO)
  1885. LD b7, 6 * SIZE(BO)
  1886. LD b8, 7 * SIZE(BO)
  1887. SUB c11, b1, c11
  1888. SUB c21, b2, c21
  1889. SUB c31, b3, c31
  1890. SUB c41, b4, c41
  1891. SUB c12, b5, c12
  1892. SUB c22, b6, c22
  1893. SUB c32, b7, c32
  1894. SUB c42, b8, c42
  1895. #else
  1896. LD b1, 0 * SIZE(AO)
  1897. LD b2, 1 * SIZE(AO)
  1898. LD b3, 2 * SIZE(AO)
  1899. LD b4, 3 * SIZE(AO)
  1900. LD b5, 4 * SIZE(AO)
  1901. LD b6, 5 * SIZE(AO)
  1902. LD b7, 6 * SIZE(AO)
  1903. LD b8, 7 * SIZE(AO)
  1904. SUB c11, b1, c11
  1905. SUB c12, b2, c12
  1906. SUB c21, b3, c21
  1907. SUB c22, b4, c22
  1908. SUB c31, b5, c31
  1909. SUB c32, b6, c32
  1910. SUB c41, b7, c41
  1911. SUB c42, b8, c42
  1912. #endif
  1913. #ifdef LN
  1914. LD b1, 3 * SIZE(AO)
  1915. LD b2, 2 * SIZE(AO)
  1916. LD b3, 0 * SIZE(AO)
  1917. MUL c12, b1, c12
  1918. MUL c22, b1, c22
  1919. MUL c32, b1, c32
  1920. MUL c42, b1, c42
  1921. NMSUB c11, c11, b2, c12
  1922. NMSUB c21, c21, b2, c22
  1923. NMSUB c31, c31, b2, c32
  1924. NMSUB c41, c41, b2, c42
  1925. MUL c11, b3, c11
  1926. MUL c21, b3, c21
  1927. MUL c31, b3, c31
  1928. MUL c41, b3, c41
  1929. #endif
  1930. #ifdef LT
  1931. LD b1, 0 * SIZE(AO)
  1932. LD b2, 1 * SIZE(AO)
  1933. LD b3, 3 * SIZE(AO)
  1934. MUL c11, b1, c11
  1935. MUL c21, b1, c21
  1936. MUL c31, b1, c31
  1937. MUL c41, b1, c41
  1938. NMSUB c12, c12, b2, c11
  1939. NMSUB c22, c22, b2, c21
  1940. NMSUB c32, c32, b2, c31
  1941. NMSUB c42, c42, b2, c41
  1942. MUL c12, b3, c12
  1943. MUL c22, b3, c22
  1944. MUL c32, b3, c32
  1945. MUL c42, b3, c42
  1946. #endif
  1947. #ifdef RN
  1948. LD b1, 0 * SIZE(BO)
  1949. LD b2, 1 * SIZE(BO)
  1950. LD b3, 2 * SIZE(BO)
  1951. LD b4, 3 * SIZE(BO)
  1952. MUL c11, b1, c11
  1953. MUL c12, b1, c12
  1954. NMSUB c21, c21, b2, c11
  1955. NMSUB c22, c22, b2, c12
  1956. NMSUB c31, c31, b3, c11
  1957. NMSUB c32, c32, b3, c12
  1958. NMSUB c41, c41, b4, c11
  1959. NMSUB c42, c42, b4, c12
  1960. LD b2, 5 * SIZE(BO)
  1961. LD b3, 6 * SIZE(BO)
  1962. LD b4, 7 * SIZE(BO)
  1963. MUL c21, b2, c21
  1964. MUL c22, b2, c22
  1965. NMSUB c31, c31, b3, c21
  1966. NMSUB c32, c32, b3, c22
  1967. NMSUB c41, c41, b4, c21
  1968. NMSUB c42, c42, b4, c22
  1969. LD b3, 10 * SIZE(BO)
  1970. LD b4, 11 * SIZE(BO)
  1971. MUL c31, b3, c31
  1972. MUL c32, b3, c32
  1973. NMSUB c41, c41, b4, c31
  1974. NMSUB c42, c42, b4, c32
  1975. LD b4, 15 * SIZE(BO)
  1976. MUL c41, b4, c41
  1977. MUL c42, b4, c42
  1978. #endif
  1979. #ifdef RT
  1980. LD b5, 15 * SIZE(BO)
  1981. LD b6, 14 * SIZE(BO)
  1982. LD b7, 13 * SIZE(BO)
  1983. LD b8, 12 * SIZE(BO)
  1984. MUL c41, b5, c41
  1985. MUL c42, b5, c42
  1986. NMSUB c31, c31, b6, c41
  1987. NMSUB c32, c32, b6, c42
  1988. NMSUB c21, c21, b7, c41
  1989. NMSUB c22, c22, b7, c42
  1990. NMSUB c11, c11, b8, c41
  1991. NMSUB c12, c12, b8, c42
  1992. LD b6, 10 * SIZE(BO)
  1993. LD b7, 9 * SIZE(BO)
  1994. LD b8, 8 * SIZE(BO)
  1995. MUL c31, b6, c31
  1996. MUL c32, b6, c32
  1997. NMSUB c21, c21, b7, c31
  1998. NMSUB c22, c22, b7, c32
  1999. NMSUB c11, c11, b8, c31
  2000. NMSUB c12, c12, b8, c32
  2001. LD b7, 5 * SIZE(BO)
  2002. LD b8, 4 * SIZE(BO)
  2003. MUL c21, b7, c21
  2004. MUL c22, b7, c22
  2005. NMSUB c11, c11, b8, c21
  2006. NMSUB c12, c12, b8, c22
  2007. LD b8, 0 * SIZE(BO)
  2008. MUL c11, b8, c11
  2009. MUL c12, b8, c12
  2010. #endif
  2011. #ifdef LN
  2012. daddiu CO1, CO1, -2 * SIZE
  2013. daddiu CO2, CO2, -2 * SIZE
  2014. daddiu CO3, CO3, -2 * SIZE
  2015. daddiu CO4, CO4, -2 * SIZE
  2016. #endif
  2017. #if defined(LN) || defined(LT)
  2018. ST c11, 0 * SIZE(BO)
  2019. ST c21, 1 * SIZE(BO)
  2020. ST c31, 2 * SIZE(BO)
  2021. ST c41, 3 * SIZE(BO)
  2022. ST c12, 4 * SIZE(BO)
  2023. ST c22, 5 * SIZE(BO)
  2024. ST c32, 6 * SIZE(BO)
  2025. ST c42, 7 * SIZE(BO)
  2026. #else
  2027. ST c11, 0 * SIZE(AO)
  2028. ST c12, 1 * SIZE(AO)
  2029. ST c21, 2 * SIZE(AO)
  2030. ST c22, 3 * SIZE(AO)
  2031. ST c31, 4 * SIZE(AO)
  2032. ST c32, 5 * SIZE(AO)
  2033. ST c41, 6 * SIZE(AO)
  2034. ST c42, 7 * SIZE(AO)
  2035. #endif
  2036. ST c11, 0 * SIZE(CO1)
  2037. ST c12, 1 * SIZE(CO1)
  2038. ST c21, 0 * SIZE(CO2)
  2039. ST c22, 1 * SIZE(CO2)
  2040. ST c31, 0 * SIZE(CO3)
  2041. ST c32, 1 * SIZE(CO3)
  2042. ST c41, 0 * SIZE(CO4)
  2043. ST c42, 1 * SIZE(CO4)
  2044. #ifndef LN
  2045. daddiu CO1, CO1, 2 * SIZE
  2046. daddiu CO2, CO2, 2 * SIZE
  2047. daddiu CO3, CO3, 2 * SIZE
  2048. daddiu CO4, CO4, 2 * SIZE
  2049. #endif
  2050. #ifdef RT
  2051. dsll TEMP, K, 1 + BASE_SHIFT
  2052. daddu AORIG, AORIG, TEMP
  2053. #endif
  2054. #if defined(LT) || defined(RN)
  2055. dsubu TEMP, K, KK
  2056. dsll L, TEMP, 1 + BASE_SHIFT
  2057. dsll TEMP, TEMP, 2 + BASE_SHIFT
  2058. daddu AO, AO, L
  2059. daddu BO, BO, TEMP
  2060. #endif
  2061. #ifdef LT
  2062. daddiu KK, KK, 2
  2063. #endif
  2064. #ifdef LN
  2065. daddiu KK, KK, -2
  2066. #endif
  2067. MTC $0, a1
  2068. MOV c11, a1
  2069. MOV c21, a1
  2070. MOV c31, a1
  2071. daddiu I, I, -1
  2072. bgtz I, .L31
  2073. MOV c41, c11
  2074. .align 3
  2075. .L49:
  2076. #ifdef LN
  2077. dsll TEMP, K, 2 + BASE_SHIFT
  2078. daddu B, B, TEMP
  2079. #endif
  2080. #if defined(LT) || defined(RN)
  2081. move B, BO
  2082. #endif
  2083. #ifdef RN
  2084. daddiu KK, KK, 4
  2085. #endif
  2086. #ifdef RT
  2087. daddiu KK, KK, -4
  2088. #endif
  2089. .align 3
  2090. .L50:
  2091. andi J, N, 2
  2092. blez J, .L70
  2093. #ifdef RT
  2094. dsll TEMP, K, 1 + BASE_SHIFT
  2095. dsubu B, B, TEMP
  2096. dsll TEMP, LDC, 1
  2097. dsubu C, C, TEMP
  2098. #endif
  2099. move AO, A
  2100. move CO1, C
  2101. daddu CO2, C, LDC
  2102. #ifdef LN
  2103. daddu KK, M, OFFSET
  2104. #endif
  2105. #ifdef LT
  2106. move KK, OFFSET
  2107. #endif
  2108. #if defined(LN) || defined(RT)
  2109. move AORIG, A
  2110. #else
  2111. move AO, A
  2112. #endif
  2113. #ifndef RT
  2114. daddu C, CO2, LDC
  2115. #endif
  2116. andi I, M, 1
  2117. blez I, .L60
  2118. NOP
  2119. #if defined(LT) || defined(RN)
  2120. dsra L, KK, 2
  2121. LD a1, 0 * SIZE(AO)
  2122. MTC $0, c11
  2123. LD a2, 1 * SIZE(AO)
  2124. MOV c21, c11
  2125. LD a3, 2 * SIZE(AO)
  2126. MOV c31, c11
  2127. LD a4, 3 * SIZE(AO)
  2128. MOV c41, c11
  2129. LD b1, 0 * SIZE(B)
  2130. LD b2, 1 * SIZE(B)
  2131. LD b3, 2 * SIZE(B)
  2132. LD b4, 3 * SIZE(B)
  2133. LD b5, 4 * SIZE(B)
  2134. LD b6, 8 * SIZE(B)
  2135. LD b7, 12 * SIZE(B)
  2136. blez L, .L65
  2137. move BO, B
  2138. #else
  2139. #ifdef LN
  2140. dsll TEMP, K, BASE_SHIFT
  2141. dsubu AORIG, AORIG, TEMP
  2142. #endif
  2143. dsll L, KK, 0 + BASE_SHIFT
  2144. dsll TEMP, KK, 1 + BASE_SHIFT
  2145. daddu AO, AORIG, L
  2146. daddu BO, B, TEMP
  2147. dsubu TEMP, K, KK
  2148. dsra L, TEMP, 2
  2149. LD a1, 0 * SIZE(AO)
  2150. MTC $0, c11
  2151. LD a2, 1 * SIZE(AO)
  2152. MOV c21, c11
  2153. LD a3, 2 * SIZE(AO)
  2154. MOV c31, c11
  2155. LD a4, 3 * SIZE(AO)
  2156. MOV c41, c11
  2157. LD b1, 0 * SIZE(BO)
  2158. LD b2, 1 * SIZE(BO)
  2159. LD b3, 2 * SIZE(BO)
  2160. LD b4, 3 * SIZE(BO)
  2161. LD b5, 4 * SIZE(BO)
  2162. LD b6, 8 * SIZE(BO)
  2163. LD b7, 12 * SIZE(BO)
  2164. blez L, .L65
  2165. NOP
  2166. #endif
  2167. .align 3
  2168. .L62:
  2169. MADD c11, c11, a1, b1
  2170. LD b1, 4 * SIZE(BO)
  2171. MADD c21, c21, a1, b2
  2172. LD b2, 5 * SIZE(BO)
  2173. MADD c31, c31, a2, b3
  2174. LD b3, 6 * SIZE(BO)
  2175. MADD c41, c41, a2, b4
  2176. LD b4, 7 * SIZE(BO)
  2177. LD a1, 4 * SIZE(AO)
  2178. LD a2, 5 * SIZE(AO)
  2179. MADD c11, c11, a3, b1
  2180. LD b1, 8 * SIZE(BO)
  2181. MADD c21, c21, a3, b2
  2182. LD b2, 9 * SIZE(BO)
  2183. MADD c31, c31, a4, b3
  2184. LD b3, 10 * SIZE(BO)
  2185. MADD c41, c41, a4, b4
  2186. LD b4, 11 * SIZE(BO)
  2187. LD a3, 6 * SIZE(AO)
  2188. LD a4, 7 * SIZE(AO)
  2189. daddiu L, L, -1
  2190. daddiu AO, AO, 4 * SIZE
  2191. bgtz L, .L62
  2192. daddiu BO, BO, 8 * SIZE
  2193. .align 3
  2194. .L65:
  2195. #if defined(LT) || defined(RN)
  2196. andi L, KK, 3
  2197. #else
  2198. andi L, TEMP, 3
  2199. #endif
  2200. NOP
  2201. blez L, .L68
  2202. NOP
  2203. .align 3
  2204. .L66:
  2205. MADD c11, c11, a1, b1
  2206. LD b1, 2 * SIZE(BO)
  2207. MADD c21, c21, a1, b2
  2208. LD b2, 3 * SIZE(BO)
  2209. LD a1, 1 * SIZE(AO)
  2210. daddiu L, L, -1
  2211. daddiu AO, AO, 1 * SIZE
  2212. bgtz L, .L66
  2213. daddiu BO, BO, 2 * SIZE
  2214. .L68:
  2215. ADD c11, c11, c31
  2216. ADD c21, c21, c41
  2217. #if defined(LN) || defined(RT)
  2218. #ifdef LN
  2219. daddiu TEMP, KK, -1
  2220. #else
  2221. daddiu TEMP, KK, -2
  2222. #endif
  2223. dsll L, TEMP, 0 + BASE_SHIFT
  2224. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2225. daddu AO, AORIG, L
  2226. daddu BO, B, TEMP
  2227. #endif
  2228. #if defined(LN) || defined(LT)
  2229. LD b1, 0 * SIZE(BO)
  2230. LD b2, 1 * SIZE(BO)
  2231. SUB c11, b1, c11
  2232. SUB c21, b2, c21
  2233. #else
  2234. LD b1, 0 * SIZE(AO)
  2235. LD b2, 1 * SIZE(AO)
  2236. SUB c11, b1, c11
  2237. SUB c21, b2, c21
  2238. #endif
  2239. #if defined(LN) || defined(LT)
  2240. LD b3, 0 * SIZE(AO)
  2241. MUL c11, b3, c11
  2242. MUL c21, b3, c21
  2243. #endif
  2244. #ifdef RN
  2245. LD b1, 0 * SIZE(BO)
  2246. LD b2, 1 * SIZE(BO)
  2247. LD b3, 3 * SIZE(BO)
  2248. MUL c11, b1, c11
  2249. NMSUB c21, c21, b2, c11
  2250. MUL c21, b3, c21
  2251. #endif
  2252. #ifdef RT
  2253. LD b1, 3 * SIZE(BO)
  2254. LD b2, 2 * SIZE(BO)
  2255. LD b3, 0 * SIZE(BO)
  2256. MUL c21, b1, c21
  2257. NMSUB c11, c11, b2, c21
  2258. MUL c11, b3, c11
  2259. #endif
  2260. #ifdef LN
  2261. daddiu CO1, CO1, -1 * SIZE
  2262. daddiu CO2, CO2, -1 * SIZE
  2263. #endif
  2264. #if defined(LN) || defined(LT)
  2265. ST c11, 0 * SIZE(BO)
  2266. ST c21, 1 * SIZE(BO)
  2267. #else
  2268. ST c11, 0 * SIZE(AO)
  2269. ST c21, 1 * SIZE(AO)
  2270. #endif
  2271. ST c11, 0 * SIZE(CO1)
  2272. ST c21, 0 * SIZE(CO2)
  2273. #ifndef LN
  2274. daddiu CO1, CO1, 1 * SIZE
  2275. daddiu CO2, CO2, 1 * SIZE
  2276. #endif
  2277. #ifdef RT
  2278. dsll TEMP, K, 0 + BASE_SHIFT
  2279. daddu AORIG, AORIG, TEMP
  2280. #endif
  2281. #if defined(LT) || defined(RN)
  2282. dsubu TEMP, K, KK
  2283. dsll L, TEMP, 0 + BASE_SHIFT
  2284. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2285. daddu AO, AO, L
  2286. daddu BO, BO, TEMP
  2287. #endif
  2288. #ifdef LT
  2289. daddiu KK, KK, 1
  2290. #endif
  2291. #ifdef LN
  2292. daddiu KK, KK, -1
  2293. #endif
  2294. .align 3
  2295. .L60:
  2296. dsra I, M, 1
  2297. blez I, .L69
  2298. NOP
  2299. .L51:
  2300. #if defined(LT) || defined(RN)
  2301. LD a1, 0 * SIZE(AO)
  2302. MTC $0, c11
  2303. LD a2, 1 * SIZE(AO)
  2304. MOV c21, c11
  2305. LD a5, 4 * SIZE(AO)
  2306. LD b1, 0 * SIZE(B)
  2307. MOV c12, c11
  2308. LD b2, 1 * SIZE(B)
  2309. MOV c22, c11
  2310. LD b3, 2 * SIZE(B)
  2311. LD b5, 4 * SIZE(B)
  2312. dsra L, KK, 2
  2313. LD b6, 8 * SIZE(B)
  2314. LD b7, 12 * SIZE(B)
  2315. blez L, .L55
  2316. move BO, B
  2317. #else
  2318. #ifdef LN
  2319. dsll TEMP, K, 1 + BASE_SHIFT
  2320. dsubu AORIG, AORIG, TEMP
  2321. #endif
  2322. dsll L, KK, 1 + BASE_SHIFT
  2323. dsll TEMP, KK, 1 + BASE_SHIFT
  2324. daddu AO, AORIG, L
  2325. daddu BO, B, TEMP
  2326. dsubu TEMP, K, KK
  2327. LD a1, 0 * SIZE(AO)
  2328. MTC $0, c11
  2329. LD a2, 1 * SIZE(AO)
  2330. MOV c21, c11
  2331. LD a5, 4 * SIZE(AO)
  2332. LD b1, 0 * SIZE(BO)
  2333. MOV c12, c11
  2334. LD b2, 1 * SIZE(BO)
  2335. MOV c22, c11
  2336. LD b3, 2 * SIZE(BO)
  2337. LD b5, 4 * SIZE(BO)
  2338. dsra L, TEMP, 2
  2339. LD b6, 8 * SIZE(BO)
  2340. LD b7, 12 * SIZE(BO)
  2341. blez L, .L55
  2342. NOP
  2343. #endif
  2344. .align 3
  2345. .L52:
  2346. MADD c11, c11, a1, b1
  2347. LD a3, 2 * SIZE(AO)
  2348. MADD c21, c21, a1, b2
  2349. LD b4, 3 * SIZE(BO)
  2350. MADD c12, c12, a2, b1
  2351. LD a4, 3 * SIZE(AO)
  2352. MADD c22, c22, a2, b2
  2353. LD b1, 8 * SIZE(BO)
  2354. MADD c11, c11, a3, b3
  2355. LD a1, 8 * SIZE(AO)
  2356. MADD c21, c21, a3, b4
  2357. LD b2, 5 * SIZE(BO)
  2358. MADD c12, c12, a4, b3
  2359. LD a2, 5 * SIZE(AO)
  2360. MADD c22, c22, a4, b4
  2361. LD b3, 6 * SIZE(BO)
  2362. MADD c11, c11, a5, b5
  2363. LD a3, 6 * SIZE(AO)
  2364. MADD c21, c21, a5, b2
  2365. LD b4, 7 * SIZE(BO)
  2366. MADD c12, c12, a2, b5
  2367. LD a4, 7 * SIZE(AO)
  2368. MADD c22, c22, a2, b2
  2369. LD b5, 12 * SIZE(BO)
  2370. MADD c11, c11, a3, b3
  2371. LD a5, 12 * SIZE(AO)
  2372. MADD c21, c21, a3, b4
  2373. LD b2, 9 * SIZE(BO)
  2374. MADD c12, c12, a4, b3
  2375. LD a2, 9 * SIZE(AO)
  2376. MADD c22, c22, a4, b4
  2377. LD b3, 10 * SIZE(BO)
  2378. daddiu AO, AO, 8 * SIZE
  2379. daddiu L, L, -1
  2380. bgtz L, .L52
  2381. daddiu BO, BO, 8 * SIZE
  2382. .align 3
  2383. .L55:
  2384. #if defined(LT) || defined(RN)
  2385. andi L, KK, 3
  2386. #else
  2387. andi L, TEMP, 3
  2388. #endif
  2389. NOP
  2390. blez L, .L58
  2391. NOP
  2392. .align 3
  2393. .L56:
  2394. MADD c11, c11, a1, b1
  2395. LD a2, 1 * SIZE(AO)
  2396. MADD c21, c21, a1, b2
  2397. LD a1, 2 * SIZE(AO)
  2398. MADD c12, c12, a2, b1
  2399. LD b1, 2 * SIZE(BO)
  2400. MADD c22, c22, a2, b2
  2401. LD b2, 3 * SIZE(BO)
  2402. daddiu L, L, -1
  2403. daddiu AO, AO, 2 * SIZE
  2404. bgtz L, .L56
  2405. daddiu BO, BO, 2 * SIZE
  2406. .L58:
  2407. #if defined(LN) || defined(RT)
  2408. #ifdef LN
  2409. daddiu TEMP, KK, -2
  2410. #else
  2411. daddiu TEMP, KK, -2
  2412. #endif
  2413. dsll L, TEMP, 1 + BASE_SHIFT
  2414. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2415. daddu AO, AORIG, L
  2416. daddu BO, B, TEMP
  2417. #endif
  2418. #if defined(LN) || defined(LT)
  2419. LD b1, 0 * SIZE(BO)
  2420. LD b2, 1 * SIZE(BO)
  2421. LD b3, 2 * SIZE(BO)
  2422. LD b4, 3 * SIZE(BO)
  2423. SUB c11, b1, c11
  2424. SUB c21, b2, c21
  2425. SUB c12, b3, c12
  2426. SUB c22, b4, c22
  2427. #else
  2428. LD b1, 0 * SIZE(AO)
  2429. LD b2, 1 * SIZE(AO)
  2430. LD b3, 2 * SIZE(AO)
  2431. LD b4, 3 * SIZE(AO)
  2432. SUB c11, b1, c11
  2433. SUB c12, b2, c12
  2434. SUB c21, b3, c21
  2435. SUB c22, b4, c22
  2436. #endif
  2437. #ifdef LN
  2438. LD b1, 3 * SIZE(AO)
  2439. LD b2, 2 * SIZE(AO)
  2440. LD b3, 0 * SIZE(AO)
  2441. MUL c12, b1, c12
  2442. MUL c22, b1, c22
  2443. NMSUB c11, c11, b2, c12
  2444. NMSUB c21, c21, b2, c22
  2445. MUL c11, b3, c11
  2446. MUL c21, b3, c21
  2447. #endif
  2448. #ifdef LT
  2449. LD b1, 0 * SIZE(AO)
  2450. LD b2, 1 * SIZE(AO)
  2451. LD b3, 3 * SIZE(AO)
  2452. MUL c11, b1, c11
  2453. MUL c21, b1, c21
  2454. NMSUB c12, c12, b2, c11
  2455. NMSUB c22, c22, b2, c21
  2456. MUL c12, b3, c12
  2457. MUL c22, b3, c22
  2458. #endif
  2459. #ifdef RN
  2460. LD b1, 0 * SIZE(BO)
  2461. LD b2, 1 * SIZE(BO)
  2462. LD b3, 3 * SIZE(BO)
  2463. MUL c11, b1, c11
  2464. MUL c12, b1, c12
  2465. NMSUB c21, c21, b2, c11
  2466. NMSUB c22, c22, b2, c12
  2467. MUL c21, b3, c21
  2468. MUL c22, b3, c22
  2469. #endif
  2470. #ifdef RT
  2471. LD b1, 3 * SIZE(BO)
  2472. LD b2, 2 * SIZE(BO)
  2473. LD b3, 0 * SIZE(BO)
  2474. MUL c21, b1, c21
  2475. MUL c22, b1, c22
  2476. NMSUB c11, c11, b2, c21
  2477. NMSUB c12, c12, b2, c22
  2478. MUL c11, b3, c11
  2479. MUL c12, b3, c12
  2480. #endif
  2481. #ifdef LN
  2482. daddiu CO1, CO1, -2 * SIZE
  2483. daddiu CO2, CO2, -2 * SIZE
  2484. #endif
  2485. #if defined(LN) || defined(LT)
  2486. ST c11, 0 * SIZE(BO)
  2487. ST c21, 1 * SIZE(BO)
  2488. ST c12, 2 * SIZE(BO)
  2489. ST c22, 3 * SIZE(BO)
  2490. #else
  2491. ST c11, 0 * SIZE(AO)
  2492. ST c12, 1 * SIZE(AO)
  2493. ST c21, 2 * SIZE(AO)
  2494. ST c22, 3 * SIZE(AO)
  2495. #endif
  2496. ST c11, 0 * SIZE(CO1)
  2497. ST c12, 1 * SIZE(CO1)
  2498. ST c21, 0 * SIZE(CO2)
  2499. ST c22, 1 * SIZE(CO2)
  2500. #ifndef LN
  2501. daddiu CO1, CO1, 2 * SIZE
  2502. daddiu CO2, CO2, 2 * SIZE
  2503. #endif
  2504. #ifdef RT
  2505. dsll TEMP, K, 1 + BASE_SHIFT
  2506. daddu AORIG, AORIG, TEMP
  2507. #endif
  2508. #if defined(LT) || defined(RN)
  2509. dsubu TEMP, K, KK
  2510. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2511. daddu AO, AO, TEMP
  2512. daddu BO, BO, TEMP
  2513. #endif
  2514. #ifdef LT
  2515. daddiu KK, KK, 2
  2516. #endif
  2517. #ifdef LN
  2518. daddiu KK, KK, -2
  2519. #endif
  2520. MTC $0, a1
  2521. MOV c11, a1
  2522. MOV c21, a1
  2523. MOV c31, a1
  2524. daddiu I, I, -1
  2525. bgtz I, .L51
  2526. MOV c41, c11
  2527. .align 3
  2528. .L69:
  2529. #ifdef LN
  2530. dsll TEMP, K, 1 + BASE_SHIFT
  2531. daddu B, B, TEMP
  2532. #endif
  2533. #if defined(LT) || defined(RN)
  2534. move B, BO
  2535. #endif
  2536. #ifdef RN
  2537. daddiu KK, KK, 2
  2538. #endif
  2539. #ifdef RT
  2540. daddiu KK, KK, -2
  2541. #endif
  2542. .align 3
  2543. .L70:
  2544. andi J, N, 1
  2545. blez J, .L999
  2546. NOP
  2547. #ifdef RT
  2548. dsll TEMP, K, BASE_SHIFT
  2549. dsubu B, B, TEMP
  2550. dsubu C, C, LDC
  2551. #endif
  2552. move AO, A
  2553. move CO1, C
  2554. #ifdef LN
  2555. daddu KK, M, OFFSET
  2556. #endif
  2557. #ifdef LT
  2558. move KK, OFFSET
  2559. #endif
  2560. #if defined(LN) || defined(RT)
  2561. move AORIG, A
  2562. #else
  2563. move AO, A
  2564. #endif
  2565. #ifndef RT
  2566. daddu C, CO1, LDC
  2567. #endif
  2568. andi I, M, 1
  2569. blez I, .L80
  2570. NOP
  2571. #if defined(LT) || defined(RN)
  2572. LD a1, 0 * SIZE(AO)
  2573. MTC $0, c11
  2574. LD a2, 1 * SIZE(AO)
  2575. MOV c21, c11
  2576. LD a3, 2 * SIZE(AO)
  2577. LD a4, 3 * SIZE(AO)
  2578. LD b1, 0 * SIZE(B)
  2579. LD b2, 1 * SIZE(B)
  2580. LD b3, 2 * SIZE(B)
  2581. LD b4, 3 * SIZE(B)
  2582. LD b5, 4 * SIZE(B)
  2583. LD b6, 8 * SIZE(B)
  2584. LD b7, 12 * SIZE(B)
  2585. dsra L, KK, 2
  2586. blez L, .L85
  2587. move BO, B
  2588. #else
  2589. #ifdef LN
  2590. dsll TEMP, K, BASE_SHIFT
  2591. dsubu AORIG, AORIG, TEMP
  2592. #endif
  2593. dsll TEMP, KK, BASE_SHIFT
  2594. daddu AO, AORIG, TEMP
  2595. daddu BO, B, TEMP
  2596. dsubu TEMP, K, KK
  2597. LD a1, 0 * SIZE(AO)
  2598. MTC $0, c11
  2599. LD a2, 1 * SIZE(AO)
  2600. MOV c21, c11
  2601. LD a3, 2 * SIZE(AO)
  2602. LD a4, 3 * SIZE(AO)
  2603. LD b1, 0 * SIZE(BO)
  2604. LD b2, 1 * SIZE(BO)
  2605. LD b3, 2 * SIZE(BO)
  2606. LD b4, 3 * SIZE(BO)
  2607. LD b5, 4 * SIZE(BO)
  2608. LD b6, 8 * SIZE(BO)
  2609. LD b7, 12 * SIZE(BO)
  2610. dsra L, TEMP, 2
  2611. blez L, .L85
  2612. NOP
  2613. #endif
  2614. .align 3
  2615. .L82:
  2616. LD a1, 0 * SIZE(AO)
  2617. LD b1, 0 * SIZE(BO)
  2618. MADD c11, c11, a1, b1
  2619. LD a1, 1 * SIZE(AO)
  2620. LD b1, 1 * SIZE(BO)
  2621. MADD c21, c21, a1, b1
  2622. LD a1, 2 * SIZE(AO)
  2623. LD b1, 2 * SIZE(BO)
  2624. MADD c11, c11, a1, b1
  2625. LD a1, 3 * SIZE(AO)
  2626. LD b1, 3 * SIZE(BO)
  2627. MADD c21, c21, a1, b1
  2628. daddiu L, L, -1
  2629. daddiu AO, AO, 4 * SIZE
  2630. bgtz L, .L82
  2631. daddiu BO, BO, 4 * SIZE
  2632. .align 3
  2633. .L85:
  2634. #if defined(LT) || defined(RN)
  2635. andi L, KK, 3
  2636. #else
  2637. andi L, TEMP, 3
  2638. #endif
  2639. NOP
  2640. blez L, .L88
  2641. NOP
  2642. .align 3
  2643. .L86:
  2644. LD a1, 0 * SIZE(AO)
  2645. LD b1, 0 * SIZE(BO)
  2646. MADD c11, c11, a1, b1
  2647. daddiu L, L, -1
  2648. daddiu AO, AO, 1 * SIZE
  2649. bgtz L, .L86
  2650. daddiu BO, BO, 1 * SIZE
  2651. .L88:
  2652. ADD c11, c11, c21
  2653. #if defined(LN) || defined(RT)
  2654. #ifdef LN
  2655. daddiu TEMP, KK, -1
  2656. #else
  2657. daddiu TEMP, KK, -1
  2658. #endif
  2659. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2660. daddu AO, AORIG, TEMP
  2661. daddu BO, B, TEMP
  2662. #endif
  2663. #if defined(LN) || defined(LT)
  2664. LD b1, 0 * SIZE(BO)
  2665. SUB c11, b1, c11
  2666. #else
  2667. LD b1, 0 * SIZE(AO)
  2668. SUB c11, b1, c11
  2669. #endif
  2670. #if defined(LN) || defined(LT)
  2671. LD b1, 0 * SIZE(AO)
  2672. MUL c11, b1, c11
  2673. #endif
  2674. #if defined(RN) || defined(RT)
  2675. LD b1, 0 * SIZE(BO)
  2676. MUL c11, b1, c11
  2677. #endif
  2678. #ifdef LN
  2679. daddiu CO1, CO1, -1 * SIZE
  2680. #endif
  2681. #if defined(LN) || defined(LT)
  2682. ST c11, 0 * SIZE(BO)
  2683. #else
  2684. ST c11, 0 * SIZE(AO)
  2685. #endif
  2686. ST c11, 0 * SIZE(CO1)
  2687. #ifndef LN
  2688. daddiu CO1, CO1, 1 * SIZE
  2689. #endif
  2690. #ifdef RT
  2691. dsll TEMP, K, BASE_SHIFT
  2692. daddu AORIG, AORIG, TEMP
  2693. #endif
  2694. #if defined(LT) || defined(RN)
  2695. dsubu TEMP, K, KK
  2696. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2697. daddu AO, AO, TEMP
  2698. daddu BO, BO, TEMP
  2699. #endif
  2700. #ifdef LT
  2701. daddiu KK, KK, 1
  2702. #endif
  2703. #ifdef LN
  2704. daddiu KK, KK, -1
  2705. #endif
  2706. .align 3
  2707. .L80:
  2708. dsra I, M, 1
  2709. blez I, .L89
  2710. NOP
  2711. .L71:
  2712. #if defined(LT) || defined(RN)
  2713. LD a1, 0 * SIZE(AO)
  2714. MTC $0, c11
  2715. LD a2, 1 * SIZE(AO)
  2716. MOV c21, c11
  2717. LD a5, 4 * SIZE(AO)
  2718. LD b1, 0 * SIZE(B)
  2719. MOV c12, c11
  2720. LD b2, 1 * SIZE(B)
  2721. MOV c22, c11
  2722. LD b3, 2 * SIZE(B)
  2723. LD b5, 4 * SIZE(B)
  2724. dsra L, KK, 2
  2725. LD b6, 8 * SIZE(B)
  2726. LD b7, 12 * SIZE(B)
  2727. blez L, .L75
  2728. move BO, B
  2729. #else
  2730. #ifdef LN
  2731. dsll TEMP, K, 1 + BASE_SHIFT
  2732. dsubu AORIG, AORIG, TEMP
  2733. #endif
  2734. dsll L, KK, 1 + BASE_SHIFT
  2735. dsll TEMP, KK, 0 + BASE_SHIFT
  2736. daddu AO, AORIG, L
  2737. daddu BO, B, TEMP
  2738. dsubu TEMP, K, KK
  2739. LD a1, 0 * SIZE(AO)
  2740. MTC $0, c11
  2741. LD a2, 1 * SIZE(AO)
  2742. MOV c21, c11
  2743. LD a5, 4 * SIZE(AO)
  2744. LD b1, 0 * SIZE(BO)
  2745. MOV c12, c11
  2746. LD b2, 1 * SIZE(BO)
  2747. MOV c22, c11
  2748. LD b3, 2 * SIZE(BO)
  2749. LD b5, 4 * SIZE(BO)
  2750. dsra L, TEMP, 2
  2751. LD b6, 8 * SIZE(BO)
  2752. LD b7, 12 * SIZE(BO)
  2753. blez L, .L75
  2754. NOP
  2755. #endif
  2756. .align 3
  2757. .L72:
  2758. LD a1, 0 * SIZE(AO)
  2759. LD a2, 1 * SIZE(AO)
  2760. LD b1, 0 * SIZE(BO)
  2761. MADD c11, c11, a1, b1
  2762. MADD c12, c12, a2, b1
  2763. LD a1, 2 * SIZE(AO)
  2764. LD a2, 3 * SIZE(AO)
  2765. LD b1, 1 * SIZE(BO)
  2766. MADD c11, c11, a1, b1
  2767. MADD c12, c12, a2, b1
  2768. LD a1, 4 * SIZE(AO)
  2769. LD a2, 5 * SIZE(AO)
  2770. LD b1, 2 * SIZE(BO)
  2771. MADD c11, c11, a1, b1
  2772. MADD c12, c12, a2, b1
  2773. LD a1, 6 * SIZE(AO)
  2774. LD a2, 7 * SIZE(AO)
  2775. LD b1, 3 * SIZE(BO)
  2776. MADD c11, c11, a1, b1
  2777. MADD c12, c12, a2, b1
  2778. daddiu L, L, -1
  2779. daddiu AO, AO, 8 * SIZE
  2780. bgtz L, .L72
  2781. daddiu BO, BO, 4 * SIZE
  2782. .align 3
  2783. .L75:
  2784. #if defined(LT) || defined(RN)
  2785. andi L, KK, 3
  2786. #else
  2787. andi L, TEMP, 3
  2788. #endif
  2789. NOP
  2790. blez L, .L78
  2791. NOP
  2792. .align 3
  2793. .L76:
  2794. LD a1, 0 * SIZE(AO)
  2795. LD a2, 1 * SIZE(AO)
  2796. LD b1, 0 * SIZE(BO)
  2797. MADD c11, c11, a1, b1
  2798. MADD c12, c12, a2, b1
  2799. daddiu L, L, -1
  2800. daddiu AO, AO, 2 * SIZE
  2801. bgtz L, .L76
  2802. daddiu BO, BO, 1 * SIZE
  2803. .L78:
  2804. ADD c11, c11, c21
  2805. ADD c12, c12, c22
  2806. #if defined(LN) || defined(RT)
  2807. #ifdef LN
  2808. daddiu TEMP, KK, -2
  2809. #else
  2810. daddiu TEMP, KK, -1
  2811. #endif
  2812. dsll L, TEMP, 1 + BASE_SHIFT
  2813. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2814. daddu AO, AORIG, L
  2815. daddu BO, B, TEMP
  2816. #endif
  2817. #if defined(LN) || defined(LT)
  2818. LD b1, 0 * SIZE(BO)
  2819. LD b2, 1 * SIZE(BO)
  2820. SUB c11, b1, c11
  2821. SUB c12, b2, c12
  2822. #else
  2823. LD b1, 0 * SIZE(AO)
  2824. LD b2, 1 * SIZE(AO)
  2825. SUB c11, b1, c11
  2826. SUB c12, b2, c12
  2827. #endif
  2828. #ifdef LN
  2829. LD b1, 3 * SIZE(AO)
  2830. LD b2, 2 * SIZE(AO)
  2831. LD b3, 0 * SIZE(AO)
  2832. MUL c12, b1, c12
  2833. NMSUB c11, c11, b2, c12
  2834. MUL c11, b3, c11
  2835. #endif
  2836. #ifdef LT
  2837. LD b1, 0 * SIZE(AO)
  2838. LD b2, 1 * SIZE(AO)
  2839. LD b3, 3 * SIZE(AO)
  2840. MUL c11, b1, c11
  2841. NMSUB c12, c12, b2, c11
  2842. MUL c12, b3, c12
  2843. #endif
  2844. #if defined(RN) || defined(RT)
  2845. LD b1, 0 * SIZE(BO)
  2846. MUL c11, b1, c11
  2847. MUL c12, b1, c12
  2848. #endif
  2849. #ifdef LN
  2850. daddiu CO1, CO1, -2 * SIZE
  2851. #endif
  2852. #if defined(LN) || defined(LT)
  2853. ST c11, 0 * SIZE(BO)
  2854. ST c12, 1 * SIZE(BO)
  2855. #else
  2856. ST c11, 0 * SIZE(AO)
  2857. ST c12, 1 * SIZE(AO)
  2858. #endif
  2859. ST c11, 0 * SIZE(CO1)
  2860. ST c12, 1 * SIZE(CO1)
  2861. #ifndef LN
  2862. daddiu CO1, CO1, 2 * SIZE
  2863. #endif
  2864. #ifdef RT
  2865. dsll TEMP, K, 1 + BASE_SHIFT
  2866. daddu AORIG, AORIG, TEMP
  2867. #endif
  2868. #if defined(LT) || defined(RN)
  2869. dsubu TEMP, K, KK
  2870. dsll L, TEMP, 1 + BASE_SHIFT
  2871. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2872. daddu AO, AO, L
  2873. daddu BO, BO, TEMP
  2874. #endif
  2875. #ifdef LT
  2876. daddiu KK, KK, 2
  2877. #endif
  2878. #ifdef LN
  2879. daddiu KK, KK, -2
  2880. #endif
  2881. daddiu I, I, -1
  2882. bgtz I, .L71
  2883. NOP
  2884. .align 3
  2885. .L89:
  2886. #ifdef LN
  2887. dsll TEMP, K, BASE_SHIFT
  2888. daddu B, B, TEMP
  2889. #endif
  2890. #if defined(LT) || defined(RN)
  2891. move B, BO
  2892. #endif
  2893. #ifdef RN
  2894. daddiu KK, KK, 1
  2895. #endif
  2896. #ifdef RT
  2897. daddiu KK, KK, -1
  2898. #endif
  2899. .align 3
  2900. .L999:
  2901. LDARG $16, 0($sp)
  2902. LDARG $17, 8($sp)
  2903. LDARG $18, 16($sp)
  2904. LDARG $19, 24($sp)
  2905. LDARG $20, 32($sp)
  2906. LDARG $21, 40($sp)
  2907. ldc1 $f24, 48($sp)
  2908. ldc1 $f25, 56($sp)
  2909. ldc1 $f26, 64($sp)
  2910. ldc1 $f27, 72($sp)
  2911. ldc1 $f28, 80($sp)
  2912. LDARG $22, 88($sp)
  2913. LDARG $23, 96($sp)
  2914. LDARG $24, 104($sp)
  2915. LDARG $25, 112($sp)
  2916. #ifndef __64BIT__
  2917. ldc1 $f20,112($sp)
  2918. ldc1 $f21,120($sp)
  2919. ldc1 $f22,128($sp)
  2920. ldc1 $f23,136($sp)
  2921. #endif
  2922. j $31
  2923. daddiu $sp, $sp, 144
  2924. EPILOGUE