You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_LT.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 56
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. #define STACKSIZE 80
  56. #define M $16
  57. #define N $17
  58. #define K $18
  59. #define A $20
  60. #define B $21
  61. #define C $22
  62. #define LDC $23
  63. #define C1 $19
  64. #define C2 $24
  65. #define C3 $25
  66. #define C4 $27
  67. #define AO $at
  68. #define BO $5
  69. #define I $6
  70. #define J $7
  71. #define L $8
  72. #define a1 $f16
  73. #define a2 $f17
  74. #define a3 $f18
  75. #define a4 $f19
  76. #define b1 $f20
  77. #define b2 $f21
  78. #define b3 $f22
  79. #define b4 $f23
  80. #define t1 $f24
  81. #define t2 $f25
  82. #define t3 $f26
  83. #define t4 $f27
  84. #define a5 $f28
  85. #define a6 $f30
  86. #define b5 $f29
  87. #define alpha $f30
  88. #define c01 $f0
  89. #define c02 $f1
  90. #define c03 $f2
  91. #define c04 $f3
  92. #define c05 $f4
  93. #define c06 $f5
  94. #define c07 $f6
  95. #define c08 $f7
  96. #define c09 $f8
  97. #define c10 $f9
  98. #define c11 $f10
  99. #define c12 $f11
  100. #define c13 $f12
  101. #define c14 $f13
  102. #define c15 $f14
  103. #define c16 $f15
  104. #define TMP1 $0
  105. #define TMP2 $1
  106. #define KK $2
  107. #define AORIG $3
  108. #define OFFSET $4
  109. PROLOGUE
  110. PROFCODE
  111. .frame $sp, STACKSIZE, $26, 0
  112. lda $sp, -STACKSIZE($sp)
  113. ldq C, 0 + STACKSIZE($sp)
  114. ldq LDC, 8 + STACKSIZE($sp)
  115. ldq OFFSET, 16 + STACKSIZE($sp)
  116. SXADDQ LDC, 0, LDC
  117. stt $f2, 0($sp)
  118. stt $f3, 8($sp)
  119. stt $f4, 16($sp)
  120. stt $f5, 24($sp)
  121. stt $f6, 32($sp)
  122. stt $f7, 40($sp)
  123. stt $f8, 48($sp)
  124. stt $f9, 56($sp)
  125. cmple M, 0, $0
  126. cmple N, 0, $1
  127. cmple K, 0, $2
  128. or $0, $1, $0
  129. or $0, $2, $0
  130. bne $0, $L999
  131. #ifdef LN
  132. mulq M, K, TMP1
  133. SXADDQ TMP1, A, A
  134. SXADDQ M, C, C
  135. #endif
  136. #ifdef RN
  137. negq OFFSET, KK
  138. #endif
  139. #ifdef RT
  140. mulq N, K, TMP1
  141. SXADDQ TMP1, B, B
  142. mulq N, LDC, TMP1
  143. addq TMP1, C, C
  144. subq N, OFFSET, KK
  145. #endif
  146. sra N, 2, J
  147. ble J, $L40
  148. .align 4
  149. $L01:
  150. #ifdef RT
  151. sll K, 2 + BASE_SHIFT, TMP1
  152. subq B, TMP1, B
  153. s4addq LDC, 0, TMP1
  154. subq C, TMP1, C
  155. #endif
  156. mov C, C1
  157. addq C, LDC, C2
  158. addq C2, LDC, C3
  159. #ifndef RT
  160. s4addq LDC, C, C
  161. #endif
  162. fclr t1
  163. addq C3, LDC, C4
  164. fclr t2
  165. #ifdef LN
  166. addq M, OFFSET, KK
  167. #endif
  168. #ifdef LT
  169. mov OFFSET, KK
  170. #endif
  171. #if defined(LN) || defined(RT)
  172. mov A, AORIG
  173. #else
  174. mov A, AO
  175. #endif
  176. sra M, 2, I
  177. fclr t3
  178. fclr t4
  179. ble I, $L20
  180. .align 4
  181. $L11:
  182. #if defined(LT) || defined(RN)
  183. LD a1, 0 * SIZE(AO)
  184. fclr c11
  185. LD a2, 1 * SIZE(AO)
  186. fclr c12
  187. LD a3, 2 * SIZE(AO)
  188. fclr c16
  189. LD a4, 3 * SIZE(AO)
  190. fclr c15
  191. LD b1, 0 * SIZE(B)
  192. fclr c01
  193. LD b2, 1 * SIZE(B)
  194. fclr c02
  195. LD b3, 2 * SIZE(B)
  196. fclr c06
  197. LD b4, 3 * SIZE(B)
  198. fclr c05
  199. lds $f31, 4 * SIZE(C1)
  200. fclr c03
  201. lda L, -2(KK)
  202. fclr c04
  203. lds $f31, 7 * SIZE(C2)
  204. fclr c08
  205. lda BO, 4 * SIZE(B)
  206. fclr c13
  207. lds $f31, 4 * SIZE(C3)
  208. fclr c09
  209. lda AO, 4 * SIZE(AO)
  210. fclr c10
  211. lds $f31, 7 * SIZE(C4)
  212. fclr c14
  213. fclr c07
  214. ble KK, $L18
  215. #else
  216. #ifdef LN
  217. sll K, BASE_SHIFT + 2, TMP1
  218. subq AORIG, TMP1, AORIG
  219. #endif
  220. sll KK, BASE_SHIFT + 2, TMP1
  221. addq AORIG, TMP1, AO
  222. addq B, TMP1, BO
  223. subq K, KK, TMP1
  224. LD a1, 0 * SIZE(AO)
  225. fclr c11
  226. LD a2, 1 * SIZE(AO)
  227. fclr c12
  228. LD a3, 2 * SIZE(AO)
  229. fclr c16
  230. LD a4, 3 * SIZE(AO)
  231. fclr c15
  232. LD b1, 0 * SIZE(BO)
  233. fclr c01
  234. LD b2, 1 * SIZE(BO)
  235. fclr c02
  236. LD b3, 2 * SIZE(BO)
  237. fclr c06
  238. LD b4, 3 * SIZE(BO)
  239. fclr c05
  240. lds $f31, 4 * SIZE(C1)
  241. fclr c03
  242. lda L, -2(TMP1)
  243. fclr c04
  244. lds $f31, 7 * SIZE(C2)
  245. fclr c08
  246. lda BO, 4 * SIZE(BO)
  247. fclr c13
  248. lds $f31, 4 * SIZE(C3)
  249. fclr c09
  250. lda AO, 4 * SIZE(AO)
  251. fclr c10
  252. lds $f31, 7 * SIZE(C4)
  253. fclr c14
  254. fclr c07
  255. ble TMP1, $L18
  256. #endif
  257. ble L, $L15
  258. .align 5
  259. $L12:
  260. /* 1 */
  261. ADD c11, t1, c11
  262. #ifndef EV4
  263. ldq $31, PREFETCHSIZE * SIZE(AO)
  264. #else
  265. unop
  266. #endif
  267. MUL b1, a1, t1
  268. #ifndef EV4
  269. ldl $31, PREFETCHSIZE * SIZE(BO)
  270. #else
  271. unop
  272. #endif
  273. ADD c12, t2, c12
  274. unop
  275. MUL b1, a2, t2
  276. unop
  277. ADD c16, t3, c16
  278. unop
  279. MUL b2, a2, t3
  280. LD a5, 0 * SIZE(AO)
  281. ADD c15, t4, c15
  282. unop
  283. MUL b2, a1, t4
  284. LD b5, 0 * SIZE(BO)
  285. /* 2 */
  286. ADD c01, t1, c01
  287. UNOP
  288. MUL b1, a3, t1
  289. UNOP
  290. ADD c02, t2, c02
  291. UNOP
  292. MUL b1, a4, t2
  293. UNOP
  294. ADD c06, t3, c06
  295. unop
  296. MUL b2, a4, t3
  297. unop
  298. ADD c05, t4, c05
  299. unop
  300. MUL b4, a1, t4
  301. unop
  302. /* 3 */
  303. ADD c03, t1, c03
  304. unop
  305. MUL b3, a1, t1
  306. unop
  307. ADD c04, t2, c04
  308. unop
  309. MUL b3, a2, t2
  310. unop
  311. ADD c08, t3, c08
  312. unop
  313. MUL b4, a2, t3
  314. LD a2, 1 * SIZE(AO)
  315. ADD c13, t4, c13
  316. unop
  317. MUL b2, a3, t4
  318. LD b2, 1 * SIZE(BO)
  319. /* 4 */
  320. ADD c09, t1, c09
  321. unop
  322. MUL b3, a3, t1
  323. LD a6, 2 * SIZE(AO)
  324. ADD c10, t2, c10
  325. unop
  326. MUL b3, a4, t2
  327. LD b3, 2 * SIZE(BO)
  328. ADD c14, t3, c14
  329. unop
  330. MUL b4, a4, t3
  331. LD a4, 3 * SIZE(AO)
  332. ADD c07, t4, c07
  333. unop
  334. MUL b4, a3, t4
  335. LD b4, 3 * SIZE(BO)
  336. /* 5 */
  337. ADD c11, t1, c11
  338. unop
  339. MUL b5, a5, t1
  340. LD a1, 4 * SIZE(AO)
  341. ADD c12, t2, c12
  342. lda L, -2(L)
  343. MUL b5, a2, t2
  344. LD b1, 4 * SIZE(BO)
  345. ADD c16, t3, c16
  346. unop
  347. MUL b2, a2, t3
  348. unop
  349. ADD c15, t4, c15
  350. unop
  351. MUL b2, a5, t4
  352. unop
  353. /* 6 */
  354. ADD c01, t1, c01
  355. unop
  356. MUL b5, a6, t1
  357. unop
  358. ADD c02, t2, c02
  359. unop
  360. MUL b5, a4, t2
  361. unop
  362. ADD c06, t3, c06
  363. unop
  364. MUL b2, a4, t3
  365. unop
  366. ADD c05, t4, c05
  367. unop
  368. MUL b4, a5, t4
  369. unop
  370. /* 7 */
  371. ADD c03, t1, c03
  372. lda AO, 8 * SIZE(AO)
  373. MUL b3, a5, t1
  374. unop
  375. ADD c04, t2, c04
  376. lda BO, 8 * SIZE(BO)
  377. MUL b3, a2, t2
  378. unop
  379. ADD c08, t3, c08
  380. unop
  381. MUL b4, a2, t3
  382. LD a2, -3 * SIZE(AO)
  383. ADD c13, t4, c13
  384. unop
  385. MUL b2, a6, t4
  386. LD b2, -3 * SIZE(BO)
  387. /* 8 */
  388. ADD c09, t1, c09
  389. unop
  390. MUL b3, a6, t1
  391. LD a3, -2 * SIZE(AO)
  392. ADD c10, t2, c10
  393. unop
  394. MUL b3, a4, t2
  395. LD b3, -2 * SIZE(BO)
  396. ADD c14, t3, c14
  397. unop
  398. MUL b4, a4, t3
  399. LD a4, -1 * SIZE(AO)
  400. ADD c07, t4, c07
  401. MUL b4, a6, t4
  402. LD b4, -1 * SIZE(BO)
  403. bgt L, $L12
  404. .align 4
  405. $L15:
  406. ADD c11, t1, c11
  407. MUL b1, a1, t1
  408. #if defined(LT) || defined(RN)
  409. blbs KK, $L17
  410. #else
  411. blbs TMP1, $L17
  412. #endif
  413. .align 4
  414. ADD c12, t2, c12
  415. MUL b1, a2, t2
  416. ADD c16, t3, c16
  417. MUL b2, a2, t3
  418. ADD c15, t4, c15
  419. MUL b2, a1, t4
  420. ADD c01, t1, c01
  421. MUL b1, a3, t1
  422. ADD c02, t2, c02
  423. unop
  424. MUL b1, a4, t2
  425. LD b1, 0 * SIZE(BO)
  426. ADD c06, t3, c06
  427. MUL b2, a4, t3
  428. ADD c05, t4, c05
  429. MUL b4, a1, t4
  430. ADD c03, t1, c03
  431. unop
  432. MUL b3, a1, t1
  433. LD a1, 0 * SIZE(AO)
  434. ADD c04, t2, c04
  435. unop
  436. MUL b3, a2, t2
  437. unop
  438. ADD c08, t3, c08
  439. unop
  440. MUL b4, a2, t3
  441. LD a2, 1 * SIZE(AO)
  442. ADD c13, t4, c13
  443. unop
  444. MUL b2, a3, t4
  445. LD b2, 1 * SIZE(BO)
  446. ADD c09, t1, c09
  447. unop
  448. MUL b3, a3, t1
  449. lda AO, 4 * SIZE(AO)
  450. ADD c10, t2, c10
  451. unop
  452. MUL b3, a4, t2
  453. LD b3, 2 * SIZE(BO)
  454. ADD c14, t3, c14
  455. unop
  456. MUL b4, a4, t3
  457. LD a4, -1 * SIZE(AO)
  458. ADD c07, t4, c07
  459. unop
  460. MUL b4, a3, t4
  461. LD a3, -2 * SIZE(AO)
  462. ADD c11, t1, c11
  463. LD b4, 3 * SIZE(BO)
  464. MUL b1, a1, t1
  465. lda BO, 4 * SIZE(BO)
  466. .align 4
  467. $L17:
  468. ADD c12, t2, c12
  469. MUL b1, a2, t2
  470. ADD c16, t3, c16
  471. MUL b2, a2, t3
  472. ADD c15, t4, c15
  473. MUL b2, a1, t4
  474. ADD c01, t1, c01
  475. MUL b1, a3, t1
  476. ADD c02, t2, c02
  477. MUL b1, a4, t2
  478. ADD c06, t3, c06
  479. MUL b2, a4, t3
  480. ADD c05, t4, c05
  481. MUL b4, a1, t4
  482. ADD c03, t1, c03
  483. MUL b3, a1, t1
  484. ADD c04, t2, c04
  485. MUL b3, a2, t2
  486. ADD c08, t3, c08
  487. MUL b4, a2, t3
  488. ADD c13, t4, c13
  489. MUL b2, a3, t4
  490. ADD c09, t1, c09
  491. MUL b3, a3, t1
  492. ADD c10, t2, c10
  493. MUL b3, a4, t2
  494. ADD c14, t3, c14
  495. MUL b4, a4, t3
  496. ADD c07, t4, c07
  497. lda AO, 4 * SIZE(AO)
  498. MUL b4, a3, t4
  499. lda BO, 4 * SIZE(BO)
  500. ADD c11, t1, c11
  501. ADD c12, t2, c12
  502. ADD c16, t3, c16
  503. ADD c15, t4, c15
  504. .align 4
  505. $L18:
  506. #if defined(LN) || defined(RT)
  507. #ifdef LN
  508. subq KK, 4, TMP1
  509. #else
  510. subq KK, 4, TMP1
  511. #endif
  512. sll TMP1, BASE_SHIFT + 2, TMP2
  513. addq AORIG, TMP2, AO
  514. sll TMP1, BASE_SHIFT + 2, TMP2
  515. addq B, TMP2, BO
  516. #else
  517. lda AO, -4 * SIZE(AO)
  518. lda BO, -4 * SIZE(BO)
  519. #endif
  520. #if defined(LN) || defined(LT)
  521. LD a1, 0 * SIZE(BO)
  522. LD a2, 1 * SIZE(BO)
  523. LD a3, 2 * SIZE(BO)
  524. LD a4, 3 * SIZE(BO)
  525. LD b1, 4 * SIZE(BO)
  526. LD b2, 5 * SIZE(BO)
  527. LD b3, 6 * SIZE(BO)
  528. LD b4, 7 * SIZE(BO)
  529. SUB a1, c01, c01
  530. SUB a2, c05, c05
  531. SUB a3, c09, c09
  532. SUB a4, c13, c13
  533. SUB b1, c02, c02
  534. SUB b2, c06, c06
  535. SUB b3, c10, c10
  536. SUB b4, c14, c14
  537. LD a1, 8 * SIZE(BO)
  538. LD a2, 9 * SIZE(BO)
  539. LD a3, 10 * SIZE(BO)
  540. LD a4, 11 * SIZE(BO)
  541. LD b1, 12 * SIZE(BO)
  542. LD b2, 13 * SIZE(BO)
  543. LD b3, 14 * SIZE(BO)
  544. LD b4, 15 * SIZE(BO)
  545. SUB a1, c03, c03
  546. SUB a2, c07, c07
  547. SUB a3, c11, c11
  548. SUB a4, c15, c15
  549. SUB b1, c04, c04
  550. SUB b2, c08, c08
  551. SUB b3, c12, c12
  552. SUB b4, c16, c16
  553. #else
  554. LD a1, 0 * SIZE(AO)
  555. LD a2, 1 * SIZE(AO)
  556. LD a3, 2 * SIZE(AO)
  557. LD a4, 3 * SIZE(AO)
  558. LD b1, 4 * SIZE(AO)
  559. LD b2, 5 * SIZE(AO)
  560. LD b3, 6 * SIZE(AO)
  561. LD b4, 7 * SIZE(AO)
  562. SUB a1, c01, c01
  563. SUB a2, c02, c02
  564. SUB a3, c03, c03
  565. SUB a4, c04, c04
  566. SUB b1, c05, c05
  567. SUB b2, c06, c06
  568. SUB b3, c07, c07
  569. SUB b4, c08, c08
  570. LD a1, 8 * SIZE(AO)
  571. LD a2, 9 * SIZE(AO)
  572. LD a3, 10 * SIZE(AO)
  573. LD a4, 11 * SIZE(AO)
  574. LD b1, 12 * SIZE(AO)
  575. LD b2, 13 * SIZE(AO)
  576. LD b3, 14 * SIZE(AO)
  577. LD b4, 15 * SIZE(AO)
  578. SUB a1, c09, c09
  579. SUB a2, c10, c10
  580. SUB a3, c11, c11
  581. SUB a4, c12, c12
  582. SUB b1, c13, c13
  583. SUB b2, c14, c14
  584. SUB b3, c15, c15
  585. SUB b4, c16, c16
  586. #endif
  587. #ifdef LN
  588. LD a1, 15 * SIZE(AO)
  589. LD a2, 14 * SIZE(AO)
  590. LD a3, 13 * SIZE(AO)
  591. LD a4, 12 * SIZE(AO)
  592. MUL a1, c04, c04
  593. MUL a1, c08, c08
  594. MUL a1, c12, c12
  595. MUL a1, c16, c16
  596. MUL a2, c04, t1
  597. MUL a2, c08, t2
  598. MUL a2, c12, t3
  599. MUL a2, c16, t4
  600. SUB c03, t1, c03
  601. SUB c07, t2, c07
  602. SUB c11, t3, c11
  603. SUB c15, t4, c15
  604. MUL a3, c04, t1
  605. MUL a3, c08, t2
  606. MUL a3, c12, t3
  607. MUL a3, c16, t4
  608. SUB c02, t1, c02
  609. SUB c06, t2, c06
  610. SUB c10, t3, c10
  611. SUB c14, t4, c14
  612. MUL a4, c04, t1
  613. MUL a4, c08, t2
  614. MUL a4, c12, t3
  615. MUL a4, c16, t4
  616. SUB c01, t1, c01
  617. SUB c05, t2, c05
  618. SUB c09, t3, c09
  619. SUB c13, t4, c13
  620. LD b1, 10 * SIZE(AO)
  621. LD b2, 9 * SIZE(AO)
  622. LD b3, 8 * SIZE(AO)
  623. MUL b1, c03, c03
  624. MUL b1, c07, c07
  625. MUL b1, c11, c11
  626. MUL b1, c15, c15
  627. MUL b2, c03, t1
  628. MUL b2, c07, t2
  629. MUL b2, c11, t3
  630. MUL b2, c15, t4
  631. SUB c02, t1, c02
  632. SUB c06, t2, c06
  633. SUB c10, t3, c10
  634. SUB c14, t4, c14
  635. MUL b3, c03, t1
  636. MUL b3, c07, t2
  637. MUL b3, c11, t3
  638. MUL b3, c15, t4
  639. SUB c01, t1, c01
  640. SUB c05, t2, c05
  641. SUB c09, t3, c09
  642. SUB c13, t4, c13
  643. LD a1, 5 * SIZE(AO)
  644. LD a2, 4 * SIZE(AO)
  645. LD a3, 0 * SIZE(AO)
  646. MUL a1, c02, c02
  647. MUL a1, c06, c06
  648. MUL a1, c10, c10
  649. MUL a1, c14, c14
  650. MUL a2, c02, t1
  651. MUL a2, c06, t2
  652. MUL a2, c10, t3
  653. MUL a2, c14, t4
  654. SUB c01, t1, c01
  655. SUB c05, t2, c05
  656. SUB c09, t3, c09
  657. SUB c13, t4, c13
  658. MUL a3, c01, c01
  659. MUL a3, c05, c05
  660. MUL a3, c09, c09
  661. MUL a3, c13, c13
  662. #endif
  663. #ifdef LT
  664. LD a1, 0 * SIZE(AO)
  665. LD a2, 1 * SIZE(AO)
  666. LD a3, 2 * SIZE(AO)
  667. LD a4, 3 * SIZE(AO)
  668. MUL a1, c01, c01
  669. MUL a1, c05, c05
  670. MUL a1, c09, c09
  671. MUL a1, c13, c13
  672. MUL a2, c01, t1
  673. MUL a2, c05, t2
  674. MUL a2, c09, t3
  675. MUL a2, c13, t4
  676. SUB c02, t1, c02
  677. SUB c06, t2, c06
  678. SUB c10, t3, c10
  679. SUB c14, t4, c14
  680. MUL a3, c01, t1
  681. MUL a3, c05, t2
  682. MUL a3, c09, t3
  683. MUL a3, c13, t4
  684. SUB c03, t1, c03
  685. SUB c07, t2, c07
  686. SUB c11, t3, c11
  687. SUB c15, t4, c15
  688. MUL a4, c01, t1
  689. MUL a4, c05, t2
  690. MUL a4, c09, t3
  691. MUL a4, c13, t4
  692. SUB c04, t1, c04
  693. SUB c08, t2, c08
  694. SUB c12, t3, c12
  695. SUB c16, t4, c16
  696. LD b1, 5 * SIZE(AO)
  697. LD b2, 6 * SIZE(AO)
  698. LD b3, 7 * SIZE(AO)
  699. MUL b1, c02, c02
  700. MUL b1, c06, c06
  701. MUL b1, c10, c10
  702. MUL b1, c14, c14
  703. MUL b2, c02, t1
  704. MUL b2, c06, t2
  705. MUL b2, c10, t3
  706. MUL b2, c14, t4
  707. SUB c03, t1, c03
  708. SUB c07, t2, c07
  709. SUB c11, t3, c11
  710. SUB c15, t4, c15
  711. MUL b3, c02, t1
  712. MUL b3, c06, t2
  713. MUL b3, c10, t3
  714. MUL b3, c14, t4
  715. SUB c04, t1, c04
  716. SUB c08, t2, c08
  717. SUB c12, t3, c12
  718. SUB c16, t4, c16
  719. LD a1, 10 * SIZE(AO)
  720. LD a2, 11 * SIZE(AO)
  721. LD a3, 15 * SIZE(AO)
  722. MUL a1, c03, c03
  723. MUL a1, c07, c07
  724. MUL a1, c11, c11
  725. MUL a1, c15, c15
  726. MUL a2, c03, t1
  727. MUL a2, c07, t2
  728. MUL a2, c11, t3
  729. MUL a2, c15, t4
  730. SUB c04, t1, c04
  731. SUB c08, t2, c08
  732. SUB c12, t3, c12
  733. SUB c16, t4, c16
  734. MUL a3, c04, c04
  735. MUL a3, c08, c08
  736. MUL a3, c12, c12
  737. MUL a3, c16, c16
  738. #endif
  739. #ifdef RN
  740. LD a1, 0 * SIZE(BO)
  741. LD a2, 1 * SIZE(BO)
  742. LD a3, 2 * SIZE(BO)
  743. LD a4, 3 * SIZE(BO)
  744. MUL a1, c01, c01
  745. MUL a1, c02, c02
  746. MUL a1, c03, c03
  747. MUL a1, c04, c04
  748. MUL a2, c01, t1
  749. MUL a2, c02, t2
  750. MUL a2, c03, t3
  751. MUL a2, c04, t4
  752. SUB c05, t1, c05
  753. SUB c06, t2, c06
  754. SUB c07, t3, c07
  755. SUB c08, t4, c08
  756. MUL a3, c01, t1
  757. MUL a3, c02, t2
  758. MUL a3, c03, t3
  759. MUL a3, c04, t4
  760. SUB c09, t1, c09
  761. SUB c10, t2, c10
  762. SUB c11, t3, c11
  763. SUB c12, t4, c12
  764. MUL a4, c01, t1
  765. MUL a4, c02, t2
  766. MUL a4, c03, t3
  767. MUL a4, c04, t4
  768. SUB c13, t1, c13
  769. SUB c14, t2, c14
  770. SUB c15, t3, c15
  771. SUB c16, t4, c16
  772. LD b1, 5 * SIZE(BO)
  773. LD b2, 6 * SIZE(BO)
  774. LD b3, 7 * SIZE(BO)
  775. MUL b1, c05, c05
  776. MUL b1, c06, c06
  777. MUL b1, c07, c07
  778. MUL b1, c08, c08
  779. MUL b2, c05, t1
  780. MUL b2, c06, t2
  781. MUL b2, c07, t3
  782. MUL b2, c08, t4
  783. SUB c09, t1, c09
  784. SUB c10, t2, c10
  785. SUB c11, t3, c11
  786. SUB c12, t4, c12
  787. MUL b3, c05, t1
  788. MUL b3, c06, t2
  789. MUL b3, c07, t3
  790. MUL b3, c08, t4
  791. SUB c13, t1, c13
  792. SUB c14, t2, c14
  793. SUB c15, t3, c15
  794. SUB c16, t4, c16
  795. LD a1, 10 * SIZE(BO)
  796. LD a2, 11 * SIZE(BO)
  797. LD a3, 15 * SIZE(BO)
  798. MUL a1, c09, c09
  799. MUL a1, c10, c10
  800. MUL a1, c11, c11
  801. MUL a1, c12, c12
  802. MUL a2, c09, t1
  803. MUL a2, c10, t2
  804. MUL a2, c11, t3
  805. MUL a2, c12, t4
  806. SUB c13, t1, c13
  807. SUB c14, t2, c14
  808. SUB c15, t3, c15
  809. SUB c16, t4, c16
  810. MUL a3, c13, c13
  811. MUL a3, c14, c14
  812. MUL a3, c15, c15
  813. MUL a3, c16, c16
  814. #endif
  815. #ifdef RT
  816. LD a1, 15 * SIZE(BO)
  817. LD a2, 14 * SIZE(BO)
  818. LD a3, 13 * SIZE(BO)
  819. LD a4, 12 * SIZE(BO)
  820. MUL a1, c13, c13
  821. MUL a1, c14, c14
  822. MUL a1, c15, c15
  823. MUL a1, c16, c16
  824. MUL a2, c13, t1
  825. MUL a2, c14, t2
  826. MUL a2, c15, t3
  827. MUL a2, c16, t4
  828. SUB c09, t1, c09
  829. SUB c10, t2, c10
  830. SUB c11, t3, c11
  831. SUB c12, t4, c12
  832. MUL a3, c13, t1
  833. MUL a3, c14, t2
  834. MUL a3, c15, t3
  835. MUL a3, c16, t4
  836. SUB c05, t1, c05
  837. SUB c06, t2, c06
  838. SUB c07, t3, c07
  839. SUB c08, t4, c08
  840. MUL a4, c13, t1
  841. MUL a4, c14, t2
  842. MUL a4, c15, t3
  843. MUL a4, c16, t4
  844. SUB c01, t1, c01
  845. SUB c02, t2, c02
  846. SUB c03, t3, c03
  847. SUB c04, t4, c04
  848. LD b1, 10 * SIZE(BO)
  849. LD b2, 9 * SIZE(BO)
  850. LD b3, 8 * SIZE(BO)
  851. MUL b1, c09, c09
  852. MUL b1, c10, c10
  853. MUL b1, c11, c11
  854. MUL b1, c12, c12
  855. MUL b2, c09, t1
  856. MUL b2, c10, t2
  857. MUL b2, c11, t3
  858. MUL b2, c12, t4
  859. SUB c05, t1, c05
  860. SUB c06, t2, c06
  861. SUB c07, t3, c07
  862. SUB c08, t4, c08
  863. MUL b3, c09, t1
  864. MUL b3, c10, t2
  865. MUL b3, c11, t3
  866. MUL b3, c12, t4
  867. SUB c01, t1, c01
  868. SUB c02, t2, c02
  869. SUB c03, t3, c03
  870. SUB c04, t4, c04
  871. LD a1, 5 * SIZE(BO)
  872. LD a2, 4 * SIZE(BO)
  873. LD a3, 0 * SIZE(BO)
  874. MUL a1, c05, c05
  875. MUL a1, c06, c06
  876. MUL a1, c07, c07
  877. MUL a1, c08, c08
  878. MUL a2, c05, t1
  879. MUL a2, c06, t2
  880. MUL a2, c07, t3
  881. MUL a2, c08, t4
  882. SUB c01, t1, c01
  883. SUB c02, t2, c02
  884. SUB c03, t3, c03
  885. SUB c04, t4, c04
  886. MUL a3, c01, c01
  887. MUL a3, c02, c02
  888. MUL a3, c03, c03
  889. MUL a3, c04, c04
  890. #endif
  891. #if defined(LN) || defined(LT)
  892. ST c01, 0 * SIZE(BO)
  893. ST c05, 1 * SIZE(BO)
  894. ST c09, 2 * SIZE(BO)
  895. ST c13, 3 * SIZE(BO)
  896. ST c02, 4 * SIZE(BO)
  897. ST c06, 5 * SIZE(BO)
  898. ST c10, 6 * SIZE(BO)
  899. ST c14, 7 * SIZE(BO)
  900. ST c03, 8 * SIZE(BO)
  901. ST c07, 9 * SIZE(BO)
  902. ST c11, 10 * SIZE(BO)
  903. ST c15, 11 * SIZE(BO)
  904. ST c04, 12 * SIZE(BO)
  905. ST c08, 13 * SIZE(BO)
  906. ST c12, 14 * SIZE(BO)
  907. ST c16, 15 * SIZE(BO)
  908. #else
  909. ST c01, 0 * SIZE(AO)
  910. ST c02, 1 * SIZE(AO)
  911. ST c03, 2 * SIZE(AO)
  912. ST c04, 3 * SIZE(AO)
  913. ST c05, 4 * SIZE(AO)
  914. ST c06, 5 * SIZE(AO)
  915. ST c07, 6 * SIZE(AO)
  916. ST c08, 7 * SIZE(AO)
  917. ST c09, 8 * SIZE(AO)
  918. ST c10, 9 * SIZE(AO)
  919. ST c11, 10 * SIZE(AO)
  920. ST c12, 11 * SIZE(AO)
  921. ST c13, 12 * SIZE(AO)
  922. ST c14, 13 * SIZE(AO)
  923. ST c15, 14 * SIZE(AO)
  924. ST c16, 15 * SIZE(AO)
  925. #endif
  926. #ifdef LN
  927. lda C1, -4 * SIZE(C1)
  928. lda C2, -4 * SIZE(C2)
  929. lda C3, -4 * SIZE(C3)
  930. lda C4, -4 * SIZE(C4)
  931. #endif
  932. ST c01, 0 * SIZE(C1)
  933. ST c02, 1 * SIZE(C1)
  934. ST c03, 2 * SIZE(C1)
  935. ST c04, 3 * SIZE(C1)
  936. ST c05, 0 * SIZE(C2)
  937. ST c06, 1 * SIZE(C2)
  938. ST c07, 2 * SIZE(C2)
  939. ST c08, 3 * SIZE(C2)
  940. ST c09, 0 * SIZE(C3)
  941. ST c10, 1 * SIZE(C3)
  942. ST c11, 2 * SIZE(C3)
  943. ST c12, 3 * SIZE(C3)
  944. ST c13, 0 * SIZE(C4)
  945. ST c14, 1 * SIZE(C4)
  946. ST c15, 2 * SIZE(C4)
  947. ST c16, 3 * SIZE(C4)
  948. #ifndef LN
  949. lda C1, 4 * SIZE(C1)
  950. lda C2, 4 * SIZE(C2)
  951. lda C3, 4 * SIZE(C3)
  952. lda C4, 4 * SIZE(C4)
  953. #endif
  954. fclr t1
  955. fclr t2
  956. fclr t3
  957. fclr t4
  958. #ifdef RT
  959. sll K, 2 + BASE_SHIFT, TMP1
  960. addq AORIG, TMP1, AORIG
  961. #endif
  962. #if defined(LT) || defined(RN)
  963. subq K, KK, TMP1
  964. sll TMP1, BASE_SHIFT + 2, TMP1
  965. addq AO, TMP1, AO
  966. addq BO, TMP1, BO
  967. #endif
  968. #ifdef LT
  969. addq KK, 4, KK
  970. #endif
  971. #ifdef LN
  972. subq KK, 4, KK
  973. #endif
  974. lda I, -1(I)
  975. bgt I, $L11
  976. .align 4
  977. $L20:
  978. and M, 2, I
  979. ble I, $L30
  980. #if defined(LT) || defined(RN)
  981. LD a1, 0 * SIZE(AO)
  982. fclr c09
  983. LD a2, 1 * SIZE(AO)
  984. fclr c13
  985. LD a3, 2 * SIZE(AO)
  986. fclr c10
  987. LD a4, 3 * SIZE(AO)
  988. fclr c14
  989. LD b1, 0 * SIZE(B)
  990. lda L, -2(KK)
  991. LD b2, 1 * SIZE(B)
  992. lda AO, 2 * SIZE(AO)
  993. LD b3, 2 * SIZE(B)
  994. fclr c01
  995. LD b4, 3 * SIZE(B)
  996. fclr c05
  997. lda BO, 4 * SIZE(B)
  998. fclr c02
  999. fclr c06
  1000. ble KK, $L28
  1001. ble L, $L25
  1002. #else
  1003. #ifdef LN
  1004. sll K, BASE_SHIFT + 1, TMP1
  1005. subq AORIG, TMP1, AORIG
  1006. #endif
  1007. sll KK, BASE_SHIFT + 1, TMP1
  1008. addq AORIG, TMP1, AO
  1009. sll KK, BASE_SHIFT + 2, TMP2
  1010. addq B, TMP2, BO
  1011. subq K, KK, TMP1
  1012. LD a1, 0 * SIZE(AO)
  1013. fclr c09
  1014. LD a2, 1 * SIZE(AO)
  1015. fclr c13
  1016. LD a3, 2 * SIZE(AO)
  1017. fclr c10
  1018. LD a4, 3 * SIZE(AO)
  1019. fclr c14
  1020. LD b1, 0 * SIZE(BO)
  1021. lda L, -2(TMP1)
  1022. LD b2, 1 * SIZE(BO)
  1023. lda AO, 2 * SIZE(AO)
  1024. LD b3, 2 * SIZE(BO)
  1025. fclr c01
  1026. LD b4, 3 * SIZE(BO)
  1027. fclr c05
  1028. lda BO, 4 * SIZE(BO)
  1029. fclr c02
  1030. fclr c06
  1031. ble TMP1, $L28
  1032. ble L, $L25
  1033. #endif
  1034. .align 4
  1035. $L22:
  1036. ADD c09, t1, c09
  1037. unop
  1038. MUL a1, b1, t1
  1039. unop
  1040. ADD c10, t2, c10
  1041. unop
  1042. MUL a2, b1, t2
  1043. LD b1, 0 * SIZE(BO)
  1044. ADD c13, t3, c13
  1045. unop
  1046. MUL a1, b2, t3
  1047. lda BO, 8 * SIZE(BO)
  1048. ADD c14, t4, c14
  1049. unop
  1050. MUL a2, b2, t4
  1051. LD b2, -7 * SIZE(BO)
  1052. ADD c01, t1, c01
  1053. unop
  1054. MUL a1, b3, t1
  1055. unop
  1056. ADD c02, t2, c02
  1057. unop
  1058. MUL a2, b3, t2
  1059. LD b3, -6 * SIZE(BO)
  1060. ADD c05, t3, c05
  1061. unop
  1062. MUL a1, b4, t3
  1063. LD a1, 2 * SIZE(AO)
  1064. ADD c06, t4, c06
  1065. MUL a2, b4, t4
  1066. LD b5, -5 * SIZE(BO)
  1067. ADD c09, t1, c09
  1068. unop
  1069. MUL a3, b1, t1
  1070. LD a2, 3 * SIZE(AO)
  1071. ADD c10, t2, c10
  1072. unop
  1073. MUL a4, b1, t2
  1074. LD b1, -4 * SIZE(BO)
  1075. ADD c13, t3, c13
  1076. unop
  1077. MUL a3, b2, t3
  1078. lda AO, 4 * SIZE(AO)
  1079. ADD c14, t4, c14
  1080. MUL a4, b2, t4
  1081. LD b2, -3 * SIZE(BO)
  1082. ADD c01, t1, c01
  1083. lda L, -2(L)
  1084. MUL a3, b3, t1
  1085. LD b4, -1 * SIZE(BO)
  1086. ADD c02, t2, c02
  1087. unop
  1088. MUL a4, b3, t2
  1089. LD b3, -2 * SIZE(BO)
  1090. ADD c05, t3, c05
  1091. unop
  1092. MUL a3, b5, t3
  1093. LD a3, 0 * SIZE(AO)
  1094. ADD c06, t4, c06
  1095. MUL a4, b5, t4
  1096. LD a4, 1 * SIZE(AO)
  1097. bgt L, $L22
  1098. .align 4
  1099. $L25:
  1100. ADD c09, t1, c09
  1101. MUL a1, b1, t1
  1102. #if defined(LT) || defined(RN)
  1103. blbs KK, $L27
  1104. #else
  1105. blbs TMP1, $L27
  1106. #endif
  1107. ADD c10, t2, c10
  1108. unop
  1109. MUL a2, b1, t2
  1110. LD b1, 0 * SIZE(BO)
  1111. ADD c13, t3, c13
  1112. unop
  1113. MUL a1, b2, t3
  1114. unop
  1115. ADD c14, t4, c14
  1116. unop
  1117. MUL a2, b2, t4
  1118. LD b2, 1 * SIZE(BO)
  1119. ADD c01, t1, c01
  1120. unop
  1121. MUL a1, b3, t1
  1122. lda AO, 2 * SIZE(AO)
  1123. ADD c02, t2, c02
  1124. unop
  1125. MUL a2, b3, t2
  1126. LD b3, 2 * SIZE(BO)
  1127. ADD c05, t3, c05
  1128. unop
  1129. MUL a1, b4, t3
  1130. LD a1, -2 * SIZE(AO)
  1131. ADD c06, t4, c06
  1132. unop
  1133. MUL a2, b4, t4
  1134. LD a2, -1 * SIZE(AO)
  1135. ADD c09, t1, c09
  1136. LD b4, 3 * SIZE(BO)
  1137. MUL a1, b1, t1
  1138. lda BO, 4 * SIZE(BO)
  1139. .align 4
  1140. $L27:
  1141. ADD c10, t2, c10
  1142. MUL a2, b1, t2
  1143. ADD c13, t3, c13
  1144. MUL a1, b2, t3
  1145. ADD c14, t4, c14
  1146. MUL a2, b2, t4
  1147. ADD c01, t1, c01
  1148. MUL a1, b3, t1
  1149. ADD c02, t2, c02
  1150. MUL a2, b3, t2
  1151. ADD c05, t3, c05
  1152. MUL a1, b4, t3
  1153. ADD c06, t4, c06
  1154. lda AO, 2 * SIZE(AO)
  1155. MUL a2, b4, t4
  1156. lda BO, 4 * SIZE(BO)
  1157. ADD c09, t1, c09
  1158. ADD c10, t2, c10
  1159. ADD c13, t3, c13
  1160. ADD c14, t4, c14
  1161. .align 4
  1162. $L28:
  1163. #if defined(LN) || defined(RT)
  1164. #ifdef LN
  1165. subq KK, 2, TMP1
  1166. #else
  1167. subq KK, 4, TMP1
  1168. #endif
  1169. sll TMP1, BASE_SHIFT + 1, TMP2
  1170. addq AORIG, TMP2, AO
  1171. sll TMP1, BASE_SHIFT + 2, TMP2
  1172. addq B, TMP2, BO
  1173. #else
  1174. lda AO, -2 * SIZE(AO)
  1175. lda BO, -4 * SIZE(BO)
  1176. #endif
  1177. #if defined(LN) || defined(LT)
  1178. LD a1, 0 * SIZE(BO)
  1179. LD a2, 1 * SIZE(BO)
  1180. LD a3, 2 * SIZE(BO)
  1181. LD a4, 3 * SIZE(BO)
  1182. LD b1, 4 * SIZE(BO)
  1183. LD b2, 5 * SIZE(BO)
  1184. LD b3, 6 * SIZE(BO)
  1185. LD b4, 7 * SIZE(BO)
  1186. SUB a1, c01, c01
  1187. SUB a2, c05, c05
  1188. SUB a3, c09, c09
  1189. SUB a4, c13, c13
  1190. SUB b1, c02, c02
  1191. SUB b2, c06, c06
  1192. SUB b3, c10, c10
  1193. SUB b4, c14, c14
  1194. #else
  1195. LD a1, 0 * SIZE(AO)
  1196. LD a2, 1 * SIZE(AO)
  1197. LD a3, 2 * SIZE(AO)
  1198. LD a4, 3 * SIZE(AO)
  1199. LD b1, 4 * SIZE(AO)
  1200. LD b2, 5 * SIZE(AO)
  1201. LD b3, 6 * SIZE(AO)
  1202. LD b4, 7 * SIZE(AO)
  1203. SUB a1, c01, c01
  1204. SUB a2, c02, c02
  1205. SUB a3, c05, c05
  1206. SUB a4, c06, c06
  1207. SUB b1, c09, c09
  1208. SUB b2, c10, c10
  1209. SUB b3, c13, c13
  1210. SUB b4, c14, c14
  1211. #endif
  1212. #ifdef LN
  1213. LD a1, 3 * SIZE(AO)
  1214. LD a2, 2 * SIZE(AO)
  1215. LD a3, 0 * SIZE(AO)
  1216. MUL a1, c02, c02
  1217. MUL a1, c06, c06
  1218. MUL a1, c10, c10
  1219. MUL a1, c14, c14
  1220. MUL a2, c02, t1
  1221. MUL a2, c06, t2
  1222. MUL a2, c10, t3
  1223. MUL a2, c14, t4
  1224. SUB c01, t1, c01
  1225. SUB c05, t2, c05
  1226. SUB c09, t3, c09
  1227. SUB c13, t4, c13
  1228. MUL a3, c01, c01
  1229. MUL a3, c05, c05
  1230. MUL a3, c09, c09
  1231. MUL a3, c13, c13
  1232. #endif
  1233. #ifdef LT
  1234. LD a1, 0 * SIZE(AO)
  1235. LD a2, 1 * SIZE(AO)
  1236. LD a3, 3 * SIZE(AO)
  1237. MUL a1, c01, c01
  1238. MUL a1, c05, c05
  1239. MUL a1, c09, c09
  1240. MUL a1, c13, c13
  1241. MUL a2, c01, t1
  1242. MUL a2, c05, t2
  1243. MUL a2, c09, t3
  1244. MUL a2, c13, t4
  1245. SUB c02, t1, c02
  1246. SUB c06, t2, c06
  1247. SUB c10, t3, c10
  1248. SUB c14, t4, c14
  1249. MUL a3, c02, c02
  1250. MUL a3, c06, c06
  1251. MUL a3, c10, c10
  1252. MUL a3, c14, c14
  1253. #endif
  1254. #ifdef RN
  1255. LD a1, 0 * SIZE(BO)
  1256. LD a2, 1 * SIZE(BO)
  1257. LD a3, 2 * SIZE(BO)
  1258. LD a4, 3 * SIZE(BO)
  1259. MUL a1, c01, c01
  1260. MUL a1, c02, c02
  1261. MUL a2, c01, t1
  1262. MUL a2, c02, t2
  1263. SUB c05, t1, c05
  1264. SUB c06, t2, c06
  1265. MUL a3, c01, t1
  1266. MUL a3, c02, t2
  1267. SUB c09, t1, c09
  1268. SUB c10, t2, c10
  1269. MUL a4, c01, t1
  1270. MUL a4, c02, t2
  1271. SUB c13, t1, c13
  1272. SUB c14, t2, c14
  1273. LD b1, 5 * SIZE(BO)
  1274. LD b2, 6 * SIZE(BO)
  1275. LD b3, 7 * SIZE(BO)
  1276. MUL b1, c05, c05
  1277. MUL b1, c06, c06
  1278. MUL b2, c05, t1
  1279. MUL b2, c06, t2
  1280. SUB c09, t1, c09
  1281. SUB c10, t2, c10
  1282. MUL b3, c05, t1
  1283. MUL b3, c06, t2
  1284. SUB c13, t1, c13
  1285. SUB c14, t2, c14
  1286. LD a1, 10 * SIZE(BO)
  1287. LD a2, 11 * SIZE(BO)
  1288. LD a3, 15 * SIZE(BO)
  1289. MUL a1, c09, c09
  1290. MUL a1, c10, c10
  1291. MUL a2, c09, t1
  1292. MUL a2, c10, t2
  1293. SUB c13, t1, c13
  1294. SUB c14, t2, c14
  1295. MUL a3, c13, c13
  1296. MUL a3, c14, c14
  1297. #endif
  1298. #ifdef RT
  1299. LD a1, 15 * SIZE(BO)
  1300. LD a2, 14 * SIZE(BO)
  1301. LD a3, 13 * SIZE(BO)
  1302. LD a4, 12 * SIZE(BO)
  1303. MUL a1, c13, c13
  1304. MUL a1, c14, c14
  1305. MUL a2, c13, t1
  1306. MUL a2, c14, t2
  1307. SUB c09, t1, c09
  1308. SUB c10, t2, c10
  1309. MUL a3, c13, t1
  1310. MUL a3, c14, t2
  1311. SUB c05, t1, c05
  1312. SUB c06, t2, c06
  1313. MUL a4, c13, t1
  1314. MUL a4, c14, t2
  1315. SUB c01, t1, c01
  1316. SUB c02, t2, c02
  1317. LD b1, 10 * SIZE(BO)
  1318. LD b2, 9 * SIZE(BO)
  1319. LD b3, 8 * SIZE(BO)
  1320. MUL b1, c09, c09
  1321. MUL b1, c10, c10
  1322. MUL b2, c09, t1
  1323. MUL b2, c10, t2
  1324. SUB c05, t1, c05
  1325. SUB c06, t2, c06
  1326. MUL b3, c09, t1
  1327. MUL b3, c10, t2
  1328. SUB c01, t1, c01
  1329. SUB c02, t2, c02
  1330. LD a1, 5 * SIZE(BO)
  1331. LD a2, 4 * SIZE(BO)
  1332. LD a3, 0 * SIZE(BO)
  1333. MUL a1, c05, c05
  1334. MUL a1, c06, c06
  1335. MUL a2, c05, t1
  1336. MUL a2, c06, t2
  1337. SUB c01, t1, c01
  1338. SUB c02, t2, c02
  1339. MUL a3, c01, c01
  1340. MUL a3, c02, c02
  1341. #endif
  1342. #if defined(LN) || defined(LT)
  1343. ST c01, 0 * SIZE(BO)
  1344. ST c05, 1 * SIZE(BO)
  1345. ST c09, 2 * SIZE(BO)
  1346. ST c13, 3 * SIZE(BO)
  1347. ST c02, 4 * SIZE(BO)
  1348. ST c06, 5 * SIZE(BO)
  1349. ST c10, 6 * SIZE(BO)
  1350. ST c14, 7 * SIZE(BO)
  1351. #else
  1352. ST c01, 0 * SIZE(AO)
  1353. ST c02, 1 * SIZE(AO)
  1354. ST c05, 2 * SIZE(AO)
  1355. ST c06, 3 * SIZE(AO)
  1356. ST c09, 4 * SIZE(AO)
  1357. ST c10, 5 * SIZE(AO)
  1358. ST c13, 6 * SIZE(AO)
  1359. ST c14, 7 * SIZE(AO)
  1360. #endif
  1361. #ifdef LN
  1362. lda C1, -2 * SIZE(C1)
  1363. lda C2, -2 * SIZE(C2)
  1364. lda C3, -2 * SIZE(C3)
  1365. lda C4, -2 * SIZE(C4)
  1366. #endif
  1367. ST c01, 0 * SIZE(C1)
  1368. ST c02, 1 * SIZE(C1)
  1369. ST c05, 0 * SIZE(C2)
  1370. ST c06, 1 * SIZE(C2)
  1371. ST c09, 0 * SIZE(C3)
  1372. ST c10, 1 * SIZE(C3)
  1373. ST c13, 0 * SIZE(C4)
  1374. ST c14, 1 * SIZE(C4)
  1375. #ifndef LN
  1376. lda C1, 2 * SIZE(C1)
  1377. lda C2, 2 * SIZE(C2)
  1378. lda C3, 2 * SIZE(C3)
  1379. lda C4, 2 * SIZE(C4)
  1380. #endif
  1381. fclr t1
  1382. fclr t2
  1383. fclr t3
  1384. fclr t4
  1385. #ifdef RT
  1386. sll K, 1 + BASE_SHIFT, TMP1
  1387. addq AORIG, TMP1, AORIG
  1388. #endif
  1389. #if defined(LT) || defined(RN)
  1390. subq K, KK, TMP1
  1391. sll TMP1, BASE_SHIFT + 1, TMP2
  1392. addq AO, TMP2, AO
  1393. sll TMP1, BASE_SHIFT + 2, TMP2
  1394. addq BO, TMP2, BO
  1395. #endif
  1396. #ifdef LT
  1397. addq KK, 2, KK
  1398. #endif
  1399. #ifdef LN
  1400. subq KK, 2, KK
  1401. #endif
  1402. .align 4
  1403. $L30:
  1404. and M, 1, I
  1405. ble I, $L39
  1406. #if defined(LT) || defined(RN)
  1407. LD a1, 0 * SIZE(AO)
  1408. fclr c01
  1409. LD a2, 1 * SIZE(AO)
  1410. fclr c05
  1411. LD b1, 0 * SIZE(B)
  1412. lda L, -2(KK)
  1413. LD b2, 1 * SIZE(B)
  1414. lda AO, 1 * SIZE(AO)
  1415. LD b3, 2 * SIZE(B)
  1416. fclr c09
  1417. LD b4, 3 * SIZE(B)
  1418. fclr c13
  1419. lda BO, 4 * SIZE(B)
  1420. ble KK, $L38
  1421. ble L, $L35
  1422. #else
  1423. #ifdef LN
  1424. sll K, BASE_SHIFT + 0, TMP1
  1425. subq AORIG, TMP1, AORIG
  1426. #endif
  1427. sll KK, BASE_SHIFT + 0, TMP1
  1428. addq AORIG, TMP1, AO
  1429. sll KK, BASE_SHIFT + 2, TMP2
  1430. addq B, TMP2, BO
  1431. subq K, KK, TMP1
  1432. LD a1, 0 * SIZE(AO)
  1433. fclr c01
  1434. LD a2, 1 * SIZE(AO)
  1435. fclr c05
  1436. LD b1, 0 * SIZE(BO)
  1437. lda L, -2(TMP1)
  1438. LD b2, 1 * SIZE(BO)
  1439. lda AO, 1 * SIZE(AO)
  1440. LD b3, 2 * SIZE(BO)
  1441. fclr c09
  1442. LD b4, 3 * SIZE(BO)
  1443. fclr c13
  1444. lda BO, 4 * SIZE(BO)
  1445. ble TMP1, $L38
  1446. ble L, $L35
  1447. #endif
  1448. .align 4
  1449. $L32:
  1450. ADD c01, t1, c01
  1451. lda L, -2(L)
  1452. MUL a1, b1, t1
  1453. LD b1, 0 * SIZE(BO)
  1454. ADD c05, t2, c05
  1455. lda AO, 2 * SIZE(AO)
  1456. MUL a1, b2, t2
  1457. LD b2, 1 * SIZE(BO)
  1458. ADD c09, t3, c09
  1459. LD b5, 3 * SIZE(BO)
  1460. MUL a1, b3, t3
  1461. LD b3, 2 * SIZE(BO)
  1462. ADD c13, t4, c13
  1463. MUL a1, b4, t4
  1464. LD a1, -1 * SIZE(AO)
  1465. ADD c01, t1, c01
  1466. MUL a2, b1, t1
  1467. LD b1, 4 * SIZE(BO)
  1468. lda BO, 8 * SIZE(BO)
  1469. ADD c05, t2, c05
  1470. MUL a2, b2, t2
  1471. LD b2, -3 * SIZE(BO)
  1472. ADD c09, t3, c09
  1473. LD b4, -1 * SIZE(BO)
  1474. MUL a2, b3, t3
  1475. LD b3, -2 * SIZE(BO)
  1476. ADD c13, t4, c13
  1477. MUL a2, b5, t4
  1478. LD a2, 0 * SIZE(AO)
  1479. bgt L, $L32
  1480. .align 4
  1481. $L35:
  1482. ADD c01, t1, c01
  1483. MUL a1, b1, t1
  1484. #if defined(LT) || defined(RN)
  1485. blbs KK, $L37
  1486. #else
  1487. blbs TMP1, $L37
  1488. #endif
  1489. .align 4
  1490. ADD c05, t2, c05
  1491. LD b1, 0 * SIZE(BO)
  1492. MUL a1, b2, t2
  1493. LD b2, 1 * SIZE(BO)
  1494. ADD c09, t3, c09
  1495. MUL a1, b3, t3
  1496. LD b3, 2 * SIZE(BO)
  1497. ADD c13, t4, c13
  1498. MUL a1, b4, t4
  1499. LD a1, 0 * SIZE(AO)
  1500. lda AO, 1 * SIZE(AO)
  1501. ADD c01, t1, c01
  1502. LD b4, 3 * SIZE(BO)
  1503. MUL a1, b1, t1
  1504. lda BO, 4 * SIZE(BO)
  1505. .align 4
  1506. $L37:
  1507. ADD c05, t2, c05
  1508. MUL a1, b2, t2
  1509. ADD c09, t3, c09
  1510. MUL a1, b3, t3
  1511. ADD c13, t4, c13
  1512. lda AO, 1 * SIZE(AO)
  1513. MUL a1, b4, t4
  1514. lda BO, 4 * SIZE(BO)
  1515. ADD c01, t1, c01
  1516. ADD c05, t2, c05
  1517. ADD c09, t3, c09
  1518. ADD c13, t4, c13
  1519. $L38:
  1520. #if defined(LN) || defined(RT)
  1521. #ifdef LN
  1522. subq KK, 1, TMP1
  1523. #else
  1524. subq KK, 4, TMP1
  1525. #endif
  1526. sll TMP1, BASE_SHIFT + 0, TMP2
  1527. addq AORIG, TMP2, AO
  1528. sll TMP1, BASE_SHIFT + 2, TMP2
  1529. addq B, TMP2, BO
  1530. #else
  1531. lda AO, -1 * SIZE(AO)
  1532. lda BO, -4 * SIZE(BO)
  1533. #endif
  1534. #if defined(LN) || defined(LT)
  1535. LD a1, 0 * SIZE(BO)
  1536. LD a2, 1 * SIZE(BO)
  1537. LD a3, 2 * SIZE(BO)
  1538. LD a4, 3 * SIZE(BO)
  1539. SUB a1, c01, c01
  1540. SUB a2, c05, c05
  1541. SUB a3, c09, c09
  1542. SUB a4, c13, c13
  1543. #else
  1544. LD a1, 0 * SIZE(AO)
  1545. LD a2, 1 * SIZE(AO)
  1546. LD a3, 2 * SIZE(AO)
  1547. LD a4, 3 * SIZE(AO)
  1548. SUB a1, c01, c01
  1549. SUB a2, c05, c05
  1550. SUB a3, c09, c09
  1551. SUB a4, c13, c13
  1552. #endif
  1553. #if defined(LN) || defined(LT)
  1554. LD a1, 0 * SIZE(AO)
  1555. MUL a1, c01, c01
  1556. MUL a1, c05, c05
  1557. MUL a1, c09, c09
  1558. MUL a1, c13, c13
  1559. #endif
  1560. #ifdef RN
  1561. LD a1, 0 * SIZE(BO)
  1562. LD a2, 1 * SIZE(BO)
  1563. LD a3, 2 * SIZE(BO)
  1564. LD a4, 3 * SIZE(BO)
  1565. MUL a1, c01, c01
  1566. MUL a2, c01, t1
  1567. SUB c05, t1, c05
  1568. MUL a3, c01, t1
  1569. SUB c09, t1, c09
  1570. MUL a4, c01, t1
  1571. SUB c13, t1, c13
  1572. LD b1, 5 * SIZE(BO)
  1573. LD b2, 6 * SIZE(BO)
  1574. LD b3, 7 * SIZE(BO)
  1575. MUL b1, c05, c05
  1576. MUL b2, c05, t1
  1577. SUB c09, t1, c09
  1578. MUL b3, c05, t1
  1579. SUB c13, t1, c13
  1580. LD a1, 10 * SIZE(BO)
  1581. LD a2, 11 * SIZE(BO)
  1582. LD a3, 15 * SIZE(BO)
  1583. MUL a1, c09, c09
  1584. MUL a2, c09, t1
  1585. SUB c13, t1, c13
  1586. MUL a3, c13, c13
  1587. #endif
  1588. #ifdef RT
  1589. LD a1, 15 * SIZE(BO)
  1590. LD a2, 14 * SIZE(BO)
  1591. LD a3, 13 * SIZE(BO)
  1592. LD a4, 12 * SIZE(BO)
  1593. MUL a1, c13, c13
  1594. MUL a2, c13, t1
  1595. SUB c09, t1, c09
  1596. MUL a3, c13, t1
  1597. SUB c05, t1, c05
  1598. MUL a4, c13, t1
  1599. SUB c01, t1, c01
  1600. LD b1, 10 * SIZE(BO)
  1601. LD b2, 9 * SIZE(BO)
  1602. LD b3, 8 * SIZE(BO)
  1603. MUL b1, c09, c09
  1604. MUL b2, c09, t1
  1605. SUB c05, t1, c05
  1606. MUL b3, c09, t1
  1607. SUB c01, t1, c01
  1608. LD a1, 5 * SIZE(BO)
  1609. LD a2, 4 * SIZE(BO)
  1610. LD a3, 0 * SIZE(BO)
  1611. MUL a1, c05, c05
  1612. MUL a2, c05, t1
  1613. SUB c01, t1, c01
  1614. MUL a3, c01, c01
  1615. #endif
  1616. #if defined(LN) || defined(LT)
  1617. ST c01, 0 * SIZE(BO)
  1618. ST c05, 1 * SIZE(BO)
  1619. ST c09, 2 * SIZE(BO)
  1620. ST c13, 3 * SIZE(BO)
  1621. #else
  1622. ST c01, 0 * SIZE(AO)
  1623. ST c05, 1 * SIZE(AO)
  1624. ST c09, 2 * SIZE(AO)
  1625. ST c13, 3 * SIZE(AO)
  1626. #endif
  1627. #ifdef LN
  1628. lda C1, -1 * SIZE(C1)
  1629. lda C2, -1 * SIZE(C2)
  1630. lda C3, -1 * SIZE(C3)
  1631. lda C4, -1 * SIZE(C4)
  1632. #endif
  1633. ST c01, 0 * SIZE(C1)
  1634. ST c05, 0 * SIZE(C2)
  1635. ST c09, 0 * SIZE(C3)
  1636. ST c13, 0 * SIZE(C4)
  1637. #ifdef RT
  1638. sll K, 0 + BASE_SHIFT, TMP1
  1639. addq AORIG, TMP1, AORIG
  1640. #endif
  1641. #if defined(LT) || defined(RN)
  1642. subq K, KK, TMP1
  1643. sll TMP1, BASE_SHIFT + 0, TMP2
  1644. addq AO, TMP2, AO
  1645. sll TMP1, BASE_SHIFT + 2, TMP2
  1646. addq BO, TMP2, BO
  1647. #endif
  1648. #ifdef LT
  1649. addq KK, 1, KK
  1650. #endif
  1651. #ifdef LN
  1652. subq KK, 1, KK
  1653. #endif
  1654. .align 4
  1655. $L39:
  1656. #ifdef LN
  1657. sll K, 2 + BASE_SHIFT, TMP1
  1658. addq B, TMP1, B
  1659. #endif
  1660. #if defined(LT) || defined(RN)
  1661. mov BO, B
  1662. #endif
  1663. #ifdef RN
  1664. addq KK, 4, KK
  1665. #endif
  1666. #ifdef RT
  1667. subq KK, 4, KK
  1668. #endif
  1669. lda J, -1(J)
  1670. bgt J, $L01
  1671. .align 4
  1672. $L40:
  1673. and N, 2, J
  1674. ble J, $L80
  1675. #ifdef RT
  1676. sll K, 1 + BASE_SHIFT, TMP1
  1677. subq B, TMP1, B
  1678. addq LDC, LDC, TMP1
  1679. subq C, TMP1, C
  1680. #endif
  1681. mov C, C1
  1682. addq C, LDC, C2
  1683. fclr t1
  1684. #ifndef RT
  1685. addq C2, LDC, C
  1686. #endif
  1687. fclr t2
  1688. #ifdef LN
  1689. addq M, OFFSET, KK
  1690. #endif
  1691. #ifdef LT
  1692. mov OFFSET, KK
  1693. #endif
  1694. #if defined(LN) || defined(RT)
  1695. mov A, AORIG
  1696. #else
  1697. mov A, AO
  1698. #endif
  1699. sra M, 2, I
  1700. fclr t3
  1701. fclr t4
  1702. ble I, $L60
  1703. .align 4
  1704. $L51:
  1705. #if defined(LT) || defined(RN)
  1706. LD a1, 0 * SIZE(AO)
  1707. fclr c03
  1708. LD a2, 1 * SIZE(AO)
  1709. fclr c07
  1710. LD a3, 2 * SIZE(AO)
  1711. fclr c04
  1712. LD a4, 3 * SIZE(AO)
  1713. fclr c08
  1714. LD b1, 0 * SIZE(B)
  1715. fclr c01
  1716. LD b2, 1 * SIZE(B)
  1717. fclr c05
  1718. LD b3, 2 * SIZE(B)
  1719. fclr c02
  1720. LD b4, 3 * SIZE(B)
  1721. fclr c06
  1722. lda L, -2(KK)
  1723. lda BO, 2 * SIZE(B)
  1724. lda AO, 4 * SIZE(AO)
  1725. ble KK, $L58
  1726. ble L, $L55
  1727. #else
  1728. #ifdef LN
  1729. sll K, BASE_SHIFT + 2, TMP1
  1730. subq AORIG, TMP1, AORIG
  1731. #endif
  1732. sll KK, BASE_SHIFT + 2, TMP1
  1733. addq AORIG, TMP1, AO
  1734. sll KK, BASE_SHIFT + 1, TMP1
  1735. addq B, TMP1, BO
  1736. subq K, KK, TMP1
  1737. LD a1, 0 * SIZE(AO)
  1738. fclr c03
  1739. LD a2, 1 * SIZE(AO)
  1740. fclr c07
  1741. LD a3, 2 * SIZE(AO)
  1742. fclr c04
  1743. LD a4, 3 * SIZE(AO)
  1744. fclr c08
  1745. LD b1, 0 * SIZE(BO)
  1746. fclr c01
  1747. LD b2, 1 * SIZE(BO)
  1748. fclr c05
  1749. LD b3, 2 * SIZE(BO)
  1750. fclr c02
  1751. LD b4, 3 * SIZE(BO)
  1752. fclr c06
  1753. lda L, -2(TMP1)
  1754. lda BO, 2 * SIZE(BO)
  1755. lda AO, 4 * SIZE(AO)
  1756. ble TMP1, $L58
  1757. ble L, $L55
  1758. #endif
  1759. .align 4
  1760. $L52:
  1761. ADD c05, t1, c05
  1762. unop
  1763. MUL a1, b1, t1
  1764. unop
  1765. ADD c06, t2, c06
  1766. lda L, -2(L)
  1767. MUL a2, b1, t2
  1768. unop
  1769. ADD c07, t3, c07
  1770. unop
  1771. MUL a3, b1, t3
  1772. unop
  1773. ADD c08, t4, c08
  1774. unop
  1775. MUL a4, b1, t4
  1776. LD b1, 2 * SIZE(BO)
  1777. ADD c01, t1, c01
  1778. unop
  1779. MUL a1, b2, t1
  1780. LD a1, 0 * SIZE(AO)
  1781. ADD c02, t2, c02
  1782. lda BO, 4 * SIZE(BO)
  1783. MUL a2, b2, t2
  1784. LD a2, 1 * SIZE(AO)
  1785. ADD c03, t3, c03
  1786. unop
  1787. MUL a3, b2, t3
  1788. LD a3, 2 * SIZE(AO)
  1789. ADD c04, t4, c04
  1790. unop
  1791. MUL a4, b2, t4
  1792. LD a5, 3 * SIZE(AO)
  1793. ADD c05, t1, c05
  1794. unop
  1795. MUL a1, b3, t1
  1796. LD b2, -1 * SIZE(BO)
  1797. ADD c06, t2, c06
  1798. unop
  1799. MUL a2, b3, t2
  1800. unop
  1801. ADD c07, t3, c07
  1802. unop
  1803. MUL a3, b3, t3
  1804. lda AO, 8 * SIZE(AO)
  1805. ADD c08, t4, c08
  1806. unop
  1807. MUL a5, b3, t4
  1808. LD b3, 0 * SIZE(BO)
  1809. ADD c01, t1, c01
  1810. unop
  1811. MUL a1, b4, t1
  1812. LD a1, -4 * SIZE(AO)
  1813. ADD c02, t2, c02
  1814. unop
  1815. MUL a2, b4, t2
  1816. LD a2, -3 * SIZE(AO)
  1817. ADD c03, t3, c03
  1818. LD a4, -1 * SIZE(AO)
  1819. MUL a3, b4, t3
  1820. LD a3, -2 * SIZE(AO)
  1821. ADD c04, t4, c04
  1822. MUL a5, b4, t4
  1823. LD b4, 1 * SIZE(BO)
  1824. bgt L, $L52
  1825. .align 4
  1826. $L55:
  1827. ADD c05, t1, c05
  1828. MUL a1, b1, t1
  1829. #if defined(LT) || defined(RN)
  1830. blbs KK, $L57
  1831. #else
  1832. blbs TMP1, $L57
  1833. #endif
  1834. .align 4
  1835. ADD c06, t2, c06
  1836. MUL a2, b1, t2
  1837. ADD c07, t3, c07
  1838. MUL a3, b1, t3
  1839. ADD c08, t4, c08
  1840. unop
  1841. MUL a4, b1, t4
  1842. LD b1, 0 * SIZE(BO)
  1843. ADD c01, t1, c01
  1844. unop
  1845. MUL a1, b2, t1
  1846. LD a1, 0 * SIZE(AO)
  1847. ADD c02, t2, c02
  1848. unop
  1849. MUL a2, b2, t2
  1850. LD a2, 1 * SIZE(AO)
  1851. ADD c03, t3, c03
  1852. unop
  1853. MUL a3, b2, t3
  1854. LD a3, 2 * SIZE(AO)
  1855. ADD c04, t4, c04
  1856. MUL a4, b2, t4
  1857. LD a4, 3 * SIZE(AO)
  1858. lda AO, 4 * SIZE(AO)
  1859. ADD c05, t1, c05
  1860. LD b2, 1 * SIZE(BO)
  1861. MUL a1, b1, t1
  1862. lda BO, 2 * SIZE(BO)
  1863. .align 4
  1864. $L57:
  1865. ADD c06, t2, c06
  1866. MUL a2, b1, t2
  1867. ADD c07, t3, c07
  1868. MUL a3, b1, t3
  1869. ADD c08, t4, c08
  1870. MUL a4, b1, t4
  1871. ADD c01, t1, c01
  1872. MUL a1, b2, t1
  1873. ADD c02, t2, c02
  1874. MUL a2, b2, t2
  1875. ADD c03, t3, c03
  1876. MUL a3, b2, t3
  1877. ADD c04, t4, c04
  1878. lda AO, 4 * SIZE(AO)
  1879. MUL a4, b2, t4
  1880. lda BO, 2 * SIZE(BO)
  1881. ADD c05, t1, c05
  1882. ADD c06, t2, c06
  1883. ADD c07, t3, c07
  1884. ADD c08, t4, c08
  1885. .align 4
  1886. $L58:
  1887. #if defined(LN) || defined(RT)
  1888. #ifdef LN
  1889. subq KK, 4, TMP1
  1890. #else
  1891. subq KK, 2, TMP1
  1892. #endif
  1893. sll TMP1, BASE_SHIFT + 2, TMP2
  1894. addq AORIG, TMP2, AO
  1895. sll TMP1, BASE_SHIFT + 1, TMP2
  1896. addq B, TMP2, BO
  1897. #else
  1898. lda AO, -4 * SIZE(AO)
  1899. lda BO, -2 * SIZE(BO)
  1900. #endif
  1901. #if defined(LN) || defined(LT)
  1902. LD a1, 0 * SIZE(BO)
  1903. LD a2, 1 * SIZE(BO)
  1904. LD a3, 2 * SIZE(BO)
  1905. LD a4, 3 * SIZE(BO)
  1906. LD b1, 4 * SIZE(BO)
  1907. LD b2, 5 * SIZE(BO)
  1908. LD b3, 6 * SIZE(BO)
  1909. LD b4, 7 * SIZE(BO)
  1910. SUB a1, c01, c01
  1911. SUB a2, c05, c05
  1912. SUB a3, c02, c02
  1913. SUB a4, c06, c06
  1914. SUB b1, c03, c03
  1915. SUB b2, c07, c07
  1916. SUB b3, c04, c04
  1917. SUB b4, c08, c08
  1918. #else
  1919. LD a1, 0 * SIZE(AO)
  1920. LD a2, 1 * SIZE(AO)
  1921. LD a3, 2 * SIZE(AO)
  1922. LD a4, 3 * SIZE(AO)
  1923. LD b1, 4 * SIZE(AO)
  1924. LD b2, 5 * SIZE(AO)
  1925. LD b3, 6 * SIZE(AO)
  1926. LD b4, 7 * SIZE(AO)
  1927. SUB a1, c01, c01
  1928. SUB a2, c02, c02
  1929. SUB a3, c03, c03
  1930. SUB a4, c04, c04
  1931. SUB b1, c05, c05
  1932. SUB b2, c06, c06
  1933. SUB b3, c07, c07
  1934. SUB b4, c08, c08
  1935. #endif
  1936. #ifdef LN
  1937. LD a1, 15 * SIZE(AO)
  1938. LD a2, 14 * SIZE(AO)
  1939. LD a3, 13 * SIZE(AO)
  1940. LD a4, 12 * SIZE(AO)
  1941. MUL a1, c04, c04
  1942. MUL a1, c08, c08
  1943. MUL a2, c04, t1
  1944. MUL a2, c08, t2
  1945. SUB c03, t1, c03
  1946. SUB c07, t2, c07
  1947. MUL a3, c04, t1
  1948. MUL a3, c08, t2
  1949. SUB c02, t1, c02
  1950. SUB c06, t2, c06
  1951. MUL a4, c04, t1
  1952. MUL a4, c08, t2
  1953. SUB c01, t1, c01
  1954. SUB c05, t2, c05
  1955. LD b1, 10 * SIZE(AO)
  1956. LD b2, 9 * SIZE(AO)
  1957. LD b3, 8 * SIZE(AO)
  1958. MUL b1, c03, c03
  1959. MUL b1, c07, c07
  1960. MUL b2, c03, t1
  1961. MUL b2, c07, t2
  1962. SUB c02, t1, c02
  1963. SUB c06, t2, c06
  1964. MUL b3, c03, t1
  1965. MUL b3, c07, t2
  1966. SUB c01, t1, c01
  1967. SUB c05, t2, c05
  1968. LD a1, 5 * SIZE(AO)
  1969. LD a2, 4 * SIZE(AO)
  1970. LD a3, 0 * SIZE(AO)
  1971. MUL a1, c02, c02
  1972. MUL a1, c06, c06
  1973. MUL a2, c02, t1
  1974. MUL a2, c06, t2
  1975. SUB c01, t1, c01
  1976. SUB c05, t2, c05
  1977. MUL a3, c01, c01
  1978. MUL a3, c05, c05
  1979. #endif
  1980. #ifdef LT
  1981. LD a1, 0 * SIZE(AO)
  1982. LD a2, 1 * SIZE(AO)
  1983. LD a3, 2 * SIZE(AO)
  1984. LD a4, 3 * SIZE(AO)
  1985. MUL a1, c01, c01
  1986. MUL a1, c05, c05
  1987. MUL a2, c01, t1
  1988. MUL a2, c05, t2
  1989. SUB c02, t1, c02
  1990. SUB c06, t2, c06
  1991. MUL a3, c01, t1
  1992. MUL a3, c05, t2
  1993. SUB c03, t1, c03
  1994. SUB c07, t2, c07
  1995. MUL a4, c01, t1
  1996. MUL a4, c05, t2
  1997. SUB c04, t1, c04
  1998. SUB c08, t2, c08
  1999. LD b1, 5 * SIZE(AO)
  2000. LD b2, 6 * SIZE(AO)
  2001. LD b3, 7 * SIZE(AO)
  2002. MUL b1, c02, c02
  2003. MUL b1, c06, c06
  2004. MUL b2, c02, t1
  2005. MUL b2, c06, t2
  2006. SUB c03, t1, c03
  2007. SUB c07, t2, c07
  2008. MUL b3, c02, t1
  2009. MUL b3, c06, t2
  2010. SUB c04, t1, c04
  2011. SUB c08, t2, c08
  2012. LD a1, 10 * SIZE(AO)
  2013. LD a2, 11 * SIZE(AO)
  2014. LD a3, 15 * SIZE(AO)
  2015. MUL a1, c03, c03
  2016. MUL a1, c07, c07
  2017. MUL a2, c03, t1
  2018. MUL a2, c07, t2
  2019. SUB c04, t1, c04
  2020. SUB c08, t2, c08
  2021. MUL a3, c04, c04
  2022. MUL a3, c08, c08
  2023. #endif
  2024. #ifdef RN
  2025. LD a1, 0 * SIZE(BO)
  2026. LD a2, 1 * SIZE(BO)
  2027. LD a3, 3 * SIZE(BO)
  2028. MUL a1, c01, c01
  2029. MUL a1, c02, c02
  2030. MUL a1, c03, c03
  2031. MUL a1, c04, c04
  2032. MUL a2, c01, t1
  2033. MUL a2, c02, t2
  2034. MUL a2, c03, t3
  2035. MUL a2, c04, t4
  2036. SUB c05, t1, c05
  2037. SUB c06, t2, c06
  2038. SUB c07, t3, c07
  2039. SUB c08, t4, c08
  2040. MUL a3, c05, c05
  2041. MUL a3, c06, c06
  2042. MUL a3, c07, c07
  2043. MUL a3, c08, c08
  2044. #endif
  2045. #ifdef RT
  2046. LD a1, 3 * SIZE(BO)
  2047. LD a2, 2 * SIZE(BO)
  2048. LD a3, 0 * SIZE(BO)
  2049. MUL a1, c05, c05
  2050. MUL a1, c06, c06
  2051. MUL a1, c07, c07
  2052. MUL a1, c08, c08
  2053. MUL a2, c05, t1
  2054. MUL a2, c06, t2
  2055. MUL a2, c07, t3
  2056. MUL a2, c08, t4
  2057. SUB c01, t1, c01
  2058. SUB c02, t2, c02
  2059. SUB c03, t3, c03
  2060. SUB c04, t4, c04
  2061. MUL a3, c01, c01
  2062. MUL a3, c02, c02
  2063. MUL a3, c03, c03
  2064. MUL a3, c04, c04
  2065. #endif
  2066. #if defined(LN) || defined(LT)
  2067. ST c01, 0 * SIZE(BO)
  2068. ST c05, 1 * SIZE(BO)
  2069. ST c02, 2 * SIZE(BO)
  2070. ST c06, 3 * SIZE(BO)
  2071. ST c03, 4 * SIZE(BO)
  2072. ST c07, 5 * SIZE(BO)
  2073. ST c04, 6 * SIZE(BO)
  2074. ST c08, 7 * SIZE(BO)
  2075. #else
  2076. ST c01, 0 * SIZE(AO)
  2077. ST c02, 1 * SIZE(AO)
  2078. ST c03, 2 * SIZE(AO)
  2079. ST c04, 3 * SIZE(AO)
  2080. ST c05, 4 * SIZE(AO)
  2081. ST c06, 5 * SIZE(AO)
  2082. ST c07, 6 * SIZE(AO)
  2083. ST c08, 7 * SIZE(AO)
  2084. #endif
  2085. #ifdef LN
  2086. lda C1, -4 * SIZE(C1)
  2087. lda C2, -4 * SIZE(C2)
  2088. #endif
  2089. ST c01, 0 * SIZE(C1)
  2090. ST c02, 1 * SIZE(C1)
  2091. ST c03, 2 * SIZE(C1)
  2092. ST c04, 3 * SIZE(C1)
  2093. ST c05, 0 * SIZE(C2)
  2094. ST c06, 1 * SIZE(C2)
  2095. ST c07, 2 * SIZE(C2)
  2096. ST c08, 3 * SIZE(C2)
  2097. #ifndef LN
  2098. lda C1, 4 * SIZE(C1)
  2099. lda C2, 4 * SIZE(C2)
  2100. #endif
  2101. fclr t1
  2102. fclr t2
  2103. fclr t3
  2104. fclr t4
  2105. #ifdef RT
  2106. sll K, 2 + BASE_SHIFT, TMP1
  2107. addq AORIG, TMP1, AORIG
  2108. #endif
  2109. #if defined(LT) || defined(RN)
  2110. subq K, KK, TMP1
  2111. sll TMP1, BASE_SHIFT + 2, TMP2
  2112. addq AO, TMP2, AO
  2113. sll TMP1, BASE_SHIFT + 1, TMP2
  2114. addq BO, TMP2, BO
  2115. #endif
  2116. #ifdef LT
  2117. addq KK, 4, KK
  2118. #endif
  2119. #ifdef LN
  2120. subq KK, 4, KK
  2121. #endif
  2122. lda I, -1(I)
  2123. bgt I, $L51
  2124. .align 4
  2125. $L60:
  2126. and M, 2, I
  2127. ble I, $L70
  2128. #if defined(LT) || defined(RN)
  2129. LD a1, 0 * SIZE(AO)
  2130. fclr c01
  2131. LD a2, 1 * SIZE(AO)
  2132. fclr c05
  2133. LD a3, 2 * SIZE(AO)
  2134. fclr c02
  2135. LD a4, 3 * SIZE(AO)
  2136. fclr c06
  2137. LD b1, 0 * SIZE(B)
  2138. lda L, -2(KK)
  2139. LD b2, 1 * SIZE(B)
  2140. lda AO, 2 * SIZE(AO)
  2141. LD b3, 2 * SIZE(B)
  2142. LD b4, 3 * SIZE(B)
  2143. lda BO, 2 * SIZE(B)
  2144. ble KK, $L68
  2145. ble L, $L65
  2146. #else
  2147. #ifdef LN
  2148. sll K, BASE_SHIFT + 1, TMP1
  2149. subq AORIG, TMP1, AORIG
  2150. #endif
  2151. sll KK, BASE_SHIFT + 1, TMP1
  2152. addq AORIG, TMP1, AO
  2153. sll KK, BASE_SHIFT + 1, TMP1
  2154. addq B, TMP1, BO
  2155. subq K, KK, TMP1
  2156. LD a1, 0 * SIZE(AO)
  2157. fclr c01
  2158. LD a2, 1 * SIZE(AO)
  2159. fclr c05
  2160. LD a3, 2 * SIZE(AO)
  2161. fclr c02
  2162. LD a4, 3 * SIZE(AO)
  2163. fclr c06
  2164. LD b1, 0 * SIZE(BO)
  2165. lda L, -2(TMP1)
  2166. LD b2, 1 * SIZE(BO)
  2167. lda AO, 2 * SIZE(AO)
  2168. LD b3, 2 * SIZE(BO)
  2169. LD b4, 3 * SIZE(BO)
  2170. lda BO, 2 * SIZE(BO)
  2171. ble TMP1, $L68
  2172. ble L, $L65
  2173. #endif
  2174. .align 4
  2175. $L62:
  2176. ADD c01, t1, c01
  2177. unop
  2178. MUL a1, b1, t1
  2179. unop
  2180. ADD c02, t2, c02
  2181. lda AO, 4 * SIZE(AO)
  2182. MUL a2, b1, t2
  2183. LD b1, 2 * SIZE(BO)
  2184. ADD c05, t3, c05
  2185. lda L, -2(L)
  2186. MUL a1, b2, t3
  2187. LD a1, -2 * SIZE(AO)
  2188. ADD c06, t4, c06
  2189. unop
  2190. MUL a2, b2, t4
  2191. LD a2, -1 * SIZE(AO)
  2192. ADD c01, t1, c01
  2193. LD b2, 3 * SIZE(BO)
  2194. MUL a3, b3, t1
  2195. lda BO, 4 * SIZE(BO)
  2196. ADD c02, t2, c02
  2197. unop
  2198. MUL a4, b3, t2
  2199. LD b3, 0 * SIZE(BO)
  2200. ADD c05, t3, c05
  2201. unop
  2202. MUL a3, b4, t3
  2203. LD a3, 0 * SIZE(AO)
  2204. ADD c06, t4, c06
  2205. MUL a4, b4, t4
  2206. LD b4, 1 * SIZE(BO)
  2207. unop
  2208. LD a4, 1 * SIZE(AO)
  2209. unop
  2210. unop
  2211. bgt L, $L62
  2212. .align 4
  2213. $L65:
  2214. ADD c01, t1, c01
  2215. MUL a1, b1, t1
  2216. #if defined(LT) || defined(RN)
  2217. blbs KK, $L67
  2218. #else
  2219. blbs TMP1, $L67
  2220. #endif
  2221. .align 4
  2222. ADD c02, t2, c02
  2223. unop
  2224. MUL a2, b1, t2
  2225. LD b1, 0 * SIZE(BO)
  2226. ADD c05, t3, c05
  2227. lda BO, 2 * SIZE(BO)
  2228. MUL a1, b2, t3
  2229. LD a1, 0 * SIZE(AO)
  2230. ADD c06, t4, c06
  2231. unop
  2232. MUL a2, b2, t4
  2233. LD a2, 1 * SIZE(AO)
  2234. ADD c01, t1, c01
  2235. LD b2, -1 * SIZE(BO)
  2236. MUL a1, b1, t1
  2237. lda AO, 2 * SIZE(AO)
  2238. .align 4
  2239. $L67:
  2240. ADD c02, t2, c02
  2241. MUL a2, b1, t2
  2242. ADD c05, t3, c05
  2243. MUL a1, b2, t3
  2244. ADD c06, t4, c06
  2245. lda AO, 2 * SIZE(AO)
  2246. MUL a2, b2, t4
  2247. lda BO, 2 * SIZE(BO)
  2248. ADD c01, t1, c01
  2249. ADD c02, t2, c02
  2250. ADD c05, t3, c05
  2251. ADD c06, t4, c06
  2252. .align 4
  2253. $L68:
  2254. #if defined(LN) || defined(RT)
  2255. #ifdef LN
  2256. subq KK, 2, TMP1
  2257. #else
  2258. subq KK, 2, TMP1
  2259. #endif
  2260. sll TMP1, BASE_SHIFT + 1, TMP2
  2261. addq AORIG, TMP2, AO
  2262. sll TMP1, BASE_SHIFT + 1, TMP2
  2263. addq B, TMP2, BO
  2264. #else
  2265. lda AO, -2 * SIZE(AO)
  2266. lda BO, -2 * SIZE(BO)
  2267. #endif
  2268. #if defined(LN) || defined(LT)
  2269. LD a1, 0 * SIZE(BO)
  2270. LD a2, 1 * SIZE(BO)
  2271. LD a3, 2 * SIZE(BO)
  2272. LD a4, 3 * SIZE(BO)
  2273. SUB a1, c01, c01
  2274. SUB a2, c05, c05
  2275. SUB a3, c02, c02
  2276. SUB a4, c06, c06
  2277. #else
  2278. LD a1, 0 * SIZE(AO)
  2279. LD a2, 1 * SIZE(AO)
  2280. LD a3, 2 * SIZE(AO)
  2281. LD a4, 3 * SIZE(AO)
  2282. SUB a1, c01, c01
  2283. SUB a2, c02, c02
  2284. SUB a3, c05, c05
  2285. SUB a4, c06, c06
  2286. #endif
  2287. #ifdef LN
  2288. LD a1, 3 * SIZE(AO)
  2289. LD a2, 2 * SIZE(AO)
  2290. LD a3, 0 * SIZE(AO)
  2291. MUL a1, c02, c02
  2292. MUL a1, c06, c06
  2293. MUL a2, c02, t1
  2294. MUL a2, c06, t2
  2295. SUB c01, t1, c01
  2296. SUB c05, t2, c05
  2297. MUL a3, c01, c01
  2298. MUL a3, c05, c05
  2299. #endif
  2300. #ifdef LT
  2301. LD a1, 0 * SIZE(AO)
  2302. LD a2, 1 * SIZE(AO)
  2303. LD a3, 3 * SIZE(AO)
  2304. MUL a1, c01, c01
  2305. MUL a1, c05, c05
  2306. MUL a2, c01, t1
  2307. MUL a2, c05, t2
  2308. SUB c02, t1, c02
  2309. SUB c06, t2, c06
  2310. MUL a3, c02, c02
  2311. MUL a3, c06, c06
  2312. #endif
  2313. #ifdef RN
  2314. LD a1, 0 * SIZE(BO)
  2315. LD a2, 1 * SIZE(BO)
  2316. LD a3, 3 * SIZE(BO)
  2317. MUL a1, c01, c01
  2318. MUL a1, c02, c02
  2319. MUL a2, c01, t1
  2320. MUL a2, c02, t2
  2321. SUB c05, t1, c05
  2322. SUB c06, t2, c06
  2323. MUL a3, c05, c05
  2324. MUL a3, c06, c06
  2325. #endif
  2326. #ifdef RT
  2327. LD a1, 3 * SIZE(BO)
  2328. LD a2, 2 * SIZE(BO)
  2329. LD a3, 0 * SIZE(BO)
  2330. MUL a1, c05, c05
  2331. MUL a1, c06, c06
  2332. MUL a2, c05, t1
  2333. MUL a2, c06, t2
  2334. SUB c01, t1, c01
  2335. SUB c02, t2, c02
  2336. MUL a3, c01, c01
  2337. MUL a3, c02, c02
  2338. #endif
  2339. #if defined(LN) || defined(LT)
  2340. ST c01, 0 * SIZE(BO)
  2341. ST c05, 1 * SIZE(BO)
  2342. ST c02, 2 * SIZE(BO)
  2343. ST c06, 3 * SIZE(BO)
  2344. #else
  2345. ST c01, 0 * SIZE(AO)
  2346. ST c02, 1 * SIZE(AO)
  2347. ST c05, 2 * SIZE(AO)
  2348. ST c06, 3 * SIZE(AO)
  2349. #endif
  2350. #ifdef LN
  2351. lda C1, -2 * SIZE(C1)
  2352. lda C2, -2 * SIZE(C2)
  2353. #endif
  2354. ST c01, 0 * SIZE(C1)
  2355. ST c02, 1 * SIZE(C1)
  2356. ST c05, 0 * SIZE(C2)
  2357. ST c06, 1 * SIZE(C2)
  2358. #ifndef LN
  2359. lda C1, 2 * SIZE(C1)
  2360. lda C2, 2 * SIZE(C2)
  2361. #endif
  2362. fclr t1
  2363. fclr t2
  2364. fclr t3
  2365. fclr t4
  2366. #ifdef RT
  2367. sll K, 1 + BASE_SHIFT, TMP1
  2368. addq AORIG, TMP1, AORIG
  2369. #endif
  2370. #if defined(LT) || defined(RN)
  2371. subq K, KK, TMP1
  2372. sll TMP1, BASE_SHIFT + 1, TMP2
  2373. addq AO, TMP2, AO
  2374. sll TMP1, BASE_SHIFT + 1, TMP2
  2375. addq BO, TMP2, BO
  2376. #endif
  2377. #ifdef LT
  2378. addq KK, 2, KK
  2379. #endif
  2380. #ifdef LN
  2381. subq KK, 2, KK
  2382. #endif
  2383. .align 4
  2384. $L70:
  2385. and M, 1, I
  2386. ble I, $L79
  2387. #if defined(LT) || defined(RN)
  2388. LD a1, 0 * SIZE(AO)
  2389. fclr c01
  2390. LD a2, 1 * SIZE(AO)
  2391. fclr c05
  2392. LD b1, 0 * SIZE(B)
  2393. fclr c02
  2394. LD b2, 1 * SIZE(B)
  2395. fclr c06
  2396. lda L, -2(KK)
  2397. LD b3, 2 * SIZE(B)
  2398. lda AO, 1 * SIZE(AO)
  2399. LD b4, 3 * SIZE(B)
  2400. lda BO, 2 * SIZE(B)
  2401. ble KK, $L78
  2402. ble L, $L75
  2403. #else
  2404. #ifdef LN
  2405. sll K, BASE_SHIFT + 0, TMP1
  2406. subq AORIG, TMP1, AORIG
  2407. #endif
  2408. sll KK, BASE_SHIFT + 0, TMP1
  2409. addq AORIG, TMP1, AO
  2410. sll KK, BASE_SHIFT + 1, TMP1
  2411. addq B, TMP1, BO
  2412. subq K, KK, TMP1
  2413. LD a1, 0 * SIZE(AO)
  2414. fclr c01
  2415. LD a2, 1 * SIZE(AO)
  2416. fclr c05
  2417. LD b1, 0 * SIZE(BO)
  2418. fclr c02
  2419. LD b2, 1 * SIZE(BO)
  2420. fclr c06
  2421. lda L, -2(TMP1)
  2422. LD b3, 2 * SIZE(BO)
  2423. lda AO, 1 * SIZE(AO)
  2424. LD b4, 3 * SIZE(BO)
  2425. lda BO, 2 * SIZE(BO)
  2426. ble TMP1, $L78
  2427. ble L, $L75
  2428. #endif
  2429. .align 4
  2430. $L72:
  2431. ADD c01, t1, c01
  2432. lda L, -2(L)
  2433. MUL a1, b1, t1
  2434. LD b1, 2 * SIZE(BO)
  2435. ADD c05, t2, c05
  2436. MUL a1, b2, t2
  2437. LD a1, 1 * SIZE(AO)
  2438. LD b2, 3 * SIZE(BO)
  2439. ADD c02, t3, c02
  2440. lda AO, 2 * SIZE(AO)
  2441. MUL a2, b3, t3
  2442. LD b3, 4 * SIZE(BO)
  2443. ADD c06, t4, c06
  2444. MUL a2, b4, t4
  2445. LD a2, 0 * SIZE(AO)
  2446. LD b4, 5 * SIZE(BO)
  2447. lda BO, 4 * SIZE(BO)
  2448. unop
  2449. unop
  2450. bgt L, $L72
  2451. .align 4
  2452. $L75:
  2453. ADD c01, t1, c01
  2454. MUL a1, b1, t1
  2455. #if defined(LT) || defined(RN)
  2456. blbs KK, $L77
  2457. #else
  2458. blbs TMP1, $L77
  2459. #endif
  2460. .align 4
  2461. ADD c05, t2, c05
  2462. MUL a1, b2, t2
  2463. LD a1, 0 * SIZE(AO)
  2464. LD b1, 0 * SIZE(BO)
  2465. ADD c01, t1, c01
  2466. LD b2, 1 * SIZE(BO)
  2467. lda AO, 1 * SIZE(AO)
  2468. MUL a1, b1, t1
  2469. lda BO, 2 * SIZE(BO)
  2470. .align 4
  2471. $L77:
  2472. ADD c05, t2, c05
  2473. MUL a1, b2, t2
  2474. ADD c02, t3, c02
  2475. ADD c06, t4, c06
  2476. ADD c01, c02, c01
  2477. lda AO, 1 * SIZE(AO)
  2478. ADD c05, c06, c05
  2479. lda BO, 2 * SIZE(BO)
  2480. ADD c01, t1, c01
  2481. ADD c05, t2, c05
  2482. .align 4
  2483. $L78:
  2484. #if defined(LN) || defined(RT)
  2485. #ifdef LN
  2486. subq KK, 1, TMP1
  2487. #else
  2488. subq KK, 2, TMP1
  2489. #endif
  2490. sll TMP1, BASE_SHIFT + 0, TMP2
  2491. addq AORIG, TMP2, AO
  2492. sll TMP1, BASE_SHIFT + 1, TMP2
  2493. addq B, TMP2, BO
  2494. #else
  2495. lda AO, -1 * SIZE(AO)
  2496. lda BO, -2 * SIZE(BO)
  2497. #endif
  2498. #if defined(LN) || defined(LT)
  2499. LD a1, 0 * SIZE(BO)
  2500. LD a2, 1 * SIZE(BO)
  2501. SUB a1, c01, c01
  2502. SUB a2, c05, c05
  2503. #else
  2504. LD a1, 0 * SIZE(AO)
  2505. LD a2, 1 * SIZE(AO)
  2506. SUB a1, c01, c01
  2507. SUB a2, c05, c05
  2508. #endif
  2509. #if defined(LN) || defined(LT)
  2510. LD a1, 0 * SIZE(AO)
  2511. MUL a1, c01, c01
  2512. MUL a1, c05, c05
  2513. #endif
  2514. #ifdef RN
  2515. LD a1, 0 * SIZE(BO)
  2516. LD a2, 1 * SIZE(BO)
  2517. LD a3, 3 * SIZE(BO)
  2518. MUL a1, c01, c01
  2519. MUL a2, c01, t1
  2520. SUB c05, t1, c05
  2521. MUL a3, c05, c05
  2522. #endif
  2523. #ifdef RT
  2524. LD a1, 3 * SIZE(BO)
  2525. LD a2, 2 * SIZE(BO)
  2526. LD a3, 0 * SIZE(BO)
  2527. MUL a1, c05, c05
  2528. MUL a2, c05, t1
  2529. SUB c01, t1, c01
  2530. MUL a3, c01, c01
  2531. #endif
  2532. #if defined(LN) || defined(LT)
  2533. ST c01, 0 * SIZE(BO)
  2534. ST c05, 1 * SIZE(BO)
  2535. #else
  2536. ST c01, 0 * SIZE(AO)
  2537. ST c05, 1 * SIZE(AO)
  2538. #endif
  2539. #ifdef LN
  2540. lda C1, -1 * SIZE(C1)
  2541. lda C2, -1 * SIZE(C2)
  2542. #endif
  2543. ST c01, 0 * SIZE(C1)
  2544. ST c05, 0 * SIZE(C2)
  2545. fclr t1
  2546. fclr t2
  2547. fclr t3
  2548. fclr t4
  2549. #ifdef RT
  2550. sll K, 0 + BASE_SHIFT, TMP1
  2551. addq AORIG, TMP1, AORIG
  2552. #endif
  2553. #if defined(LT) || defined(RN)
  2554. subq K, KK, TMP1
  2555. sll TMP1, BASE_SHIFT + 0, TMP2
  2556. addq AO, TMP2, AO
  2557. sll TMP1, BASE_SHIFT + 1, TMP2
  2558. addq BO, TMP2, BO
  2559. #endif
  2560. #ifdef LT
  2561. addq KK, 1, KK
  2562. #endif
  2563. #ifdef LN
  2564. subq KK, 1, KK
  2565. #endif
  2566. .align 4
  2567. $L79:
  2568. #ifdef LN
  2569. sll K, 1 + BASE_SHIFT, TMP1
  2570. addq B, TMP1, B
  2571. #endif
  2572. #if defined(LT) || defined(RN)
  2573. mov BO, B
  2574. #endif
  2575. #ifdef RN
  2576. addq KK, 2, KK
  2577. #endif
  2578. #ifdef RT
  2579. subq KK, 2, KK
  2580. #endif
  2581. .align 4
  2582. $L80:
  2583. and N, 1, J
  2584. ble J, $L999
  2585. #ifdef RT
  2586. sll K, BASE_SHIFT, TMP1
  2587. subq B, TMP1, B
  2588. subq C, LDC, C
  2589. #endif
  2590. mov C, C1
  2591. #ifndef RT
  2592. addq C, LDC, C
  2593. #endif
  2594. #ifdef LN
  2595. addq M, OFFSET, KK
  2596. #endif
  2597. #ifdef LT
  2598. mov OFFSET, KK
  2599. #endif
  2600. #if defined(LN) || defined(RT)
  2601. mov A, AORIG
  2602. #else
  2603. mov A, AO
  2604. #endif
  2605. sra M, 2, I
  2606. ble I, $L100
  2607. .align 4
  2608. $L91:
  2609. #if defined(LT) || defined(RN)
  2610. LD a1, 0 * SIZE(AO)
  2611. fclr t1
  2612. LD a2, 1 * SIZE(AO)
  2613. fclr t2
  2614. LD a3, 2 * SIZE(AO)
  2615. fclr t3
  2616. LD a4, 3 * SIZE(AO)
  2617. fclr t4
  2618. LD b1, 0 * SIZE(B)
  2619. fclr c01
  2620. LD b2, 1 * SIZE(B)
  2621. fclr c02
  2622. LD b3, 2 * SIZE(B)
  2623. fclr c03
  2624. LD b4, 3 * SIZE(B)
  2625. fclr c04
  2626. sra KK, 2, L
  2627. mov B, BO
  2628. ble L, $L95
  2629. #else
  2630. #ifdef LN
  2631. sll K, BASE_SHIFT + 2, TMP1
  2632. subq AORIG, TMP1, AORIG
  2633. #endif
  2634. sll KK, BASE_SHIFT + 2, TMP1
  2635. addq AORIG, TMP1, AO
  2636. sll KK, BASE_SHIFT + 0, TMP1
  2637. addq B, TMP1, BO
  2638. subq K, KK, TMP1
  2639. LD a1, 0 * SIZE(AO)
  2640. fclr t1
  2641. LD a2, 1 * SIZE(AO)
  2642. fclr t2
  2643. LD a3, 2 * SIZE(AO)
  2644. fclr t3
  2645. LD a4, 3 * SIZE(AO)
  2646. fclr t4
  2647. LD b1, 0 * SIZE(BO)
  2648. fclr c01
  2649. LD b2, 1 * SIZE(BO)
  2650. fclr c02
  2651. LD b3, 2 * SIZE(BO)
  2652. fclr c03
  2653. LD b4, 3 * SIZE(BO)
  2654. fclr c04
  2655. sra TMP1, 2, L
  2656. unop
  2657. ble L, $L95
  2658. #endif
  2659. .align 5
  2660. $L92:
  2661. ADD c01, t1, c01
  2662. unop
  2663. MUL a1, b1, t1
  2664. LD a1, 4 * SIZE(AO)
  2665. ADD c02, t2, c02
  2666. lda L, -1(L)
  2667. MUL a2, b1, t2
  2668. LD a2, 5 * SIZE(AO)
  2669. ADD c03, t3, c03
  2670. unop
  2671. MUL a3, b1, t3
  2672. LD a3, 6 * SIZE(AO)
  2673. ADD c04, t4, c04
  2674. MUL a4, b1, t4
  2675. LD a4, 7 * SIZE(AO)
  2676. LD b1, 4 * SIZE(BO)
  2677. ADD c01, t1, c01
  2678. unop
  2679. MUL a1, b2, t1
  2680. LD a1, 8 * SIZE(AO)
  2681. ADD c02, t2, c02
  2682. unop
  2683. MUL a2, b2, t2
  2684. LD a2, 9 * SIZE(AO)
  2685. ADD c03, t3, c03
  2686. unop
  2687. MUL a3, b2, t3
  2688. LD a3, 10 * SIZE(AO)
  2689. ADD c04, t4, c04
  2690. MUL a4, b2, t4
  2691. LD a4, 11 * SIZE(AO)
  2692. LD b2, 5 * SIZE(BO)
  2693. ADD c01, t1, c01
  2694. unop
  2695. MUL a1, b3, t1
  2696. LD a1, 12 * SIZE(AO)
  2697. ADD c02, t2, c02
  2698. unop
  2699. MUL a2, b3, t2
  2700. LD a2, 13 * SIZE(AO)
  2701. ADD c03, t3, c03
  2702. unop
  2703. MUL a3, b3, t3
  2704. LD a3, 14 * SIZE(AO)
  2705. ADD c04, t4, c04
  2706. MUL a4, b3, t4
  2707. LD a5, 15 * SIZE(AO)
  2708. LD b3, 6 * SIZE(BO)
  2709. ADD c01, t1, c01
  2710. MUL a1, b4, t1
  2711. LD a1, 16 * SIZE(AO)
  2712. lda AO, 16 * SIZE(AO)
  2713. ADD c02, t2, c02
  2714. lda BO, 4 * SIZE(BO)
  2715. MUL a2, b4, t2
  2716. LD a2, 1 * SIZE(AO)
  2717. ADD c03, t3, c03
  2718. LD a4, 3 * SIZE(AO)
  2719. MUL a3, b4, t3
  2720. LD a3, 2 * SIZE(AO)
  2721. ADD c04, t4, c04
  2722. MUL a5, b4, t4
  2723. LD b4, 3 * SIZE(BO)
  2724. bgt L, $L92
  2725. .align 4
  2726. $L95:
  2727. #if defined(LT) || defined(RN)
  2728. and KK, 3, L
  2729. #else
  2730. and TMP1, 3, L
  2731. #endif
  2732. unop
  2733. ble L, $L98
  2734. .align 4
  2735. $L96:
  2736. ADD c01, t1, c01
  2737. lda L, -1(L)
  2738. MUL a1, b1, t1
  2739. LD a1, 4 * SIZE(AO)
  2740. ADD c02, t2, c02
  2741. lda BO, 1 * SIZE(BO)
  2742. MUL a2, b1, t2
  2743. LD a2, 5 * SIZE(AO)
  2744. ADD c03, t3, c03
  2745. unop
  2746. MUL a3, b1, t3
  2747. LD a3, 6 * SIZE(AO)
  2748. ADD c04, t4, c04
  2749. MUL a4, b1, t4
  2750. LD a4, 7 * SIZE(AO)
  2751. LD b1, 0 * SIZE(BO)
  2752. lda AO, 4 * SIZE(AO)
  2753. bgt L, $L96
  2754. .align 4
  2755. $L98:
  2756. ADD c01, t1, c01
  2757. ADD c02, t2, c02
  2758. ADD c03, t3, c03
  2759. ADD c04, t4, c04
  2760. #if defined(LN) || defined(RT)
  2761. #ifdef LN
  2762. subq KK, 4, TMP1
  2763. #else
  2764. subq KK, 1, TMP1
  2765. #endif
  2766. sll TMP1, BASE_SHIFT + 2, TMP2
  2767. addq AORIG, TMP2, AO
  2768. sll TMP1, BASE_SHIFT + 0, TMP2
  2769. addq B, TMP2, BO
  2770. #endif
  2771. #if defined(LN) || defined(LT)
  2772. LD a1, 0 * SIZE(BO)
  2773. LD a2, 1 * SIZE(BO)
  2774. LD a3, 2 * SIZE(BO)
  2775. LD a4, 3 * SIZE(BO)
  2776. SUB a1, c01, c01
  2777. SUB a2, c02, c02
  2778. SUB a3, c03, c03
  2779. SUB a4, c04, c04
  2780. #else
  2781. LD a1, 0 * SIZE(AO)
  2782. LD a2, 1 * SIZE(AO)
  2783. LD a3, 2 * SIZE(AO)
  2784. LD a4, 3 * SIZE(AO)
  2785. SUB a1, c01, c01
  2786. SUB a2, c02, c02
  2787. SUB a3, c03, c03
  2788. SUB a4, c04, c04
  2789. #endif
  2790. #ifdef LN
  2791. LD a1, 15 * SIZE(AO)
  2792. LD a2, 14 * SIZE(AO)
  2793. LD a3, 13 * SIZE(AO)
  2794. LD a4, 12 * SIZE(AO)
  2795. MUL a1, c04, c04
  2796. MUL a2, c04, t1
  2797. SUB c03, t1, c03
  2798. MUL a3, c04, t1
  2799. SUB c02, t1, c02
  2800. MUL a4, c04, t1
  2801. SUB c01, t1, c01
  2802. LD b1, 10 * SIZE(AO)
  2803. LD b2, 9 * SIZE(AO)
  2804. LD b3, 8 * SIZE(AO)
  2805. MUL b1, c03, c03
  2806. MUL b2, c03, t1
  2807. SUB c02, t1, c02
  2808. MUL b3, c03, t1
  2809. SUB c01, t1, c01
  2810. LD a1, 5 * SIZE(AO)
  2811. LD a2, 4 * SIZE(AO)
  2812. LD a3, 0 * SIZE(AO)
  2813. MUL a1, c02, c02
  2814. MUL a2, c02, t1
  2815. SUB c01, t1, c01
  2816. MUL a3, c01, c01
  2817. #endif
  2818. #ifdef LT
  2819. LD a1, 0 * SIZE(AO)
  2820. LD a2, 1 * SIZE(AO)
  2821. LD a3, 2 * SIZE(AO)
  2822. LD a4, 3 * SIZE(AO)
  2823. MUL a1, c01, c01
  2824. MUL a2, c01, t1
  2825. SUB c02, t1, c02
  2826. MUL a3, c01, t1
  2827. SUB c03, t1, c03
  2828. MUL a4, c01, t1
  2829. SUB c04, t1, c04
  2830. LD b1, 5 * SIZE(AO)
  2831. LD b2, 6 * SIZE(AO)
  2832. LD b3, 7 * SIZE(AO)
  2833. MUL b1, c02, c02
  2834. MUL b2, c02, t1
  2835. SUB c03, t1, c03
  2836. MUL b3, c02, t1
  2837. SUB c04, t1, c04
  2838. LD a1, 10 * SIZE(AO)
  2839. LD a2, 11 * SIZE(AO)
  2840. LD a3, 15 * SIZE(AO)
  2841. MUL a1, c03, c03
  2842. MUL a2, c03, t1
  2843. SUB c04, t1, c04
  2844. MUL a3, c04, c04
  2845. #endif
  2846. #if defined(RN) || defined(RT)
  2847. LD a1, 0 * SIZE(BO)
  2848. MUL a1, c01, c01
  2849. MUL a1, c02, c02
  2850. MUL a1, c03, c03
  2851. MUL a1, c04, c04
  2852. #endif
  2853. #if defined(LN) || defined(LT)
  2854. ST c01, 0 * SIZE(BO)
  2855. ST c02, 1 * SIZE(BO)
  2856. ST c03, 2 * SIZE(BO)
  2857. ST c04, 3 * SIZE(BO)
  2858. #else
  2859. ST c01, 0 * SIZE(AO)
  2860. ST c02, 1 * SIZE(AO)
  2861. ST c03, 2 * SIZE(AO)
  2862. ST c04, 3 * SIZE(AO)
  2863. #endif
  2864. #ifdef LN
  2865. lda C1, -4 * SIZE(C1)
  2866. #endif
  2867. ST c01, 0 * SIZE(C1)
  2868. ST c02, 1 * SIZE(C1)
  2869. ST c03, 2 * SIZE(C1)
  2870. ST c04, 3 * SIZE(C1)
  2871. #ifndef LN
  2872. lda C1, 4 * SIZE(C1)
  2873. #endif
  2874. fclr t1
  2875. fclr t2
  2876. fclr t3
  2877. fclr t4
  2878. #ifdef RT
  2879. sll K, 2 + BASE_SHIFT, TMP1
  2880. addq AORIG, TMP1, AORIG
  2881. #endif
  2882. #if defined(LT) || defined(RN)
  2883. subq K, KK, TMP1
  2884. sll TMP1, BASE_SHIFT + 2, TMP2
  2885. addq AO, TMP2, AO
  2886. sll TMP1, BASE_SHIFT + 0, TMP2
  2887. addq BO, TMP2, BO
  2888. #endif
  2889. #ifdef LT
  2890. addq KK, 4, KK
  2891. #endif
  2892. #ifdef LN
  2893. subq KK, 4, KK
  2894. #endif
  2895. lda I, -1(I)
  2896. bgt I, $L91
  2897. .align 4
  2898. $L100:
  2899. and M, 2, I
  2900. ble I, $L110
  2901. #if defined(LT) || defined(RN)
  2902. LD a1, 0 * SIZE(AO)
  2903. fclr t1
  2904. LD a2, 1 * SIZE(AO)
  2905. fclr t2
  2906. LD a3, 2 * SIZE(AO)
  2907. fclr t3
  2908. LD a4, 3 * SIZE(AO)
  2909. fclr t4
  2910. LD b1, 0 * SIZE(B)
  2911. fclr c01
  2912. LD b2, 1 * SIZE(B)
  2913. fclr c02
  2914. LD b3, 2 * SIZE(B)
  2915. fclr c03
  2916. LD b4, 3 * SIZE(B)
  2917. fclr c04
  2918. sra KK, 2, L
  2919. mov B, BO
  2920. ble L, $L105
  2921. #else
  2922. #ifdef LN
  2923. sll K, BASE_SHIFT + 1, TMP1
  2924. subq AORIG, TMP1, AORIG
  2925. #endif
  2926. sll KK, BASE_SHIFT + 1, TMP1
  2927. addq AORIG, TMP1, AO
  2928. sll KK, BASE_SHIFT + 0, TMP1
  2929. addq B, TMP1, BO
  2930. subq K, KK, TMP1
  2931. LD a1, 0 * SIZE(AO)
  2932. fclr t1
  2933. LD a2, 1 * SIZE(AO)
  2934. fclr t2
  2935. LD a3, 2 * SIZE(AO)
  2936. fclr t3
  2937. LD a4, 3 * SIZE(AO)
  2938. fclr t4
  2939. LD b1, 0 * SIZE(BO)
  2940. fclr c01
  2941. LD b2, 1 * SIZE(BO)
  2942. fclr c02
  2943. LD b3, 2 * SIZE(BO)
  2944. fclr c03
  2945. LD b4, 3 * SIZE(BO)
  2946. fclr c04
  2947. sra TMP1, 2, L
  2948. ble L, $L105
  2949. #endif
  2950. .align 5
  2951. $L102:
  2952. ADD c01, t1, c01
  2953. lda L, -1(L)
  2954. MUL a1, b1, t1
  2955. LD a1, 4 * SIZE(AO)
  2956. ADD c02, t2, c02
  2957. MUL a2, b1, t2
  2958. LD a2, 5 * SIZE(AO)
  2959. LD b1, 4 * SIZE(BO)
  2960. ADD c03, t3, c03
  2961. lda BO, 4 * SIZE(BO)
  2962. MUL a3, b2, t3
  2963. LD a3, 6 * SIZE(AO)
  2964. ADD c04, t4, c04
  2965. MUL a4, b2, t4
  2966. LD a5, 7 * SIZE(AO)
  2967. LD b2, 1 * SIZE(BO)
  2968. ADD c01, t1, c01
  2969. MUL a1, b3, t1
  2970. LD a1, 8 * SIZE(AO)
  2971. lda AO, 8 * SIZE(AO)
  2972. ADD c02, t2, c02
  2973. MUL a2, b3, t2
  2974. LD b3, 2 * SIZE(BO)
  2975. LD a2, 1 * SIZE(AO)
  2976. ADD c03, t3, c03
  2977. LD a4, 3 * SIZE(AO)
  2978. MUL a3, b4, t3
  2979. LD a3, 2 * SIZE(AO)
  2980. ADD c04, t4, c04
  2981. MUL a5, b4, t4
  2982. LD b4, 3 * SIZE(BO)
  2983. bgt L, $L102
  2984. .align 4
  2985. $L105:
  2986. #if defined(LT) || defined(RN)
  2987. and KK, 3, L
  2988. #else
  2989. and TMP1, 3, L
  2990. #endif
  2991. ble L, $L108
  2992. .align 4
  2993. $L106:
  2994. ADD c01, t1, c01
  2995. lda L, -1(L)
  2996. MUL a1, b1, t1
  2997. LD a1, 2 * SIZE(AO)
  2998. ADD c02, t2, c02
  2999. MUL a2, b1, t2
  3000. LD a2, 3 * SIZE(AO)
  3001. LD b1, 1 * SIZE(BO)
  3002. lda AO, 2 * SIZE(AO)
  3003. unop
  3004. lda BO, 1 * SIZE(BO)
  3005. bgt L, $L106
  3006. .align 4
  3007. $L108:
  3008. ADD c01, t1, c01
  3009. ADD c02, t2, c02
  3010. ADD c03, t3, c03
  3011. ADD c04, t4, c04
  3012. ADD c01, c03, c01
  3013. ADD c02, c04, c02
  3014. #if defined(LN) || defined(RT)
  3015. #ifdef LN
  3016. subq KK, 2, TMP1
  3017. #else
  3018. subq KK, 1, TMP1
  3019. #endif
  3020. sll TMP1, BASE_SHIFT + 1, TMP2
  3021. addq AORIG, TMP2, AO
  3022. sll TMP1, BASE_SHIFT + 0, TMP2
  3023. addq B, TMP2, BO
  3024. #endif
  3025. #if defined(LN) || defined(LT)
  3026. LD a1, 0 * SIZE(BO)
  3027. LD a2, 1 * SIZE(BO)
  3028. SUB a1, c01, c01
  3029. SUB a2, c02, c02
  3030. #else
  3031. LD a1, 0 * SIZE(AO)
  3032. LD a2, 1 * SIZE(AO)
  3033. SUB a1, c01, c01
  3034. SUB a2, c02, c02
  3035. #endif
  3036. #ifdef LN
  3037. LD a1, 3 * SIZE(AO)
  3038. LD a2, 2 * SIZE(AO)
  3039. LD a3, 0 * SIZE(AO)
  3040. MUL a1, c02, c02
  3041. MUL a2, c02, t1
  3042. SUB c01, t1, c01
  3043. MUL a3, c01, c01
  3044. #endif
  3045. #ifdef LT
  3046. LD a1, 0 * SIZE(AO)
  3047. LD a2, 1 * SIZE(AO)
  3048. LD a3, 3 * SIZE(AO)
  3049. MUL a1, c01, c01
  3050. MUL a2, c01, t1
  3051. SUB c02, t1, c02
  3052. MUL a3, c02, c02
  3053. #endif
  3054. #if defined(RN) || defined(RT)
  3055. LD a1, 0 * SIZE(BO)
  3056. MUL a1, c01, c01
  3057. MUL a1, c02, c02
  3058. #endif
  3059. #if defined(LN) || defined(LT)
  3060. ST c01, 0 * SIZE(BO)
  3061. ST c02, 1 * SIZE(BO)
  3062. #else
  3063. ST c01, 0 * SIZE(AO)
  3064. ST c02, 1 * SIZE(AO)
  3065. #endif
  3066. #ifdef LN
  3067. lda C1, -2 * SIZE(C1)
  3068. #endif
  3069. ST c01, 0 * SIZE(C1)
  3070. ST c02, 1 * SIZE(C1)
  3071. #ifndef LN
  3072. lda C1, 2 * SIZE(C1)
  3073. #endif
  3074. fclr t1
  3075. fclr t2
  3076. fclr t3
  3077. fclr t4
  3078. #ifdef RT
  3079. sll K, 1 + BASE_SHIFT, TMP1
  3080. addq AORIG, TMP1, AORIG
  3081. #endif
  3082. #if defined(LT) || defined(RN)
  3083. subq K, KK, TMP1
  3084. sll TMP1, BASE_SHIFT + 1, TMP2
  3085. addq AO, TMP2, AO
  3086. sll TMP1, BASE_SHIFT + 0, TMP2
  3087. addq BO, TMP2, BO
  3088. #endif
  3089. #ifdef LT
  3090. addq KK, 2, KK
  3091. #endif
  3092. #ifdef LN
  3093. subq KK, 2, KK
  3094. #endif
  3095. .align 4
  3096. $L110:
  3097. and M, 1, I
  3098. ble I, $L119
  3099. #if defined(LT) || defined(RN)
  3100. LD a1, 0 * SIZE(AO)
  3101. fclr t1
  3102. LD a2, 1 * SIZE(AO)
  3103. fclr t2
  3104. LD a3, 2 * SIZE(AO)
  3105. fclr t3
  3106. LD a4, 3 * SIZE(AO)
  3107. fclr t4
  3108. LD b1, 0 * SIZE(B)
  3109. fclr c01
  3110. LD b2, 1 * SIZE(B)
  3111. fclr c02
  3112. LD b3, 2 * SIZE(B)
  3113. fclr c03
  3114. LD b4, 3 * SIZE(B)
  3115. fclr c04
  3116. sra KK, 2, L
  3117. mov B, BO
  3118. unop
  3119. ble L, $L115
  3120. #else
  3121. #ifdef LN
  3122. sll K, BASE_SHIFT + 0, TMP1
  3123. subq AORIG, TMP1, AORIG
  3124. #endif
  3125. sll KK, BASE_SHIFT + 0, TMP1
  3126. addq AORIG, TMP1, AO
  3127. sll KK, BASE_SHIFT + 0, TMP1
  3128. addq B, TMP1, BO
  3129. subq K, KK, TMP1
  3130. LD a1, 0 * SIZE(AO)
  3131. fclr t1
  3132. LD a2, 1 * SIZE(AO)
  3133. fclr t2
  3134. LD a3, 2 * SIZE(AO)
  3135. fclr t3
  3136. LD a4, 3 * SIZE(AO)
  3137. fclr t4
  3138. LD b1, 0 * SIZE(BO)
  3139. fclr c01
  3140. LD b2, 1 * SIZE(BO)
  3141. fclr c02
  3142. LD b3, 2 * SIZE(BO)
  3143. fclr c03
  3144. LD b4, 3 * SIZE(BO)
  3145. fclr c04
  3146. sra TMP1, 2, L
  3147. unop
  3148. ble L, $L115
  3149. #endif
  3150. .align 4
  3151. $L112:
  3152. ADD c01, t1, c01
  3153. MUL a1, b1, t1
  3154. LD a1, 4 * SIZE(AO)
  3155. LD b1, 4 * SIZE(BO)
  3156. ADD c02, t2, c02
  3157. MUL a2, b2, t2
  3158. LD a2, 5 * SIZE(AO)
  3159. LD b2, 5 * SIZE(BO)
  3160. ADD c03, t3, c03
  3161. MUL a3, b3, t3
  3162. LD a3, 6 * SIZE(AO)
  3163. LD b3, 6 * SIZE(BO)
  3164. ADD c04, t4, c04
  3165. MUL a4, b4, t4
  3166. LD a4, 7 * SIZE(AO)
  3167. LD b4, 7 * SIZE(BO)
  3168. lda L, -1(L)
  3169. lda AO, 4 * SIZE(AO)
  3170. lda BO, 4 * SIZE(BO)
  3171. bgt L, $L112
  3172. .align 4
  3173. $L115:
  3174. #if defined(LT) || defined(RN)
  3175. and KK, 3, L
  3176. #else
  3177. and TMP1, 3, L
  3178. #endif
  3179. ble L, $L118
  3180. .align 4
  3181. $L116:
  3182. ADD c01, t1, c01
  3183. MUL a1, b1, t1
  3184. LD a1, 1 * SIZE(AO)
  3185. LD b1, 1 * SIZE(BO)
  3186. lda L, -1(L)
  3187. lda AO, 1 * SIZE(AO)
  3188. lda BO, 1 * SIZE(BO)
  3189. bgt L, $L116
  3190. .align 4
  3191. $L118:
  3192. ADD c01, t1, c01
  3193. ADD c02, t2, c02
  3194. ADD c03, t3, c03
  3195. ADD c04, t4, c04
  3196. ADD c01, c02, c01
  3197. ADD c03, c04, c03
  3198. ADD c01, c03, c01
  3199. #if defined(LN) || defined(RT)
  3200. subq KK, 1, TMP1
  3201. sll TMP1, BASE_SHIFT + 0, TMP2
  3202. addq AORIG, TMP2, AO
  3203. addq B, TMP2, BO
  3204. #endif
  3205. #if defined(LN) || defined(LT)
  3206. LD a1, 0 * SIZE(BO)
  3207. SUB a1, c01, c01
  3208. #else
  3209. LD a1, 0 * SIZE(AO)
  3210. SUB a1, c01, c01
  3211. #endif
  3212. #if defined(LN) || defined(LT)
  3213. LD a1, 0 * SIZE(AO)
  3214. MUL a1, c01, c01
  3215. #endif
  3216. #if defined(RN) || defined(RT)
  3217. LD a1, 0 * SIZE(BO)
  3218. MUL a1, c01, c01
  3219. #endif
  3220. #if defined(LN) || defined(LT)
  3221. ST c01, 0 * SIZE(BO)
  3222. #else
  3223. ST c01, 0 * SIZE(AO)
  3224. #endif
  3225. #ifdef LN
  3226. lda C1, -1 * SIZE(C1)
  3227. #endif
  3228. ST c01, 0 * SIZE(C1)
  3229. #ifndef LN
  3230. lda C1, 1 * SIZE(C1)
  3231. #endif
  3232. #ifdef RT
  3233. SXADDQ K, AORIG, AORIG
  3234. #endif
  3235. #if defined(LT) || defined(RN)
  3236. subq K, KK, TMP1
  3237. sll TMP1, BASE_SHIFT + 0, TMP2
  3238. addq AO, TMP2, AO
  3239. addq BO, TMP2, BO
  3240. #endif
  3241. #ifdef LT
  3242. addq KK, 1, KK
  3243. #endif
  3244. #ifdef LN
  3245. subq KK, 1, KK
  3246. #endif
  3247. .align 4
  3248. $L119:
  3249. #ifdef LN
  3250. SXADDQ K, B, B
  3251. #endif
  3252. #if defined(LT) || defined(RN)
  3253. mov BO, B
  3254. #endif
  3255. #ifdef RN
  3256. addq KK, 1, KK
  3257. #endif
  3258. #ifdef RT
  3259. subq KK, 1, KK
  3260. #endif
  3261. .align 4
  3262. $L999:
  3263. ldt $f2, 0($sp)
  3264. ldt $f3, 8($sp)
  3265. ldt $f4, 16($sp)
  3266. ldt $f5, 24($sp)
  3267. ldt $f6, 32($sp)
  3268. ldt $f7, 40($sp)
  3269. ldt $f8, 48($sp)
  3270. ldt $f9, 56($sp)
  3271. clr $0
  3272. lda $sp, STACKSIZE($sp)
  3273. ret
  3274. EPILOGUE