You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_LN.S 59 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 56
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. #define STACKSIZE 80
  56. #define M $16
  57. #define N $17
  58. #define K $18
  59. #define A $20
  60. #define B $21
  61. #define C $22
  62. #define LDC $23
  63. #define C1 $19
  64. #define C2 $24
  65. #define C3 $25
  66. #define C4 $27
  67. #define AO $at
  68. #define BO $5
  69. #define I $6
  70. #define J $7
  71. #define L $8
  72. #define a1 $f16
  73. #define a2 $f17
  74. #define a3 $f18
  75. #define a4 $f19
  76. #define b1 $f20
  77. #define b2 $f21
  78. #define b3 $f22
  79. #define b4 $f23
  80. #define t1 $f24
  81. #define t2 $f25
  82. #define t3 $f26
  83. #define t4 $f27
  84. #define a5 $f28
  85. #define a6 $f30
  86. #define b5 $f29
  87. #define alpha $f30
  88. #define c01 $f0
  89. #define c02 $f1
  90. #define c03 $f2
  91. #define c04 $f3
  92. #define c05 $f4
  93. #define c06 $f5
  94. #define c07 $f6
  95. #define c08 $f7
  96. #define c09 $f8
  97. #define c10 $f9
  98. #define c11 $f10
  99. #define c12 $f11
  100. #define c13 $f12
  101. #define c14 $f13
  102. #define c15 $f14
  103. #define c16 $f15
  104. #define TMP1 $0
  105. #define TMP2 $1
  106. #define KK $2
  107. #define AORIG $3
  108. #define OFFSET $4
  109. PROLOGUE
  110. PROFCODE
  111. .frame $sp, STACKSIZE, $26, 0
  112. lda $sp, -STACKSIZE($sp)
  113. ldq C, 0 + STACKSIZE($sp)
  114. ldq LDC, 8 + STACKSIZE($sp)
  115. ldq OFFSET, 16 + STACKSIZE($sp)
  116. SXADDQ LDC, 0, LDC
  117. stt $f2, 0($sp)
  118. stt $f3, 8($sp)
  119. stt $f4, 16($sp)
  120. stt $f5, 24($sp)
  121. stt $f6, 32($sp)
  122. stt $f7, 40($sp)
  123. stt $f8, 48($sp)
  124. stt $f9, 56($sp)
  125. cmple M, 0, $0
  126. cmple N, 0, $1
  127. cmple K, 0, $2
  128. or $0, $1, $0
  129. or $0, $2, $0
  130. bne $0, $L999
  131. #ifdef LN
  132. mulq M, K, TMP1
  133. SXADDQ TMP1, A, A
  134. SXADDQ M, C, C
  135. #endif
  136. #ifdef RN
  137. negq OFFSET, KK
  138. #endif
  139. #ifdef RT
  140. mulq N, K, TMP1
  141. SXADDQ TMP1, B, B
  142. mulq N, LDC, TMP1
  143. addq TMP1, C, C
  144. subq N, OFFSET, KK
  145. #endif
  146. sra N, 2, J
  147. ble J, $L40
  148. .align 4
  149. $L01:
  150. #ifdef RT
  151. sll K, 2 + BASE_SHIFT, TMP1
  152. subq B, TMP1, B
  153. s4addq LDC, 0, TMP1
  154. subq C, TMP1, C
  155. #endif
  156. mov C, C1
  157. addq C, LDC, C2
  158. addq C2, LDC, C3
  159. #ifndef RT
  160. s4addq LDC, C, C
  161. #endif
  162. fclr t1
  163. addq C3, LDC, C4
  164. fclr t2
  165. #ifdef LN
  166. addq M, OFFSET, KK
  167. #endif
  168. #ifdef LT
  169. mov OFFSET, KK
  170. #endif
  171. #if defined(LN) || defined(RT)
  172. mov A, AORIG
  173. #else
  174. mov A, AO
  175. #endif
  176. fclr t3
  177. fclr t4
  178. and M, 1, I
  179. ble I, $L20
  180. #if defined(LT) || defined(RN)
  181. LD a1, 0 * SIZE(AO)
  182. fclr c01
  183. LD a2, 1 * SIZE(AO)
  184. fclr c05
  185. LD b1, 0 * SIZE(B)
  186. lda L, -2(KK)
  187. LD b2, 1 * SIZE(B)
  188. lda AO, 1 * SIZE(AO)
  189. LD b3, 2 * SIZE(B)
  190. fclr c09
  191. LD b4, 3 * SIZE(B)
  192. fclr c13
  193. lda BO, 4 * SIZE(B)
  194. ble KK, $L38
  195. ble L, $L35
  196. #else
  197. #ifdef LN
  198. sll K, BASE_SHIFT + 0, TMP1
  199. subq AORIG, TMP1, AORIG
  200. #endif
  201. sll KK, BASE_SHIFT + 0, TMP1
  202. addq AORIG, TMP1, AO
  203. sll KK, BASE_SHIFT + 2, TMP2
  204. addq B, TMP2, BO
  205. subq K, KK, TMP1
  206. LD a1, 0 * SIZE(AO)
  207. fclr c01
  208. LD a2, 1 * SIZE(AO)
  209. fclr c05
  210. LD b1, 0 * SIZE(BO)
  211. lda L, -2(TMP1)
  212. LD b2, 1 * SIZE(BO)
  213. lda AO, 1 * SIZE(AO)
  214. LD b3, 2 * SIZE(BO)
  215. fclr c09
  216. LD b4, 3 * SIZE(BO)
  217. fclr c13
  218. lda BO, 4 * SIZE(BO)
  219. ble TMP1, $L38
  220. ble L, $L35
  221. #endif
  222. .align 4
  223. $L32:
  224. ADD c01, t1, c01
  225. lda L, -2(L)
  226. MUL a1, b1, t1
  227. LD b1, 0 * SIZE(BO)
  228. ADD c05, t2, c05
  229. lda AO, 2 * SIZE(AO)
  230. MUL a1, b2, t2
  231. LD b2, 1 * SIZE(BO)
  232. ADD c09, t3, c09
  233. LD b5, 3 * SIZE(BO)
  234. MUL a1, b3, t3
  235. LD b3, 2 * SIZE(BO)
  236. ADD c13, t4, c13
  237. MUL a1, b4, t4
  238. LD a1, -1 * SIZE(AO)
  239. ADD c01, t1, c01
  240. MUL a2, b1, t1
  241. LD b1, 4 * SIZE(BO)
  242. lda BO, 8 * SIZE(BO)
  243. ADD c05, t2, c05
  244. MUL a2, b2, t2
  245. LD b2, -3 * SIZE(BO)
  246. ADD c09, t3, c09
  247. LD b4, -1 * SIZE(BO)
  248. MUL a2, b3, t3
  249. LD b3, -2 * SIZE(BO)
  250. ADD c13, t4, c13
  251. MUL a2, b5, t4
  252. LD a2, 0 * SIZE(AO)
  253. bgt L, $L32
  254. .align 4
  255. $L35:
  256. ADD c01, t1, c01
  257. MUL a1, b1, t1
  258. #if defined(LT) || defined(RN)
  259. blbs KK, $L37
  260. #else
  261. blbs TMP1, $L37
  262. #endif
  263. .align 4
  264. ADD c05, t2, c05
  265. LD b1, 0 * SIZE(BO)
  266. MUL a1, b2, t2
  267. LD b2, 1 * SIZE(BO)
  268. ADD c09, t3, c09
  269. MUL a1, b3, t3
  270. LD b3, 2 * SIZE(BO)
  271. ADD c13, t4, c13
  272. MUL a1, b4, t4
  273. LD a1, 0 * SIZE(AO)
  274. lda AO, 1 * SIZE(AO)
  275. ADD c01, t1, c01
  276. LD b4, 3 * SIZE(BO)
  277. MUL a1, b1, t1
  278. lda BO, 4 * SIZE(BO)
  279. .align 4
  280. $L37:
  281. ADD c05, t2, c05
  282. MUL a1, b2, t2
  283. ADD c09, t3, c09
  284. MUL a1, b3, t3
  285. ADD c13, t4, c13
  286. lda AO, 1 * SIZE(AO)
  287. MUL a1, b4, t4
  288. lda BO, 4 * SIZE(BO)
  289. ADD c01, t1, c01
  290. ADD c05, t2, c05
  291. ADD c09, t3, c09
  292. ADD c13, t4, c13
  293. $L38:
  294. #if defined(LN) || defined(RT)
  295. #ifdef LN
  296. subq KK, 1, TMP1
  297. #else
  298. subq KK, 4, TMP1
  299. #endif
  300. sll TMP1, BASE_SHIFT + 0, TMP2
  301. addq AORIG, TMP2, AO
  302. sll TMP1, BASE_SHIFT + 2, TMP2
  303. addq B, TMP2, BO
  304. #else
  305. lda AO, -1 * SIZE(AO)
  306. lda BO, -4 * SIZE(BO)
  307. #endif
  308. #if defined(LN) || defined(LT)
  309. LD a1, 0 * SIZE(BO)
  310. LD a2, 1 * SIZE(BO)
  311. LD a3, 2 * SIZE(BO)
  312. LD a4, 3 * SIZE(BO)
  313. SUB a1, c01, c01
  314. SUB a2, c05, c05
  315. SUB a3, c09, c09
  316. SUB a4, c13, c13
  317. #else
  318. LD a1, 0 * SIZE(AO)
  319. LD a2, 1 * SIZE(AO)
  320. LD a3, 2 * SIZE(AO)
  321. LD a4, 3 * SIZE(AO)
  322. SUB a1, c01, c01
  323. SUB a2, c05, c05
  324. SUB a3, c09, c09
  325. SUB a4, c13, c13
  326. #endif
  327. #if defined(LN) || defined(LT)
  328. LD a1, 0 * SIZE(AO)
  329. MUL a1, c01, c01
  330. MUL a1, c05, c05
  331. MUL a1, c09, c09
  332. MUL a1, c13, c13
  333. #endif
  334. #ifdef RN
  335. LD a1, 0 * SIZE(BO)
  336. LD a2, 1 * SIZE(BO)
  337. LD a3, 2 * SIZE(BO)
  338. LD a4, 3 * SIZE(BO)
  339. MUL a1, c01, c01
  340. MUL a2, c01, t1
  341. SUB c05, t1, c05
  342. MUL a3, c01, t1
  343. SUB c09, t1, c09
  344. MUL a4, c01, t1
  345. SUB c13, t1, c13
  346. LD b1, 5 * SIZE(BO)
  347. LD b2, 6 * SIZE(BO)
  348. LD b3, 7 * SIZE(BO)
  349. MUL b1, c05, c05
  350. MUL b2, c05, t1
  351. SUB c09, t1, c09
  352. MUL b3, c05, t1
  353. SUB c13, t1, c13
  354. LD a1, 10 * SIZE(BO)
  355. LD a2, 11 * SIZE(BO)
  356. LD a3, 15 * SIZE(BO)
  357. MUL a1, c09, c09
  358. MUL a2, c09, t1
  359. SUB c13, t1, c13
  360. MUL a3, c13, c13
  361. #endif
  362. #ifdef RT
  363. LD a1, 15 * SIZE(BO)
  364. LD a2, 14 * SIZE(BO)
  365. LD a3, 13 * SIZE(BO)
  366. LD a4, 12 * SIZE(BO)
  367. MUL a1, c13, c13
  368. MUL a2, c13, t1
  369. SUB c09, t1, c09
  370. MUL a3, c13, t1
  371. SUB c05, t1, c05
  372. MUL a4, c13, t1
  373. SUB c01, t1, c01
  374. LD b1, 10 * SIZE(BO)
  375. LD b2, 9 * SIZE(BO)
  376. LD b3, 8 * SIZE(BO)
  377. MUL b1, c09, c09
  378. MUL b2, c09, t1
  379. SUB c05, t1, c05
  380. MUL b3, c09, t1
  381. SUB c01, t1, c01
  382. LD a1, 5 * SIZE(BO)
  383. LD a2, 4 * SIZE(BO)
  384. LD a3, 0 * SIZE(BO)
  385. MUL a1, c05, c05
  386. MUL a2, c05, t1
  387. SUB c01, t1, c01
  388. MUL a3, c01, c01
  389. #endif
  390. #if defined(LN) || defined(LT)
  391. ST c01, 0 * SIZE(BO)
  392. ST c05, 1 * SIZE(BO)
  393. ST c09, 2 * SIZE(BO)
  394. ST c13, 3 * SIZE(BO)
  395. #else
  396. ST c01, 0 * SIZE(AO)
  397. ST c05, 1 * SIZE(AO)
  398. ST c09, 2 * SIZE(AO)
  399. ST c13, 3 * SIZE(AO)
  400. #endif
  401. #ifdef LN
  402. lda C1, -1 * SIZE(C1)
  403. lda C2, -1 * SIZE(C2)
  404. lda C3, -1 * SIZE(C3)
  405. lda C4, -1 * SIZE(C4)
  406. #endif
  407. ST c01, 0 * SIZE(C1)
  408. ST c05, 0 * SIZE(C2)
  409. ST c09, 0 * SIZE(C3)
  410. ST c13, 0 * SIZE(C4)
  411. #ifdef RT
  412. sll K, 0 + BASE_SHIFT, TMP1
  413. addq AORIG, TMP1, AORIG
  414. #endif
  415. #if defined(LT) || defined(RN)
  416. subq K, KK, TMP1
  417. sll TMP1, BASE_SHIFT + 0, TMP2
  418. addq AO, TMP2, AO
  419. sll TMP1, BASE_SHIFT + 2, TMP2
  420. addq BO, TMP2, BO
  421. #endif
  422. #ifdef LT
  423. addq KK, 1, KK
  424. #endif
  425. #ifdef LN
  426. subq KK, 1, KK
  427. #endif
  428. .align 4
  429. $L20:
  430. and M, 2, I
  431. ble I, $L30
  432. #if defined(LT) || defined(RN)
  433. LD a1, 0 * SIZE(AO)
  434. fclr c09
  435. LD a2, 1 * SIZE(AO)
  436. fclr c13
  437. LD a3, 2 * SIZE(AO)
  438. fclr c10
  439. LD a4, 3 * SIZE(AO)
  440. fclr c14
  441. LD b1, 0 * SIZE(B)
  442. lda L, -2(KK)
  443. LD b2, 1 * SIZE(B)
  444. lda AO, 2 * SIZE(AO)
  445. LD b3, 2 * SIZE(B)
  446. fclr c01
  447. LD b4, 3 * SIZE(B)
  448. fclr c05
  449. lda BO, 4 * SIZE(B)
  450. fclr c02
  451. fclr c06
  452. ble KK, $L28
  453. ble L, $L25
  454. #else
  455. #ifdef LN
  456. sll K, BASE_SHIFT + 1, TMP1
  457. subq AORIG, TMP1, AORIG
  458. #endif
  459. sll KK, BASE_SHIFT + 1, TMP1
  460. addq AORIG, TMP1, AO
  461. sll KK, BASE_SHIFT + 2, TMP2
  462. addq B, TMP2, BO
  463. subq K, KK, TMP1
  464. LD a1, 0 * SIZE(AO)
  465. fclr c09
  466. LD a2, 1 * SIZE(AO)
  467. fclr c13
  468. LD a3, 2 * SIZE(AO)
  469. fclr c10
  470. LD a4, 3 * SIZE(AO)
  471. fclr c14
  472. LD b1, 0 * SIZE(BO)
  473. lda L, -2(TMP1)
  474. LD b2, 1 * SIZE(BO)
  475. lda AO, 2 * SIZE(AO)
  476. LD b3, 2 * SIZE(BO)
  477. fclr c01
  478. LD b4, 3 * SIZE(BO)
  479. fclr c05
  480. lda BO, 4 * SIZE(BO)
  481. fclr c02
  482. fclr c06
  483. ble TMP1, $L28
  484. ble L, $L25
  485. #endif
  486. .align 4
  487. $L22:
  488. ADD c09, t1, c09
  489. unop
  490. MUL a1, b1, t1
  491. unop
  492. ADD c10, t2, c10
  493. unop
  494. MUL a2, b1, t2
  495. LD b1, 0 * SIZE(BO)
  496. ADD c13, t3, c13
  497. unop
  498. MUL a1, b2, t3
  499. lda BO, 8 * SIZE(BO)
  500. ADD c14, t4, c14
  501. unop
  502. MUL a2, b2, t4
  503. LD b2, -7 * SIZE(BO)
  504. ADD c01, t1, c01
  505. unop
  506. MUL a1, b3, t1
  507. unop
  508. ADD c02, t2, c02
  509. unop
  510. MUL a2, b3, t2
  511. LD b3, -6 * SIZE(BO)
  512. ADD c05, t3, c05
  513. unop
  514. MUL a1, b4, t3
  515. LD a1, 2 * SIZE(AO)
  516. ADD c06, t4, c06
  517. MUL a2, b4, t4
  518. LD b5, -5 * SIZE(BO)
  519. ADD c09, t1, c09
  520. unop
  521. MUL a3, b1, t1
  522. LD a2, 3 * SIZE(AO)
  523. ADD c10, t2, c10
  524. unop
  525. MUL a4, b1, t2
  526. LD b1, -4 * SIZE(BO)
  527. ADD c13, t3, c13
  528. unop
  529. MUL a3, b2, t3
  530. lda AO, 4 * SIZE(AO)
  531. ADD c14, t4, c14
  532. MUL a4, b2, t4
  533. LD b2, -3 * SIZE(BO)
  534. ADD c01, t1, c01
  535. lda L, -2(L)
  536. MUL a3, b3, t1
  537. LD b4, -1 * SIZE(BO)
  538. ADD c02, t2, c02
  539. unop
  540. MUL a4, b3, t2
  541. LD b3, -2 * SIZE(BO)
  542. ADD c05, t3, c05
  543. unop
  544. MUL a3, b5, t3
  545. LD a3, 0 * SIZE(AO)
  546. ADD c06, t4, c06
  547. MUL a4, b5, t4
  548. LD a4, 1 * SIZE(AO)
  549. bgt L, $L22
  550. .align 4
  551. $L25:
  552. ADD c09, t1, c09
  553. MUL a1, b1, t1
  554. #if defined(LT) || defined(RN)
  555. blbs KK, $L27
  556. #else
  557. blbs TMP1, $L27
  558. #endif
  559. ADD c10, t2, c10
  560. unop
  561. MUL a2, b1, t2
  562. LD b1, 0 * SIZE(BO)
  563. ADD c13, t3, c13
  564. unop
  565. MUL a1, b2, t3
  566. unop
  567. ADD c14, t4, c14
  568. unop
  569. MUL a2, b2, t4
  570. LD b2, 1 * SIZE(BO)
  571. ADD c01, t1, c01
  572. unop
  573. MUL a1, b3, t1
  574. lda AO, 2 * SIZE(AO)
  575. ADD c02, t2, c02
  576. unop
  577. MUL a2, b3, t2
  578. LD b3, 2 * SIZE(BO)
  579. ADD c05, t3, c05
  580. unop
  581. MUL a1, b4, t3
  582. LD a1, -2 * SIZE(AO)
  583. ADD c06, t4, c06
  584. unop
  585. MUL a2, b4, t4
  586. LD a2, -1 * SIZE(AO)
  587. ADD c09, t1, c09
  588. LD b4, 3 * SIZE(BO)
  589. MUL a1, b1, t1
  590. lda BO, 4 * SIZE(BO)
  591. .align 4
  592. $L27:
  593. ADD c10, t2, c10
  594. MUL a2, b1, t2
  595. ADD c13, t3, c13
  596. MUL a1, b2, t3
  597. ADD c14, t4, c14
  598. MUL a2, b2, t4
  599. ADD c01, t1, c01
  600. MUL a1, b3, t1
  601. ADD c02, t2, c02
  602. MUL a2, b3, t2
  603. ADD c05, t3, c05
  604. MUL a1, b4, t3
  605. ADD c06, t4, c06
  606. lda AO, 2 * SIZE(AO)
  607. MUL a2, b4, t4
  608. lda BO, 4 * SIZE(BO)
  609. ADD c09, t1, c09
  610. ADD c10, t2, c10
  611. ADD c13, t3, c13
  612. ADD c14, t4, c14
  613. .align 4
  614. $L28:
  615. #if defined(LN) || defined(RT)
  616. #ifdef LN
  617. subq KK, 2, TMP1
  618. #else
  619. subq KK, 4, TMP1
  620. #endif
  621. sll TMP1, BASE_SHIFT + 1, TMP2
  622. addq AORIG, TMP2, AO
  623. sll TMP1, BASE_SHIFT + 2, TMP2
  624. addq B, TMP2, BO
  625. #else
  626. lda AO, -2 * SIZE(AO)
  627. lda BO, -4 * SIZE(BO)
  628. #endif
  629. #if defined(LN) || defined(LT)
  630. LD a1, 0 * SIZE(BO)
  631. LD a2, 1 * SIZE(BO)
  632. LD a3, 2 * SIZE(BO)
  633. LD a4, 3 * SIZE(BO)
  634. LD b1, 4 * SIZE(BO)
  635. LD b2, 5 * SIZE(BO)
  636. LD b3, 6 * SIZE(BO)
  637. LD b4, 7 * SIZE(BO)
  638. SUB a1, c01, c01
  639. SUB a2, c05, c05
  640. SUB a3, c09, c09
  641. SUB a4, c13, c13
  642. SUB b1, c02, c02
  643. SUB b2, c06, c06
  644. SUB b3, c10, c10
  645. SUB b4, c14, c14
  646. #else
  647. LD a1, 0 * SIZE(AO)
  648. LD a2, 1 * SIZE(AO)
  649. LD a3, 2 * SIZE(AO)
  650. LD a4, 3 * SIZE(AO)
  651. LD b1, 4 * SIZE(AO)
  652. LD b2, 5 * SIZE(AO)
  653. LD b3, 6 * SIZE(AO)
  654. LD b4, 7 * SIZE(AO)
  655. SUB a1, c01, c01
  656. SUB a2, c02, c02
  657. SUB a3, c05, c05
  658. SUB a4, c06, c06
  659. SUB b1, c09, c09
  660. SUB b2, c10, c10
  661. SUB b3, c13, c13
  662. SUB b4, c14, c14
  663. #endif
  664. #ifdef LN
  665. LD a1, 3 * SIZE(AO)
  666. LD a2, 2 * SIZE(AO)
  667. LD a3, 0 * SIZE(AO)
  668. MUL a1, c02, c02
  669. MUL a1, c06, c06
  670. MUL a1, c10, c10
  671. MUL a1, c14, c14
  672. MUL a2, c02, t1
  673. MUL a2, c06, t2
  674. MUL a2, c10, t3
  675. MUL a2, c14, t4
  676. SUB c01, t1, c01
  677. SUB c05, t2, c05
  678. SUB c09, t3, c09
  679. SUB c13, t4, c13
  680. MUL a3, c01, c01
  681. MUL a3, c05, c05
  682. MUL a3, c09, c09
  683. MUL a3, c13, c13
  684. #endif
  685. #ifdef LT
  686. LD a1, 0 * SIZE(AO)
  687. LD a2, 1 * SIZE(AO)
  688. LD a3, 3 * SIZE(AO)
  689. MUL a1, c01, c01
  690. MUL a1, c05, c05
  691. MUL a1, c09, c09
  692. MUL a1, c13, c13
  693. MUL a2, c01, t1
  694. MUL a2, c05, t2
  695. MUL a2, c09, t3
  696. MUL a2, c13, t4
  697. SUB c02, t1, c02
  698. SUB c06, t2, c06
  699. SUB c10, t3, c10
  700. SUB c14, t4, c14
  701. MUL a3, c02, c02
  702. MUL a3, c06, c06
  703. MUL a3, c10, c10
  704. MUL a3, c14, c14
  705. #endif
  706. #ifdef RN
  707. LD a1, 0 * SIZE(BO)
  708. LD a2, 1 * SIZE(BO)
  709. LD a3, 2 * SIZE(BO)
  710. LD a4, 3 * SIZE(BO)
  711. MUL a1, c01, c01
  712. MUL a1, c02, c02
  713. MUL a2, c01, t1
  714. MUL a2, c02, t2
  715. SUB c05, t1, c05
  716. SUB c06, t2, c06
  717. MUL a3, c01, t1
  718. MUL a3, c02, t2
  719. SUB c09, t1, c09
  720. SUB c10, t2, c10
  721. MUL a4, c01, t1
  722. MUL a4, c02, t2
  723. SUB c13, t1, c13
  724. SUB c14, t2, c14
  725. LD b1, 5 * SIZE(BO)
  726. LD b2, 6 * SIZE(BO)
  727. LD b3, 7 * SIZE(BO)
  728. MUL b1, c05, c05
  729. MUL b1, c06, c06
  730. MUL b2, c05, t1
  731. MUL b2, c06, t2
  732. SUB c09, t1, c09
  733. SUB c10, t2, c10
  734. MUL b3, c05, t1
  735. MUL b3, c06, t2
  736. SUB c13, t1, c13
  737. SUB c14, t2, c14
  738. LD a1, 10 * SIZE(BO)
  739. LD a2, 11 * SIZE(BO)
  740. LD a3, 15 * SIZE(BO)
  741. MUL a1, c09, c09
  742. MUL a1, c10, c10
  743. MUL a2, c09, t1
  744. MUL a2, c10, t2
  745. SUB c13, t1, c13
  746. SUB c14, t2, c14
  747. MUL a3, c13, c13
  748. MUL a3, c14, c14
  749. #endif
  750. #ifdef RT
  751. LD a1, 15 * SIZE(BO)
  752. LD a2, 14 * SIZE(BO)
  753. LD a3, 13 * SIZE(BO)
  754. LD a4, 12 * SIZE(BO)
  755. MUL a1, c13, c13
  756. MUL a1, c14, c14
  757. MUL a2, c13, t1
  758. MUL a2, c14, t2
  759. SUB c09, t1, c09
  760. SUB c10, t2, c10
  761. MUL a3, c13, t1
  762. MUL a3, c14, t2
  763. SUB c05, t1, c05
  764. SUB c06, t2, c06
  765. MUL a4, c13, t1
  766. MUL a4, c14, t2
  767. SUB c01, t1, c01
  768. SUB c02, t2, c02
  769. LD b1, 10 * SIZE(BO)
  770. LD b2, 9 * SIZE(BO)
  771. LD b3, 8 * SIZE(BO)
  772. MUL b1, c09, c09
  773. MUL b1, c10, c10
  774. MUL b2, c09, t1
  775. MUL b2, c10, t2
  776. SUB c05, t1, c05
  777. SUB c06, t2, c06
  778. MUL b3, c09, t1
  779. MUL b3, c10, t2
  780. SUB c01, t1, c01
  781. SUB c02, t2, c02
  782. LD a1, 5 * SIZE(BO)
  783. LD a2, 4 * SIZE(BO)
  784. LD a3, 0 * SIZE(BO)
  785. MUL a1, c05, c05
  786. MUL a1, c06, c06
  787. MUL a2, c05, t1
  788. MUL a2, c06, t2
  789. SUB c01, t1, c01
  790. SUB c02, t2, c02
  791. MUL a3, c01, c01
  792. MUL a3, c02, c02
  793. #endif
  794. #if defined(LN) || defined(LT)
  795. ST c01, 0 * SIZE(BO)
  796. ST c05, 1 * SIZE(BO)
  797. ST c09, 2 * SIZE(BO)
  798. ST c13, 3 * SIZE(BO)
  799. ST c02, 4 * SIZE(BO)
  800. ST c06, 5 * SIZE(BO)
  801. ST c10, 6 * SIZE(BO)
  802. ST c14, 7 * SIZE(BO)
  803. #else
  804. ST c01, 0 * SIZE(AO)
  805. ST c02, 1 * SIZE(AO)
  806. ST c05, 2 * SIZE(AO)
  807. ST c06, 3 * SIZE(AO)
  808. ST c09, 4 * SIZE(AO)
  809. ST c10, 5 * SIZE(AO)
  810. ST c13, 6 * SIZE(AO)
  811. ST c14, 7 * SIZE(AO)
  812. #endif
  813. #ifdef LN
  814. lda C1, -2 * SIZE(C1)
  815. lda C2, -2 * SIZE(C2)
  816. lda C3, -2 * SIZE(C3)
  817. lda C4, -2 * SIZE(C4)
  818. #endif
  819. ST c01, 0 * SIZE(C1)
  820. ST c02, 1 * SIZE(C1)
  821. ST c05, 0 * SIZE(C2)
  822. ST c06, 1 * SIZE(C2)
  823. ST c09, 0 * SIZE(C3)
  824. ST c10, 1 * SIZE(C3)
  825. ST c13, 0 * SIZE(C4)
  826. ST c14, 1 * SIZE(C4)
  827. #ifndef LN
  828. lda C1, 2 * SIZE(C1)
  829. lda C2, 2 * SIZE(C2)
  830. lda C3, 2 * SIZE(C3)
  831. lda C4, 2 * SIZE(C4)
  832. #endif
  833. fclr t1
  834. fclr t2
  835. fclr t3
  836. fclr t4
  837. #ifdef RT
  838. sll K, 1 + BASE_SHIFT, TMP1
  839. addq AORIG, TMP1, AORIG
  840. #endif
  841. #if defined(LT) || defined(RN)
  842. subq K, KK, TMP1
  843. sll TMP1, BASE_SHIFT + 1, TMP2
  844. addq AO, TMP2, AO
  845. sll TMP1, BASE_SHIFT + 2, TMP2
  846. addq BO, TMP2, BO
  847. #endif
  848. #ifdef LT
  849. addq KK, 2, KK
  850. #endif
  851. #ifdef LN
  852. subq KK, 2, KK
  853. #endif
  854. .align 4
  855. $L30:
  856. sra M, 2, I
  857. ble I, $L39
  858. .align 4
  859. $L11:
  860. #if defined(LT) || defined(RN)
  861. LD a1, 0 * SIZE(AO)
  862. fclr c11
  863. LD a2, 1 * SIZE(AO)
  864. fclr c12
  865. LD a3, 2 * SIZE(AO)
  866. fclr c16
  867. LD a4, 3 * SIZE(AO)
  868. fclr c15
  869. LD b1, 0 * SIZE(B)
  870. fclr c01
  871. LD b2, 1 * SIZE(B)
  872. fclr c02
  873. LD b3, 2 * SIZE(B)
  874. fclr c06
  875. LD b4, 3 * SIZE(B)
  876. fclr c05
  877. lds $f31, 4 * SIZE(C1)
  878. fclr c03
  879. lda L, -2(KK)
  880. fclr c04
  881. lds $f31, 7 * SIZE(C2)
  882. fclr c08
  883. lda BO, 4 * SIZE(B)
  884. fclr c13
  885. lds $f31, 4 * SIZE(C3)
  886. fclr c09
  887. lda AO, 4 * SIZE(AO)
  888. fclr c10
  889. lds $f31, 7 * SIZE(C4)
  890. fclr c14
  891. fclr c07
  892. ble KK, $L18
  893. #else
  894. #ifdef LN
  895. sll K, BASE_SHIFT + 2, TMP1
  896. subq AORIG, TMP1, AORIG
  897. #endif
  898. sll KK, BASE_SHIFT + 2, TMP1
  899. addq AORIG, TMP1, AO
  900. addq B, TMP1, BO
  901. subq K, KK, TMP1
  902. LD a1, 0 * SIZE(AO)
  903. fclr c11
  904. LD a2, 1 * SIZE(AO)
  905. fclr c12
  906. LD a3, 2 * SIZE(AO)
  907. fclr c16
  908. LD a4, 3 * SIZE(AO)
  909. fclr c15
  910. LD b1, 0 * SIZE(BO)
  911. fclr c01
  912. LD b2, 1 * SIZE(BO)
  913. fclr c02
  914. LD b3, 2 * SIZE(BO)
  915. fclr c06
  916. LD b4, 3 * SIZE(BO)
  917. fclr c05
  918. lds $f31, 4 * SIZE(C1)
  919. fclr c03
  920. lda L, -2(TMP1)
  921. fclr c04
  922. lds $f31, 7 * SIZE(C2)
  923. fclr c08
  924. lda BO, 4 * SIZE(BO)
  925. fclr c13
  926. lds $f31, 4 * SIZE(C3)
  927. fclr c09
  928. lda AO, 4 * SIZE(AO)
  929. fclr c10
  930. lds $f31, 7 * SIZE(C4)
  931. fclr c14
  932. fclr c07
  933. ble TMP1, $L18
  934. #endif
  935. ble L, $L15
  936. .align 5
  937. $L12:
  938. /* 1 */
  939. ADD c11, t1, c11
  940. #ifndef EV4
  941. ldq $31, PREFETCHSIZE * SIZE(AO)
  942. #else
  943. unop
  944. #endif
  945. MUL b1, a1, t1
  946. #ifndef EV4
  947. ldl $31, PREFETCHSIZE * SIZE(BO)
  948. #else
  949. unop
  950. #endif
  951. ADD c12, t2, c12
  952. unop
  953. MUL b1, a2, t2
  954. unop
  955. ADD c16, t3, c16
  956. unop
  957. MUL b2, a2, t3
  958. LD a5, 0 * SIZE(AO)
  959. ADD c15, t4, c15
  960. unop
  961. MUL b2, a1, t4
  962. LD b5, 0 * SIZE(BO)
  963. /* 2 */
  964. ADD c01, t1, c01
  965. UNOP
  966. MUL b1, a3, t1
  967. UNOP
  968. ADD c02, t2, c02
  969. UNOP
  970. MUL b1, a4, t2
  971. UNOP
  972. ADD c06, t3, c06
  973. unop
  974. MUL b2, a4, t3
  975. unop
  976. ADD c05, t4, c05
  977. unop
  978. MUL b4, a1, t4
  979. unop
  980. /* 3 */
  981. ADD c03, t1, c03
  982. unop
  983. MUL b3, a1, t1
  984. unop
  985. ADD c04, t2, c04
  986. unop
  987. MUL b3, a2, t2
  988. unop
  989. ADD c08, t3, c08
  990. unop
  991. MUL b4, a2, t3
  992. LD a2, 1 * SIZE(AO)
  993. ADD c13, t4, c13
  994. unop
  995. MUL b2, a3, t4
  996. LD b2, 1 * SIZE(BO)
  997. /* 4 */
  998. ADD c09, t1, c09
  999. unop
  1000. MUL b3, a3, t1
  1001. LD a6, 2 * SIZE(AO)
  1002. ADD c10, t2, c10
  1003. unop
  1004. MUL b3, a4, t2
  1005. LD b3, 2 * SIZE(BO)
  1006. ADD c14, t3, c14
  1007. unop
  1008. MUL b4, a4, t3
  1009. LD a4, 3 * SIZE(AO)
  1010. ADD c07, t4, c07
  1011. unop
  1012. MUL b4, a3, t4
  1013. LD b4, 3 * SIZE(BO)
  1014. /* 5 */
  1015. ADD c11, t1, c11
  1016. unop
  1017. MUL b5, a5, t1
  1018. LD a1, 4 * SIZE(AO)
  1019. ADD c12, t2, c12
  1020. lda L, -2(L)
  1021. MUL b5, a2, t2
  1022. LD b1, 4 * SIZE(BO)
  1023. ADD c16, t3, c16
  1024. unop
  1025. MUL b2, a2, t3
  1026. unop
  1027. ADD c15, t4, c15
  1028. unop
  1029. MUL b2, a5, t4
  1030. unop
  1031. /* 6 */
  1032. ADD c01, t1, c01
  1033. unop
  1034. MUL b5, a6, t1
  1035. unop
  1036. ADD c02, t2, c02
  1037. unop
  1038. MUL b5, a4, t2
  1039. unop
  1040. ADD c06, t3, c06
  1041. unop
  1042. MUL b2, a4, t3
  1043. unop
  1044. ADD c05, t4, c05
  1045. unop
  1046. MUL b4, a5, t4
  1047. unop
  1048. /* 7 */
  1049. ADD c03, t1, c03
  1050. lda AO, 8 * SIZE(AO)
  1051. MUL b3, a5, t1
  1052. unop
  1053. ADD c04, t2, c04
  1054. lda BO, 8 * SIZE(BO)
  1055. MUL b3, a2, t2
  1056. unop
  1057. ADD c08, t3, c08
  1058. unop
  1059. MUL b4, a2, t3
  1060. LD a2, -3 * SIZE(AO)
  1061. ADD c13, t4, c13
  1062. unop
  1063. MUL b2, a6, t4
  1064. LD b2, -3 * SIZE(BO)
  1065. /* 8 */
  1066. ADD c09, t1, c09
  1067. unop
  1068. MUL b3, a6, t1
  1069. LD a3, -2 * SIZE(AO)
  1070. ADD c10, t2, c10
  1071. unop
  1072. MUL b3, a4, t2
  1073. LD b3, -2 * SIZE(BO)
  1074. ADD c14, t3, c14
  1075. unop
  1076. MUL b4, a4, t3
  1077. LD a4, -1 * SIZE(AO)
  1078. ADD c07, t4, c07
  1079. MUL b4, a6, t4
  1080. LD b4, -1 * SIZE(BO)
  1081. bgt L, $L12
  1082. .align 4
  1083. $L15:
  1084. ADD c11, t1, c11
  1085. MUL b1, a1, t1
  1086. #if defined(LT) || defined(RN)
  1087. blbs KK, $L17
  1088. #else
  1089. blbs TMP1, $L17
  1090. #endif
  1091. .align 4
  1092. ADD c12, t2, c12
  1093. MUL b1, a2, t2
  1094. ADD c16, t3, c16
  1095. MUL b2, a2, t3
  1096. ADD c15, t4, c15
  1097. MUL b2, a1, t4
  1098. ADD c01, t1, c01
  1099. MUL b1, a3, t1
  1100. ADD c02, t2, c02
  1101. unop
  1102. MUL b1, a4, t2
  1103. LD b1, 0 * SIZE(BO)
  1104. ADD c06, t3, c06
  1105. MUL b2, a4, t3
  1106. ADD c05, t4, c05
  1107. MUL b4, a1, t4
  1108. ADD c03, t1, c03
  1109. unop
  1110. MUL b3, a1, t1
  1111. LD a1, 0 * SIZE(AO)
  1112. ADD c04, t2, c04
  1113. unop
  1114. MUL b3, a2, t2
  1115. unop
  1116. ADD c08, t3, c08
  1117. unop
  1118. MUL b4, a2, t3
  1119. LD a2, 1 * SIZE(AO)
  1120. ADD c13, t4, c13
  1121. unop
  1122. MUL b2, a3, t4
  1123. LD b2, 1 * SIZE(BO)
  1124. ADD c09, t1, c09
  1125. unop
  1126. MUL b3, a3, t1
  1127. lda AO, 4 * SIZE(AO)
  1128. ADD c10, t2, c10
  1129. unop
  1130. MUL b3, a4, t2
  1131. LD b3, 2 * SIZE(BO)
  1132. ADD c14, t3, c14
  1133. unop
  1134. MUL b4, a4, t3
  1135. LD a4, -1 * SIZE(AO)
  1136. ADD c07, t4, c07
  1137. unop
  1138. MUL b4, a3, t4
  1139. LD a3, -2 * SIZE(AO)
  1140. ADD c11, t1, c11
  1141. LD b4, 3 * SIZE(BO)
  1142. MUL b1, a1, t1
  1143. lda BO, 4 * SIZE(BO)
  1144. .align 4
  1145. $L17:
  1146. ADD c12, t2, c12
  1147. MUL b1, a2, t2
  1148. ADD c16, t3, c16
  1149. MUL b2, a2, t3
  1150. ADD c15, t4, c15
  1151. MUL b2, a1, t4
  1152. ADD c01, t1, c01
  1153. MUL b1, a3, t1
  1154. ADD c02, t2, c02
  1155. MUL b1, a4, t2
  1156. ADD c06, t3, c06
  1157. MUL b2, a4, t3
  1158. ADD c05, t4, c05
  1159. MUL b4, a1, t4
  1160. ADD c03, t1, c03
  1161. MUL b3, a1, t1
  1162. ADD c04, t2, c04
  1163. MUL b3, a2, t2
  1164. ADD c08, t3, c08
  1165. MUL b4, a2, t3
  1166. ADD c13, t4, c13
  1167. MUL b2, a3, t4
  1168. ADD c09, t1, c09
  1169. MUL b3, a3, t1
  1170. ADD c10, t2, c10
  1171. MUL b3, a4, t2
  1172. ADD c14, t3, c14
  1173. MUL b4, a4, t3
  1174. ADD c07, t4, c07
  1175. lda AO, 4 * SIZE(AO)
  1176. MUL b4, a3, t4
  1177. lda BO, 4 * SIZE(BO)
  1178. ADD c11, t1, c11
  1179. ADD c12, t2, c12
  1180. ADD c16, t3, c16
  1181. ADD c15, t4, c15
  1182. .align 4
  1183. $L18:
  1184. #if defined(LN) || defined(RT)
  1185. #ifdef LN
  1186. subq KK, 4, TMP1
  1187. #else
  1188. subq KK, 4, TMP1
  1189. #endif
  1190. sll TMP1, BASE_SHIFT + 2, TMP2
  1191. addq AORIG, TMP2, AO
  1192. sll TMP1, BASE_SHIFT + 2, TMP2
  1193. addq B, TMP2, BO
  1194. #else
  1195. lda AO, -4 * SIZE(AO)
  1196. lda BO, -4 * SIZE(BO)
  1197. #endif
  1198. #if defined(LN) || defined(LT)
  1199. LD a1, 0 * SIZE(BO)
  1200. LD a2, 1 * SIZE(BO)
  1201. LD a3, 2 * SIZE(BO)
  1202. LD a4, 3 * SIZE(BO)
  1203. LD b1, 4 * SIZE(BO)
  1204. LD b2, 5 * SIZE(BO)
  1205. LD b3, 6 * SIZE(BO)
  1206. LD b4, 7 * SIZE(BO)
  1207. SUB a1, c01, c01
  1208. SUB a2, c05, c05
  1209. SUB a3, c09, c09
  1210. SUB a4, c13, c13
  1211. SUB b1, c02, c02
  1212. SUB b2, c06, c06
  1213. SUB b3, c10, c10
  1214. SUB b4, c14, c14
  1215. LD a1, 8 * SIZE(BO)
  1216. LD a2, 9 * SIZE(BO)
  1217. LD a3, 10 * SIZE(BO)
  1218. LD a4, 11 * SIZE(BO)
  1219. LD b1, 12 * SIZE(BO)
  1220. LD b2, 13 * SIZE(BO)
  1221. LD b3, 14 * SIZE(BO)
  1222. LD b4, 15 * SIZE(BO)
  1223. SUB a1, c03, c03
  1224. SUB a2, c07, c07
  1225. SUB a3, c11, c11
  1226. SUB a4, c15, c15
  1227. SUB b1, c04, c04
  1228. SUB b2, c08, c08
  1229. SUB b3, c12, c12
  1230. SUB b4, c16, c16
  1231. #else
  1232. LD a1, 0 * SIZE(AO)
  1233. LD a2, 1 * SIZE(AO)
  1234. LD a3, 2 * SIZE(AO)
  1235. LD a4, 3 * SIZE(AO)
  1236. LD b1, 4 * SIZE(AO)
  1237. LD b2, 5 * SIZE(AO)
  1238. LD b3, 6 * SIZE(AO)
  1239. LD b4, 7 * SIZE(AO)
  1240. SUB a1, c01, c01
  1241. SUB a2, c02, c02
  1242. SUB a3, c03, c03
  1243. SUB a4, c04, c04
  1244. SUB b1, c05, c05
  1245. SUB b2, c06, c06
  1246. SUB b3, c07, c07
  1247. SUB b4, c08, c08
  1248. LD a1, 8 * SIZE(AO)
  1249. LD a2, 9 * SIZE(AO)
  1250. LD a3, 10 * SIZE(AO)
  1251. LD a4, 11 * SIZE(AO)
  1252. LD b1, 12 * SIZE(AO)
  1253. LD b2, 13 * SIZE(AO)
  1254. LD b3, 14 * SIZE(AO)
  1255. LD b4, 15 * SIZE(AO)
  1256. SUB a1, c09, c09
  1257. SUB a2, c10, c10
  1258. SUB a3, c11, c11
  1259. SUB a4, c12, c12
  1260. SUB b1, c13, c13
  1261. SUB b2, c14, c14
  1262. SUB b3, c15, c15
  1263. SUB b4, c16, c16
  1264. #endif
  1265. #ifdef LN
  1266. LD a1, 15 * SIZE(AO)
  1267. LD a2, 14 * SIZE(AO)
  1268. LD a3, 13 * SIZE(AO)
  1269. LD a4, 12 * SIZE(AO)
  1270. MUL a1, c04, c04
  1271. MUL a1, c08, c08
  1272. MUL a1, c12, c12
  1273. MUL a1, c16, c16
  1274. MUL a2, c04, t1
  1275. MUL a2, c08, t2
  1276. MUL a2, c12, t3
  1277. MUL a2, c16, t4
  1278. SUB c03, t1, c03
  1279. SUB c07, t2, c07
  1280. SUB c11, t3, c11
  1281. SUB c15, t4, c15
  1282. MUL a3, c04, t1
  1283. MUL a3, c08, t2
  1284. MUL a3, c12, t3
  1285. MUL a3, c16, t4
  1286. SUB c02, t1, c02
  1287. SUB c06, t2, c06
  1288. SUB c10, t3, c10
  1289. SUB c14, t4, c14
  1290. MUL a4, c04, t1
  1291. MUL a4, c08, t2
  1292. MUL a4, c12, t3
  1293. MUL a4, c16, t4
  1294. SUB c01, t1, c01
  1295. SUB c05, t2, c05
  1296. SUB c09, t3, c09
  1297. SUB c13, t4, c13
  1298. LD b1, 10 * SIZE(AO)
  1299. LD b2, 9 * SIZE(AO)
  1300. LD b3, 8 * SIZE(AO)
  1301. MUL b1, c03, c03
  1302. MUL b1, c07, c07
  1303. MUL b1, c11, c11
  1304. MUL b1, c15, c15
  1305. MUL b2, c03, t1
  1306. MUL b2, c07, t2
  1307. MUL b2, c11, t3
  1308. MUL b2, c15, t4
  1309. SUB c02, t1, c02
  1310. SUB c06, t2, c06
  1311. SUB c10, t3, c10
  1312. SUB c14, t4, c14
  1313. MUL b3, c03, t1
  1314. MUL b3, c07, t2
  1315. MUL b3, c11, t3
  1316. MUL b3, c15, t4
  1317. SUB c01, t1, c01
  1318. SUB c05, t2, c05
  1319. SUB c09, t3, c09
  1320. SUB c13, t4, c13
  1321. LD a1, 5 * SIZE(AO)
  1322. LD a2, 4 * SIZE(AO)
  1323. LD a3, 0 * SIZE(AO)
  1324. MUL a1, c02, c02
  1325. MUL a1, c06, c06
  1326. MUL a1, c10, c10
  1327. MUL a1, c14, c14
  1328. MUL a2, c02, t1
  1329. MUL a2, c06, t2
  1330. MUL a2, c10, t3
  1331. MUL a2, c14, t4
  1332. SUB c01, t1, c01
  1333. SUB c05, t2, c05
  1334. SUB c09, t3, c09
  1335. SUB c13, t4, c13
  1336. MUL a3, c01, c01
  1337. MUL a3, c05, c05
  1338. MUL a3, c09, c09
  1339. MUL a3, c13, c13
  1340. #endif
  1341. #ifdef LT
  1342. LD a1, 0 * SIZE(AO)
  1343. LD a2, 1 * SIZE(AO)
  1344. LD a3, 2 * SIZE(AO)
  1345. LD a4, 3 * SIZE(AO)
  1346. MUL a1, c01, c01
  1347. MUL a1, c05, c05
  1348. MUL a1, c09, c09
  1349. MUL a1, c13, c13
  1350. MUL a2, c01, t1
  1351. MUL a2, c05, t2
  1352. MUL a2, c09, t3
  1353. MUL a2, c13, t4
  1354. SUB c02, t1, c02
  1355. SUB c06, t2, c06
  1356. SUB c10, t3, c10
  1357. SUB c14, t4, c14
  1358. MUL a3, c01, t1
  1359. MUL a3, c05, t2
  1360. MUL a3, c09, t3
  1361. MUL a3, c13, t4
  1362. SUB c03, t1, c03
  1363. SUB c07, t2, c07
  1364. SUB c11, t3, c11
  1365. SUB c15, t4, c15
  1366. MUL a4, c01, t1
  1367. MUL a4, c05, t2
  1368. MUL a4, c09, t3
  1369. MUL a4, c13, t4
  1370. SUB c04, t1, c04
  1371. SUB c08, t2, c08
  1372. SUB c12, t3, c12
  1373. SUB c16, t4, c16
  1374. LD b1, 5 * SIZE(AO)
  1375. LD b2, 6 * SIZE(AO)
  1376. LD b3, 7 * SIZE(AO)
  1377. MUL b1, c02, c02
  1378. MUL b1, c06, c06
  1379. MUL b1, c10, c10
  1380. MUL b1, c14, c14
  1381. MUL b2, c02, t1
  1382. MUL b2, c06, t2
  1383. MUL b2, c10, t3
  1384. MUL b2, c14, t4
  1385. SUB c03, t1, c03
  1386. SUB c07, t2, c07
  1387. SUB c11, t3, c11
  1388. SUB c15, t4, c15
  1389. MUL b3, c02, t1
  1390. MUL b3, c06, t2
  1391. MUL b3, c10, t3
  1392. MUL b3, c14, t4
  1393. SUB c04, t1, c04
  1394. SUB c08, t2, c08
  1395. SUB c12, t3, c12
  1396. SUB c16, t4, c16
  1397. LD a1, 10 * SIZE(AO)
  1398. LD a2, 11 * SIZE(AO)
  1399. LD a3, 15 * SIZE(AO)
  1400. MUL a1, c03, c03
  1401. MUL a1, c07, c07
  1402. MUL a1, c11, c11
  1403. MUL a1, c15, c15
  1404. MUL a2, c03, t1
  1405. MUL a2, c07, t2
  1406. MUL a2, c11, t3
  1407. MUL a2, c15, t4
  1408. SUB c04, t1, c04
  1409. SUB c08, t2, c08
  1410. SUB c12, t3, c12
  1411. SUB c16, t4, c16
  1412. MUL a3, c04, c04
  1413. MUL a3, c08, c08
  1414. MUL a3, c12, c12
  1415. MUL a3, c16, c16
  1416. #endif
  1417. #ifdef RN
  1418. LD a1, 0 * SIZE(BO)
  1419. LD a2, 1 * SIZE(BO)
  1420. LD a3, 2 * SIZE(BO)
  1421. LD a4, 3 * SIZE(BO)
  1422. MUL a1, c01, c01
  1423. MUL a1, c02, c02
  1424. MUL a1, c03, c03
  1425. MUL a1, c04, c04
  1426. MUL a2, c01, t1
  1427. MUL a2, c02, t2
  1428. MUL a2, c03, t3
  1429. MUL a2, c04, t4
  1430. SUB c05, t1, c05
  1431. SUB c06, t2, c06
  1432. SUB c07, t3, c07
  1433. SUB c08, t4, c08
  1434. MUL a3, c01, t1
  1435. MUL a3, c02, t2
  1436. MUL a3, c03, t3
  1437. MUL a3, c04, t4
  1438. SUB c09, t1, c09
  1439. SUB c10, t2, c10
  1440. SUB c11, t3, c11
  1441. SUB c12, t4, c12
  1442. MUL a4, c01, t1
  1443. MUL a4, c02, t2
  1444. MUL a4, c03, t3
  1445. MUL a4, c04, t4
  1446. SUB c13, t1, c13
  1447. SUB c14, t2, c14
  1448. SUB c15, t3, c15
  1449. SUB c16, t4, c16
  1450. LD b1, 5 * SIZE(BO)
  1451. LD b2, 6 * SIZE(BO)
  1452. LD b3, 7 * SIZE(BO)
  1453. MUL b1, c05, c05
  1454. MUL b1, c06, c06
  1455. MUL b1, c07, c07
  1456. MUL b1, c08, c08
  1457. MUL b2, c05, t1
  1458. MUL b2, c06, t2
  1459. MUL b2, c07, t3
  1460. MUL b2, c08, t4
  1461. SUB c09, t1, c09
  1462. SUB c10, t2, c10
  1463. SUB c11, t3, c11
  1464. SUB c12, t4, c12
  1465. MUL b3, c05, t1
  1466. MUL b3, c06, t2
  1467. MUL b3, c07, t3
  1468. MUL b3, c08, t4
  1469. SUB c13, t1, c13
  1470. SUB c14, t2, c14
  1471. SUB c15, t3, c15
  1472. SUB c16, t4, c16
  1473. LD a1, 10 * SIZE(BO)
  1474. LD a2, 11 * SIZE(BO)
  1475. LD a3, 15 * SIZE(BO)
  1476. MUL a1, c09, c09
  1477. MUL a1, c10, c10
  1478. MUL a1, c11, c11
  1479. MUL a1, c12, c12
  1480. MUL a2, c09, t1
  1481. MUL a2, c10, t2
  1482. MUL a2, c11, t3
  1483. MUL a2, c12, t4
  1484. SUB c13, t1, c13
  1485. SUB c14, t2, c14
  1486. SUB c15, t3, c15
  1487. SUB c16, t4, c16
  1488. MUL a3, c13, c13
  1489. MUL a3, c14, c14
  1490. MUL a3, c15, c15
  1491. MUL a3, c16, c16
  1492. #endif
  1493. #ifdef RT
  1494. LD a1, 15 * SIZE(BO)
  1495. LD a2, 14 * SIZE(BO)
  1496. LD a3, 13 * SIZE(BO)
  1497. LD a4, 12 * SIZE(BO)
  1498. MUL a1, c13, c13
  1499. MUL a1, c14, c14
  1500. MUL a1, c15, c15
  1501. MUL a1, c16, c16
  1502. MUL a2, c13, t1
  1503. MUL a2, c14, t2
  1504. MUL a2, c15, t3
  1505. MUL a2, c16, t4
  1506. SUB c09, t1, c09
  1507. SUB c10, t2, c10
  1508. SUB c11, t3, c11
  1509. SUB c12, t4, c12
  1510. MUL a3, c13, t1
  1511. MUL a3, c14, t2
  1512. MUL a3, c15, t3
  1513. MUL a3, c16, t4
  1514. SUB c05, t1, c05
  1515. SUB c06, t2, c06
  1516. SUB c07, t3, c07
  1517. SUB c08, t4, c08
  1518. MUL a4, c13, t1
  1519. MUL a4, c14, t2
  1520. MUL a4, c15, t3
  1521. MUL a4, c16, t4
  1522. SUB c01, t1, c01
  1523. SUB c02, t2, c02
  1524. SUB c03, t3, c03
  1525. SUB c04, t4, c04
  1526. LD b1, 10 * SIZE(BO)
  1527. LD b2, 9 * SIZE(BO)
  1528. LD b3, 8 * SIZE(BO)
  1529. MUL b1, c09, c09
  1530. MUL b1, c10, c10
  1531. MUL b1, c11, c11
  1532. MUL b1, c12, c12
  1533. MUL b2, c09, t1
  1534. MUL b2, c10, t2
  1535. MUL b2, c11, t3
  1536. MUL b2, c12, t4
  1537. SUB c05, t1, c05
  1538. SUB c06, t2, c06
  1539. SUB c07, t3, c07
  1540. SUB c08, t4, c08
  1541. MUL b3, c09, t1
  1542. MUL b3, c10, t2
  1543. MUL b3, c11, t3
  1544. MUL b3, c12, t4
  1545. SUB c01, t1, c01
  1546. SUB c02, t2, c02
  1547. SUB c03, t3, c03
  1548. SUB c04, t4, c04
  1549. LD a1, 5 * SIZE(BO)
  1550. LD a2, 4 * SIZE(BO)
  1551. LD a3, 0 * SIZE(BO)
  1552. MUL a1, c05, c05
  1553. MUL a1, c06, c06
  1554. MUL a1, c07, c07
  1555. MUL a1, c08, c08
  1556. MUL a2, c05, t1
  1557. MUL a2, c06, t2
  1558. MUL a2, c07, t3
  1559. MUL a2, c08, t4
  1560. SUB c01, t1, c01
  1561. SUB c02, t2, c02
  1562. SUB c03, t3, c03
  1563. SUB c04, t4, c04
  1564. MUL a3, c01, c01
  1565. MUL a3, c02, c02
  1566. MUL a3, c03, c03
  1567. MUL a3, c04, c04
  1568. #endif
  1569. #if defined(LN) || defined(LT)
  1570. ST c01, 0 * SIZE(BO)
  1571. ST c05, 1 * SIZE(BO)
  1572. ST c09, 2 * SIZE(BO)
  1573. ST c13, 3 * SIZE(BO)
  1574. ST c02, 4 * SIZE(BO)
  1575. ST c06, 5 * SIZE(BO)
  1576. ST c10, 6 * SIZE(BO)
  1577. ST c14, 7 * SIZE(BO)
  1578. ST c03, 8 * SIZE(BO)
  1579. ST c07, 9 * SIZE(BO)
  1580. ST c11, 10 * SIZE(BO)
  1581. ST c15, 11 * SIZE(BO)
  1582. ST c04, 12 * SIZE(BO)
  1583. ST c08, 13 * SIZE(BO)
  1584. ST c12, 14 * SIZE(BO)
  1585. ST c16, 15 * SIZE(BO)
  1586. #else
  1587. ST c01, 0 * SIZE(AO)
  1588. ST c02, 1 * SIZE(AO)
  1589. ST c03, 2 * SIZE(AO)
  1590. ST c04, 3 * SIZE(AO)
  1591. ST c05, 4 * SIZE(AO)
  1592. ST c06, 5 * SIZE(AO)
  1593. ST c07, 6 * SIZE(AO)
  1594. ST c08, 7 * SIZE(AO)
  1595. ST c09, 8 * SIZE(AO)
  1596. ST c10, 9 * SIZE(AO)
  1597. ST c11, 10 * SIZE(AO)
  1598. ST c12, 11 * SIZE(AO)
  1599. ST c13, 12 * SIZE(AO)
  1600. ST c14, 13 * SIZE(AO)
  1601. ST c15, 14 * SIZE(AO)
  1602. ST c16, 15 * SIZE(AO)
  1603. #endif
  1604. #ifdef LN
  1605. lda C1, -4 * SIZE(C1)
  1606. lda C2, -4 * SIZE(C2)
  1607. lda C3, -4 * SIZE(C3)
  1608. lda C4, -4 * SIZE(C4)
  1609. #endif
  1610. ST c01, 0 * SIZE(C1)
  1611. ST c02, 1 * SIZE(C1)
  1612. ST c03, 2 * SIZE(C1)
  1613. ST c04, 3 * SIZE(C1)
  1614. ST c05, 0 * SIZE(C2)
  1615. ST c06, 1 * SIZE(C2)
  1616. ST c07, 2 * SIZE(C2)
  1617. ST c08, 3 * SIZE(C2)
  1618. ST c09, 0 * SIZE(C3)
  1619. ST c10, 1 * SIZE(C3)
  1620. ST c11, 2 * SIZE(C3)
  1621. ST c12, 3 * SIZE(C3)
  1622. ST c13, 0 * SIZE(C4)
  1623. ST c14, 1 * SIZE(C4)
  1624. ST c15, 2 * SIZE(C4)
  1625. ST c16, 3 * SIZE(C4)
  1626. #ifndef LN
  1627. lda C1, 4 * SIZE(C1)
  1628. lda C2, 4 * SIZE(C2)
  1629. lda C3, 4 * SIZE(C3)
  1630. lda C4, 4 * SIZE(C4)
  1631. #endif
  1632. fclr t1
  1633. fclr t2
  1634. fclr t3
  1635. fclr t4
  1636. #ifdef RT
  1637. sll K, 2 + BASE_SHIFT, TMP1
  1638. addq AORIG, TMP1, AORIG
  1639. #endif
  1640. #if defined(LT) || defined(RN)
  1641. subq K, KK, TMP1
  1642. sll TMP1, BASE_SHIFT + 2, TMP1
  1643. addq AO, TMP1, AO
  1644. addq BO, TMP1, BO
  1645. #endif
  1646. #ifdef LT
  1647. addq KK, 4, KK
  1648. #endif
  1649. #ifdef LN
  1650. subq KK, 4, KK
  1651. #endif
  1652. lda I, -1(I)
  1653. bgt I, $L11
  1654. .align 4
  1655. $L39:
  1656. #ifdef LN
  1657. sll K, 2 + BASE_SHIFT, TMP1
  1658. addq B, TMP1, B
  1659. #endif
  1660. #if defined(LT) || defined(RN)
  1661. mov BO, B
  1662. #endif
  1663. #ifdef RN
  1664. addq KK, 4, KK
  1665. #endif
  1666. #ifdef RT
  1667. subq KK, 4, KK
  1668. #endif
  1669. lda J, -1(J)
  1670. bgt J, $L01
  1671. .align 4
  1672. $L40:
  1673. and N, 2, J
  1674. ble J, $L80
  1675. #ifdef RT
  1676. sll K, 1 + BASE_SHIFT, TMP1
  1677. subq B, TMP1, B
  1678. addq LDC, LDC, TMP1
  1679. subq C, TMP1, C
  1680. #endif
  1681. mov C, C1
  1682. addq C, LDC, C2
  1683. fclr t1
  1684. #ifndef RT
  1685. addq C2, LDC, C
  1686. #endif
  1687. fclr t2
  1688. #ifdef LN
  1689. addq M, OFFSET, KK
  1690. #endif
  1691. #ifdef LT
  1692. mov OFFSET, KK
  1693. #endif
  1694. #if defined(LN) || defined(RT)
  1695. mov A, AORIG
  1696. #else
  1697. mov A, AO
  1698. #endif
  1699. fclr t3
  1700. fclr t4
  1701. and M, 1, I
  1702. ble I, $L60
  1703. #if defined(LT) || defined(RN)
  1704. LD a1, 0 * SIZE(AO)
  1705. fclr c01
  1706. LD a2, 1 * SIZE(AO)
  1707. fclr c05
  1708. LD b1, 0 * SIZE(B)
  1709. fclr c02
  1710. LD b2, 1 * SIZE(B)
  1711. fclr c06
  1712. lda L, -2(KK)
  1713. LD b3, 2 * SIZE(B)
  1714. lda AO, 1 * SIZE(AO)
  1715. LD b4, 3 * SIZE(B)
  1716. lda BO, 2 * SIZE(B)
  1717. ble KK, $L78
  1718. ble L, $L75
  1719. #else
  1720. #ifdef LN
  1721. sll K, BASE_SHIFT + 0, TMP1
  1722. subq AORIG, TMP1, AORIG
  1723. #endif
  1724. sll KK, BASE_SHIFT + 0, TMP1
  1725. addq AORIG, TMP1, AO
  1726. sll KK, BASE_SHIFT + 1, TMP1
  1727. addq B, TMP1, BO
  1728. subq K, KK, TMP1
  1729. LD a1, 0 * SIZE(AO)
  1730. fclr c01
  1731. LD a2, 1 * SIZE(AO)
  1732. fclr c05
  1733. LD b1, 0 * SIZE(BO)
  1734. fclr c02
  1735. LD b2, 1 * SIZE(BO)
  1736. fclr c06
  1737. lda L, -2(TMP1)
  1738. LD b3, 2 * SIZE(BO)
  1739. lda AO, 1 * SIZE(AO)
  1740. LD b4, 3 * SIZE(BO)
  1741. lda BO, 2 * SIZE(BO)
  1742. ble TMP1, $L78
  1743. ble L, $L75
  1744. #endif
  1745. .align 4
  1746. $L72:
  1747. ADD c01, t1, c01
  1748. lda L, -2(L)
  1749. MUL a1, b1, t1
  1750. LD b1, 2 * SIZE(BO)
  1751. ADD c05, t2, c05
  1752. MUL a1, b2, t2
  1753. LD a1, 1 * SIZE(AO)
  1754. LD b2, 3 * SIZE(BO)
  1755. ADD c02, t3, c02
  1756. lda AO, 2 * SIZE(AO)
  1757. MUL a2, b3, t3
  1758. LD b3, 4 * SIZE(BO)
  1759. ADD c06, t4, c06
  1760. MUL a2, b4, t4
  1761. LD a2, 0 * SIZE(AO)
  1762. LD b4, 5 * SIZE(BO)
  1763. lda BO, 4 * SIZE(BO)
  1764. unop
  1765. unop
  1766. bgt L, $L72
  1767. .align 4
  1768. $L75:
  1769. ADD c01, t1, c01
  1770. MUL a1, b1, t1
  1771. #if defined(LT) || defined(RN)
  1772. blbs KK, $L77
  1773. #else
  1774. blbs TMP1, $L77
  1775. #endif
  1776. .align 4
  1777. ADD c05, t2, c05
  1778. MUL a1, b2, t2
  1779. LD a1, 0 * SIZE(AO)
  1780. LD b1, 0 * SIZE(BO)
  1781. ADD c01, t1, c01
  1782. LD b2, 1 * SIZE(BO)
  1783. lda AO, 1 * SIZE(AO)
  1784. MUL a1, b1, t1
  1785. lda BO, 2 * SIZE(BO)
  1786. .align 4
  1787. $L77:
  1788. ADD c05, t2, c05
  1789. MUL a1, b2, t2
  1790. ADD c02, t3, c02
  1791. ADD c06, t4, c06
  1792. ADD c01, c02, c01
  1793. lda AO, 1 * SIZE(AO)
  1794. ADD c05, c06, c05
  1795. lda BO, 2 * SIZE(BO)
  1796. ADD c01, t1, c01
  1797. ADD c05, t2, c05
  1798. .align 4
  1799. $L78:
  1800. #if defined(LN) || defined(RT)
  1801. #ifdef LN
  1802. subq KK, 1, TMP1
  1803. #else
  1804. subq KK, 2, TMP1
  1805. #endif
  1806. sll TMP1, BASE_SHIFT + 0, TMP2
  1807. addq AORIG, TMP2, AO
  1808. sll TMP1, BASE_SHIFT + 1, TMP2
  1809. addq B, TMP2, BO
  1810. #else
  1811. lda AO, -1 * SIZE(AO)
  1812. lda BO, -2 * SIZE(BO)
  1813. #endif
  1814. #if defined(LN) || defined(LT)
  1815. LD a1, 0 * SIZE(BO)
  1816. LD a2, 1 * SIZE(BO)
  1817. SUB a1, c01, c01
  1818. SUB a2, c05, c05
  1819. #else
  1820. LD a1, 0 * SIZE(AO)
  1821. LD a2, 1 * SIZE(AO)
  1822. SUB a1, c01, c01
  1823. SUB a2, c05, c05
  1824. #endif
  1825. #if defined(LN) || defined(LT)
  1826. LD a1, 0 * SIZE(AO)
  1827. MUL a1, c01, c01
  1828. MUL a1, c05, c05
  1829. #endif
  1830. #ifdef RN
  1831. LD a1, 0 * SIZE(BO)
  1832. LD a2, 1 * SIZE(BO)
  1833. LD a3, 3 * SIZE(BO)
  1834. MUL a1, c01, c01
  1835. MUL a2, c01, t1
  1836. SUB c05, t1, c05
  1837. MUL a3, c05, c05
  1838. #endif
  1839. #ifdef RT
  1840. LD a1, 3 * SIZE(BO)
  1841. LD a2, 2 * SIZE(BO)
  1842. LD a3, 0 * SIZE(BO)
  1843. MUL a1, c05, c05
  1844. MUL a2, c05, t1
  1845. SUB c01, t1, c01
  1846. MUL a3, c01, c01
  1847. #endif
  1848. #if defined(LN) || defined(LT)
  1849. ST c01, 0 * SIZE(BO)
  1850. ST c05, 1 * SIZE(BO)
  1851. #else
  1852. ST c01, 0 * SIZE(AO)
  1853. ST c05, 1 * SIZE(AO)
  1854. #endif
  1855. #ifdef LN
  1856. lda C1, -1 * SIZE(C1)
  1857. lda C2, -1 * SIZE(C2)
  1858. #endif
  1859. ST c01, 0 * SIZE(C1)
  1860. ST c05, 0 * SIZE(C2)
  1861. fclr t1
  1862. fclr t2
  1863. fclr t3
  1864. fclr t4
  1865. #ifdef RT
  1866. sll K, 0 + BASE_SHIFT, TMP1
  1867. addq AORIG, TMP1, AORIG
  1868. #endif
  1869. #if defined(LT) || defined(RN)
  1870. subq K, KK, TMP1
  1871. sll TMP1, BASE_SHIFT + 0, TMP2
  1872. addq AO, TMP2, AO
  1873. sll TMP1, BASE_SHIFT + 1, TMP2
  1874. addq BO, TMP2, BO
  1875. #endif
  1876. #ifdef LT
  1877. addq KK, 1, KK
  1878. #endif
  1879. #ifdef LN
  1880. subq KK, 1, KK
  1881. #endif
  1882. .align 4
  1883. $L60:
  1884. and M, 2, I
  1885. ble I, $L70
  1886. #if defined(LT) || defined(RN)
  1887. LD a1, 0 * SIZE(AO)
  1888. fclr c01
  1889. LD a2, 1 * SIZE(AO)
  1890. fclr c05
  1891. LD a3, 2 * SIZE(AO)
  1892. fclr c02
  1893. LD a4, 3 * SIZE(AO)
  1894. fclr c06
  1895. LD b1, 0 * SIZE(B)
  1896. lda L, -2(KK)
  1897. LD b2, 1 * SIZE(B)
  1898. lda AO, 2 * SIZE(AO)
  1899. LD b3, 2 * SIZE(B)
  1900. LD b4, 3 * SIZE(B)
  1901. lda BO, 2 * SIZE(B)
  1902. ble KK, $L68
  1903. ble L, $L65
  1904. #else
  1905. #ifdef LN
  1906. sll K, BASE_SHIFT + 1, TMP1
  1907. subq AORIG, TMP1, AORIG
  1908. #endif
  1909. sll KK, BASE_SHIFT + 1, TMP1
  1910. addq AORIG, TMP1, AO
  1911. sll KK, BASE_SHIFT + 1, TMP1
  1912. addq B, TMP1, BO
  1913. subq K, KK, TMP1
  1914. LD a1, 0 * SIZE(AO)
  1915. fclr c01
  1916. LD a2, 1 * SIZE(AO)
  1917. fclr c05
  1918. LD a3, 2 * SIZE(AO)
  1919. fclr c02
  1920. LD a4, 3 * SIZE(AO)
  1921. fclr c06
  1922. LD b1, 0 * SIZE(BO)
  1923. lda L, -2(TMP1)
  1924. LD b2, 1 * SIZE(BO)
  1925. lda AO, 2 * SIZE(AO)
  1926. LD b3, 2 * SIZE(BO)
  1927. LD b4, 3 * SIZE(BO)
  1928. lda BO, 2 * SIZE(BO)
  1929. ble TMP1, $L68
  1930. ble L, $L65
  1931. #endif
  1932. .align 4
  1933. $L62:
  1934. ADD c01, t1, c01
  1935. unop
  1936. MUL a1, b1, t1
  1937. unop
  1938. ADD c02, t2, c02
  1939. lda AO, 4 * SIZE(AO)
  1940. MUL a2, b1, t2
  1941. LD b1, 2 * SIZE(BO)
  1942. ADD c05, t3, c05
  1943. lda L, -2(L)
  1944. MUL a1, b2, t3
  1945. LD a1, -2 * SIZE(AO)
  1946. ADD c06, t4, c06
  1947. unop
  1948. MUL a2, b2, t4
  1949. LD a2, -1 * SIZE(AO)
  1950. ADD c01, t1, c01
  1951. LD b2, 3 * SIZE(BO)
  1952. MUL a3, b3, t1
  1953. lda BO, 4 * SIZE(BO)
  1954. ADD c02, t2, c02
  1955. unop
  1956. MUL a4, b3, t2
  1957. LD b3, 0 * SIZE(BO)
  1958. ADD c05, t3, c05
  1959. unop
  1960. MUL a3, b4, t3
  1961. LD a3, 0 * SIZE(AO)
  1962. ADD c06, t4, c06
  1963. MUL a4, b4, t4
  1964. LD b4, 1 * SIZE(BO)
  1965. unop
  1966. LD a4, 1 * SIZE(AO)
  1967. unop
  1968. unop
  1969. bgt L, $L62
  1970. .align 4
  1971. $L65:
  1972. ADD c01, t1, c01
  1973. MUL a1, b1, t1
  1974. #if defined(LT) || defined(RN)
  1975. blbs KK, $L67
  1976. #else
  1977. blbs TMP1, $L67
  1978. #endif
  1979. .align 4
  1980. ADD c02, t2, c02
  1981. unop
  1982. MUL a2, b1, t2
  1983. LD b1, 0 * SIZE(BO)
  1984. ADD c05, t3, c05
  1985. lda BO, 2 * SIZE(BO)
  1986. MUL a1, b2, t3
  1987. LD a1, 0 * SIZE(AO)
  1988. ADD c06, t4, c06
  1989. unop
  1990. MUL a2, b2, t4
  1991. LD a2, 1 * SIZE(AO)
  1992. ADD c01, t1, c01
  1993. LD b2, -1 * SIZE(BO)
  1994. MUL a1, b1, t1
  1995. lda AO, 2 * SIZE(AO)
  1996. .align 4
  1997. $L67:
  1998. ADD c02, t2, c02
  1999. MUL a2, b1, t2
  2000. ADD c05, t3, c05
  2001. MUL a1, b2, t3
  2002. ADD c06, t4, c06
  2003. lda AO, 2 * SIZE(AO)
  2004. MUL a2, b2, t4
  2005. lda BO, 2 * SIZE(BO)
  2006. ADD c01, t1, c01
  2007. ADD c02, t2, c02
  2008. ADD c05, t3, c05
  2009. ADD c06, t4, c06
  2010. .align 4
  2011. $L68:
  2012. #if defined(LN) || defined(RT)
  2013. #ifdef LN
  2014. subq KK, 2, TMP1
  2015. #else
  2016. subq KK, 2, TMP1
  2017. #endif
  2018. sll TMP1, BASE_SHIFT + 1, TMP2
  2019. addq AORIG, TMP2, AO
  2020. sll TMP1, BASE_SHIFT + 1, TMP2
  2021. addq B, TMP2, BO
  2022. #else
  2023. lda AO, -2 * SIZE(AO)
  2024. lda BO, -2 * SIZE(BO)
  2025. #endif
  2026. #if defined(LN) || defined(LT)
  2027. LD a1, 0 * SIZE(BO)
  2028. LD a2, 1 * SIZE(BO)
  2029. LD a3, 2 * SIZE(BO)
  2030. LD a4, 3 * SIZE(BO)
  2031. SUB a1, c01, c01
  2032. SUB a2, c05, c05
  2033. SUB a3, c02, c02
  2034. SUB a4, c06, c06
  2035. #else
  2036. LD a1, 0 * SIZE(AO)
  2037. LD a2, 1 * SIZE(AO)
  2038. LD a3, 2 * SIZE(AO)
  2039. LD a4, 3 * SIZE(AO)
  2040. SUB a1, c01, c01
  2041. SUB a2, c02, c02
  2042. SUB a3, c05, c05
  2043. SUB a4, c06, c06
  2044. #endif
  2045. #ifdef LN
  2046. LD a1, 3 * SIZE(AO)
  2047. LD a2, 2 * SIZE(AO)
  2048. LD a3, 0 * SIZE(AO)
  2049. MUL a1, c02, c02
  2050. MUL a1, c06, c06
  2051. MUL a2, c02, t1
  2052. MUL a2, c06, t2
  2053. SUB c01, t1, c01
  2054. SUB c05, t2, c05
  2055. MUL a3, c01, c01
  2056. MUL a3, c05, c05
  2057. #endif
  2058. #ifdef LT
  2059. LD a1, 0 * SIZE(AO)
  2060. LD a2, 1 * SIZE(AO)
  2061. LD a3, 3 * SIZE(AO)
  2062. MUL a1, c01, c01
  2063. MUL a1, c05, c05
  2064. MUL a2, c01, t1
  2065. MUL a2, c05, t2
  2066. SUB c02, t1, c02
  2067. SUB c06, t2, c06
  2068. MUL a3, c02, c02
  2069. MUL a3, c06, c06
  2070. #endif
  2071. #ifdef RN
  2072. LD a1, 0 * SIZE(BO)
  2073. LD a2, 1 * SIZE(BO)
  2074. LD a3, 3 * SIZE(BO)
  2075. MUL a1, c01, c01
  2076. MUL a1, c02, c02
  2077. MUL a2, c01, t1
  2078. MUL a2, c02, t2
  2079. SUB c05, t1, c05
  2080. SUB c06, t2, c06
  2081. MUL a3, c05, c05
  2082. MUL a3, c06, c06
  2083. #endif
  2084. #ifdef RT
  2085. LD a1, 3 * SIZE(BO)
  2086. LD a2, 2 * SIZE(BO)
  2087. LD a3, 0 * SIZE(BO)
  2088. MUL a1, c05, c05
  2089. MUL a1, c06, c06
  2090. MUL a2, c05, t1
  2091. MUL a2, c06, t2
  2092. SUB c01, t1, c01
  2093. SUB c02, t2, c02
  2094. MUL a3, c01, c01
  2095. MUL a3, c02, c02
  2096. #endif
  2097. #if defined(LN) || defined(LT)
  2098. ST c01, 0 * SIZE(BO)
  2099. ST c05, 1 * SIZE(BO)
  2100. ST c02, 2 * SIZE(BO)
  2101. ST c06, 3 * SIZE(BO)
  2102. #else
  2103. ST c01, 0 * SIZE(AO)
  2104. ST c02, 1 * SIZE(AO)
  2105. ST c05, 2 * SIZE(AO)
  2106. ST c06, 3 * SIZE(AO)
  2107. #endif
  2108. #ifdef LN
  2109. lda C1, -2 * SIZE(C1)
  2110. lda C2, -2 * SIZE(C2)
  2111. #endif
  2112. ST c01, 0 * SIZE(C1)
  2113. ST c02, 1 * SIZE(C1)
  2114. ST c05, 0 * SIZE(C2)
  2115. ST c06, 1 * SIZE(C2)
  2116. #ifndef LN
  2117. lda C1, 2 * SIZE(C1)
  2118. lda C2, 2 * SIZE(C2)
  2119. #endif
  2120. fclr t1
  2121. fclr t2
  2122. fclr t3
  2123. fclr t4
  2124. #ifdef RT
  2125. sll K, 1 + BASE_SHIFT, TMP1
  2126. addq AORIG, TMP1, AORIG
  2127. #endif
  2128. #if defined(LT) || defined(RN)
  2129. subq K, KK, TMP1
  2130. sll TMP1, BASE_SHIFT + 1, TMP2
  2131. addq AO, TMP2, AO
  2132. sll TMP1, BASE_SHIFT + 1, TMP2
  2133. addq BO, TMP2, BO
  2134. #endif
  2135. #ifdef LT
  2136. addq KK, 2, KK
  2137. #endif
  2138. #ifdef LN
  2139. subq KK, 2, KK
  2140. #endif
  2141. .align 4
  2142. $L70:
  2143. sra M, 2, I
  2144. ble I, $L79
  2145. .align 4
  2146. $L51:
  2147. #if defined(LT) || defined(RN)
  2148. LD a1, 0 * SIZE(AO)
  2149. fclr c03
  2150. LD a2, 1 * SIZE(AO)
  2151. fclr c07
  2152. LD a3, 2 * SIZE(AO)
  2153. fclr c04
  2154. LD a4, 3 * SIZE(AO)
  2155. fclr c08
  2156. LD b1, 0 * SIZE(B)
  2157. fclr c01
  2158. LD b2, 1 * SIZE(B)
  2159. fclr c05
  2160. LD b3, 2 * SIZE(B)
  2161. fclr c02
  2162. LD b4, 3 * SIZE(B)
  2163. fclr c06
  2164. lda L, -2(KK)
  2165. lda BO, 2 * SIZE(B)
  2166. lda AO, 4 * SIZE(AO)
  2167. ble KK, $L58
  2168. ble L, $L55
  2169. #else
  2170. #ifdef LN
  2171. sll K, BASE_SHIFT + 2, TMP1
  2172. subq AORIG, TMP1, AORIG
  2173. #endif
  2174. sll KK, BASE_SHIFT + 2, TMP1
  2175. addq AORIG, TMP1, AO
  2176. sll KK, BASE_SHIFT + 1, TMP1
  2177. addq B, TMP1, BO
  2178. subq K, KK, TMP1
  2179. LD a1, 0 * SIZE(AO)
  2180. fclr c03
  2181. LD a2, 1 * SIZE(AO)
  2182. fclr c07
  2183. LD a3, 2 * SIZE(AO)
  2184. fclr c04
  2185. LD a4, 3 * SIZE(AO)
  2186. fclr c08
  2187. LD b1, 0 * SIZE(BO)
  2188. fclr c01
  2189. LD b2, 1 * SIZE(BO)
  2190. fclr c05
  2191. LD b3, 2 * SIZE(BO)
  2192. fclr c02
  2193. LD b4, 3 * SIZE(BO)
  2194. fclr c06
  2195. lda L, -2(TMP1)
  2196. lda BO, 2 * SIZE(BO)
  2197. lda AO, 4 * SIZE(AO)
  2198. ble TMP1, $L58
  2199. ble L, $L55
  2200. #endif
  2201. .align 4
  2202. $L52:
  2203. ADD c05, t1, c05
  2204. unop
  2205. MUL a1, b1, t1
  2206. unop
  2207. ADD c06, t2, c06
  2208. lda L, -2(L)
  2209. MUL a2, b1, t2
  2210. unop
  2211. ADD c07, t3, c07
  2212. unop
  2213. MUL a3, b1, t3
  2214. unop
  2215. ADD c08, t4, c08
  2216. unop
  2217. MUL a4, b1, t4
  2218. LD b1, 2 * SIZE(BO)
  2219. ADD c01, t1, c01
  2220. unop
  2221. MUL a1, b2, t1
  2222. LD a1, 0 * SIZE(AO)
  2223. ADD c02, t2, c02
  2224. lda BO, 4 * SIZE(BO)
  2225. MUL a2, b2, t2
  2226. LD a2, 1 * SIZE(AO)
  2227. ADD c03, t3, c03
  2228. unop
  2229. MUL a3, b2, t3
  2230. LD a3, 2 * SIZE(AO)
  2231. ADD c04, t4, c04
  2232. unop
  2233. MUL a4, b2, t4
  2234. LD a5, 3 * SIZE(AO)
  2235. ADD c05, t1, c05
  2236. unop
  2237. MUL a1, b3, t1
  2238. LD b2, -1 * SIZE(BO)
  2239. ADD c06, t2, c06
  2240. unop
  2241. MUL a2, b3, t2
  2242. unop
  2243. ADD c07, t3, c07
  2244. unop
  2245. MUL a3, b3, t3
  2246. lda AO, 8 * SIZE(AO)
  2247. ADD c08, t4, c08
  2248. unop
  2249. MUL a5, b3, t4
  2250. LD b3, 0 * SIZE(BO)
  2251. ADD c01, t1, c01
  2252. unop
  2253. MUL a1, b4, t1
  2254. LD a1, -4 * SIZE(AO)
  2255. ADD c02, t2, c02
  2256. unop
  2257. MUL a2, b4, t2
  2258. LD a2, -3 * SIZE(AO)
  2259. ADD c03, t3, c03
  2260. LD a4, -1 * SIZE(AO)
  2261. MUL a3, b4, t3
  2262. LD a3, -2 * SIZE(AO)
  2263. ADD c04, t4, c04
  2264. MUL a5, b4, t4
  2265. LD b4, 1 * SIZE(BO)
  2266. bgt L, $L52
  2267. .align 4
  2268. $L55:
  2269. ADD c05, t1, c05
  2270. MUL a1, b1, t1
  2271. #if defined(LT) || defined(RN)
  2272. blbs KK, $L57
  2273. #else
  2274. blbs TMP1, $L57
  2275. #endif
  2276. .align 4
  2277. ADD c06, t2, c06
  2278. MUL a2, b1, t2
  2279. ADD c07, t3, c07
  2280. MUL a3, b1, t3
  2281. ADD c08, t4, c08
  2282. unop
  2283. MUL a4, b1, t4
  2284. LD b1, 0 * SIZE(BO)
  2285. ADD c01, t1, c01
  2286. unop
  2287. MUL a1, b2, t1
  2288. LD a1, 0 * SIZE(AO)
  2289. ADD c02, t2, c02
  2290. unop
  2291. MUL a2, b2, t2
  2292. LD a2, 1 * SIZE(AO)
  2293. ADD c03, t3, c03
  2294. unop
  2295. MUL a3, b2, t3
  2296. LD a3, 2 * SIZE(AO)
  2297. ADD c04, t4, c04
  2298. MUL a4, b2, t4
  2299. LD a4, 3 * SIZE(AO)
  2300. lda AO, 4 * SIZE(AO)
  2301. ADD c05, t1, c05
  2302. LD b2, 1 * SIZE(BO)
  2303. MUL a1, b1, t1
  2304. lda BO, 2 * SIZE(BO)
  2305. .align 4
  2306. $L57:
  2307. ADD c06, t2, c06
  2308. MUL a2, b1, t2
  2309. ADD c07, t3, c07
  2310. MUL a3, b1, t3
  2311. ADD c08, t4, c08
  2312. MUL a4, b1, t4
  2313. ADD c01, t1, c01
  2314. MUL a1, b2, t1
  2315. ADD c02, t2, c02
  2316. MUL a2, b2, t2
  2317. ADD c03, t3, c03
  2318. MUL a3, b2, t3
  2319. ADD c04, t4, c04
  2320. lda AO, 4 * SIZE(AO)
  2321. MUL a4, b2, t4
  2322. lda BO, 2 * SIZE(BO)
  2323. ADD c05, t1, c05
  2324. ADD c06, t2, c06
  2325. ADD c07, t3, c07
  2326. ADD c08, t4, c08
  2327. .align 4
  2328. $L58:
  2329. #if defined(LN) || defined(RT)
  2330. #ifdef LN
  2331. subq KK, 4, TMP1
  2332. #else
  2333. subq KK, 2, TMP1
  2334. #endif
  2335. sll TMP1, BASE_SHIFT + 2, TMP2
  2336. addq AORIG, TMP2, AO
  2337. sll TMP1, BASE_SHIFT + 1, TMP2
  2338. addq B, TMP2, BO
  2339. #else
  2340. lda AO, -4 * SIZE(AO)
  2341. lda BO, -2 * SIZE(BO)
  2342. #endif
  2343. #if defined(LN) || defined(LT)
  2344. LD a1, 0 * SIZE(BO)
  2345. LD a2, 1 * SIZE(BO)
  2346. LD a3, 2 * SIZE(BO)
  2347. LD a4, 3 * SIZE(BO)
  2348. LD b1, 4 * SIZE(BO)
  2349. LD b2, 5 * SIZE(BO)
  2350. LD b3, 6 * SIZE(BO)
  2351. LD b4, 7 * SIZE(BO)
  2352. SUB a1, c01, c01
  2353. SUB a2, c05, c05
  2354. SUB a3, c02, c02
  2355. SUB a4, c06, c06
  2356. SUB b1, c03, c03
  2357. SUB b2, c07, c07
  2358. SUB b3, c04, c04
  2359. SUB b4, c08, c08
  2360. #else
  2361. LD a1, 0 * SIZE(AO)
  2362. LD a2, 1 * SIZE(AO)
  2363. LD a3, 2 * SIZE(AO)
  2364. LD a4, 3 * SIZE(AO)
  2365. LD b1, 4 * SIZE(AO)
  2366. LD b2, 5 * SIZE(AO)
  2367. LD b3, 6 * SIZE(AO)
  2368. LD b4, 7 * SIZE(AO)
  2369. SUB a1, c01, c01
  2370. SUB a2, c02, c02
  2371. SUB a3, c03, c03
  2372. SUB a4, c04, c04
  2373. SUB b1, c05, c05
  2374. SUB b2, c06, c06
  2375. SUB b3, c07, c07
  2376. SUB b4, c08, c08
  2377. #endif
  2378. #ifdef LN
  2379. LD a1, 15 * SIZE(AO)
  2380. LD a2, 14 * SIZE(AO)
  2381. LD a3, 13 * SIZE(AO)
  2382. LD a4, 12 * SIZE(AO)
  2383. MUL a1, c04, c04
  2384. MUL a1, c08, c08
  2385. MUL a2, c04, t1
  2386. MUL a2, c08, t2
  2387. SUB c03, t1, c03
  2388. SUB c07, t2, c07
  2389. MUL a3, c04, t1
  2390. MUL a3, c08, t2
  2391. SUB c02, t1, c02
  2392. SUB c06, t2, c06
  2393. MUL a4, c04, t1
  2394. MUL a4, c08, t2
  2395. SUB c01, t1, c01
  2396. SUB c05, t2, c05
  2397. LD b1, 10 * SIZE(AO)
  2398. LD b2, 9 * SIZE(AO)
  2399. LD b3, 8 * SIZE(AO)
  2400. MUL b1, c03, c03
  2401. MUL b1, c07, c07
  2402. MUL b2, c03, t1
  2403. MUL b2, c07, t2
  2404. SUB c02, t1, c02
  2405. SUB c06, t2, c06
  2406. MUL b3, c03, t1
  2407. MUL b3, c07, t2
  2408. SUB c01, t1, c01
  2409. SUB c05, t2, c05
  2410. LD a1, 5 * SIZE(AO)
  2411. LD a2, 4 * SIZE(AO)
  2412. LD a3, 0 * SIZE(AO)
  2413. MUL a1, c02, c02
  2414. MUL a1, c06, c06
  2415. MUL a2, c02, t1
  2416. MUL a2, c06, t2
  2417. SUB c01, t1, c01
  2418. SUB c05, t2, c05
  2419. MUL a3, c01, c01
  2420. MUL a3, c05, c05
  2421. #endif
  2422. #ifdef LT
  2423. LD a1, 0 * SIZE(AO)
  2424. LD a2, 1 * SIZE(AO)
  2425. LD a3, 2 * SIZE(AO)
  2426. LD a4, 3 * SIZE(AO)
  2427. MUL a1, c01, c01
  2428. MUL a1, c05, c05
  2429. MUL a2, c01, t1
  2430. MUL a2, c05, t2
  2431. SUB c02, t1, c02
  2432. SUB c06, t2, c06
  2433. MUL a3, c01, t1
  2434. MUL a3, c05, t2
  2435. SUB c03, t1, c03
  2436. SUB c07, t2, c07
  2437. MUL a4, c01, t1
  2438. MUL a4, c05, t2
  2439. SUB c04, t1, c04
  2440. SUB c08, t2, c08
  2441. LD b1, 5 * SIZE(AO)
  2442. LD b2, 6 * SIZE(AO)
  2443. LD b3, 7 * SIZE(AO)
  2444. MUL b1, c02, c02
  2445. MUL b1, c06, c06
  2446. MUL b2, c02, t1
  2447. MUL b2, c06, t2
  2448. SUB c03, t1, c03
  2449. SUB c07, t2, c07
  2450. MUL b3, c02, t1
  2451. MUL b3, c06, t2
  2452. SUB c04, t1, c04
  2453. SUB c08, t2, c08
  2454. LD a1, 10 * SIZE(AO)
  2455. LD a2, 11 * SIZE(AO)
  2456. LD a3, 15 * SIZE(AO)
  2457. MUL a1, c03, c03
  2458. MUL a1, c07, c07
  2459. MUL a2, c03, t1
  2460. MUL a2, c07, t2
  2461. SUB c04, t1, c04
  2462. SUB c08, t2, c08
  2463. MUL a3, c04, c04
  2464. MUL a3, c08, c08
  2465. #endif
  2466. #ifdef RN
  2467. LD a1, 0 * SIZE(BO)
  2468. LD a2, 1 * SIZE(BO)
  2469. LD a3, 3 * SIZE(BO)
  2470. MUL a1, c01, c01
  2471. MUL a1, c02, c02
  2472. MUL a1, c03, c03
  2473. MUL a1, c04, c04
  2474. MUL a2, c01, t1
  2475. MUL a2, c02, t2
  2476. MUL a2, c03, t3
  2477. MUL a2, c04, t4
  2478. SUB c05, t1, c05
  2479. SUB c06, t2, c06
  2480. SUB c07, t3, c07
  2481. SUB c08, t4, c08
  2482. MUL a3, c05, c05
  2483. MUL a3, c06, c06
  2484. MUL a3, c07, c07
  2485. MUL a3, c08, c08
  2486. #endif
  2487. #ifdef RT
  2488. LD a1, 3 * SIZE(BO)
  2489. LD a2, 2 * SIZE(BO)
  2490. LD a3, 0 * SIZE(BO)
  2491. MUL a1, c05, c05
  2492. MUL a1, c06, c06
  2493. MUL a1, c07, c07
  2494. MUL a1, c08, c08
  2495. MUL a2, c05, t1
  2496. MUL a2, c06, t2
  2497. MUL a2, c07, t3
  2498. MUL a2, c08, t4
  2499. SUB c01, t1, c01
  2500. SUB c02, t2, c02
  2501. SUB c03, t3, c03
  2502. SUB c04, t4, c04
  2503. MUL a3, c01, c01
  2504. MUL a3, c02, c02
  2505. MUL a3, c03, c03
  2506. MUL a3, c04, c04
  2507. #endif
  2508. #if defined(LN) || defined(LT)
  2509. ST c01, 0 * SIZE(BO)
  2510. ST c05, 1 * SIZE(BO)
  2511. ST c02, 2 * SIZE(BO)
  2512. ST c06, 3 * SIZE(BO)
  2513. ST c03, 4 * SIZE(BO)
  2514. ST c07, 5 * SIZE(BO)
  2515. ST c04, 6 * SIZE(BO)
  2516. ST c08, 7 * SIZE(BO)
  2517. #else
  2518. ST c01, 0 * SIZE(AO)
  2519. ST c02, 1 * SIZE(AO)
  2520. ST c03, 2 * SIZE(AO)
  2521. ST c04, 3 * SIZE(AO)
  2522. ST c05, 4 * SIZE(AO)
  2523. ST c06, 5 * SIZE(AO)
  2524. ST c07, 6 * SIZE(AO)
  2525. ST c08, 7 * SIZE(AO)
  2526. #endif
  2527. #ifdef LN
  2528. lda C1, -4 * SIZE(C1)
  2529. lda C2, -4 * SIZE(C2)
  2530. #endif
  2531. ST c01, 0 * SIZE(C1)
  2532. ST c02, 1 * SIZE(C1)
  2533. ST c03, 2 * SIZE(C1)
  2534. ST c04, 3 * SIZE(C1)
  2535. ST c05, 0 * SIZE(C2)
  2536. ST c06, 1 * SIZE(C2)
  2537. ST c07, 2 * SIZE(C2)
  2538. ST c08, 3 * SIZE(C2)
  2539. #ifndef LN
  2540. lda C1, 4 * SIZE(C1)
  2541. lda C2, 4 * SIZE(C2)
  2542. #endif
  2543. fclr t1
  2544. fclr t2
  2545. fclr t3
  2546. fclr t4
  2547. #ifdef RT
  2548. sll K, 2 + BASE_SHIFT, TMP1
  2549. addq AORIG, TMP1, AORIG
  2550. #endif
  2551. #if defined(LT) || defined(RN)
  2552. subq K, KK, TMP1
  2553. sll TMP1, BASE_SHIFT + 2, TMP2
  2554. addq AO, TMP2, AO
  2555. sll TMP1, BASE_SHIFT + 1, TMP2
  2556. addq BO, TMP2, BO
  2557. #endif
  2558. #ifdef LT
  2559. addq KK, 4, KK
  2560. #endif
  2561. #ifdef LN
  2562. subq KK, 4, KK
  2563. #endif
  2564. lda I, -1(I)
  2565. bgt I, $L51
  2566. .align 4
  2567. $L79:
  2568. #ifdef LN
  2569. sll K, 1 + BASE_SHIFT, TMP1
  2570. addq B, TMP1, B
  2571. #endif
  2572. #if defined(LT) || defined(RN)
  2573. mov BO, B
  2574. #endif
  2575. #ifdef RN
  2576. addq KK, 2, KK
  2577. #endif
  2578. #ifdef RT
  2579. subq KK, 2, KK
  2580. #endif
  2581. .align 4
  2582. $L80:
  2583. and N, 1, J
  2584. ble J, $L999
  2585. #ifdef RT
  2586. sll K, BASE_SHIFT, TMP1
  2587. subq B, TMP1, B
  2588. subq C, LDC, C
  2589. #endif
  2590. mov C, C1
  2591. #ifndef RT
  2592. addq C, LDC, C
  2593. #endif
  2594. #ifdef LN
  2595. addq M, OFFSET, KK
  2596. #endif
  2597. #ifdef LT
  2598. mov OFFSET, KK
  2599. #endif
  2600. #if defined(LN) || defined(RT)
  2601. mov A, AORIG
  2602. #else
  2603. mov A, AO
  2604. #endif
  2605. and M, 1, I
  2606. ble I, $L100
  2607. #if defined(LT) || defined(RN)
  2608. LD a1, 0 * SIZE(AO)
  2609. fclr t1
  2610. LD a2, 1 * SIZE(AO)
  2611. fclr t2
  2612. LD a3, 2 * SIZE(AO)
  2613. fclr t3
  2614. LD a4, 3 * SIZE(AO)
  2615. fclr t4
  2616. LD b1, 0 * SIZE(B)
  2617. fclr c01
  2618. LD b2, 1 * SIZE(B)
  2619. fclr c02
  2620. LD b3, 2 * SIZE(B)
  2621. fclr c03
  2622. LD b4, 3 * SIZE(B)
  2623. fclr c04
  2624. sra KK, 2, L
  2625. mov B, BO
  2626. unop
  2627. ble L, $L115
  2628. #else
  2629. #ifdef LN
  2630. sll K, BASE_SHIFT + 0, TMP1
  2631. subq AORIG, TMP1, AORIG
  2632. #endif
  2633. sll KK, BASE_SHIFT + 0, TMP1
  2634. addq AORIG, TMP1, AO
  2635. sll KK, BASE_SHIFT + 0, TMP1
  2636. addq B, TMP1, BO
  2637. subq K, KK, TMP1
  2638. LD a1, 0 * SIZE(AO)
  2639. fclr t1
  2640. LD a2, 1 * SIZE(AO)
  2641. fclr t2
  2642. LD a3, 2 * SIZE(AO)
  2643. fclr t3
  2644. LD a4, 3 * SIZE(AO)
  2645. fclr t4
  2646. LD b1, 0 * SIZE(BO)
  2647. fclr c01
  2648. LD b2, 1 * SIZE(BO)
  2649. fclr c02
  2650. LD b3, 2 * SIZE(BO)
  2651. fclr c03
  2652. LD b4, 3 * SIZE(BO)
  2653. fclr c04
  2654. sra TMP1, 2, L
  2655. unop
  2656. ble L, $L115
  2657. #endif
  2658. .align 4
  2659. $L112:
  2660. ADD c01, t1, c01
  2661. MUL a1, b1, t1
  2662. LD a1, 4 * SIZE(AO)
  2663. LD b1, 4 * SIZE(BO)
  2664. ADD c02, t2, c02
  2665. MUL a2, b2, t2
  2666. LD a2, 5 * SIZE(AO)
  2667. LD b2, 5 * SIZE(BO)
  2668. ADD c03, t3, c03
  2669. MUL a3, b3, t3
  2670. LD a3, 6 * SIZE(AO)
  2671. LD b3, 6 * SIZE(BO)
  2672. ADD c04, t4, c04
  2673. MUL a4, b4, t4
  2674. LD a4, 7 * SIZE(AO)
  2675. LD b4, 7 * SIZE(BO)
  2676. lda L, -1(L)
  2677. lda AO, 4 * SIZE(AO)
  2678. lda BO, 4 * SIZE(BO)
  2679. bgt L, $L112
  2680. .align 4
  2681. $L115:
  2682. #if defined(LT) || defined(RN)
  2683. and KK, 3, L
  2684. #else
  2685. and TMP1, 3, L
  2686. #endif
  2687. ble L, $L118
  2688. .align 4
  2689. $L116:
  2690. ADD c01, t1, c01
  2691. MUL a1, b1, t1
  2692. LD a1, 1 * SIZE(AO)
  2693. LD b1, 1 * SIZE(BO)
  2694. lda L, -1(L)
  2695. lda AO, 1 * SIZE(AO)
  2696. lda BO, 1 * SIZE(BO)
  2697. bgt L, $L116
  2698. .align 4
  2699. $L118:
  2700. ADD c01, t1, c01
  2701. ADD c02, t2, c02
  2702. ADD c03, t3, c03
  2703. ADD c04, t4, c04
  2704. ADD c01, c02, c01
  2705. ADD c03, c04, c03
  2706. ADD c01, c03, c01
  2707. #if defined(LN) || defined(RT)
  2708. subq KK, 1, TMP1
  2709. sll TMP1, BASE_SHIFT + 0, TMP2
  2710. addq AORIG, TMP2, AO
  2711. addq B, TMP2, BO
  2712. #endif
  2713. #if defined(LN) || defined(LT)
  2714. LD a1, 0 * SIZE(BO)
  2715. SUB a1, c01, c01
  2716. #else
  2717. LD a1, 0 * SIZE(AO)
  2718. SUB a1, c01, c01
  2719. #endif
  2720. #if defined(LN) || defined(LT)
  2721. LD a1, 0 * SIZE(AO)
  2722. MUL a1, c01, c01
  2723. #endif
  2724. #if defined(RN) || defined(RT)
  2725. LD a1, 0 * SIZE(BO)
  2726. MUL a1, c01, c01
  2727. #endif
  2728. #if defined(LN) || defined(LT)
  2729. ST c01, 0 * SIZE(BO)
  2730. #else
  2731. ST c01, 0 * SIZE(AO)
  2732. #endif
  2733. #ifdef LN
  2734. lda C1, -1 * SIZE(C1)
  2735. #endif
  2736. ST c01, 0 * SIZE(C1)
  2737. #ifndef LN
  2738. lda C1, 1 * SIZE(C1)
  2739. #endif
  2740. #ifdef RT
  2741. SXADDQ K, AORIG, AORIG
  2742. #endif
  2743. #if defined(LT) || defined(RN)
  2744. subq K, KK, TMP1
  2745. sll TMP1, BASE_SHIFT + 0, TMP2
  2746. addq AO, TMP2, AO
  2747. addq BO, TMP2, BO
  2748. #endif
  2749. #ifdef LT
  2750. addq KK, 1, KK
  2751. #endif
  2752. #ifdef LN
  2753. subq KK, 1, KK
  2754. #endif
  2755. .align 4
  2756. $L100:
  2757. and M, 2, I
  2758. ble I, $L110
  2759. #if defined(LT) || defined(RN)
  2760. LD a1, 0 * SIZE(AO)
  2761. fclr t1
  2762. LD a2, 1 * SIZE(AO)
  2763. fclr t2
  2764. LD a3, 2 * SIZE(AO)
  2765. fclr t3
  2766. LD a4, 3 * SIZE(AO)
  2767. fclr t4
  2768. LD b1, 0 * SIZE(B)
  2769. fclr c01
  2770. LD b2, 1 * SIZE(B)
  2771. fclr c02
  2772. LD b3, 2 * SIZE(B)
  2773. fclr c03
  2774. LD b4, 3 * SIZE(B)
  2775. fclr c04
  2776. sra KK, 2, L
  2777. mov B, BO
  2778. ble L, $L105
  2779. #else
  2780. #ifdef LN
  2781. sll K, BASE_SHIFT + 1, TMP1
  2782. subq AORIG, TMP1, AORIG
  2783. #endif
  2784. sll KK, BASE_SHIFT + 1, TMP1
  2785. addq AORIG, TMP1, AO
  2786. sll KK, BASE_SHIFT + 0, TMP1
  2787. addq B, TMP1, BO
  2788. subq K, KK, TMP1
  2789. LD a1, 0 * SIZE(AO)
  2790. fclr t1
  2791. LD a2, 1 * SIZE(AO)
  2792. fclr t2
  2793. LD a3, 2 * SIZE(AO)
  2794. fclr t3
  2795. LD a4, 3 * SIZE(AO)
  2796. fclr t4
  2797. LD b1, 0 * SIZE(BO)
  2798. fclr c01
  2799. LD b2, 1 * SIZE(BO)
  2800. fclr c02
  2801. LD b3, 2 * SIZE(BO)
  2802. fclr c03
  2803. LD b4, 3 * SIZE(BO)
  2804. fclr c04
  2805. sra TMP1, 2, L
  2806. ble L, $L105
  2807. #endif
  2808. .align 5
  2809. $L102:
  2810. ADD c01, t1, c01
  2811. lda L, -1(L)
  2812. MUL a1, b1, t1
  2813. LD a1, 4 * SIZE(AO)
  2814. ADD c02, t2, c02
  2815. MUL a2, b1, t2
  2816. LD a2, 5 * SIZE(AO)
  2817. LD b1, 4 * SIZE(BO)
  2818. ADD c03, t3, c03
  2819. lda BO, 4 * SIZE(BO)
  2820. MUL a3, b2, t3
  2821. LD a3, 6 * SIZE(AO)
  2822. ADD c04, t4, c04
  2823. MUL a4, b2, t4
  2824. LD a5, 7 * SIZE(AO)
  2825. LD b2, 1 * SIZE(BO)
  2826. ADD c01, t1, c01
  2827. MUL a1, b3, t1
  2828. LD a1, 8 * SIZE(AO)
  2829. lda AO, 8 * SIZE(AO)
  2830. ADD c02, t2, c02
  2831. MUL a2, b3, t2
  2832. LD b3, 2 * SIZE(BO)
  2833. LD a2, 1 * SIZE(AO)
  2834. ADD c03, t3, c03
  2835. LD a4, 3 * SIZE(AO)
  2836. MUL a3, b4, t3
  2837. LD a3, 2 * SIZE(AO)
  2838. ADD c04, t4, c04
  2839. MUL a5, b4, t4
  2840. LD b4, 3 * SIZE(BO)
  2841. bgt L, $L102
  2842. .align 4
  2843. $L105:
  2844. #if defined(LT) || defined(RN)
  2845. and KK, 3, L
  2846. #else
  2847. and TMP1, 3, L
  2848. #endif
  2849. ble L, $L108
  2850. .align 4
  2851. $L106:
  2852. ADD c01, t1, c01
  2853. lda L, -1(L)
  2854. MUL a1, b1, t1
  2855. LD a1, 2 * SIZE(AO)
  2856. ADD c02, t2, c02
  2857. MUL a2, b1, t2
  2858. LD a2, 3 * SIZE(AO)
  2859. LD b1, 1 * SIZE(BO)
  2860. lda AO, 2 * SIZE(AO)
  2861. unop
  2862. lda BO, 1 * SIZE(BO)
  2863. bgt L, $L106
  2864. .align 4
  2865. $L108:
  2866. ADD c01, t1, c01
  2867. ADD c02, t2, c02
  2868. ADD c03, t3, c03
  2869. ADD c04, t4, c04
  2870. ADD c01, c03, c01
  2871. ADD c02, c04, c02
  2872. #if defined(LN) || defined(RT)
  2873. #ifdef LN
  2874. subq KK, 2, TMP1
  2875. #else
  2876. subq KK, 1, TMP1
  2877. #endif
  2878. sll TMP1, BASE_SHIFT + 1, TMP2
  2879. addq AORIG, TMP2, AO
  2880. sll TMP1, BASE_SHIFT + 0, TMP2
  2881. addq B, TMP2, BO
  2882. #endif
  2883. #if defined(LN) || defined(LT)
  2884. LD a1, 0 * SIZE(BO)
  2885. LD a2, 1 * SIZE(BO)
  2886. SUB a1, c01, c01
  2887. SUB a2, c02, c02
  2888. #else
  2889. LD a1, 0 * SIZE(AO)
  2890. LD a2, 1 * SIZE(AO)
  2891. SUB a1, c01, c01
  2892. SUB a2, c02, c02
  2893. #endif
  2894. #ifdef LN
  2895. LD a1, 3 * SIZE(AO)
  2896. LD a2, 2 * SIZE(AO)
  2897. LD a3, 0 * SIZE(AO)
  2898. MUL a1, c02, c02
  2899. MUL a2, c02, t1
  2900. SUB c01, t1, c01
  2901. MUL a3, c01, c01
  2902. #endif
  2903. #ifdef LT
  2904. LD a1, 0 * SIZE(AO)
  2905. LD a2, 1 * SIZE(AO)
  2906. LD a3, 3 * SIZE(AO)
  2907. MUL a1, c01, c01
  2908. MUL a2, c01, t1
  2909. SUB c02, t1, c02
  2910. MUL a3, c02, c02
  2911. #endif
  2912. #if defined(RN) || defined(RT)
  2913. LD a1, 0 * SIZE(BO)
  2914. MUL a1, c01, c01
  2915. MUL a1, c02, c02
  2916. #endif
  2917. #if defined(LN) || defined(LT)
  2918. ST c01, 0 * SIZE(BO)
  2919. ST c02, 1 * SIZE(BO)
  2920. #else
  2921. ST c01, 0 * SIZE(AO)
  2922. ST c02, 1 * SIZE(AO)
  2923. #endif
  2924. #ifdef LN
  2925. lda C1, -2 * SIZE(C1)
  2926. #endif
  2927. ST c01, 0 * SIZE(C1)
  2928. ST c02, 1 * SIZE(C1)
  2929. #ifndef LN
  2930. lda C1, 2 * SIZE(C1)
  2931. #endif
  2932. fclr t1
  2933. fclr t2
  2934. fclr t3
  2935. fclr t4
  2936. #ifdef RT
  2937. sll K, 1 + BASE_SHIFT, TMP1
  2938. addq AORIG, TMP1, AORIG
  2939. #endif
  2940. #if defined(LT) || defined(RN)
  2941. subq K, KK, TMP1
  2942. sll TMP1, BASE_SHIFT + 1, TMP2
  2943. addq AO, TMP2, AO
  2944. sll TMP1, BASE_SHIFT + 0, TMP2
  2945. addq BO, TMP2, BO
  2946. #endif
  2947. #ifdef LT
  2948. addq KK, 2, KK
  2949. #endif
  2950. #ifdef LN
  2951. subq KK, 2, KK
  2952. #endif
  2953. .align 4
  2954. $L110:
  2955. sra M, 2, I
  2956. ble I, $L119
  2957. .align 4
  2958. $L91:
  2959. #if defined(LT) || defined(RN)
  2960. LD a1, 0 * SIZE(AO)
  2961. fclr t1
  2962. LD a2, 1 * SIZE(AO)
  2963. fclr t2
  2964. LD a3, 2 * SIZE(AO)
  2965. fclr t3
  2966. LD a4, 3 * SIZE(AO)
  2967. fclr t4
  2968. LD b1, 0 * SIZE(B)
  2969. fclr c01
  2970. LD b2, 1 * SIZE(B)
  2971. fclr c02
  2972. LD b3, 2 * SIZE(B)
  2973. fclr c03
  2974. LD b4, 3 * SIZE(B)
  2975. fclr c04
  2976. sra KK, 2, L
  2977. mov B, BO
  2978. ble L, $L95
  2979. #else
  2980. #ifdef LN
  2981. sll K, BASE_SHIFT + 2, TMP1
  2982. subq AORIG, TMP1, AORIG
  2983. #endif
  2984. sll KK, BASE_SHIFT + 2, TMP1
  2985. addq AORIG, TMP1, AO
  2986. sll KK, BASE_SHIFT + 0, TMP1
  2987. addq B, TMP1, BO
  2988. subq K, KK, TMP1
  2989. LD a1, 0 * SIZE(AO)
  2990. fclr t1
  2991. LD a2, 1 * SIZE(AO)
  2992. fclr t2
  2993. LD a3, 2 * SIZE(AO)
  2994. fclr t3
  2995. LD a4, 3 * SIZE(AO)
  2996. fclr t4
  2997. LD b1, 0 * SIZE(BO)
  2998. fclr c01
  2999. LD b2, 1 * SIZE(BO)
  3000. fclr c02
  3001. LD b3, 2 * SIZE(BO)
  3002. fclr c03
  3003. LD b4, 3 * SIZE(BO)
  3004. fclr c04
  3005. sra TMP1, 2, L
  3006. unop
  3007. ble L, $L95
  3008. #endif
  3009. .align 5
  3010. $L92:
  3011. ADD c01, t1, c01
  3012. unop
  3013. MUL a1, b1, t1
  3014. LD a1, 4 * SIZE(AO)
  3015. ADD c02, t2, c02
  3016. lda L, -1(L)
  3017. MUL a2, b1, t2
  3018. LD a2, 5 * SIZE(AO)
  3019. ADD c03, t3, c03
  3020. unop
  3021. MUL a3, b1, t3
  3022. LD a3, 6 * SIZE(AO)
  3023. ADD c04, t4, c04
  3024. MUL a4, b1, t4
  3025. LD a4, 7 * SIZE(AO)
  3026. LD b1, 4 * SIZE(BO)
  3027. ADD c01, t1, c01
  3028. unop
  3029. MUL a1, b2, t1
  3030. LD a1, 8 * SIZE(AO)
  3031. ADD c02, t2, c02
  3032. unop
  3033. MUL a2, b2, t2
  3034. LD a2, 9 * SIZE(AO)
  3035. ADD c03, t3, c03
  3036. unop
  3037. MUL a3, b2, t3
  3038. LD a3, 10 * SIZE(AO)
  3039. ADD c04, t4, c04
  3040. MUL a4, b2, t4
  3041. LD a4, 11 * SIZE(AO)
  3042. LD b2, 5 * SIZE(BO)
  3043. ADD c01, t1, c01
  3044. unop
  3045. MUL a1, b3, t1
  3046. LD a1, 12 * SIZE(AO)
  3047. ADD c02, t2, c02
  3048. unop
  3049. MUL a2, b3, t2
  3050. LD a2, 13 * SIZE(AO)
  3051. ADD c03, t3, c03
  3052. unop
  3053. MUL a3, b3, t3
  3054. LD a3, 14 * SIZE(AO)
  3055. ADD c04, t4, c04
  3056. MUL a4, b3, t4
  3057. LD a5, 15 * SIZE(AO)
  3058. LD b3, 6 * SIZE(BO)
  3059. ADD c01, t1, c01
  3060. MUL a1, b4, t1
  3061. LD a1, 16 * SIZE(AO)
  3062. lda AO, 16 * SIZE(AO)
  3063. ADD c02, t2, c02
  3064. lda BO, 4 * SIZE(BO)
  3065. MUL a2, b4, t2
  3066. LD a2, 1 * SIZE(AO)
  3067. ADD c03, t3, c03
  3068. LD a4, 3 * SIZE(AO)
  3069. MUL a3, b4, t3
  3070. LD a3, 2 * SIZE(AO)
  3071. ADD c04, t4, c04
  3072. MUL a5, b4, t4
  3073. LD b4, 3 * SIZE(BO)
  3074. bgt L, $L92
  3075. .align 4
  3076. $L95:
  3077. #if defined(LT) || defined(RN)
  3078. and KK, 3, L
  3079. #else
  3080. and TMP1, 3, L
  3081. #endif
  3082. unop
  3083. ble L, $L98
  3084. .align 4
  3085. $L96:
  3086. ADD c01, t1, c01
  3087. lda L, -1(L)
  3088. MUL a1, b1, t1
  3089. LD a1, 4 * SIZE(AO)
  3090. ADD c02, t2, c02
  3091. lda BO, 1 * SIZE(BO)
  3092. MUL a2, b1, t2
  3093. LD a2, 5 * SIZE(AO)
  3094. ADD c03, t3, c03
  3095. unop
  3096. MUL a3, b1, t3
  3097. LD a3, 6 * SIZE(AO)
  3098. ADD c04, t4, c04
  3099. MUL a4, b1, t4
  3100. LD a4, 7 * SIZE(AO)
  3101. LD b1, 0 * SIZE(BO)
  3102. lda AO, 4 * SIZE(AO)
  3103. bgt L, $L96
  3104. .align 4
  3105. $L98:
  3106. ADD c01, t1, c01
  3107. ADD c02, t2, c02
  3108. ADD c03, t3, c03
  3109. ADD c04, t4, c04
  3110. #if defined(LN) || defined(RT)
  3111. #ifdef LN
  3112. subq KK, 4, TMP1
  3113. #else
  3114. subq KK, 1, TMP1
  3115. #endif
  3116. sll TMP1, BASE_SHIFT + 2, TMP2
  3117. addq AORIG, TMP2, AO
  3118. sll TMP1, BASE_SHIFT + 0, TMP2
  3119. addq B, TMP2, BO
  3120. #endif
  3121. #if defined(LN) || defined(LT)
  3122. LD a1, 0 * SIZE(BO)
  3123. LD a2, 1 * SIZE(BO)
  3124. LD a3, 2 * SIZE(BO)
  3125. LD a4, 3 * SIZE(BO)
  3126. SUB a1, c01, c01
  3127. SUB a2, c02, c02
  3128. SUB a3, c03, c03
  3129. SUB a4, c04, c04
  3130. #else
  3131. LD a1, 0 * SIZE(AO)
  3132. LD a2, 1 * SIZE(AO)
  3133. LD a3, 2 * SIZE(AO)
  3134. LD a4, 3 * SIZE(AO)
  3135. SUB a1, c01, c01
  3136. SUB a2, c02, c02
  3137. SUB a3, c03, c03
  3138. SUB a4, c04, c04
  3139. #endif
  3140. #ifdef LN
  3141. LD a1, 15 * SIZE(AO)
  3142. LD a2, 14 * SIZE(AO)
  3143. LD a3, 13 * SIZE(AO)
  3144. LD a4, 12 * SIZE(AO)
  3145. MUL a1, c04, c04
  3146. MUL a2, c04, t1
  3147. SUB c03, t1, c03
  3148. MUL a3, c04, t1
  3149. SUB c02, t1, c02
  3150. MUL a4, c04, t1
  3151. SUB c01, t1, c01
  3152. LD b1, 10 * SIZE(AO)
  3153. LD b2, 9 * SIZE(AO)
  3154. LD b3, 8 * SIZE(AO)
  3155. MUL b1, c03, c03
  3156. MUL b2, c03, t1
  3157. SUB c02, t1, c02
  3158. MUL b3, c03, t1
  3159. SUB c01, t1, c01
  3160. LD a1, 5 * SIZE(AO)
  3161. LD a2, 4 * SIZE(AO)
  3162. LD a3, 0 * SIZE(AO)
  3163. MUL a1, c02, c02
  3164. MUL a2, c02, t1
  3165. SUB c01, t1, c01
  3166. MUL a3, c01, c01
  3167. #endif
  3168. #ifdef LT
  3169. LD a1, 0 * SIZE(AO)
  3170. LD a2, 1 * SIZE(AO)
  3171. LD a3, 2 * SIZE(AO)
  3172. LD a4, 3 * SIZE(AO)
  3173. MUL a1, c01, c01
  3174. MUL a2, c01, t1
  3175. SUB c02, t1, c02
  3176. MUL a3, c01, t1
  3177. SUB c03, t1, c03
  3178. MUL a4, c01, t1
  3179. SUB c04, t1, c04
  3180. LD b1, 5 * SIZE(AO)
  3181. LD b2, 6 * SIZE(AO)
  3182. LD b3, 7 * SIZE(AO)
  3183. MUL b1, c02, c02
  3184. MUL b2, c02, t1
  3185. SUB c03, t1, c03
  3186. MUL b3, c02, t1
  3187. SUB c04, t1, c04
  3188. LD a1, 10 * SIZE(AO)
  3189. LD a2, 11 * SIZE(AO)
  3190. LD a3, 15 * SIZE(AO)
  3191. MUL a1, c03, c03
  3192. MUL a2, c03, t1
  3193. SUB c04, t1, c04
  3194. MUL a3, c04, c04
  3195. #endif
  3196. #if defined(RN) || defined(RT)
  3197. LD a1, 0 * SIZE(BO)
  3198. MUL a1, c01, c01
  3199. MUL a1, c02, c02
  3200. MUL a1, c03, c03
  3201. MUL a1, c04, c04
  3202. #endif
  3203. #if defined(LN) || defined(LT)
  3204. ST c01, 0 * SIZE(BO)
  3205. ST c02, 1 * SIZE(BO)
  3206. ST c03, 2 * SIZE(BO)
  3207. ST c04, 3 * SIZE(BO)
  3208. #else
  3209. ST c01, 0 * SIZE(AO)
  3210. ST c02, 1 * SIZE(AO)
  3211. ST c03, 2 * SIZE(AO)
  3212. ST c04, 3 * SIZE(AO)
  3213. #endif
  3214. #ifdef LN
  3215. lda C1, -4 * SIZE(C1)
  3216. #endif
  3217. ST c01, 0 * SIZE(C1)
  3218. ST c02, 1 * SIZE(C1)
  3219. ST c03, 2 * SIZE(C1)
  3220. ST c04, 3 * SIZE(C1)
  3221. #ifndef LN
  3222. lda C1, 4 * SIZE(C1)
  3223. #endif
  3224. fclr t1
  3225. fclr t2
  3226. fclr t3
  3227. fclr t4
  3228. #ifdef RT
  3229. sll K, 2 + BASE_SHIFT, TMP1
  3230. addq AORIG, TMP1, AORIG
  3231. #endif
  3232. #if defined(LT) || defined(RN)
  3233. subq K, KK, TMP1
  3234. sll TMP1, BASE_SHIFT + 2, TMP2
  3235. addq AO, TMP2, AO
  3236. sll TMP1, BASE_SHIFT + 0, TMP2
  3237. addq BO, TMP2, BO
  3238. #endif
  3239. #ifdef LT
  3240. addq KK, 4, KK
  3241. #endif
  3242. #ifdef LN
  3243. subq KK, 4, KK
  3244. #endif
  3245. lda I, -1(I)
  3246. bgt I, $L91
  3247. .align 4
  3248. $L119:
  3249. #ifdef LN
  3250. SXADDQ K, B, B
  3251. #endif
  3252. #if defined(LT) || defined(RN)
  3253. mov BO, B
  3254. #endif
  3255. #ifdef RN
  3256. addq KK, 1, KK
  3257. #endif
  3258. #ifdef RT
  3259. subq KK, 1, KK
  3260. #endif
  3261. .align 4
  3262. $L999:
  3263. ldt $f2, 0($sp)
  3264. ldt $f3, 8($sp)
  3265. ldt $f4, 16($sp)
  3266. ldt $f5, 24($sp)
  3267. ldt $f6, 32($sp)
  3268. ldt $f7, 40($sp)
  3269. ldt $f8, 48($sp)
  3270. ldt $f9, 56($sp)
  3271. clr $0
  3272. lda $sp, STACKSIZE($sp)
  3273. ret
  3274. EPILOGUE