You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_8x4_sse3.S 65 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define AO %r12
  49. #define BO %r13
  50. #define CO1 %r14
  51. #define CO2 %r15
  52. #define BB %rbp
  53. #ifndef WINDOWS_ABI
  54. #define STACKSIZE 64
  55. #define OLD_LDC 8 + STACKSIZE(%rsp)
  56. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_A 40 + STACKSIZE(%rsp)
  60. #define OLD_B 48 + STACKSIZE(%rsp)
  61. #define OLD_C 56 + STACKSIZE(%rsp)
  62. #define OLD_LDC 64 + STACKSIZE(%rsp)
  63. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  64. #endif
  65. #define ALPHA 0(%rsp)
  66. #define J 16(%rsp)
  67. #define OFFSET 24(%rsp)
  68. #define KK 32(%rsp)
  69. #define KKK 40(%rsp)
  70. #define BUFFER 128(%rsp)
  71. #define PREFETCH prefetcht0
  72. #define PREFETCHSIZE 320
  73. #define KERNEL1(address) \
  74. mulps %xmm8, %xmm9; \
  75. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \
  76. addps %xmm9, %xmm0; \
  77. movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \
  78. mulps %xmm8, %xmm9; \
  79. addps %xmm9, %xmm1; \
  80. movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \
  81. mulps %xmm8, %xmm9; \
  82. addps %xmm9, %xmm2; \
  83. movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \
  84. mulps %xmm8, %xmm9; \
  85. movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \
  86. addps %xmm9, %xmm3; \
  87. movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9
  88. #define KERNEL2(address) \
  89. mulps %xmm8, %xmm9; \
  90. addps %xmm9, %xmm4; \
  91. movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \
  92. mulps %xmm8, %xmm9; \
  93. addps %xmm9, %xmm5; \
  94. movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \
  95. mulps %xmm8, %xmm9; \
  96. addps %xmm9, %xmm6; \
  97. movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \
  98. mulps %xmm8, %xmm9; \
  99. movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \
  100. addps %xmm9, %xmm7; \
  101. movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9
  102. #define KERNEL3(address) \
  103. mulps %xmm8, %xmm9; \
  104. addps %xmm9, %xmm0; \
  105. movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \
  106. mulps %xmm8, %xmm9; \
  107. addps %xmm9, %xmm1; \
  108. movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \
  109. mulps %xmm8, %xmm9; \
  110. addps %xmm9, %xmm2; \
  111. movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \
  112. mulps %xmm8, %xmm9; \
  113. movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \
  114. addps %xmm9, %xmm3; \
  115. movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9
  116. #define KERNEL4(address) \
  117. mulps %xmm8, %xmm9; \
  118. addps %xmm9, %xmm4; \
  119. movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \
  120. mulps %xmm8, %xmm9; \
  121. addps %xmm9, %xmm5; \
  122. movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \
  123. mulps %xmm8, %xmm9; \
  124. addps %xmm9, %xmm6; \
  125. movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \
  126. mulps %xmm8, %xmm9; \
  127. movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \
  128. addps %xmm9, %xmm7; \
  129. movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9
  130. #define KERNEL5(address) \
  131. mulps %xmm10, %xmm11; \
  132. addps %xmm11, %xmm0; \
  133. movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \
  134. mulps %xmm10, %xmm11; \
  135. addps %xmm11, %xmm1; \
  136. movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \
  137. mulps %xmm10, %xmm11; \
  138. addps %xmm11, %xmm2; \
  139. movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \
  140. mulps %xmm10, %xmm11; \
  141. movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \
  142. addps %xmm11, %xmm3; \
  143. movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11
  144. #define KERNEL6(address) \
  145. mulps %xmm10, %xmm11; \
  146. addps %xmm11, %xmm4; \
  147. movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \
  148. mulps %xmm10, %xmm11; \
  149. addps %xmm11, %xmm5; \
  150. movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \
  151. mulps %xmm10, %xmm11; \
  152. addps %xmm11, %xmm6; \
  153. movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \
  154. mulps %xmm10, %xmm11; \
  155. movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \
  156. addps %xmm11, %xmm7; \
  157. movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11
  158. #define KERNEL7(address) \
  159. mulps %xmm10, %xmm11; \
  160. addps %xmm11, %xmm0; \
  161. movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \
  162. mulps %xmm10, %xmm11; \
  163. addps %xmm11, %xmm1; \
  164. movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \
  165. mulps %xmm10, %xmm11; \
  166. addps %xmm11, %xmm2; \
  167. movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \
  168. mulps %xmm10, %xmm11; \
  169. movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \
  170. addps %xmm11, %xmm3; \
  171. movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11
  172. #define KERNEL8(address) \
  173. mulps %xmm10, %xmm11; \
  174. addps %xmm11, %xmm4; \
  175. movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \
  176. mulps %xmm10, %xmm11; \
  177. addps %xmm11, %xmm5; \
  178. movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \
  179. mulps %xmm10, %xmm11; \
  180. addps %xmm11, %xmm6; \
  181. movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \
  182. mulps %xmm10, %xmm11; \
  183. movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \
  184. addps %xmm11, %xmm7; \
  185. movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11
  186. #define KERNEL9(address) \
  187. mulps %xmm12, %xmm13; \
  188. PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \
  189. addps %xmm13, %xmm0; \
  190. movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \
  191. mulps %xmm12, %xmm13; \
  192. addps %xmm13, %xmm1; \
  193. movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \
  194. mulps %xmm12, %xmm13; \
  195. addps %xmm13, %xmm2; \
  196. movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \
  197. mulps %xmm12, %xmm13; \
  198. movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \
  199. addps %xmm13, %xmm3; \
  200. movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13
  201. #define KERNEL10(address) \
  202. mulps %xmm12, %xmm13; \
  203. addps %xmm13, %xmm4; \
  204. movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \
  205. mulps %xmm12, %xmm13; \
  206. addps %xmm13, %xmm5; \
  207. movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \
  208. mulps %xmm12, %xmm13; \
  209. addps %xmm13, %xmm6; \
  210. movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \
  211. mulps %xmm12, %xmm13; \
  212. movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \
  213. addps %xmm13, %xmm7; \
  214. movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13
  215. #define KERNEL11(address) \
  216. mulps %xmm12, %xmm13; \
  217. addps %xmm13, %xmm0; \
  218. movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \
  219. mulps %xmm12, %xmm13; \
  220. addps %xmm13, %xmm1; \
  221. movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \
  222. mulps %xmm12, %xmm13; \
  223. addps %xmm13, %xmm2; \
  224. movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \
  225. mulps %xmm12, %xmm13; \
  226. movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \
  227. addps %xmm13, %xmm3; \
  228. movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13
  229. #define KERNEL12(address) \
  230. mulps %xmm12, %xmm13; \
  231. addps %xmm13, %xmm4; \
  232. movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \
  233. mulps %xmm12, %xmm13; \
  234. addps %xmm13, %xmm5; \
  235. movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \
  236. mulps %xmm12, %xmm13; \
  237. addps %xmm13, %xmm6; \
  238. movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \
  239. mulps %xmm12, %xmm13; \
  240. movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \
  241. addps %xmm13, %xmm7; \
  242. movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13
  243. #define KERNEL13(address) \
  244. mulps %xmm14, %xmm15; \
  245. addps %xmm15, %xmm0; \
  246. movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \
  247. mulps %xmm14, %xmm15; \
  248. addps %xmm15, %xmm1; \
  249. movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \
  250. mulps %xmm14, %xmm15; \
  251. addps %xmm15, %xmm2; \
  252. movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \
  253. mulps %xmm14, %xmm15; \
  254. movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \
  255. addps %xmm15, %xmm3; \
  256. movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15
  257. #define KERNEL14(address) \
  258. mulps %xmm14, %xmm15; \
  259. addps %xmm15, %xmm4; \
  260. movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \
  261. mulps %xmm14, %xmm15; \
  262. addps %xmm15, %xmm5; \
  263. movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \
  264. mulps %xmm14, %xmm15; \
  265. addps %xmm15, %xmm6; \
  266. movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \
  267. mulps %xmm14, %xmm15; \
  268. movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \
  269. addps %xmm15, %xmm7; \
  270. movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15
  271. #define KERNEL15(address) \
  272. mulps %xmm14, %xmm15; \
  273. addps %xmm15, %xmm0; \
  274. movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \
  275. mulps %xmm14, %xmm15; \
  276. addps %xmm15, %xmm1; \
  277. movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \
  278. mulps %xmm14, %xmm15; \
  279. addps %xmm15, %xmm2; \
  280. movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \
  281. mulps %xmm14, %xmm15; \
  282. movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \
  283. addps %xmm15, %xmm3; \
  284. movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15
  285. #define KERNEL16(address) \
  286. mulps %xmm14, %xmm15; \
  287. addps %xmm15, %xmm4; \
  288. movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \
  289. mulps %xmm14, %xmm15; \
  290. addps %xmm15, %xmm5; \
  291. movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \
  292. mulps %xmm14, %xmm15; \
  293. addps %xmm15, %xmm6; \
  294. movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \
  295. mulps %xmm14, %xmm15; \
  296. movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \
  297. addps %xmm15, %xmm7; \
  298. movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15
  299. PROLOGUE
  300. PROFCODE
  301. subq $STACKSIZE, %rsp
  302. movq %rbx, 0(%rsp)
  303. movq %rbp, 8(%rsp)
  304. movq %r12, 16(%rsp)
  305. movq %r13, 24(%rsp)
  306. movq %r14, 32(%rsp)
  307. movq %r15, 40(%rsp)
  308. #ifdef WINDOWS_ABI
  309. movq %rdi, 48(%rsp)
  310. movq %rsi, 56(%rsp)
  311. movups %xmm6, 64(%rsp)
  312. movups %xmm7, 80(%rsp)
  313. movups %xmm8, 96(%rsp)
  314. movups %xmm9, 112(%rsp)
  315. movups %xmm10, 128(%rsp)
  316. movups %xmm11, 144(%rsp)
  317. movups %xmm12, 160(%rsp)
  318. movups %xmm13, 176(%rsp)
  319. movups %xmm14, 192(%rsp)
  320. movups %xmm15, 208(%rsp)
  321. movq ARG1, M
  322. movq ARG2, N
  323. movq ARG3, K
  324. movq OLD_A, A
  325. movq OLD_B, B
  326. movq OLD_C, C
  327. movq OLD_LDC, LDC
  328. #ifdef TRMMKERNEL
  329. movsd OLD_OFFSET, %xmm4
  330. #endif
  331. movaps %xmm3, %xmm0
  332. #else
  333. movq OLD_LDC, LDC
  334. #ifdef TRMMKERNEL
  335. movsd OLD_OFFSET, %xmm4
  336. #endif
  337. #endif
  338. movq %rsp, %rbx # save old stack
  339. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  340. andq $-4096, %rsp # align stack
  341. STACK_TOUCHING
  342. shufps $0, %xmm0, %xmm0
  343. movaps %xmm0, ALPHA
  344. #ifdef TRMMKERNEL
  345. movsd %xmm4, OFFSET
  346. movsd %xmm4, KK
  347. #ifndef LEFT
  348. negq KK
  349. #endif
  350. #endif
  351. leaq (, LDC, SIZE), LDC
  352. movq N, J
  353. sarq $2, J # j = (n >> 2)
  354. jle .L50
  355. .L01:
  356. #if defined(TRMMKERNEL) && defined(LEFT)
  357. movq OFFSET, %rax
  358. movq %rax, KK
  359. #endif
  360. /* Copying to Sub Buffer */
  361. leaq BUFFER, BO
  362. movq K, %rax
  363. sarq $2, %rax
  364. jle .L03
  365. ALIGN_4
  366. .L02:
  367. movddup 0 * SIZE(B), %xmm0
  368. movddup 2 * SIZE(B), %xmm1
  369. movddup 4 * SIZE(B), %xmm2
  370. movddup 6 * SIZE(B), %xmm3
  371. movddup 8 * SIZE(B), %xmm4
  372. movddup 10 * SIZE(B), %xmm5
  373. movddup 12 * SIZE(B), %xmm6
  374. movddup 14 * SIZE(B), %xmm7
  375. movaps %xmm0, 0 * SIZE(BO)
  376. movaps %xmm1, 4 * SIZE(BO)
  377. movaps %xmm2, 8 * SIZE(BO)
  378. movaps %xmm3, 12 * SIZE(BO)
  379. movaps %xmm4, 16 * SIZE(BO)
  380. movaps %xmm5, 20 * SIZE(BO)
  381. movaps %xmm6, 24 * SIZE(BO)
  382. movaps %xmm7, 28 * SIZE(BO)
  383. prefetcht1 128 * SIZE(BO)
  384. prefetcht0 112 * SIZE(B)
  385. addq $16 * SIZE, B
  386. addq $32 * SIZE, BO
  387. decq %rax
  388. jne .L02
  389. ALIGN_4
  390. .L03:
  391. movq K, %rax
  392. andq $3, %rax
  393. BRANCH
  394. jle .L10
  395. ALIGN_4
  396. .L04:
  397. movddup 0 * SIZE(B), %xmm0
  398. movddup 2 * SIZE(B), %xmm1
  399. movaps %xmm0, 0 * SIZE(BO)
  400. movaps %xmm1, 4 * SIZE(BO)
  401. addq $4 * SIZE, B
  402. addq $8 * SIZE, BO
  403. decq %rax
  404. jne .L04
  405. ALIGN_4
  406. .L10:
  407. movq C, CO1 # coffset1 = c
  408. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  409. movq A, AO # aoffset = a
  410. leaq 112 * SIZE(B), BB
  411. movq M, I
  412. sarq $3, I # i = (m >> 3)
  413. jle .L20
  414. ALIGN_4
  415. .L11:
  416. prefetcht0 0 * SIZE(BB)
  417. prefetcht0 8 * SIZE(BB)
  418. subq $-16 * SIZE, BB
  419. #if !defined(TRMMKERNEL) || \
  420. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  421. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  422. leaq BUFFER, BO
  423. #else
  424. leaq BUFFER, BO
  425. movq KK, %rax
  426. leaq (, %rax, 8), %rax
  427. leaq (AO, %rax, 4), AO
  428. leaq (BO, %rax, 4), BO
  429. #endif
  430. movaps 0 * SIZE(AO), %xmm8
  431. movaps 16 * SIZE(AO), %xmm10
  432. movaps 32 * SIZE(AO), %xmm12
  433. movaps 48 * SIZE(AO), %xmm14
  434. movsldup 0 * SIZE(BO), %xmm9
  435. movsldup 16 * SIZE(BO), %xmm11
  436. movsldup 32 * SIZE(BO), %xmm13
  437. movsldup 48 * SIZE(BO), %xmm15
  438. pxor %xmm0, %xmm0
  439. pxor %xmm1, %xmm1
  440. pxor %xmm2, %xmm2
  441. pxor %xmm3, %xmm3
  442. prefetchnta 8 * SIZE(CO1)
  443. pxor %xmm4, %xmm4
  444. prefetchnta 8 * SIZE(CO2)
  445. pxor %xmm5, %xmm5
  446. prefetchnta 8 * SIZE(CO1, LDC, 2)
  447. pxor %xmm6, %xmm6
  448. prefetchnta 8 * SIZE(CO2, LDC, 2)
  449. pxor %xmm7, %xmm7
  450. #ifndef TRMMKERNEL
  451. movq K, %rax
  452. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  453. movq K, %rax
  454. subq KK, %rax
  455. movq %rax, KKK
  456. #else
  457. movq KK, %rax
  458. #ifdef LEFT
  459. addq $8, %rax
  460. #else
  461. addq $4, %rax
  462. #endif
  463. movq %rax, KKK
  464. #endif
  465. #if 1
  466. andq $-8, %rax
  467. salq $4, %rax
  468. je .L15
  469. .L1X:
  470. KERNEL1 (64 * 0)
  471. KERNEL2 (64 * 0)
  472. KERNEL3 (64 * 0)
  473. KERNEL4 (64 * 0)
  474. KERNEL5 (64 * 0)
  475. KERNEL6 (64 * 0)
  476. KERNEL7 (64 * 0)
  477. KERNEL8 (64 * 0)
  478. KERNEL9 (64 * 0)
  479. KERNEL10(64 * 0)
  480. KERNEL11(64 * 0)
  481. KERNEL12(64 * 0)
  482. KERNEL13(64 * 0)
  483. KERNEL14(64 * 0)
  484. KERNEL15(64 * 0)
  485. KERNEL16(64 * 0)
  486. cmpq $128 * 1, %rax
  487. NOBRANCH
  488. jle .L12
  489. KERNEL1 (64 * 1)
  490. KERNEL2 (64 * 1)
  491. KERNEL3 (64 * 1)
  492. KERNEL4 (64 * 1)
  493. KERNEL5 (64 * 1)
  494. KERNEL6 (64 * 1)
  495. KERNEL7 (64 * 1)
  496. KERNEL8 (64 * 1)
  497. KERNEL9 (64 * 1)
  498. KERNEL10(64 * 1)
  499. KERNEL11(64 * 1)
  500. KERNEL12(64 * 1)
  501. KERNEL13(64 * 1)
  502. KERNEL14(64 * 1)
  503. KERNEL15(64 * 1)
  504. KERNEL16(64 * 1)
  505. cmpq $128 * 2, %rax
  506. NOBRANCH
  507. jle .L12
  508. KERNEL1 (64 * 2)
  509. KERNEL2 (64 * 2)
  510. KERNEL3 (64 * 2)
  511. KERNEL4 (64 * 2)
  512. KERNEL5 (64 * 2)
  513. KERNEL6 (64 * 2)
  514. KERNEL7 (64 * 2)
  515. KERNEL8 (64 * 2)
  516. KERNEL9 (64 * 2)
  517. KERNEL10(64 * 2)
  518. KERNEL11(64 * 2)
  519. KERNEL12(64 * 2)
  520. KERNEL13(64 * 2)
  521. KERNEL14(64 * 2)
  522. KERNEL15(64 * 2)
  523. KERNEL16(64 * 2)
  524. cmpq $128 * 3, %rax
  525. NOBRANCH
  526. jle .L12
  527. KERNEL1 (64 * 3)
  528. KERNEL2 (64 * 3)
  529. KERNEL3 (64 * 3)
  530. KERNEL4 (64 * 3)
  531. KERNEL5 (64 * 3)
  532. KERNEL6 (64 * 3)
  533. KERNEL7 (64 * 3)
  534. KERNEL8 (64 * 3)
  535. KERNEL9 (64 * 3)
  536. KERNEL10(64 * 3)
  537. KERNEL11(64 * 3)
  538. KERNEL12(64 * 3)
  539. KERNEL13(64 * 3)
  540. KERNEL14(64 * 3)
  541. KERNEL15(64 * 3)
  542. KERNEL16(64 * 3)
  543. cmpq $128 * 4, %rax
  544. NOBRANCH
  545. jle .L12
  546. KERNEL1 (64 * 4)
  547. KERNEL2 (64 * 4)
  548. KERNEL3 (64 * 4)
  549. KERNEL4 (64 * 4)
  550. KERNEL5 (64 * 4)
  551. KERNEL6 (64 * 4)
  552. KERNEL7 (64 * 4)
  553. KERNEL8 (64 * 4)
  554. KERNEL9 (64 * 4)
  555. KERNEL10(64 * 4)
  556. KERNEL11(64 * 4)
  557. KERNEL12(64 * 4)
  558. KERNEL13(64 * 4)
  559. KERNEL14(64 * 4)
  560. KERNEL15(64 * 4)
  561. KERNEL16(64 * 4)
  562. cmpq $128 * 5, %rax
  563. NOBRANCH
  564. jle .L12
  565. KERNEL1 (64 * 5)
  566. KERNEL2 (64 * 5)
  567. KERNEL3 (64 * 5)
  568. KERNEL4 (64 * 5)
  569. KERNEL5 (64 * 5)
  570. KERNEL6 (64 * 5)
  571. KERNEL7 (64 * 5)
  572. KERNEL8 (64 * 5)
  573. KERNEL9 (64 * 5)
  574. KERNEL10(64 * 5)
  575. KERNEL11(64 * 5)
  576. KERNEL12(64 * 5)
  577. KERNEL13(64 * 5)
  578. KERNEL14(64 * 5)
  579. KERNEL15(64 * 5)
  580. KERNEL16(64 * 5)
  581. cmpq $128 * 6, %rax
  582. NOBRANCH
  583. jle .L12
  584. KERNEL1 (64 * 6)
  585. KERNEL2 (64 * 6)
  586. KERNEL3 (64 * 6)
  587. KERNEL4 (64 * 6)
  588. KERNEL5 (64 * 6)
  589. KERNEL6 (64 * 6)
  590. KERNEL7 (64 * 6)
  591. KERNEL8 (64 * 6)
  592. KERNEL9 (64 * 6)
  593. KERNEL10(64 * 6)
  594. KERNEL11(64 * 6)
  595. KERNEL12(64 * 6)
  596. KERNEL13(64 * 6)
  597. KERNEL14(64 * 6)
  598. KERNEL15(64 * 6)
  599. KERNEL16(64 * 6)
  600. cmpq $128 * 7, %rax
  601. NOBRANCH
  602. jle .L12
  603. KERNEL1 (64 * 7)
  604. KERNEL2 (64 * 7)
  605. KERNEL3 (64 * 7)
  606. KERNEL4 (64 * 7)
  607. KERNEL5 (64 * 7)
  608. KERNEL6 (64 * 7)
  609. KERNEL7 (64 * 7)
  610. KERNEL8 (64 * 7)
  611. KERNEL9 (64 * 7)
  612. KERNEL10(64 * 7)
  613. KERNEL11(64 * 7)
  614. KERNEL12(64 * 7)
  615. KERNEL13(64 * 7)
  616. KERNEL14(64 * 7)
  617. KERNEL15(64 * 7)
  618. KERNEL16(64 * 7)
  619. addq $64 * 8 * SIZE, AO
  620. addq $64 * 8 * SIZE, BO
  621. subq $128 * 8, %rax
  622. jg .L1X
  623. .L12:
  624. leaq (AO, %rax, 2), AO
  625. leaq (BO, %rax, 2), BO
  626. #else
  627. sarq $3, %rax
  628. je .L15
  629. ALIGN_4
  630. .L12:
  631. KERNEL1 (64 * 0)
  632. KERNEL2 (64 * 0)
  633. KERNEL3 (64 * 0)
  634. KERNEL4 (64 * 0)
  635. KERNEL5 (64 * 0)
  636. KERNEL6 (64 * 0)
  637. KERNEL7 (64 * 0)
  638. KERNEL8 (64 * 0)
  639. KERNEL9 (64 * 0)
  640. KERNEL10(64 * 0)
  641. KERNEL11(64 * 0)
  642. KERNEL12(64 * 0)
  643. KERNEL13(64 * 0)
  644. KERNEL14(64 * 0)
  645. KERNEL15(64 * 0)
  646. KERNEL16(64 * 0)
  647. addq $64 * SIZE, AO
  648. addq $64 * SIZE, BO
  649. decq %rax
  650. jne .L12
  651. #endif
  652. ALIGN_4
  653. .L15:
  654. #ifndef TRMMKERNEL
  655. movq K, %rax
  656. #else
  657. movq KKK, %rax
  658. #endif
  659. movaps ALPHA, %xmm15
  660. andq $7, %rax # if (k & 1)
  661. BRANCH
  662. je .L18
  663. ALIGN_4
  664. .L16:
  665. mulps %xmm8, %xmm9
  666. addps %xmm9, %xmm0
  667. movshdup 0 * SIZE(BO), %xmm9
  668. mulps %xmm8, %xmm9
  669. addps %xmm9, %xmm1
  670. movsldup 4 * SIZE(BO), %xmm9
  671. mulps %xmm8, %xmm9
  672. addps %xmm9, %xmm2
  673. movshdup 4 * SIZE(BO), %xmm9
  674. mulps %xmm8, %xmm9
  675. movaps 4 * SIZE(AO), %xmm8
  676. addps %xmm9, %xmm3
  677. movsldup 0 * SIZE(BO), %xmm9
  678. mulps %xmm8, %xmm9
  679. addps %xmm9, %xmm4
  680. movshdup 0 * SIZE(BO), %xmm9
  681. mulps %xmm8, %xmm9
  682. addps %xmm9, %xmm5
  683. movsldup 4 * SIZE(BO), %xmm9
  684. mulps %xmm8, %xmm9
  685. addps %xmm9, %xmm6
  686. movshdup 4 * SIZE(BO), %xmm9
  687. mulps %xmm8, %xmm9
  688. movaps 8 * SIZE(AO), %xmm8
  689. addps %xmm9, %xmm7
  690. movsldup 8 * SIZE(BO), %xmm9
  691. addq $8 * SIZE, AO
  692. addq $8 * SIZE, BO
  693. decq %rax
  694. jg .L16
  695. ALIGN_4
  696. .L18:
  697. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  698. movsd 0 * SIZE(CO1), %xmm8
  699. mulps %xmm15, %xmm0
  700. movhps 2 * SIZE(CO1), %xmm8
  701. mulps %xmm15, %xmm1
  702. movsd 4 * SIZE(CO1), %xmm9
  703. mulps %xmm15, %xmm2
  704. movhps 6 * SIZE(CO1), %xmm9
  705. mulps %xmm15, %xmm3
  706. movsd 0 * SIZE(CO2), %xmm10
  707. mulps %xmm15, %xmm4
  708. movhps 2 * SIZE(CO2), %xmm10
  709. mulps %xmm15, %xmm5
  710. movsd 4 * SIZE(CO2), %xmm11
  711. mulps %xmm15, %xmm6
  712. movhps 6 * SIZE(CO2), %xmm11
  713. mulps %xmm15, %xmm7
  714. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  715. movhps 2 * SIZE(CO1, LDC, 2), %xmm12
  716. movsd 4 * SIZE(CO1, LDC, 2), %xmm13
  717. movhps 6 * SIZE(CO1, LDC, 2), %xmm13
  718. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  719. movhps 2 * SIZE(CO2, LDC, 2), %xmm14
  720. movsd 4 * SIZE(CO2, LDC, 2), %xmm15
  721. movhps 6 * SIZE(CO2, LDC, 2), %xmm15
  722. addps %xmm8, %xmm0
  723. addps %xmm9, %xmm4
  724. addps %xmm10, %xmm1
  725. addps %xmm11, %xmm5
  726. addps %xmm12, %xmm2
  727. movsd %xmm0, 0 * SIZE(CO1)
  728. movhps %xmm0, 2 * SIZE(CO1)
  729. addps %xmm13, %xmm6
  730. movsd %xmm4, 4 * SIZE(CO1)
  731. movhps %xmm4, 6 * SIZE(CO1)
  732. addps %xmm14, %xmm3
  733. movsd %xmm1, 0 * SIZE(CO2)
  734. movhps %xmm1, 2 * SIZE(CO2)
  735. addps %xmm15, %xmm7
  736. movsd %xmm5, 4 * SIZE(CO2)
  737. movhps %xmm5, 6 * SIZE(CO2)
  738. #else
  739. mulps %xmm15, %xmm0
  740. mulps %xmm15, %xmm1
  741. mulps %xmm15, %xmm2
  742. mulps %xmm15, %xmm3
  743. mulps %xmm15, %xmm4
  744. mulps %xmm15, %xmm5
  745. mulps %xmm15, %xmm6
  746. mulps %xmm15, %xmm7
  747. movsd %xmm0, 0 * SIZE(CO1)
  748. movhps %xmm0, 2 * SIZE(CO1)
  749. movsd %xmm4, 4 * SIZE(CO1)
  750. movhps %xmm4, 6 * SIZE(CO1)
  751. movsd %xmm1, 0 * SIZE(CO2)
  752. movhps %xmm1, 2 * SIZE(CO2)
  753. movsd %xmm5, 4 * SIZE(CO2)
  754. movhps %xmm5, 6 * SIZE(CO2)
  755. #endif
  756. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  757. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  758. movsd %xmm6, 4 * SIZE(CO1, LDC, 2)
  759. movhps %xmm6, 6 * SIZE(CO1, LDC, 2)
  760. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  761. movhps %xmm3, 2 * SIZE(CO2, LDC, 2)
  762. movsd %xmm7, 4 * SIZE(CO2, LDC, 2)
  763. movhps %xmm7, 6 * SIZE(CO2, LDC, 2)
  764. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  765. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  766. movq K, %rax
  767. subq KKK, %rax
  768. leaq (,%rax, 8), %rax
  769. leaq (AO, %rax, 4), AO
  770. leaq (BO, %rax, 4), BO
  771. #endif
  772. #if defined(TRMMKERNEL) && defined(LEFT)
  773. addq $8, KK
  774. #endif
  775. addq $8 * SIZE, CO1 # coffset += 4
  776. addq $8 * SIZE, CO2 # coffset += 4
  777. decq I # i --
  778. jg .L11
  779. ALIGN_4
  780. .L20:
  781. testq $4, M
  782. je .L30
  783. #if !defined(TRMMKERNEL) || \
  784. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  785. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  786. leaq BUFFER, BO
  787. #else
  788. leaq BUFFER, BO
  789. movq KK, %rax
  790. leaq (, %rax, 8), %rax
  791. leaq (AO, %rax, 2), AO
  792. leaq (BO, %rax, 4), BO
  793. #endif
  794. movaps 0 * SIZE(AO), %xmm8
  795. movaps 16 * SIZE(AO), %xmm10
  796. movsldup 0 * SIZE(BO), %xmm9
  797. movsldup 16 * SIZE(BO), %xmm11
  798. movsldup 32 * SIZE(BO), %xmm13
  799. movsldup 48 * SIZE(BO), %xmm15
  800. pxor %xmm0, %xmm0
  801. pxor %xmm1, %xmm1
  802. pxor %xmm2, %xmm2
  803. pxor %xmm3, %xmm3
  804. #ifndef TRMMKERNEL
  805. movq K, %rax
  806. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  807. movq K, %rax
  808. subq KK, %rax
  809. movq %rax, KKK
  810. #else
  811. movq KK, %rax
  812. #ifdef LEFT
  813. addq $4, %rax
  814. #else
  815. addq $4, %rax
  816. #endif
  817. movq %rax, KKK
  818. #endif
  819. sarq $3, %rax
  820. je .L25
  821. ALIGN_4
  822. .L22:
  823. mulps %xmm8, %xmm9
  824. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  825. addps %xmm9, %xmm0
  826. movshdup 0 * SIZE(BO), %xmm9
  827. mulps %xmm8, %xmm9
  828. addps %xmm9, %xmm1
  829. movsldup 4 * SIZE(BO), %xmm9
  830. mulps %xmm8, %xmm9
  831. addps %xmm9, %xmm2
  832. movshdup 4 * SIZE(BO), %xmm9
  833. mulps %xmm8, %xmm9
  834. movaps 4 * SIZE(AO), %xmm8
  835. addps %xmm9, %xmm3
  836. movsldup 8 * SIZE(BO), %xmm9
  837. mulps %xmm8, %xmm9
  838. addps %xmm9, %xmm0
  839. movshdup 8 * SIZE(BO), %xmm9
  840. mulps %xmm8, %xmm9
  841. addps %xmm9, %xmm1
  842. movsldup 12 * SIZE(BO), %xmm9
  843. mulps %xmm8, %xmm9
  844. addps %xmm9, %xmm2
  845. movshdup 12 * SIZE(BO), %xmm9
  846. mulps %xmm8, %xmm9
  847. movaps 8 * SIZE(AO), %xmm8
  848. addps %xmm9, %xmm3
  849. movsldup 64 * SIZE(BO), %xmm9
  850. mulps %xmm8, %xmm11
  851. addps %xmm11, %xmm0
  852. movshdup 16 * SIZE(BO), %xmm11
  853. mulps %xmm8, %xmm11
  854. addps %xmm11, %xmm1
  855. movsldup 20 * SIZE(BO), %xmm11
  856. mulps %xmm8, %xmm11
  857. addps %xmm11, %xmm2
  858. movshdup 20 * SIZE(BO), %xmm11
  859. mulps %xmm8, %xmm11
  860. movaps 12 * SIZE(AO), %xmm8
  861. addps %xmm11, %xmm3
  862. movsldup 24 * SIZE(BO), %xmm11
  863. mulps %xmm8, %xmm11
  864. addps %xmm11, %xmm0
  865. movshdup 24 * SIZE(BO), %xmm11
  866. mulps %xmm8, %xmm11
  867. addps %xmm11, %xmm1
  868. movsldup 28 * SIZE(BO), %xmm11
  869. mulps %xmm8, %xmm11
  870. addps %xmm11, %xmm2
  871. movshdup 28 * SIZE(BO), %xmm11
  872. mulps %xmm8, %xmm11
  873. movaps 32 * SIZE(AO), %xmm8
  874. addps %xmm11, %xmm3
  875. movsldup 80 * SIZE(BO), %xmm11
  876. mulps %xmm10, %xmm13
  877. addps %xmm13, %xmm0
  878. movshdup 32 * SIZE(BO), %xmm13
  879. mulps %xmm10, %xmm13
  880. addps %xmm13, %xmm1
  881. movsldup 36 * SIZE(BO), %xmm13
  882. mulps %xmm10, %xmm13
  883. addps %xmm13, %xmm2
  884. movshdup 36 * SIZE(BO), %xmm13
  885. mulps %xmm10, %xmm13
  886. movaps 20 * SIZE(AO), %xmm10
  887. addps %xmm13, %xmm3
  888. movsldup 40 * SIZE(BO), %xmm13
  889. mulps %xmm10, %xmm13
  890. addps %xmm13, %xmm0
  891. movshdup 40 * SIZE(BO), %xmm13
  892. mulps %xmm10, %xmm13
  893. addps %xmm13, %xmm1
  894. movsldup 44 * SIZE(BO), %xmm13
  895. mulps %xmm10, %xmm13
  896. addps %xmm13, %xmm2
  897. movshdup 44 * SIZE(BO), %xmm13
  898. mulps %xmm10, %xmm13
  899. movaps 24 * SIZE(AO), %xmm10
  900. addps %xmm13, %xmm3
  901. movsldup 96 * SIZE(BO), %xmm13
  902. mulps %xmm10, %xmm15
  903. addps %xmm15, %xmm0
  904. movshdup 48 * SIZE(BO), %xmm15
  905. mulps %xmm10, %xmm15
  906. addps %xmm15, %xmm1
  907. movsldup 52 * SIZE(BO), %xmm15
  908. mulps %xmm10, %xmm15
  909. addps %xmm15, %xmm2
  910. movshdup 52 * SIZE(BO), %xmm15
  911. mulps %xmm10, %xmm15
  912. movaps 28 * SIZE(AO), %xmm10
  913. addps %xmm15, %xmm3
  914. movsldup 56 * SIZE(BO), %xmm15
  915. mulps %xmm10, %xmm15
  916. addps %xmm15, %xmm0
  917. movshdup 56 * SIZE(BO), %xmm15
  918. mulps %xmm10, %xmm15
  919. addps %xmm15, %xmm1
  920. movsldup 60 * SIZE(BO), %xmm15
  921. mulps %xmm10, %xmm15
  922. addps %xmm15, %xmm2
  923. movshdup 60 * SIZE(BO), %xmm15
  924. mulps %xmm10, %xmm15
  925. movaps 48 * SIZE(AO), %xmm10
  926. addps %xmm15, %xmm3
  927. movsldup 112 * SIZE(BO), %xmm15
  928. addq $32 * SIZE, AO
  929. addq $64 * SIZE, BO
  930. decq %rax
  931. jne .L22
  932. ALIGN_4
  933. .L25:
  934. #ifndef TRMMKERNEL
  935. movq K, %rax
  936. #else
  937. movq KKK, %rax
  938. #endif
  939. movaps ALPHA, %xmm15
  940. andq $7, %rax # if (k & 1)
  941. BRANCH
  942. je .L28
  943. ALIGN_4
  944. .L26:
  945. mulps %xmm8, %xmm9
  946. addps %xmm9, %xmm0
  947. movshdup 0 * SIZE(BO), %xmm9
  948. mulps %xmm8, %xmm9
  949. addps %xmm9, %xmm1
  950. movsldup 4 * SIZE(BO), %xmm9
  951. mulps %xmm8, %xmm9
  952. addps %xmm9, %xmm2
  953. movshdup 4 * SIZE(BO), %xmm9
  954. mulps %xmm8, %xmm9
  955. movaps 4 * SIZE(AO), %xmm8
  956. addps %xmm9, %xmm3
  957. movsldup 8 * SIZE(BO), %xmm9
  958. addq $4 * SIZE, AO
  959. addq $8 * SIZE, BO
  960. decq %rax
  961. jg .L26
  962. ALIGN_4
  963. .L28:
  964. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  965. movsd 0 * SIZE(CO1), %xmm8
  966. movhps 2 * SIZE(CO1), %xmm8
  967. movsd 0 * SIZE(CO2), %xmm10
  968. movhps 2 * SIZE(CO2), %xmm10
  969. mulps %xmm15, %xmm0
  970. mulps %xmm15, %xmm1
  971. mulps %xmm15, %xmm2
  972. mulps %xmm15, %xmm3
  973. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  974. movhps 2 * SIZE(CO1, LDC, 2), %xmm12
  975. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  976. movhps 2 * SIZE(CO2, LDC, 2), %xmm14
  977. addps %xmm8, %xmm0
  978. addps %xmm10, %xmm1
  979. addps %xmm12, %xmm2
  980. addps %xmm14, %xmm3
  981. #else
  982. mulps %xmm15, %xmm0
  983. mulps %xmm15, %xmm1
  984. mulps %xmm15, %xmm2
  985. mulps %xmm15, %xmm3
  986. #endif
  987. movsd %xmm0, 0 * SIZE(CO1)
  988. movhps %xmm0, 2 * SIZE(CO1)
  989. movsd %xmm1, 0 * SIZE(CO2)
  990. movhps %xmm1, 2 * SIZE(CO2)
  991. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  992. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  993. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  994. movhps %xmm3, 2 * SIZE(CO2, LDC, 2)
  995. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  996. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  997. movq K, %rax
  998. subq KKK, %rax
  999. leaq (,%rax, 8), %rax
  1000. leaq (AO, %rax, 2), AO
  1001. leaq (BO, %rax, 4), BO
  1002. #endif
  1003. #if defined(TRMMKERNEL) && defined(LEFT)
  1004. addq $4, KK
  1005. #endif
  1006. addq $4 * SIZE, CO1 # coffset += 4
  1007. addq $4 * SIZE, CO2 # coffset += 4
  1008. ALIGN_4
  1009. .L30:
  1010. testq $2, M
  1011. je .L40
  1012. #if !defined(TRMMKERNEL) || \
  1013. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1014. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1015. leaq BUFFER, BO
  1016. #else
  1017. leaq BUFFER, BO
  1018. movq KK, %rax
  1019. leaq (, %rax, 8), %rax
  1020. leaq (AO, %rax, 1), AO
  1021. leaq (BO, %rax, 4), BO
  1022. #endif
  1023. movddup 0 * SIZE(AO), %xmm8
  1024. movddup 8 * SIZE(AO), %xmm10
  1025. movsd 0 * SIZE(BO), %xmm9
  1026. movsd 32 * SIZE(BO), %xmm11
  1027. pxor %xmm0, %xmm0
  1028. pxor %xmm1, %xmm1
  1029. pxor %xmm2, %xmm2
  1030. pxor %xmm3, %xmm3
  1031. #ifndef TRMMKERNEL
  1032. movq K, %rax
  1033. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1034. movq K, %rax
  1035. subq KK, %rax
  1036. movq %rax, KKK
  1037. #else
  1038. movq KK, %rax
  1039. #ifdef LEFT
  1040. addq $2, %rax
  1041. #else
  1042. addq $4, %rax
  1043. #endif
  1044. movq %rax, KKK
  1045. #endif
  1046. sarq $3, %rax
  1047. je .L35
  1048. ALIGN_4
  1049. .L32:
  1050. shufps $0x50, %xmm9, %xmm9
  1051. mulps %xmm8, %xmm9
  1052. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1053. addps %xmm9, %xmm0
  1054. movsd 4 * SIZE(BO), %xmm9
  1055. shufps $0x50, %xmm9, %xmm9
  1056. mulps %xmm8, %xmm9
  1057. movddup 2 * SIZE(AO), %xmm8
  1058. addps %xmm9, %xmm1
  1059. movsd 8 * SIZE(BO), %xmm9
  1060. shufps $0x50, %xmm9, %xmm9
  1061. mulps %xmm8, %xmm9
  1062. addps %xmm9, %xmm2
  1063. movsd 12 * SIZE(BO), %xmm9
  1064. shufps $0x50, %xmm9, %xmm9
  1065. mulps %xmm8, %xmm9
  1066. movddup 4 * SIZE(AO), %xmm8
  1067. addps %xmm9, %xmm3
  1068. movsd 16 * SIZE(BO), %xmm9
  1069. shufps $0x50, %xmm9, %xmm9
  1070. mulps %xmm8, %xmm9
  1071. addps %xmm9, %xmm0
  1072. movsd 20 * SIZE(BO), %xmm9
  1073. shufps $0x50, %xmm9, %xmm9
  1074. mulps %xmm8, %xmm9
  1075. movddup 6 * SIZE(AO), %xmm8
  1076. addps %xmm9, %xmm1
  1077. movsd 24 * SIZE(BO), %xmm9
  1078. shufps $0x50, %xmm9, %xmm9
  1079. mulps %xmm8, %xmm9
  1080. addps %xmm9, %xmm2
  1081. movsd 28 * SIZE(BO), %xmm9
  1082. shufps $0x50, %xmm9, %xmm9
  1083. mulps %xmm8, %xmm9
  1084. movddup 16 * SIZE(AO), %xmm8
  1085. addps %xmm9, %xmm3
  1086. shufps $0x50, %xmm11, %xmm11
  1087. mulps %xmm10, %xmm11
  1088. movsd 64 * SIZE(BO), %xmm9
  1089. addps %xmm11, %xmm0
  1090. movsd 36 * SIZE(BO), %xmm11
  1091. shufps $0x50, %xmm11, %xmm11
  1092. mulps %xmm10, %xmm11
  1093. movddup 10 * SIZE(AO), %xmm10
  1094. addps %xmm11, %xmm1
  1095. movsd 40 * SIZE(BO), %xmm11
  1096. shufps $0x50, %xmm11, %xmm11
  1097. mulps %xmm10, %xmm11
  1098. addps %xmm11, %xmm2
  1099. movsd 44 * SIZE(BO), %xmm11
  1100. shufps $0x50, %xmm11, %xmm11
  1101. mulps %xmm10, %xmm11
  1102. movddup 12 * SIZE(AO), %xmm10
  1103. addps %xmm11, %xmm3
  1104. movsd 48 * SIZE(BO), %xmm11
  1105. shufps $0x50, %xmm11, %xmm11
  1106. mulps %xmm10, %xmm11
  1107. addps %xmm11, %xmm0
  1108. movsd 52 * SIZE(BO), %xmm11
  1109. shufps $0x50, %xmm11, %xmm11
  1110. mulps %xmm10, %xmm11
  1111. movddup 14 * SIZE(AO), %xmm10
  1112. addps %xmm11, %xmm1
  1113. movsd 56 * SIZE(BO), %xmm11
  1114. shufps $0x50, %xmm11, %xmm11
  1115. mulps %xmm10, %xmm11
  1116. addps %xmm11, %xmm2
  1117. movsd 60 * SIZE(BO), %xmm11
  1118. shufps $0x50, %xmm11, %xmm11
  1119. mulps %xmm10, %xmm11
  1120. movddup 24 * SIZE(AO), %xmm10
  1121. addps %xmm11, %xmm3
  1122. movsd 96 * SIZE(BO), %xmm11
  1123. addq $16 * SIZE, AO
  1124. addq $64 * SIZE, BO
  1125. decq %rax
  1126. jne .L32
  1127. ALIGN_4
  1128. .L35:
  1129. #ifndef TRMMKERNEL
  1130. movq K, %rax
  1131. #else
  1132. movq KKK, %rax
  1133. #endif
  1134. movaps ALPHA, %xmm15
  1135. andq $7, %rax # if (k & 1)
  1136. BRANCH
  1137. je .L38
  1138. ALIGN_4
  1139. .L36:
  1140. shufps $0x50, %xmm9, %xmm9
  1141. mulps %xmm8, %xmm9
  1142. addps %xmm9, %xmm0
  1143. movsd 4 * SIZE(BO), %xmm9
  1144. shufps $0x50, %xmm9, %xmm9
  1145. mulps %xmm8, %xmm9
  1146. movddup 2 * SIZE(AO), %xmm8
  1147. addps %xmm9, %xmm1
  1148. movsd 8 * SIZE(BO), %xmm9
  1149. addq $2 * SIZE, AO
  1150. addq $8 * SIZE, BO
  1151. decq %rax
  1152. jg .L36
  1153. ALIGN_4
  1154. .L38:
  1155. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1156. movsd 0 * SIZE(CO1), %xmm8
  1157. movhps 0 * SIZE(CO2), %xmm8
  1158. movsd 0 * SIZE(CO1, LDC, 2), %xmm9
  1159. movhps 0 * SIZE(CO2, LDC, 2), %xmm9
  1160. #endif
  1161. addps %xmm2, %xmm0
  1162. addps %xmm3, %xmm1
  1163. mulps %xmm15, %xmm0
  1164. mulps %xmm15, %xmm1
  1165. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1166. addps %xmm8, %xmm0
  1167. addps %xmm9, %xmm1
  1168. #endif
  1169. movsd %xmm0, 0 * SIZE(CO1)
  1170. movhps %xmm0, 0 * SIZE(CO2)
  1171. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  1172. movhps %xmm1, 0 * SIZE(CO2, LDC, 2)
  1173. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1174. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1175. movq K, %rax
  1176. subq KKK, %rax
  1177. leaq (,%rax, 8), %rax
  1178. leaq (AO, %rax, 1), AO
  1179. leaq (BO, %rax, 4), BO
  1180. #endif
  1181. #if defined(TRMMKERNEL) && defined(LEFT)
  1182. addq $2, KK
  1183. #endif
  1184. addq $2 * SIZE, CO1 # coffset += 4
  1185. addq $2 * SIZE, CO2 # coffset += 4
  1186. ALIGN_4
  1187. .L40:
  1188. testq $1, M
  1189. je .L49
  1190. #if !defined(TRMMKERNEL) || \
  1191. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1192. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1193. leaq BUFFER, BO
  1194. #else
  1195. leaq BUFFER, BO
  1196. movq KK, %rax
  1197. leaq (, %rax, 4), %rax
  1198. leaq (AO, %rax, 1), AO
  1199. leaq (BO, %rax, 8), BO
  1200. #endif
  1201. movss 0 * SIZE(AO), %xmm8
  1202. movss 4 * SIZE(AO), %xmm10
  1203. movsd 0 * SIZE(BO), %xmm9
  1204. movsd 32 * SIZE(BO), %xmm11
  1205. pxor %xmm0, %xmm0
  1206. pxor %xmm1, %xmm1
  1207. #ifndef TRMMKERNEL
  1208. movq K, %rax
  1209. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1210. movq K, %rax
  1211. subq KK, %rax
  1212. movq %rax, KKK
  1213. #else
  1214. movq KK, %rax
  1215. #ifdef LEFT
  1216. addq $1, %rax
  1217. #else
  1218. addq $4, %rax
  1219. #endif
  1220. movq %rax, KKK
  1221. #endif
  1222. sarq $3, %rax
  1223. je .L45
  1224. ALIGN_4
  1225. .L42:
  1226. shufps $0, %xmm8, %xmm8
  1227. movhps 4 * SIZE(BO), %xmm9
  1228. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1229. mulps %xmm8, %xmm9
  1230. movss 1 * SIZE(AO), %xmm8
  1231. addps %xmm9, %xmm0
  1232. movsd 8 * SIZE(BO), %xmm9
  1233. shufps $0, %xmm8, %xmm8
  1234. movhps 12 * SIZE(BO), %xmm9
  1235. mulps %xmm8, %xmm9
  1236. movss 2 * SIZE(AO), %xmm8
  1237. addps %xmm9, %xmm1
  1238. movsd 16 * SIZE(BO), %xmm9
  1239. shufps $0, %xmm8, %xmm8
  1240. movhps 20 * SIZE(BO), %xmm9
  1241. mulps %xmm8, %xmm9
  1242. movss 3 * SIZE(AO), %xmm8
  1243. addps %xmm9, %xmm0
  1244. movsd 24 * SIZE(BO), %xmm9
  1245. shufps $0, %xmm8, %xmm8
  1246. movhps 28 * SIZE(BO), %xmm9
  1247. mulps %xmm8, %xmm9
  1248. movss 8 * SIZE(AO), %xmm8
  1249. addps %xmm9, %xmm1
  1250. movsd 64 * SIZE(BO), %xmm9
  1251. shufps $0, %xmm10, %xmm10
  1252. movhps 36 * SIZE(BO), %xmm11
  1253. mulps %xmm10, %xmm11
  1254. movss 5 * SIZE(AO), %xmm10
  1255. addps %xmm11, %xmm0
  1256. movsd 40 * SIZE(BO), %xmm11
  1257. shufps $0, %xmm10, %xmm10
  1258. movhps 44 * SIZE(BO), %xmm11
  1259. mulps %xmm10, %xmm11
  1260. movss 6 * SIZE(AO), %xmm10
  1261. addps %xmm11, %xmm1
  1262. movsd 48 * SIZE(BO), %xmm11
  1263. shufps $0, %xmm10, %xmm10
  1264. movhps 52 * SIZE(BO), %xmm11
  1265. mulps %xmm10, %xmm11
  1266. movss 7 * SIZE(AO), %xmm10
  1267. addps %xmm11, %xmm0
  1268. movsd 56 * SIZE(BO), %xmm11
  1269. shufps $0, %xmm10, %xmm10
  1270. movhps 60 * SIZE(BO), %xmm11
  1271. mulps %xmm10, %xmm11
  1272. movss 12 * SIZE(AO), %xmm10
  1273. addps %xmm11, %xmm1
  1274. movsd 96 * SIZE(BO), %xmm11
  1275. addq $ 8 * SIZE, AO
  1276. addq $64 * SIZE, BO
  1277. decq %rax
  1278. jne .L42
  1279. ALIGN_4
  1280. .L45:
  1281. #ifndef TRMMKERNEL
  1282. movq K, %rax
  1283. #else
  1284. movq KKK, %rax
  1285. #endif
  1286. movaps ALPHA, %xmm15
  1287. andq $7, %rax # if (k & 1)
  1288. BRANCH
  1289. je .L48
  1290. ALIGN_4
  1291. .L46:
  1292. shufps $0, %xmm8, %xmm8
  1293. movhps 4 * SIZE(BO), %xmm9
  1294. mulps %xmm8, %xmm9
  1295. movss 1 * SIZE(AO), %xmm8
  1296. addps %xmm9, %xmm0
  1297. movsd 8 * SIZE(BO), %xmm9
  1298. addq $1 * SIZE, AO # aoffset += 4
  1299. addq $8 * SIZE, BO # boffset1 += 8
  1300. decq %rax
  1301. jg .L46
  1302. ALIGN_4
  1303. .L48:
  1304. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1305. movss 0 * SIZE(CO1), %xmm8
  1306. movss 0 * SIZE(CO2), %xmm9
  1307. movss 0 * SIZE(CO1, LDC, 2), %xmm10
  1308. movss 0 * SIZE(CO2, LDC, 2), %xmm11
  1309. #endif
  1310. addps %xmm1, %xmm0
  1311. mulps %xmm15, %xmm0
  1312. movhlps %xmm0, %xmm1
  1313. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1314. addss %xmm0, %xmm8
  1315. psrlq $32, %xmm0
  1316. addss %xmm0, %xmm9
  1317. addss %xmm1, %xmm10
  1318. psrlq $32, %xmm1
  1319. addss %xmm1, %xmm11
  1320. movss %xmm8, 0 * SIZE(CO1)
  1321. movss %xmm9, 0 * SIZE(CO2)
  1322. movss %xmm10, 0 * SIZE(CO1, LDC, 2)
  1323. movss %xmm11, 0 * SIZE(CO2, LDC, 2)
  1324. #else
  1325. movss %xmm0, 0 * SIZE(CO1)
  1326. psrlq $32, %xmm0
  1327. movss %xmm0, 0 * SIZE(CO2)
  1328. movss %xmm1, 0 * SIZE(CO1, LDC, 2)
  1329. psrlq $32, %xmm1
  1330. movss %xmm1, 0 * SIZE(CO2, LDC, 2)
  1331. #endif
  1332. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1333. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1334. movq K, %rax
  1335. subq KKK, %rax
  1336. leaq (,%rax, 4), %rax
  1337. leaq (AO, %rax, 1), AO
  1338. leaq (BO, %rax, 8), BO
  1339. #endif
  1340. #if defined(TRMMKERNEL) && defined(LEFT)
  1341. addq $1, KK
  1342. #endif
  1343. ALIGN_4
  1344. .L49:
  1345. #if defined(TRMMKERNEL) && !defined(LEFT)
  1346. addl $4, KK
  1347. #endif
  1348. leaq (C, LDC, 4), C # c += 4 * ldc
  1349. decq J # j --
  1350. jg .L01
  1351. .L50:
  1352. testq $2, N
  1353. je .L100
  1354. .L51:
  1355. #if defined(TRMMKERNEL) && defined(LEFT)
  1356. movq OFFSET, %rax
  1357. movq %rax, KK
  1358. #endif
  1359. /* Copying to Sub Buffer */
  1360. leaq BUFFER, BO
  1361. movq K, %rax
  1362. sarq $3, %rax
  1363. jle .L53
  1364. ALIGN_4
  1365. .L52:
  1366. movddup 0 * SIZE(B), %xmm0
  1367. movddup 2 * SIZE(B), %xmm1
  1368. movddup 4 * SIZE(B), %xmm2
  1369. movddup 6 * SIZE(B), %xmm3
  1370. movddup 8 * SIZE(B), %xmm4
  1371. movddup 10 * SIZE(B), %xmm5
  1372. movddup 12 * SIZE(B), %xmm6
  1373. movddup 14 * SIZE(B), %xmm7
  1374. movaps %xmm0, 0 * SIZE(BO)
  1375. movaps %xmm1, 4 * SIZE(BO)
  1376. movaps %xmm2, 8 * SIZE(BO)
  1377. movaps %xmm3, 12 * SIZE(BO)
  1378. movaps %xmm4, 16 * SIZE(BO)
  1379. movaps %xmm5, 20 * SIZE(BO)
  1380. movaps %xmm6, 24 * SIZE(BO)
  1381. movaps %xmm7, 28 * SIZE(BO)
  1382. prefetcht1 128 * SIZE(BO)
  1383. prefetcht0 112 * SIZE(B)
  1384. addq $16 * SIZE, B
  1385. addq $32 * SIZE, BO
  1386. decq %rax
  1387. jne .L52
  1388. ALIGN_4
  1389. .L53:
  1390. movq K, %rax
  1391. andq $7, %rax
  1392. BRANCH
  1393. jle .L60
  1394. ALIGN_4
  1395. .L54:
  1396. movddup 0 * SIZE(B), %xmm0
  1397. movaps %xmm0, 0 * SIZE(BO)
  1398. addq $ 2 * SIZE, B
  1399. addq $ 4 * SIZE, BO
  1400. decq %rax
  1401. jne .L54
  1402. ALIGN_4
  1403. .L60:
  1404. movq C, CO1 # coffset1 = c
  1405. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1406. movq A, AO # aoffset = a
  1407. movq M, I
  1408. sarq $3, I # i = (m >> 3)
  1409. jle .L70
  1410. ALIGN_4
  1411. .L61:
  1412. #if !defined(TRMMKERNEL) || \
  1413. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1414. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1415. leaq BUFFER, BO
  1416. #else
  1417. leaq BUFFER, BO
  1418. movq KK, %rax
  1419. leaq (, %rax, 8), %rax
  1420. leaq (AO, %rax, 4), AO
  1421. leaq (BO, %rax, 2), BO
  1422. #endif
  1423. movaps 0 * SIZE(AO), %xmm8
  1424. movaps 16 * SIZE(AO), %xmm10
  1425. movaps 32 * SIZE(AO), %xmm12
  1426. movaps 48 * SIZE(AO), %xmm14
  1427. movsldup 0 * SIZE(BO), %xmm9
  1428. movsldup 16 * SIZE(BO), %xmm11
  1429. pxor %xmm0, %xmm0
  1430. pxor %xmm1, %xmm1
  1431. prefetcht2 4 * SIZE(CO1)
  1432. pxor %xmm4, %xmm4
  1433. prefetcht2 4 * SIZE(CO2)
  1434. pxor %xmm5, %xmm5
  1435. #ifndef TRMMKERNEL
  1436. movq K, %rax
  1437. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1438. movq K, %rax
  1439. subq KK, %rax
  1440. movq %rax, KKK
  1441. #else
  1442. movq KK, %rax
  1443. #ifdef LEFT
  1444. addq $8, %rax
  1445. #else
  1446. addq $2, %rax
  1447. #endif
  1448. movq %rax, KKK
  1449. #endif
  1450. sarq $3, %rax
  1451. je .L65
  1452. ALIGN_4
  1453. .L62:
  1454. mulps %xmm8, %xmm9
  1455. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1456. addps %xmm9, %xmm0
  1457. movshdup 0 * SIZE(BO), %xmm9
  1458. mulps %xmm8, %xmm9
  1459. movaps 4 * SIZE(AO), %xmm8
  1460. addps %xmm9, %xmm1
  1461. movsldup 0 * SIZE(BO), %xmm9
  1462. mulps %xmm8, %xmm9
  1463. addps %xmm9, %xmm4
  1464. movshdup 0 * SIZE(BO), %xmm9
  1465. mulps %xmm8, %xmm9
  1466. movaps 8 * SIZE(AO), %xmm8
  1467. addps %xmm9, %xmm5
  1468. movsldup 4 * SIZE(BO), %xmm9
  1469. mulps %xmm8, %xmm9
  1470. addps %xmm9, %xmm0
  1471. movshdup 4 * SIZE(BO), %xmm9
  1472. mulps %xmm8, %xmm9
  1473. movaps 12 * SIZE(AO), %xmm8
  1474. addps %xmm9, %xmm1
  1475. movsldup 4 * SIZE(BO), %xmm9
  1476. mulps %xmm8, %xmm9
  1477. addps %xmm9, %xmm4
  1478. movshdup 4 * SIZE(BO), %xmm9
  1479. mulps %xmm8, %xmm9
  1480. movaps 64 * SIZE(AO), %xmm8
  1481. addps %xmm9, %xmm5
  1482. movsldup 8 * SIZE(BO), %xmm9
  1483. mulps %xmm10, %xmm9
  1484. addps %xmm9, %xmm0
  1485. movshdup 8 * SIZE(BO), %xmm9
  1486. mulps %xmm10, %xmm9
  1487. movaps 20 * SIZE(AO), %xmm10
  1488. addps %xmm9, %xmm1
  1489. movsldup 8 * SIZE(BO), %xmm9
  1490. mulps %xmm10, %xmm9
  1491. addps %xmm9, %xmm4
  1492. movshdup 8 * SIZE(BO), %xmm9
  1493. mulps %xmm10, %xmm9
  1494. movaps 24 * SIZE(AO), %xmm10
  1495. addps %xmm9, %xmm5
  1496. movsldup 12 * SIZE(BO), %xmm9
  1497. mulps %xmm10, %xmm9
  1498. addps %xmm9, %xmm0
  1499. movshdup 12 * SIZE(BO), %xmm9
  1500. mulps %xmm10, %xmm9
  1501. movaps 28 * SIZE(AO), %xmm10
  1502. addps %xmm9, %xmm1
  1503. movsldup 12 * SIZE(BO), %xmm9
  1504. mulps %xmm10, %xmm9
  1505. addps %xmm9, %xmm4
  1506. movshdup 12 * SIZE(BO), %xmm9
  1507. mulps %xmm10, %xmm9
  1508. movaps 80 * SIZE(AO), %xmm10
  1509. addps %xmm9, %xmm5
  1510. movsldup 32 * SIZE(BO), %xmm9
  1511. mulps %xmm12, %xmm11
  1512. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  1513. addps %xmm11, %xmm0
  1514. movshdup 16 * SIZE(BO), %xmm11
  1515. mulps %xmm12, %xmm11
  1516. movaps 36 * SIZE(AO), %xmm12
  1517. addps %xmm11, %xmm1
  1518. movsldup 16 * SIZE(BO), %xmm11
  1519. mulps %xmm12, %xmm11
  1520. addps %xmm11, %xmm4
  1521. movshdup 16 * SIZE(BO), %xmm11
  1522. mulps %xmm12, %xmm11
  1523. movaps 40 * SIZE(AO), %xmm12
  1524. addps %xmm11, %xmm5
  1525. movsldup 20 * SIZE(BO), %xmm11
  1526. mulps %xmm12, %xmm11
  1527. addps %xmm11, %xmm0
  1528. movshdup 20 * SIZE(BO), %xmm11
  1529. mulps %xmm12, %xmm11
  1530. movaps 44 * SIZE(AO), %xmm12
  1531. addps %xmm11, %xmm1
  1532. movsldup 20 * SIZE(BO), %xmm11
  1533. mulps %xmm12, %xmm11
  1534. addps %xmm11, %xmm4
  1535. movshdup 20 * SIZE(BO), %xmm11
  1536. mulps %xmm12, %xmm11
  1537. movaps 96 * SIZE(AO), %xmm12
  1538. addps %xmm11, %xmm5
  1539. movsldup 24 * SIZE(BO), %xmm11
  1540. mulps %xmm14, %xmm11
  1541. addps %xmm11, %xmm0
  1542. movshdup 24 * SIZE(BO), %xmm11
  1543. mulps %xmm14, %xmm11
  1544. movaps 52 * SIZE(AO), %xmm14
  1545. addps %xmm11, %xmm1
  1546. movsldup 24 * SIZE(BO), %xmm11
  1547. mulps %xmm14, %xmm11
  1548. addps %xmm11, %xmm4
  1549. movshdup 24 * SIZE(BO), %xmm11
  1550. mulps %xmm14, %xmm11
  1551. movaps 56 * SIZE(AO), %xmm14
  1552. addps %xmm11, %xmm5
  1553. movsldup 28 * SIZE(BO), %xmm11
  1554. mulps %xmm14, %xmm11
  1555. addps %xmm11, %xmm0
  1556. movshdup 28 * SIZE(BO), %xmm11
  1557. mulps %xmm14, %xmm11
  1558. movaps 60 * SIZE(AO), %xmm14
  1559. addps %xmm11, %xmm1
  1560. movsldup 28 * SIZE(BO), %xmm11
  1561. mulps %xmm14, %xmm11
  1562. addps %xmm11, %xmm4
  1563. movshdup 28 * SIZE(BO), %xmm11
  1564. mulps %xmm14, %xmm11
  1565. movaps 112 * SIZE(AO), %xmm14
  1566. addps %xmm11, %xmm5
  1567. movsldup 48 * SIZE(BO), %xmm11
  1568. addq $64 * SIZE, AO
  1569. addq $32 * SIZE, BO
  1570. decq %rax
  1571. jne .L62
  1572. ALIGN_4
  1573. .L65:
  1574. #ifndef TRMMKERNEL
  1575. movq K, %rax
  1576. #else
  1577. movq KKK, %rax
  1578. #endif
  1579. movaps ALPHA, %xmm15
  1580. andq $7, %rax # if (k & 1)
  1581. BRANCH
  1582. je .L68
  1583. ALIGN_4
  1584. .L66:
  1585. mulps %xmm8, %xmm9
  1586. addps %xmm9, %xmm0
  1587. movshdup 0 * SIZE(BO), %xmm9
  1588. mulps %xmm8, %xmm9
  1589. movaps 4 * SIZE(AO), %xmm8
  1590. addps %xmm9, %xmm1
  1591. movsldup 0 * SIZE(BO), %xmm9
  1592. mulps %xmm8, %xmm9
  1593. addps %xmm9, %xmm4
  1594. movshdup 0 * SIZE(BO), %xmm9
  1595. mulps %xmm8, %xmm9
  1596. movaps 8 * SIZE(AO), %xmm8
  1597. addps %xmm9, %xmm5
  1598. movsldup 4 * SIZE(BO), %xmm9
  1599. addq $8 * SIZE, AO
  1600. addq $4 * SIZE, BO
  1601. decq %rax
  1602. jg .L66
  1603. ALIGN_4
  1604. .L68:
  1605. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1606. movsd 0 * SIZE(CO1), %xmm8
  1607. movhps 2 * SIZE(CO1), %xmm8
  1608. movsd 4 * SIZE(CO1), %xmm9
  1609. movhps 6 * SIZE(CO1), %xmm9
  1610. movsd 0 * SIZE(CO2), %xmm10
  1611. movhps 2 * SIZE(CO2), %xmm10
  1612. movsd 4 * SIZE(CO2), %xmm11
  1613. movhps 6 * SIZE(CO2), %xmm11
  1614. #endif
  1615. mulps %xmm15, %xmm0
  1616. mulps %xmm15, %xmm4
  1617. mulps %xmm15, %xmm1
  1618. mulps %xmm15, %xmm5
  1619. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1620. addps %xmm8, %xmm0
  1621. addps %xmm9, %xmm4
  1622. addps %xmm10, %xmm1
  1623. addps %xmm11, %xmm5
  1624. #endif
  1625. movsd %xmm0, 0 * SIZE(CO1)
  1626. movhps %xmm0, 2 * SIZE(CO1)
  1627. movsd %xmm4, 4 * SIZE(CO1)
  1628. movhps %xmm4, 6 * SIZE(CO1)
  1629. movsd %xmm1, 0 * SIZE(CO2)
  1630. movhps %xmm1, 2 * SIZE(CO2)
  1631. movsd %xmm5, 4 * SIZE(CO2)
  1632. movhps %xmm5, 6 * SIZE(CO2)
  1633. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1634. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1635. movq K, %rax
  1636. subq KKK, %rax
  1637. leaq (,%rax, 8), %rax
  1638. leaq (AO, %rax, 4), AO
  1639. leaq (BO, %rax, 2), BO
  1640. #endif
  1641. #if defined(TRMMKERNEL) && defined(LEFT)
  1642. addq $8, KK
  1643. #endif
  1644. addq $8 * SIZE, CO1 # coffset += 4
  1645. addq $8 * SIZE, CO2 # coffset += 4
  1646. decq I # i --
  1647. jg .L61
  1648. ALIGN_4
  1649. .L70:
  1650. testq $4, M
  1651. je .L80
  1652. #if !defined(TRMMKERNEL) || \
  1653. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1654. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1655. leaq BUFFER, BO
  1656. #else
  1657. leaq BUFFER, BO
  1658. movq KK, %rax
  1659. leaq (, %rax, 8), %rax
  1660. leaq (AO, %rax, 2), AO
  1661. leaq (BO, %rax, 2), BO
  1662. #endif
  1663. movaps 0 * SIZE(AO), %xmm8
  1664. movsldup 0 * SIZE(BO), %xmm9
  1665. movaps 16 * SIZE(AO), %xmm10
  1666. movsldup 16 * SIZE(BO), %xmm11
  1667. pxor %xmm0, %xmm0
  1668. pxor %xmm1, %xmm1
  1669. pxor %xmm2, %xmm2
  1670. pxor %xmm3, %xmm3
  1671. #ifndef TRMMKERNEL
  1672. movq K, %rax
  1673. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1674. movq K, %rax
  1675. subq KK, %rax
  1676. movq %rax, KKK
  1677. #else
  1678. movq KK, %rax
  1679. #ifdef LEFT
  1680. addq $4, %rax
  1681. #else
  1682. addq $2, %rax
  1683. #endif
  1684. movq %rax, KKK
  1685. #endif
  1686. sarq $3, %rax
  1687. je .L75
  1688. ALIGN_4
  1689. .L72:
  1690. mulps %xmm8, %xmm9
  1691. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1692. addps %xmm9, %xmm0
  1693. movshdup 0 * SIZE(BO), %xmm9
  1694. mulps %xmm8, %xmm9
  1695. movaps 4 * SIZE(AO), %xmm8
  1696. addps %xmm9, %xmm1
  1697. movsldup 4 * SIZE(BO), %xmm9
  1698. mulps %xmm8, %xmm9
  1699. addps %xmm9, %xmm2
  1700. movshdup 4 * SIZE(BO), %xmm9
  1701. mulps %xmm8, %xmm9
  1702. movaps 8 * SIZE(AO), %xmm8
  1703. addps %xmm9, %xmm3
  1704. movsldup 8 * SIZE(BO), %xmm9
  1705. mulps %xmm8, %xmm9
  1706. addps %xmm9, %xmm0
  1707. movshdup 8 * SIZE(BO), %xmm9
  1708. mulps %xmm8, %xmm9
  1709. movaps 12 * SIZE(AO), %xmm8
  1710. addps %xmm9, %xmm1
  1711. movsldup 12 * SIZE(BO), %xmm9
  1712. mulps %xmm8, %xmm9
  1713. addps %xmm9, %xmm2
  1714. movshdup 12 * SIZE(BO), %xmm9
  1715. mulps %xmm8, %xmm9
  1716. movaps 32 * SIZE(AO), %xmm8
  1717. addps %xmm9, %xmm3
  1718. movsldup 32 * SIZE(BO), %xmm9
  1719. mulps %xmm10, %xmm11
  1720. addps %xmm11, %xmm0
  1721. movshdup 16 * SIZE(BO), %xmm11
  1722. mulps %xmm10, %xmm11
  1723. movaps 20 * SIZE(AO), %xmm10
  1724. addps %xmm11, %xmm1
  1725. movsldup 20 * SIZE(BO), %xmm11
  1726. mulps %xmm10, %xmm11
  1727. addps %xmm11, %xmm2
  1728. movshdup 20 * SIZE(BO), %xmm11
  1729. mulps %xmm10, %xmm11
  1730. movaps 24 * SIZE(AO), %xmm10
  1731. addps %xmm11, %xmm3
  1732. movsldup 24 * SIZE(BO), %xmm11
  1733. mulps %xmm10, %xmm11
  1734. addps %xmm11, %xmm0
  1735. movshdup 24 * SIZE(BO), %xmm11
  1736. mulps %xmm10, %xmm11
  1737. movaps 28 * SIZE(AO), %xmm10
  1738. addps %xmm11, %xmm1
  1739. movsldup 28 * SIZE(BO), %xmm11
  1740. mulps %xmm10, %xmm11
  1741. addps %xmm11, %xmm2
  1742. movshdup 28 * SIZE(BO), %xmm11
  1743. mulps %xmm10, %xmm11
  1744. movaps 48 * SIZE(AO), %xmm10
  1745. addps %xmm11, %xmm3
  1746. movsldup 48 * SIZE(BO), %xmm11
  1747. addq $32 * SIZE, AO
  1748. addq $32 * SIZE, BO
  1749. decq %rax
  1750. jne .L72
  1751. ALIGN_4
  1752. .L75:
  1753. #ifndef TRMMKERNEL
  1754. movq K, %rax
  1755. #else
  1756. movq KKK, %rax
  1757. #endif
  1758. movaps ALPHA, %xmm15
  1759. andq $7, %rax # if (k & 1)
  1760. BRANCH
  1761. je .L78
  1762. ALIGN_4
  1763. .L76:
  1764. mulps %xmm8, %xmm9
  1765. addps %xmm9, %xmm0
  1766. movshdup 0 * SIZE(BO), %xmm9
  1767. mulps %xmm8, %xmm9
  1768. movaps 4 * SIZE(AO), %xmm8
  1769. addps %xmm9, %xmm1
  1770. movsldup 4 * SIZE(BO), %xmm9
  1771. addq $4 * SIZE, AO
  1772. addq $4 * SIZE, BO
  1773. decq %rax
  1774. jg .L76
  1775. ALIGN_4
  1776. .L78:
  1777. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1778. movsd 0 * SIZE(CO1), %xmm8
  1779. movhps 2 * SIZE(CO1), %xmm8
  1780. movsd 0 * SIZE(CO2), %xmm10
  1781. movhps 2 * SIZE(CO2), %xmm10
  1782. #endif
  1783. addps %xmm2, %xmm0
  1784. addps %xmm3, %xmm1
  1785. mulps %xmm15, %xmm0
  1786. mulps %xmm15, %xmm1
  1787. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1788. addps %xmm8, %xmm0
  1789. addps %xmm10, %xmm1
  1790. #endif
  1791. movsd %xmm0, 0 * SIZE(CO1)
  1792. movhps %xmm0, 2 * SIZE(CO1)
  1793. movsd %xmm1, 0 * SIZE(CO2)
  1794. movhps %xmm1, 2 * SIZE(CO2)
  1795. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1796. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1797. movq K, %rax
  1798. subq KKK, %rax
  1799. leaq (,%rax, 8), %rax
  1800. leaq (AO, %rax, 2), AO
  1801. leaq (BO, %rax, 2), BO
  1802. #endif
  1803. #if defined(TRMMKERNEL) && defined(LEFT)
  1804. addq $4, KK
  1805. #endif
  1806. addq $4 * SIZE, CO1 # coffset += 4
  1807. addq $4 * SIZE, CO2 # coffset += 4
  1808. ALIGN_4
  1809. .L80:
  1810. testq $2, M
  1811. je .L90
  1812. #if !defined(TRMMKERNEL) || \
  1813. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1814. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1815. leaq BUFFER, BO
  1816. #else
  1817. leaq BUFFER, BO
  1818. movq KK, %rax
  1819. leaq (, %rax, 8), %rax
  1820. leaq (AO, %rax, 1), AO
  1821. leaq (BO, %rax, 2), BO
  1822. #endif
  1823. movddup 0 * SIZE(AO), %xmm8
  1824. movddup 8 * SIZE(AO), %xmm10
  1825. movsd 0 * SIZE(BO), %xmm9
  1826. movsd 16 * SIZE(BO), %xmm11
  1827. pxor %xmm0, %xmm0
  1828. pxor %xmm1, %xmm1
  1829. #ifndef TRMMKERNEL
  1830. movq K, %rax
  1831. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1832. movq K, %rax
  1833. subq KK, %rax
  1834. movq %rax, KKK
  1835. #else
  1836. movq KK, %rax
  1837. #ifdef LEFT
  1838. addq $2, %rax
  1839. #else
  1840. addq $2, %rax
  1841. #endif
  1842. movq %rax, KKK
  1843. #endif
  1844. sarq $3, %rax
  1845. je .L85
  1846. ALIGN_4
  1847. .L82:
  1848. shufps $0x50, %xmm9, %xmm9
  1849. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1850. mulps %xmm8, %xmm9
  1851. movddup 2 * SIZE(AO), %xmm8
  1852. addps %xmm9, %xmm0
  1853. movsd 4 * SIZE(BO), %xmm9
  1854. shufps $0x50, %xmm9, %xmm9
  1855. mulps %xmm8, %xmm9
  1856. movddup 4 * SIZE(AO), %xmm8
  1857. addps %xmm9, %xmm1
  1858. movsd 8 * SIZE(BO), %xmm9
  1859. shufps $0x50, %xmm9, %xmm9
  1860. mulps %xmm8, %xmm9
  1861. movddup 6 * SIZE(AO), %xmm8
  1862. addps %xmm9, %xmm0
  1863. movsd 12 * SIZE(BO), %xmm9
  1864. shufps $0x50, %xmm9, %xmm9
  1865. mulps %xmm8, %xmm9
  1866. movddup 16 * SIZE(AO), %xmm8
  1867. addps %xmm9, %xmm1
  1868. movsd 32 * SIZE(BO), %xmm9
  1869. shufps $0x50, %xmm11, %xmm11
  1870. mulps %xmm10, %xmm11
  1871. movddup 10 * SIZE(AO), %xmm10
  1872. addps %xmm11, %xmm0
  1873. movsd 20 * SIZE(BO), %xmm11
  1874. shufps $0x50, %xmm11, %xmm11
  1875. mulps %xmm10, %xmm11
  1876. movddup 12 * SIZE(AO), %xmm10
  1877. addps %xmm11, %xmm1
  1878. movsd 24 * SIZE(BO), %xmm11
  1879. shufps $0x50, %xmm11, %xmm11
  1880. mulps %xmm10, %xmm11
  1881. movddup 14 * SIZE(AO), %xmm10
  1882. addps %xmm11, %xmm0
  1883. movsd 28 * SIZE(BO), %xmm11
  1884. shufps $0x50, %xmm11, %xmm11
  1885. mulps %xmm10, %xmm11
  1886. movddup 24 * SIZE(AO), %xmm10
  1887. addps %xmm11, %xmm1
  1888. movsd 48 * SIZE(BO), %xmm11
  1889. addq $16 * SIZE, AO
  1890. addq $32 * SIZE, BO
  1891. decq %rax
  1892. jne .L82
  1893. ALIGN_4
  1894. .L85:
  1895. #ifndef TRMMKERNEL
  1896. movq K, %rax
  1897. #else
  1898. movq KKK, %rax
  1899. #endif
  1900. movaps ALPHA, %xmm15
  1901. andq $7, %rax # if (k & 1)
  1902. BRANCH
  1903. je .L88
  1904. ALIGN_4
  1905. .L86:
  1906. shufps $0x50, %xmm9, %xmm9
  1907. mulps %xmm8, %xmm9
  1908. movddup 2 * SIZE(AO), %xmm8
  1909. addps %xmm9, %xmm0
  1910. movsd 4 * SIZE(BO), %xmm9
  1911. addq $2 * SIZE, AO
  1912. addq $4 * SIZE, BO
  1913. decq %rax
  1914. jg .L86
  1915. ALIGN_4
  1916. .L88:
  1917. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1918. movsd 0 * SIZE(CO1), %xmm8
  1919. movhps 0 * SIZE(CO2), %xmm8
  1920. #endif
  1921. addps %xmm1, %xmm0
  1922. mulps %xmm15, %xmm0
  1923. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1924. addps %xmm8, %xmm0
  1925. #endif
  1926. movsd %xmm0, 0 * SIZE(CO1)
  1927. movhps %xmm0, 0 * SIZE(CO2)
  1928. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1929. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1930. movq K, %rax
  1931. subq KKK, %rax
  1932. leaq (,%rax, 8), %rax
  1933. leaq (AO, %rax, 1), AO
  1934. leaq (BO, %rax, 2), BO
  1935. #endif
  1936. #if defined(TRMMKERNEL) && defined(LEFT)
  1937. addq $2, KK
  1938. #endif
  1939. addq $2 * SIZE, CO1 # coffset += 4
  1940. addq $2 * SIZE, CO2 # coffset += 4
  1941. ALIGN_4
  1942. .L90:
  1943. testq $1, M
  1944. je .L99
  1945. #if !defined(TRMMKERNEL) || \
  1946. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1947. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1948. leaq BUFFER, BO
  1949. #else
  1950. leaq BUFFER, BO
  1951. movq KK, %rax
  1952. leaq (, %rax, 4), %rax
  1953. leaq (AO, %rax, 1), AO
  1954. leaq (BO, %rax, 4), BO
  1955. #endif
  1956. movss 0 * SIZE(AO), %xmm8
  1957. movss 4 * SIZE(AO), %xmm10
  1958. movsd 0 * SIZE(BO), %xmm9
  1959. movsd 16 * SIZE(BO), %xmm11
  1960. pxor %xmm0, %xmm0
  1961. pxor %xmm1, %xmm1
  1962. #ifndef TRMMKERNEL
  1963. movq K, %rax
  1964. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1965. movq K, %rax
  1966. subq KK, %rax
  1967. movq %rax, KKK
  1968. #else
  1969. movq KK, %rax
  1970. #ifdef LEFT
  1971. addq $1, %rax
  1972. #else
  1973. addq $2, %rax
  1974. #endif
  1975. movq %rax, KKK
  1976. #endif
  1977. sarq $3, %rax
  1978. je .L95
  1979. ALIGN_4
  1980. .L92:
  1981. shufps $0, %xmm8, %xmm8
  1982. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1983. mulps %xmm8, %xmm9
  1984. movss 1 * SIZE(AO), %xmm8
  1985. addps %xmm9, %xmm0
  1986. movsd 4 * SIZE(BO), %xmm9
  1987. shufps $0, %xmm8, %xmm8
  1988. mulps %xmm8, %xmm9
  1989. movss 2 * SIZE(AO), %xmm8
  1990. addps %xmm9, %xmm1
  1991. movsd 8 * SIZE(BO), %xmm9
  1992. shufps $0, %xmm8, %xmm8
  1993. mulps %xmm8, %xmm9
  1994. movss 3 * SIZE(AO), %xmm8
  1995. addps %xmm9, %xmm0
  1996. movsd 12 * SIZE(BO), %xmm9
  1997. shufps $0, %xmm8, %xmm8
  1998. mulps %xmm8, %xmm9
  1999. movss 8 * SIZE(AO), %xmm8
  2000. addps %xmm9, %xmm1
  2001. movsd 32 * SIZE(BO), %xmm9
  2002. shufps $0, %xmm10, %xmm10
  2003. mulps %xmm10, %xmm11
  2004. movss 5 * SIZE(AO), %xmm10
  2005. addps %xmm11, %xmm0
  2006. movsd 20 * SIZE(BO), %xmm11
  2007. shufps $0, %xmm10, %xmm10
  2008. mulps %xmm10, %xmm11
  2009. movss 6 * SIZE(AO), %xmm10
  2010. addps %xmm11, %xmm1
  2011. movsd 24 * SIZE(BO), %xmm11
  2012. shufps $0, %xmm10, %xmm10
  2013. mulps %xmm10, %xmm11
  2014. movss 7 * SIZE(AO), %xmm10
  2015. addps %xmm11, %xmm0
  2016. movsd 28 * SIZE(BO), %xmm11
  2017. shufps $0, %xmm10, %xmm10
  2018. mulps %xmm10, %xmm11
  2019. movss 12 * SIZE(AO), %xmm10
  2020. addps %xmm11, %xmm1
  2021. movsd 48 * SIZE(BO), %xmm11
  2022. addq $ 8 * SIZE, AO
  2023. addq $32 * SIZE, BO
  2024. decq %rax
  2025. jne .L92
  2026. ALIGN_4
  2027. .L95:
  2028. #ifndef TRMMKERNEL
  2029. movq K, %rax
  2030. #else
  2031. movq KKK, %rax
  2032. #endif
  2033. movaps ALPHA, %xmm15
  2034. andq $7, %rax # if (k & 1)
  2035. BRANCH
  2036. je .L98
  2037. ALIGN_4
  2038. .L96:
  2039. shufps $0, %xmm8, %xmm8
  2040. mulps %xmm8, %xmm9
  2041. movss 1 * SIZE(AO), %xmm8
  2042. addps %xmm9, %xmm0
  2043. movsd 4 * SIZE(BO), %xmm9
  2044. addq $1 * SIZE, AO
  2045. addq $4 * SIZE, BO
  2046. decq %rax
  2047. jg .L96
  2048. ALIGN_4
  2049. .L98:
  2050. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2051. movss 0 * SIZE(CO1), %xmm8
  2052. movss 0 * SIZE(CO2), %xmm9
  2053. addps %xmm1, %xmm0
  2054. mulps %xmm15, %xmm0
  2055. addss %xmm0, %xmm8
  2056. psrlq $32, %xmm0
  2057. addss %xmm0, %xmm9
  2058. movss %xmm8, 0 * SIZE(CO1)
  2059. movss %xmm9, 0 * SIZE(CO2)
  2060. #else
  2061. addps %xmm1, %xmm0
  2062. mulps %xmm15, %xmm0
  2063. movss %xmm0, 0 * SIZE(CO1)
  2064. psrlq $32, %xmm0
  2065. movss %xmm0, 0 * SIZE(CO2)
  2066. #endif
  2067. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2068. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2069. movq K, %rax
  2070. subq KKK, %rax
  2071. leaq (,%rax, 4), %rax
  2072. leaq (AO, %rax, 1), AO
  2073. leaq (BO, %rax, 4), BO
  2074. #endif
  2075. #if defined(TRMMKERNEL) && defined(LEFT)
  2076. addq $1, KK
  2077. #endif
  2078. ALIGN_4
  2079. .L99:
  2080. #if defined(TRMMKERNEL) && !defined(LEFT)
  2081. addl $2, KK
  2082. #endif
  2083. leaq (C, LDC, 2), C # c += 4 * ldc
  2084. ALIGN_4
  2085. .L100:
  2086. testq $1, N
  2087. je .L999
  2088. .L101:
  2089. #if defined(TRMMKERNEL) && defined(LEFT)
  2090. movq OFFSET, %rax
  2091. movq %rax, KK
  2092. #endif
  2093. /* Copying to Sub Buffer */
  2094. leaq BUFFER, BO
  2095. movq K, %rax
  2096. sarq $3, %rax
  2097. jle .L103
  2098. ALIGN_4
  2099. .L102:
  2100. movss 0 * SIZE(B), %xmm0
  2101. movss 1 * SIZE(B), %xmm1
  2102. movss 2 * SIZE(B), %xmm2
  2103. movss 3 * SIZE(B), %xmm3
  2104. movss 4 * SIZE(B), %xmm4
  2105. movss 5 * SIZE(B), %xmm5
  2106. movss 6 * SIZE(B), %xmm6
  2107. movss 7 * SIZE(B), %xmm7
  2108. movss %xmm0, 0 * SIZE(BO)
  2109. movss %xmm0, 1 * SIZE(BO)
  2110. movss %xmm1, 2 * SIZE(BO)
  2111. movss %xmm1, 3 * SIZE(BO)
  2112. movss %xmm2, 4 * SIZE(BO)
  2113. movss %xmm2, 5 * SIZE(BO)
  2114. movss %xmm3, 6 * SIZE(BO)
  2115. movss %xmm3, 7 * SIZE(BO)
  2116. movss %xmm4, 8 * SIZE(BO)
  2117. movss %xmm4, 9 * SIZE(BO)
  2118. movss %xmm5, 10 * SIZE(BO)
  2119. movss %xmm5, 11 * SIZE(BO)
  2120. movss %xmm6, 12 * SIZE(BO)
  2121. movss %xmm6, 13 * SIZE(BO)
  2122. movss %xmm7, 14 * SIZE(BO)
  2123. movss %xmm7, 15 * SIZE(BO)
  2124. addq $ 8 * SIZE, B
  2125. addq $16 * SIZE, BO
  2126. decq %rax
  2127. jne .L102
  2128. ALIGN_4
  2129. .L103:
  2130. movq K, %rax
  2131. andq $7, %rax
  2132. BRANCH
  2133. jle .L110
  2134. ALIGN_4
  2135. .L104:
  2136. movss 0 * SIZE(B), %xmm0
  2137. movss %xmm0, 0 * SIZE(BO)
  2138. movss %xmm0, 1 * SIZE(BO)
  2139. addq $ 1 * SIZE, B
  2140. addq $ 2 * SIZE, BO
  2141. decq %rax
  2142. jne .L104
  2143. ALIGN_4
  2144. .L110:
  2145. movq C, CO1 # coffset1 = c
  2146. movq A, AO # aoffset = a
  2147. movq M, I
  2148. sarq $3, I # i = (m >> 3)
  2149. jle .L120
  2150. ALIGN_4
  2151. .L111:
  2152. #if !defined(TRMMKERNEL) || \
  2153. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2154. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2155. leaq BUFFER, BO
  2156. #else
  2157. leaq BUFFER, BO
  2158. movq KK, %rax
  2159. leaq (, %rax, 8), %rax
  2160. leaq (AO, %rax, 4), AO
  2161. leaq (BO, %rax, 1), BO
  2162. #endif
  2163. movaps 0 * SIZE(AO), %xmm8
  2164. movaps 16 * SIZE(AO), %xmm10
  2165. movaps 32 * SIZE(AO), %xmm12
  2166. movaps 48 * SIZE(AO), %xmm14
  2167. movddup 0 * SIZE(BO), %xmm9
  2168. movddup 8 * SIZE(BO), %xmm11
  2169. pxor %xmm0, %xmm0
  2170. pxor %xmm1, %xmm1
  2171. prefetchnta 8 * SIZE(CO1)
  2172. pxor %xmm4, %xmm4
  2173. pxor %xmm5, %xmm5
  2174. #ifndef TRMMKERNEL
  2175. movq K, %rax
  2176. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2177. movq K, %rax
  2178. subq KK, %rax
  2179. movq %rax, KKK
  2180. #else
  2181. movq KK, %rax
  2182. #ifdef LEFT
  2183. addq $8, %rax
  2184. #else
  2185. addq $1, %rax
  2186. #endif
  2187. movq %rax, KKK
  2188. #endif
  2189. sarq $3, %rax
  2190. je .L115
  2191. ALIGN_4
  2192. .L112:
  2193. mulps %xmm8, %xmm9
  2194. movaps 4 * SIZE(AO), %xmm8
  2195. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2196. addps %xmm9, %xmm0
  2197. movddup 0 * SIZE(BO), %xmm9
  2198. mulps %xmm8, %xmm9
  2199. movaps 8 * SIZE(AO), %xmm8
  2200. addps %xmm9, %xmm4
  2201. movddup 2 * SIZE(BO), %xmm9
  2202. mulps %xmm8, %xmm9
  2203. movaps 12 * SIZE(AO), %xmm8
  2204. addps %xmm9, %xmm1
  2205. movddup 2 * SIZE(BO), %xmm9
  2206. mulps %xmm8, %xmm9
  2207. movaps 64 * SIZE(AO), %xmm8
  2208. addps %xmm9, %xmm5
  2209. movddup 4 * SIZE(BO), %xmm9
  2210. mulps %xmm10, %xmm9
  2211. movaps 20 * SIZE(AO), %xmm10
  2212. addps %xmm9, %xmm0
  2213. movddup 4 * SIZE(BO), %xmm9
  2214. mulps %xmm10, %xmm9
  2215. movaps 24 * SIZE(AO), %xmm10
  2216. addps %xmm9, %xmm4
  2217. movddup 6 * SIZE(BO), %xmm9
  2218. mulps %xmm10, %xmm9
  2219. movaps 28 * SIZE(AO), %xmm10
  2220. addps %xmm9, %xmm1
  2221. movddup 6 * SIZE(BO), %xmm9
  2222. mulps %xmm10, %xmm9
  2223. movaps 80 * SIZE(AO), %xmm10
  2224. addps %xmm9, %xmm5
  2225. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  2226. movddup 8 * SIZE(BO), %xmm9
  2227. mulps %xmm12, %xmm9
  2228. movaps 36 * SIZE(AO), %xmm12
  2229. addps %xmm9, %xmm0
  2230. movddup 16 * SIZE(BO), %xmm9
  2231. mulps %xmm12, %xmm11
  2232. movaps 40 * SIZE(AO), %xmm12
  2233. addps %xmm11, %xmm4
  2234. movddup 10 * SIZE(BO), %xmm11
  2235. mulps %xmm12, %xmm11
  2236. movaps 44 * SIZE(AO), %xmm12
  2237. addps %xmm11, %xmm1
  2238. movddup 10 * SIZE(BO), %xmm11
  2239. mulps %xmm12, %xmm11
  2240. movaps 96 * SIZE(AO), %xmm12
  2241. addps %xmm11, %xmm5
  2242. movddup 12 * SIZE(BO), %xmm11
  2243. mulps %xmm14, %xmm11
  2244. movaps 52 * SIZE(AO), %xmm14
  2245. addps %xmm11, %xmm0
  2246. movddup 12 * SIZE(BO), %xmm11
  2247. mulps %xmm14, %xmm11
  2248. movaps 56 * SIZE(AO), %xmm14
  2249. addps %xmm11, %xmm4
  2250. movddup 14 * SIZE(BO), %xmm11
  2251. mulps %xmm14, %xmm11
  2252. movaps 60 * SIZE(AO), %xmm14
  2253. addps %xmm11, %xmm1
  2254. movddup 14 * SIZE(BO), %xmm11
  2255. mulps %xmm14, %xmm11
  2256. movaps 112 * SIZE(AO), %xmm14
  2257. addps %xmm11, %xmm5
  2258. movddup 24 * SIZE(BO), %xmm11
  2259. addq $64 * SIZE, AO
  2260. addq $16 * SIZE, BO
  2261. decq %rax
  2262. jne .L112
  2263. ALIGN_4
  2264. .L115:
  2265. #ifndef TRMMKERNEL
  2266. movq K, %rax
  2267. #else
  2268. movq KKK, %rax
  2269. #endif
  2270. movaps ALPHA, %xmm15
  2271. andq $7, %rax # if (k & 1)
  2272. BRANCH
  2273. je .L118
  2274. ALIGN_4
  2275. .L116:
  2276. mulps %xmm8, %xmm9
  2277. movaps 4 * SIZE(AO), %xmm8
  2278. addps %xmm9, %xmm0
  2279. movddup 0 * SIZE(BO), %xmm9
  2280. mulps %xmm8, %xmm9
  2281. movaps 8 * SIZE(AO), %xmm8
  2282. addps %xmm9, %xmm4
  2283. movddup 2 * SIZE(BO), %xmm9
  2284. addq $8 * SIZE, AO
  2285. addq $2 * SIZE, BO
  2286. decq %rax
  2287. jg .L116
  2288. ALIGN_4
  2289. .L118:
  2290. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2291. movsd 0 * SIZE(CO1), %xmm8
  2292. movhps 2 * SIZE(CO1), %xmm8
  2293. movsd 4 * SIZE(CO1), %xmm9
  2294. movhps 6 * SIZE(CO1), %xmm9
  2295. #endif
  2296. addps %xmm1, %xmm0
  2297. addps %xmm5, %xmm4
  2298. mulps %xmm15, %xmm0
  2299. mulps %xmm15, %xmm4
  2300. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2301. addps %xmm8, %xmm0
  2302. addps %xmm9, %xmm4
  2303. #endif
  2304. movsd %xmm0, 0 * SIZE(CO1)
  2305. movhps %xmm0, 2 * SIZE(CO1)
  2306. movsd %xmm4, 4 * SIZE(CO1)
  2307. movhps %xmm4, 6 * SIZE(CO1)
  2308. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2309. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2310. movq K, %rax
  2311. subq KKK, %rax
  2312. leaq (,%rax, 8), %rax
  2313. leaq (AO, %rax, 4), AO
  2314. leaq (BO, %rax, 1), BO
  2315. #endif
  2316. #if defined(TRMMKERNEL) && defined(LEFT)
  2317. addq $8, KK
  2318. #endif
  2319. addq $8 * SIZE, CO1 # coffset += 4
  2320. decq I # i --
  2321. jg .L111
  2322. ALIGN_4
  2323. .L120:
  2324. testq $4, M
  2325. je .L130
  2326. #if !defined(TRMMKERNEL) || \
  2327. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2328. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2329. leaq BUFFER, BO
  2330. #else
  2331. leaq BUFFER, BO
  2332. movq KK, %rax
  2333. leaq (, %rax, 8), %rax
  2334. leaq (AO, %rax, 2), AO
  2335. leaq (BO, %rax, 1), BO
  2336. #endif
  2337. movaps 0 * SIZE(AO), %xmm8
  2338. movaps 16 * SIZE(AO), %xmm10
  2339. movaps 0 * SIZE(BO), %xmm9
  2340. movaps 16 * SIZE(BO), %xmm11
  2341. movaps 0 * SIZE(AO), %xmm8
  2342. movddup 0 * SIZE(BO), %xmm9
  2343. movaps 16 * SIZE(AO), %xmm10
  2344. movddup 8 * SIZE(BO), %xmm11
  2345. pxor %xmm0, %xmm0
  2346. pxor %xmm1, %xmm1
  2347. #ifndef TRMMKERNEL
  2348. movq K, %rax
  2349. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2350. movq K, %rax
  2351. subq KK, %rax
  2352. movq %rax, KKK
  2353. #else
  2354. movq KK, %rax
  2355. #ifdef LEFT
  2356. addq $4, %rax
  2357. #else
  2358. addq $1, %rax
  2359. #endif
  2360. movq %rax, KKK
  2361. #endif
  2362. sarq $3, %rax
  2363. je .L125
  2364. ALIGN_4
  2365. .L122:
  2366. mulps %xmm8, %xmm9
  2367. movaps 4 * SIZE(AO), %xmm8
  2368. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2369. addps %xmm9, %xmm0
  2370. movddup 2 * SIZE(BO), %xmm9
  2371. mulps %xmm8, %xmm9
  2372. movaps 8 * SIZE(AO), %xmm8
  2373. addps %xmm9, %xmm1
  2374. movddup 4 * SIZE(BO), %xmm9
  2375. mulps %xmm8, %xmm9
  2376. movaps 12 * SIZE(AO), %xmm8
  2377. addps %xmm9, %xmm0
  2378. movddup 6 * SIZE(BO), %xmm9
  2379. mulps %xmm8, %xmm9
  2380. movaps 32 * SIZE(AO), %xmm8
  2381. addps %xmm9, %xmm1
  2382. movddup 16 * SIZE(BO), %xmm9
  2383. mulps %xmm10, %xmm11
  2384. movaps 20 * SIZE(AO), %xmm10
  2385. addps %xmm11, %xmm0
  2386. movddup 10 * SIZE(BO), %xmm11
  2387. mulps %xmm10, %xmm11
  2388. movaps 24 * SIZE(AO), %xmm10
  2389. addps %xmm11, %xmm1
  2390. movddup 12 * SIZE(BO), %xmm11
  2391. mulps %xmm10, %xmm11
  2392. movaps 28 * SIZE(AO), %xmm10
  2393. addps %xmm11, %xmm0
  2394. movddup 14 * SIZE(BO), %xmm11
  2395. mulps %xmm10, %xmm11
  2396. movaps 48 * SIZE(AO), %xmm10
  2397. addps %xmm11, %xmm1
  2398. movddup 24 * SIZE(BO), %xmm11
  2399. addq $32 * SIZE, AO
  2400. addq $16 * SIZE, BO
  2401. decq %rax
  2402. jne .L122
  2403. ALIGN_4
  2404. .L125:
  2405. #ifndef TRMMKERNEL
  2406. movq K, %rax
  2407. #else
  2408. movq KKK, %rax
  2409. #endif
  2410. movaps ALPHA, %xmm15
  2411. andq $7, %rax # if (k & 1)
  2412. BRANCH
  2413. je .L128
  2414. ALIGN_4
  2415. .L126:
  2416. mulps %xmm8, %xmm9
  2417. movaps 4 * SIZE(AO), %xmm8
  2418. addps %xmm9, %xmm0
  2419. movddup 2 * SIZE(BO), %xmm9
  2420. addq $4 * SIZE, AO
  2421. addq $2 * SIZE, BO
  2422. decq %rax
  2423. jg .L126
  2424. ALIGN_4
  2425. .L128:
  2426. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2427. movsd 0 * SIZE(CO1), %xmm8
  2428. movhps 2 * SIZE(CO1), %xmm8
  2429. #endif
  2430. addps %xmm1, %xmm0
  2431. mulps %xmm15, %xmm0
  2432. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2433. addps %xmm8, %xmm0
  2434. #endif
  2435. movsd %xmm0, 0 * SIZE(CO1)
  2436. movhps %xmm0, 2 * SIZE(CO1)
  2437. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2438. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2439. movq K, %rax
  2440. subq KKK, %rax
  2441. leaq (,%rax, 8), %rax
  2442. leaq (AO, %rax, 2), AO
  2443. leaq (BO, %rax, 1), BO
  2444. #endif
  2445. #if defined(TRMMKERNEL) && defined(LEFT)
  2446. addq $4, KK
  2447. #endif
  2448. addq $4 * SIZE, CO1 # coffset += 4
  2449. ALIGN_4
  2450. .L130:
  2451. testq $2, M
  2452. je .L140
  2453. #if !defined(TRMMKERNEL) || \
  2454. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2455. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2456. leaq BUFFER, BO
  2457. #else
  2458. leaq BUFFER, BO
  2459. movq KK, %rax
  2460. leaq (, %rax, 8), %rax
  2461. leaq (AO, %rax, 1), AO
  2462. leaq (BO, %rax, 1), BO
  2463. #endif
  2464. movaps 0 * SIZE(AO), %xmm8
  2465. movaps 0 * SIZE(BO), %xmm9
  2466. movaps 16 * SIZE(AO), %xmm10
  2467. movaps 16 * SIZE(BO), %xmm11
  2468. pxor %xmm0, %xmm0
  2469. pxor %xmm1, %xmm1
  2470. pxor %xmm2, %xmm2
  2471. pxor %xmm3, %xmm3
  2472. #ifndef TRMMKERNEL
  2473. movq K, %rax
  2474. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2475. movq K, %rax
  2476. subq KK, %rax
  2477. movq %rax, KKK
  2478. #else
  2479. movq KK, %rax
  2480. #ifdef LEFT
  2481. addq $2, %rax
  2482. #else
  2483. addq $1, %rax
  2484. #endif
  2485. movq %rax, KKK
  2486. #endif
  2487. sarq $4, %rax
  2488. je .L135
  2489. ALIGN_4
  2490. .L132:
  2491. mulps %xmm8, %xmm9
  2492. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2493. movaps 4 * SIZE(AO), %xmm8
  2494. addps %xmm9, %xmm0
  2495. mulps 4 * SIZE(BO), %xmm8
  2496. addps %xmm8, %xmm1
  2497. movaps 8 * SIZE(AO), %xmm8
  2498. mulps 8 * SIZE(BO), %xmm8
  2499. addps %xmm8, %xmm2
  2500. movaps 12 * SIZE(AO), %xmm8
  2501. mulps 12 * SIZE(BO), %xmm8
  2502. addps %xmm8, %xmm3
  2503. movaps 32 * SIZE(AO), %xmm8
  2504. movaps 32 * SIZE(BO), %xmm9
  2505. mulps %xmm10, %xmm11
  2506. movaps 20 * SIZE(AO), %xmm10
  2507. addps %xmm11, %xmm0
  2508. movaps 48 * SIZE(BO), %xmm11
  2509. mulps 20 * SIZE(BO), %xmm10
  2510. addps %xmm10, %xmm1
  2511. movaps 24 * SIZE(AO), %xmm10
  2512. mulps 24 * SIZE(BO), %xmm10
  2513. addps %xmm10, %xmm2
  2514. movaps 28 * SIZE(AO), %xmm10
  2515. mulps 28 * SIZE(BO), %xmm10
  2516. addps %xmm10, %xmm3
  2517. movaps 48 * SIZE(AO), %xmm10
  2518. addq $32 * SIZE, AO
  2519. addq $32 * SIZE, BO
  2520. decq %rax
  2521. jne .L132
  2522. ALIGN_4
  2523. .L135:
  2524. #ifndef TRMMKERNEL
  2525. movq K, %rax
  2526. #else
  2527. movq KKK, %rax
  2528. #endif
  2529. movaps ALPHA, %xmm15
  2530. andq $15, %rax # if (k & 1)
  2531. BRANCH
  2532. je .L138
  2533. ALIGN_4
  2534. .L136:
  2535. movsd 0 * SIZE(AO), %xmm8
  2536. movsd 0 * SIZE(BO), %xmm9
  2537. mulps %xmm8, %xmm9
  2538. addps %xmm9, %xmm0
  2539. addq $2 * SIZE, AO
  2540. addq $2 * SIZE, BO
  2541. decq %rax
  2542. jg .L136
  2543. ALIGN_4
  2544. .L138:
  2545. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2546. movsd 0 * SIZE(CO1), %xmm8
  2547. #endif
  2548. addps %xmm1, %xmm0
  2549. addps %xmm3, %xmm2
  2550. addps %xmm2, %xmm0
  2551. movhlps %xmm0, %xmm1
  2552. addps %xmm1, %xmm0
  2553. mulps %xmm15, %xmm0
  2554. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2555. addps %xmm8, %xmm0
  2556. #endif
  2557. movsd %xmm0, 0 * SIZE(CO1)
  2558. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2559. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2560. movq K, %rax
  2561. subq KKK, %rax
  2562. leaq (,%rax, 8), %rax
  2563. leaq (AO, %rax, 1), AO
  2564. leaq (BO, %rax, 1), BO
  2565. #endif
  2566. #if defined(TRMMKERNEL) && defined(LEFT)
  2567. addq $2, KK
  2568. #endif
  2569. addq $2 * SIZE, CO1 # coffset += 4
  2570. ALIGN_4
  2571. .L140:
  2572. testq $1, M
  2573. je .L999
  2574. #if !defined(TRMMKERNEL) || \
  2575. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2576. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2577. leaq BUFFER, BO
  2578. #else
  2579. leaq BUFFER, BO
  2580. movq KK, %rax
  2581. leaq (, %rax, 4), %rax
  2582. leaq (AO, %rax, 1), AO
  2583. leaq (BO, %rax, 2), BO
  2584. #endif
  2585. movss 0 * SIZE(AO), %xmm8
  2586. movss 4 * SIZE(AO), %xmm10
  2587. movss 0 * SIZE(BO), %xmm9
  2588. movss 8 * SIZE(BO), %xmm11
  2589. pxor %xmm0, %xmm0
  2590. pxor %xmm1, %xmm1
  2591. pxor %xmm2, %xmm2
  2592. pxor %xmm3, %xmm3
  2593. #ifndef TRMMKERNEL
  2594. movq K, %rax
  2595. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2596. movq K, %rax
  2597. subq KK, %rax
  2598. movq %rax, KKK
  2599. #else
  2600. movq KK, %rax
  2601. #ifdef LEFT
  2602. addq $1, %rax
  2603. #else
  2604. addq $1, %rax
  2605. #endif
  2606. movq %rax, KKK
  2607. #endif
  2608. sarq $3, %rax
  2609. je .L145
  2610. ALIGN_4
  2611. .L142:
  2612. mulss %xmm8, %xmm9
  2613. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2614. movss 1 * SIZE(AO), %xmm8
  2615. mulss 2 * SIZE(BO), %xmm8
  2616. addss %xmm9, %xmm0
  2617. movss 16 * SIZE(BO), %xmm9
  2618. addss %xmm8, %xmm1
  2619. movss 2 * SIZE(AO), %xmm8
  2620. mulss 4 * SIZE(BO), %xmm8
  2621. addss %xmm8, %xmm2
  2622. movss 3 * SIZE(AO), %xmm8
  2623. mulss 6 * SIZE(BO), %xmm8
  2624. addss %xmm8, %xmm3
  2625. movss 8 * SIZE(AO), %xmm8
  2626. mulss %xmm10, %xmm11
  2627. movss 5 * SIZE(AO), %xmm10
  2628. mulss 10 * SIZE(BO), %xmm10
  2629. addss %xmm11, %xmm0
  2630. movss 24 * SIZE(BO), %xmm11
  2631. addss %xmm10, %xmm1
  2632. movss 6 * SIZE(AO), %xmm10
  2633. mulss 12 * SIZE(BO), %xmm10
  2634. addss %xmm10, %xmm2
  2635. movss 7 * SIZE(AO), %xmm10
  2636. mulss 14 * SIZE(BO), %xmm10
  2637. addss %xmm10, %xmm3
  2638. movss 12 * SIZE(AO), %xmm10
  2639. addq $ 8 * SIZE, AO
  2640. addq $16 * SIZE, BO
  2641. decq %rax
  2642. jne .L142
  2643. ALIGN_4
  2644. .L145:
  2645. #ifndef TRMMKERNEL
  2646. movq K, %rax
  2647. #else
  2648. movq KKK, %rax
  2649. #endif
  2650. movss ALPHA, %xmm15
  2651. andq $7, %rax # if (k & 1)
  2652. BRANCH
  2653. je .L148
  2654. ALIGN_4
  2655. .L146:
  2656. movss 0 * SIZE(AO), %xmm8
  2657. movss 0 * SIZE(BO), %xmm9
  2658. mulps %xmm8, %xmm9
  2659. addps %xmm9, %xmm0
  2660. addq $1 * SIZE, AO
  2661. addq $2 * SIZE, BO
  2662. decq %rax
  2663. jg .L146
  2664. ALIGN_4
  2665. .L148:
  2666. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2667. movss 0 * SIZE(CO1), %xmm8
  2668. #endif
  2669. addss %xmm1, %xmm0
  2670. addss %xmm3, %xmm2
  2671. addss %xmm2, %xmm0
  2672. mulss %xmm15, %xmm0
  2673. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2674. addss %xmm8, %xmm0
  2675. #endif
  2676. movss %xmm0, 0 * SIZE(CO1)
  2677. ALIGN_4
  2678. .L999:
  2679. movq %rbx, %rsp
  2680. movq 0(%rsp), %rbx
  2681. movq 8(%rsp), %rbp
  2682. movq 16(%rsp), %r12
  2683. movq 24(%rsp), %r13
  2684. movq 32(%rsp), %r14
  2685. movq 40(%rsp), %r15
  2686. #ifdef WINDOWS_ABI
  2687. movq 48(%rsp), %rdi
  2688. movq 56(%rsp), %rsi
  2689. movups 64(%rsp), %xmm6
  2690. movups 80(%rsp), %xmm7
  2691. movups 96(%rsp), %xmm8
  2692. movups 112(%rsp), %xmm9
  2693. movups 128(%rsp), %xmm10
  2694. movups 144(%rsp), %xmm11
  2695. movups 160(%rsp), %xmm12
  2696. movups 176(%rsp), %xmm13
  2697. movups 192(%rsp), %xmm14
  2698. movups 208(%rsp), %xmm15
  2699. #endif
  2700. addq $STACKSIZE, %rsp
  2701. ret
  2702. EPILOGUE