You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_sse2.S 56 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %rbp
  54. #define BB %r12
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #define OLD_LDC 8 + STACKSIZE(%rsp)
  58. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  59. #else
  60. #define STACKSIZE 256
  61. #define OLD_A 40 + STACKSIZE(%rsp)
  62. #define OLD_B 48 + STACKSIZE(%rsp)
  63. #define OLD_C 56 + STACKSIZE(%rsp)
  64. #define OLD_LDC 64 + STACKSIZE(%rsp)
  65. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  66. #endif
  67. #define ALPHA 0(%rsp)
  68. #define J 16(%rsp)
  69. #define OFFSET 24(%rsp)
  70. #define KK 32(%rsp)
  71. #define KKK 40(%rsp)
  72. #define BUFFER 256(%rsp)
  73. #ifdef OPTERON
  74. #define PREFETCH prefetch
  75. #define PREFETCHW prefetchw
  76. #define PREFETCHSIZE (8 * 9 + 4)
  77. #define movsd movlps
  78. #define movapd movaps
  79. #endif
  80. #ifdef GENERIC
  81. #define PREFETCH prefetcht0
  82. #define PREFETCHW prefetcht0
  83. #define PREFETCHSIZE (8 * 13 + 4)
  84. #define movapd movaps
  85. #endif
  86. #ifndef GENERIC
  87. #define KERNEL1(xx) \
  88. mulpd %xmm0, %xmm1 ;\
  89. addpd %xmm1, %xmm8 ;\
  90. movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  91. mulpd %xmm0, %xmm3 ;\
  92. addpd %xmm3, %xmm9 ;\
  93. movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  94. mulpd %xmm0, %xmm5 ;\
  95. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  96. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  97. addpd %xmm5, %xmm10 ;\
  98. movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  99. addpd %xmm0, %xmm11 ;\
  100. movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  101. #define KERNEL2(xx) \
  102. mulpd %xmm2, %xmm1 ;\
  103. addpd %xmm1, %xmm12 ;\
  104. movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  105. mulpd %xmm2, %xmm3 ;\
  106. addpd %xmm3, %xmm13 ;\
  107. movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  108. mulpd %xmm2, %xmm5 ;\
  109. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  110. addpd %xmm5, %xmm14 ;\
  111. movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  112. addpd %xmm2, %xmm15 ;\
  113. movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  114. #define KERNEL3(xx) \
  115. mulpd %xmm4, %xmm7 ;\
  116. addpd %xmm7, %xmm8 ;\
  117. movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  118. mulpd %xmm4, %xmm3 ;\
  119. addpd %xmm3, %xmm9 ;\
  120. movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  121. mulpd %xmm4, %xmm5 ;\
  122. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  123. addpd %xmm5, %xmm10 ;\
  124. movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  125. addpd %xmm4, %xmm11 ;\
  126. movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  127. #define KERNEL4(xx) \
  128. mulpd %xmm6, %xmm7 ;\
  129. addpd %xmm7, %xmm12 ;\
  130. movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  131. mulpd %xmm6, %xmm3 ;\
  132. addpd %xmm3, %xmm13 ;\
  133. movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  134. mulpd %xmm6, %xmm5 ;\
  135. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  136. addpd %xmm5, %xmm14 ;\
  137. movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  138. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  139. addpd %xmm6, %xmm15 ;\
  140. movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  141. #define KERNEL5(xx) \
  142. mulpd %xmm0, %xmm1 ;\
  143. addpd %xmm1, %xmm8 ;\
  144. movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  145. mulpd %xmm0, %xmm3 ;\
  146. addpd %xmm3, %xmm9 ;\
  147. movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  148. mulpd %xmm0, %xmm5 ;\
  149. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  150. addpd %xmm5, %xmm10 ;\
  151. movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  152. addpd %xmm0, %xmm11 ;\
  153. movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  154. #define KERNEL6(xx) \
  155. mulpd %xmm2, %xmm1 ;\
  156. addpd %xmm1, %xmm12 ;\
  157. movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  158. mulpd %xmm2, %xmm3 ;\
  159. addpd %xmm3, %xmm13 ;\
  160. movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  161. mulpd %xmm2, %xmm5 ;\
  162. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  163. addpd %xmm5, %xmm14 ;\
  164. movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  165. addpd %xmm2, %xmm15 ;\
  166. movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  167. #define KERNEL7(xx) \
  168. mulpd %xmm4, %xmm7 ;\
  169. addpd %xmm7, %xmm8 ;\
  170. movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  171. mulpd %xmm4, %xmm3 ;\
  172. addpd %xmm3, %xmm9 ;\
  173. movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  174. mulpd %xmm4, %xmm5 ;\
  175. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  176. addpd %xmm5, %xmm10 ;\
  177. movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  178. addpd %xmm4, %xmm11 ;\
  179. movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  180. #define KERNEL8(xx) \
  181. mulpd %xmm6, %xmm7 ;\
  182. addpd %xmm7, %xmm12 ;\
  183. movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  184. mulpd %xmm6, %xmm3 ;\
  185. addpd %xmm3, %xmm13 ;\
  186. movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  187. mulpd %xmm6, %xmm5 ;\
  188. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  189. addpd %xmm5, %xmm14 ;\
  190. movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  191. addpd %xmm6, %xmm15 ;\
  192. movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  193. #else
  194. #define KERNEL1(xx) \
  195. mulpd %xmm0, %xmm1 ;\
  196. addpd %xmm1, %xmm8 ;\
  197. movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  198. mulpd %xmm0, %xmm3 ;\
  199. addpd %xmm3, %xmm9 ;\
  200. movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  201. mulpd %xmm0, %xmm5 ;\
  202. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  203. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  204. addpd %xmm5, %xmm10 ;\
  205. movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  206. addpd %xmm0, %xmm11 ;\
  207. movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  208. #define KERNEL2(xx) \
  209. mulpd %xmm2, %xmm1 ;\
  210. addpd %xmm1, %xmm12 ;\
  211. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  212. mulpd %xmm2, %xmm3 ;\
  213. addpd %xmm3, %xmm13 ;\
  214. movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  215. mulpd %xmm2, %xmm5 ;\
  216. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  217. addpd %xmm5, %xmm14 ;\
  218. movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  219. addpd %xmm2, %xmm15 ;\
  220. movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  221. #define KERNEL3(xx) \
  222. mulpd %xmm4, %xmm7 ;\
  223. addpd %xmm7, %xmm8 ;\
  224. movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  225. mulpd %xmm4, %xmm3 ;\
  226. addpd %xmm3, %xmm9 ;\
  227. movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  228. mulpd %xmm4, %xmm5 ;\
  229. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  230. addpd %xmm5, %xmm10 ;\
  231. movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  232. addpd %xmm4, %xmm11 ;\
  233. movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  234. #define KERNEL4(xx) \
  235. mulpd %xmm6, %xmm7 ;\
  236. addpd %xmm7, %xmm12 ;\
  237. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  238. mulpd %xmm6, %xmm3 ;\
  239. addpd %xmm3, %xmm13 ;\
  240. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  241. mulpd %xmm6, %xmm5 ;\
  242. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  243. addpd %xmm5, %xmm14 ;\
  244. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  245. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
  246. addpd %xmm6, %xmm15 ;\
  247. movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  248. #define KERNEL5(xx) \
  249. mulpd %xmm0, %xmm1 ;\
  250. addpd %xmm1, %xmm8 ;\
  251. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  252. mulpd %xmm0, %xmm3 ;\
  253. addpd %xmm3, %xmm9 ;\
  254. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  255. mulpd %xmm0, %xmm5 ;\
  256. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  257. addpd %xmm5, %xmm10 ;\
  258. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  259. addpd %xmm0, %xmm11 ;\
  260. movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  261. #define KERNEL6(xx) \
  262. mulpd %xmm2, %xmm1 ;\
  263. addpd %xmm1, %xmm12 ;\
  264. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  265. mulpd %xmm2, %xmm3 ;\
  266. addpd %xmm3, %xmm13 ;\
  267. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  268. mulpd %xmm2, %xmm5 ;\
  269. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  270. addpd %xmm5, %xmm14 ;\
  271. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  272. addpd %xmm2, %xmm15 ;\
  273. movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  274. #define KERNEL7(xx) \
  275. mulpd %xmm4, %xmm7 ;\
  276. addpd %xmm7, %xmm8 ;\
  277. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  278. mulpd %xmm4, %xmm3 ;\
  279. addpd %xmm3, %xmm9 ;\
  280. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  281. mulpd %xmm4, %xmm5 ;\
  282. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  283. addpd %xmm5, %xmm10 ;\
  284. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  285. addpd %xmm4, %xmm11 ;\
  286. movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  287. #define KERNEL8(xx) \
  288. mulpd %xmm6, %xmm7 ;\
  289. addpd %xmm7, %xmm12 ;\
  290. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  291. mulpd %xmm6, %xmm3 ;\
  292. addpd %xmm3, %xmm13 ;\
  293. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  294. mulpd %xmm6, %xmm5 ;\
  295. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  296. addpd %xmm5, %xmm14 ;\
  297. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  298. addpd %xmm6, %xmm15 ;\
  299. movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  300. #endif
  301. PROLOGUE
  302. PROFCODE
  303. subq $STACKSIZE, %rsp
  304. movq %rbx, 0(%rsp)
  305. movq %rbp, 8(%rsp)
  306. movq %r12, 16(%rsp)
  307. movq %r13, 24(%rsp)
  308. movq %r14, 32(%rsp)
  309. movq %r15, 40(%rsp)
  310. #ifdef WINDOWS_ABI
  311. movq %rdi, 48(%rsp)
  312. movq %rsi, 56(%rsp)
  313. movups %xmm6, 64(%rsp)
  314. movups %xmm7, 80(%rsp)
  315. movups %xmm8, 96(%rsp)
  316. movups %xmm9, 112(%rsp)
  317. movups %xmm10, 128(%rsp)
  318. movups %xmm11, 144(%rsp)
  319. movups %xmm12, 160(%rsp)
  320. movups %xmm13, 176(%rsp)
  321. movups %xmm14, 192(%rsp)
  322. movups %xmm15, 208(%rsp)
  323. movq ARG1, OLD_M
  324. movq ARG2, OLD_N
  325. movq ARG3, K
  326. movq OLD_A, A
  327. movq OLD_B, B
  328. movq OLD_C, C
  329. movq OLD_LDC, LDC
  330. #ifdef TRMMKERNEL
  331. movsd OLD_OFFSET, %xmm12
  332. #endif
  333. movaps %xmm3, %xmm0
  334. #else
  335. movq OLD_LDC, LDC
  336. #ifdef TRMMKERNEL
  337. movsd OLD_OFFSET, %xmm12
  338. #endif
  339. #endif
  340. EMMS
  341. movq %rsp, %rbx # save old stack
  342. subq $256 + LOCAL_BUFFER_SIZE, %rsp
  343. andq $-4096, %rsp # align stack
  344. STACK_TOUCHING
  345. movq OLD_M, M
  346. movq OLD_N, N
  347. subq $-16 * SIZE, A
  348. unpcklpd %xmm0, %xmm0
  349. movapd %xmm0, ALPHA
  350. leaq (, LDC, SIZE), LDC
  351. #ifdef TRMMKERNEL
  352. movsd %xmm12, OFFSET
  353. movsd %xmm12, KK
  354. #ifndef LEFT
  355. negq KK
  356. #endif
  357. #endif
  358. movq N, J
  359. sarq $2, J # j = (n >> 2)
  360. jle .L40
  361. ALIGN_3
  362. .L01:
  363. /* Copying to Sub Buffer */
  364. leaq 16 * SIZE + BUFFER, BO
  365. movq C, CO1 # coffset1 = c
  366. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  367. #if defined(TRMMKERNEL) && defined(LEFT)
  368. movq OFFSET, %rax
  369. movq %rax, KK
  370. #endif
  371. movq K, %rax
  372. sarq $2, %rax
  373. jle .L03
  374. ALIGN_3
  375. #define RPREFETCHSIZE (8 * 7 + 4)
  376. #define WPREFETCHSIZE (8 * 8 + 4)
  377. .L02:
  378. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  379. movq 0 * SIZE(B), %mm0
  380. movq %mm0, -16 * SIZE(BO)
  381. movq %mm0, -15 * SIZE(BO)
  382. movq 1 * SIZE(B), %mm1
  383. movq %mm1, -14 * SIZE(BO)
  384. movq %mm1, -13 * SIZE(BO)
  385. movq 2 * SIZE(B), %mm2
  386. movq %mm2, -12 * SIZE(BO)
  387. movq %mm2, -11 * SIZE(BO)
  388. movq 3 * SIZE(B), %mm3
  389. movq %mm3, -10 * SIZE(BO)
  390. movq %mm3, -9 * SIZE(BO)
  391. PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO)
  392. movq 4 * SIZE(B), %mm4
  393. movq %mm4, -8 * SIZE(BO)
  394. movq %mm4, -7 * SIZE(BO)
  395. movq 5 * SIZE(B), %mm5
  396. movq %mm5, -6 * SIZE(BO)
  397. movq %mm5, -5 * SIZE(BO)
  398. PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO)
  399. movq 6 * SIZE(B), %mm6
  400. movq %mm6, -4 * SIZE(BO)
  401. movq %mm6, -3 * SIZE(BO)
  402. movq 7 * SIZE(B), %mm7
  403. movq %mm7, -2 * SIZE(BO)
  404. movq %mm7, -1 * SIZE(BO)
  405. PREFETCH (RPREFETCHSIZE + 8) * SIZE(B)
  406. movq 8 * SIZE(B), %mm0
  407. movq %mm0, 0 * SIZE(BO)
  408. movq %mm0, 1 * SIZE(BO)
  409. movq 9 * SIZE(B), %mm1
  410. movq %mm1, 2 * SIZE(BO)
  411. movq %mm1, 3 * SIZE(BO)
  412. movq 10 * SIZE(B), %mm2
  413. movq %mm2, 4 * SIZE(BO)
  414. movq %mm2, 5 * SIZE(BO)
  415. movq 11 * SIZE(B), %mm3
  416. movq %mm3, 6 * SIZE(BO)
  417. movq %mm3, 7 * SIZE(BO)
  418. PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO)
  419. movq 12 * SIZE(B), %mm4
  420. movq %mm4, 8 * SIZE(BO)
  421. movq %mm4, 9 * SIZE(BO)
  422. movq 13 * SIZE(B), %mm5
  423. movq %mm5, 10 * SIZE(BO)
  424. movq %mm5, 11 * SIZE(BO)
  425. PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO)
  426. movq 14 * SIZE(B), %mm6
  427. movq %mm6, 12 * SIZE(BO)
  428. movq %mm6, 13 * SIZE(BO)
  429. movq 15 * SIZE(B), %mm7
  430. movq %mm7, 14 * SIZE(BO)
  431. movq %mm7, 15 * SIZE(BO)
  432. addq $ 32 * SIZE, BO
  433. subq $-16 * SIZE, B
  434. subq $1, %rax
  435. jne .L02
  436. ALIGN_3
  437. .L03:
  438. movq K, %rax
  439. andq $3, %rax
  440. BRANCH
  441. jle .L10
  442. ALIGN_3
  443. .L04:
  444. movq 0 * SIZE(B), %mm0
  445. movq %mm0, -16 * SIZE(BO)
  446. movq %mm0, -15 * SIZE(BO)
  447. movq 1 * SIZE(B), %mm1
  448. movq %mm1, -14 * SIZE(BO)
  449. movq %mm1, -13 * SIZE(BO)
  450. movq 2 * SIZE(B), %mm2
  451. movq %mm2, -12 * SIZE(BO)
  452. movq %mm2, -11 * SIZE(BO)
  453. movq 3 * SIZE(B), %mm3
  454. movq %mm3, -10 * SIZE(BO)
  455. movq %mm3, -9 * SIZE(BO)
  456. addq $4 * SIZE, B
  457. addq $8 * SIZE, BO
  458. subq $1, %rax
  459. jne .L04
  460. ALIGN_3
  461. .L10:
  462. movq A, AO # aoffset = a
  463. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  464. movq M, I
  465. sarq $2, I # i = (m >> 2)
  466. jle .L20
  467. ALIGN_3
  468. .L11:
  469. #if !defined(TRMMKERNEL) || \
  470. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  471. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  472. leaq 16 * SIZE + BUFFER, BO
  473. #else
  474. leaq 16 * SIZE + BUFFER, BO
  475. movq KK, %rax
  476. leaq (, %rax, SIZE), %rax
  477. leaq (AO, %rax, 4), AO
  478. leaq (BO, %rax, 8), BO
  479. #endif
  480. movapd -16 * SIZE(AO), %xmm0
  481. movapd -16 * SIZE(BO), %xmm1
  482. pxor %xmm8, %xmm8
  483. movapd -14 * SIZE(AO), %xmm2
  484. movapd -14 * SIZE(BO), %xmm3
  485. pxor %xmm9, %xmm9
  486. movapd -12 * SIZE(AO), %xmm4
  487. movapd -12 * SIZE(BO), %xmm5
  488. pxor %xmm10, %xmm10
  489. movapd -10 * SIZE(AO), %xmm6
  490. movapd -8 * SIZE(BO), %xmm7
  491. pxor %xmm11, %xmm11
  492. PREFETCHW 3 * SIZE(CO1)
  493. pxor %xmm12, %xmm12
  494. PREFETCHW 7 * SIZE(CO2)
  495. pxor %xmm13, %xmm13
  496. PREFETCHW 3 * SIZE(CO1, LDC, 2)
  497. pxor %xmm14, %xmm14
  498. PREFETCHW 7 * SIZE(CO2, LDC, 2)
  499. pxor %xmm15, %xmm15
  500. PREFETCH 0 * SIZE(BB)
  501. #ifndef TRMMKERNEL
  502. movq K, %rax
  503. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  504. movq K, %rax
  505. subq KK, %rax
  506. movq %rax, KKK
  507. #else
  508. movq KK, %rax
  509. #ifdef LEFT
  510. addq $4, %rax
  511. #else
  512. addq $4, %rax
  513. #endif
  514. movq %rax, KKK
  515. #endif
  516. #ifndef GENERIC
  517. andq $-8, %rax
  518. leaq (, %rax, SIZE), %rax
  519. leaq (AO, %rax, 4), AO
  520. leaq (BO, %rax, 8), BO
  521. negq %rax
  522. NOBRANCH
  523. je .L15
  524. ALIGN_3
  525. .L12:
  526. KERNEL1(16 * 0)
  527. KERNEL2(16 * 0)
  528. KERNEL3(16 * 0)
  529. KERNEL4(16 * 0)
  530. KERNEL5(16 * 0)
  531. KERNEL6(16 * 0)
  532. KERNEL7(16 * 0)
  533. KERNEL8(16 * 0)
  534. KERNEL1(16 * 1)
  535. KERNEL2(16 * 1)
  536. KERNEL3(16 * 1)
  537. KERNEL4(16 * 1)
  538. KERNEL5(16 * 1)
  539. KERNEL6(16 * 1)
  540. KERNEL7(16 * 1)
  541. KERNEL8(16 * 1)
  542. addq $8 * SIZE, %rax
  543. NOBRANCH
  544. je .L15
  545. KERNEL1(16 * 0)
  546. KERNEL2(16 * 0)
  547. KERNEL3(16 * 0)
  548. KERNEL4(16 * 0)
  549. KERNEL5(16 * 0)
  550. KERNEL6(16 * 0)
  551. KERNEL7(16 * 0)
  552. KERNEL8(16 * 0)
  553. KERNEL1(16 * 1)
  554. KERNEL2(16 * 1)
  555. KERNEL3(16 * 1)
  556. KERNEL4(16 * 1)
  557. KERNEL5(16 * 1)
  558. KERNEL6(16 * 1)
  559. KERNEL7(16 * 1)
  560. KERNEL8(16 * 1)
  561. addq $8 * SIZE, %rax
  562. NOBRANCH
  563. je .L15
  564. KERNEL1(16 * 0)
  565. KERNEL2(16 * 0)
  566. KERNEL3(16 * 0)
  567. KERNEL4(16 * 0)
  568. KERNEL5(16 * 0)
  569. KERNEL6(16 * 0)
  570. KERNEL7(16 * 0)
  571. KERNEL8(16 * 0)
  572. KERNEL1(16 * 1)
  573. KERNEL2(16 * 1)
  574. KERNEL3(16 * 1)
  575. KERNEL4(16 * 1)
  576. KERNEL5(16 * 1)
  577. KERNEL6(16 * 1)
  578. KERNEL7(16 * 1)
  579. KERNEL8(16 * 1)
  580. addq $8 * SIZE, %rax
  581. NOBRANCH
  582. je .L15
  583. KERNEL1(16 * 0)
  584. KERNEL2(16 * 0)
  585. KERNEL3(16 * 0)
  586. KERNEL4(16 * 0)
  587. KERNEL5(16 * 0)
  588. KERNEL6(16 * 0)
  589. KERNEL7(16 * 0)
  590. KERNEL8(16 * 0)
  591. KERNEL1(16 * 1)
  592. KERNEL2(16 * 1)
  593. KERNEL3(16 * 1)
  594. KERNEL4(16 * 1)
  595. KERNEL5(16 * 1)
  596. KERNEL6(16 * 1)
  597. KERNEL7(16 * 1)
  598. KERNEL8(16 * 1)
  599. addq $8 * SIZE, %rax
  600. NOBRANCH
  601. je .L15
  602. KERNEL1(16 * 0)
  603. KERNEL2(16 * 0)
  604. KERNEL3(16 * 0)
  605. KERNEL4(16 * 0)
  606. KERNEL5(16 * 0)
  607. KERNEL6(16 * 0)
  608. KERNEL7(16 * 0)
  609. KERNEL8(16 * 0)
  610. KERNEL1(16 * 1)
  611. KERNEL2(16 * 1)
  612. KERNEL3(16 * 1)
  613. KERNEL4(16 * 1)
  614. KERNEL5(16 * 1)
  615. KERNEL6(16 * 1)
  616. KERNEL7(16 * 1)
  617. KERNEL8(16 * 1)
  618. addq $8 * SIZE, %rax
  619. NOBRANCH
  620. je .L15
  621. KERNEL1(16 * 0)
  622. KERNEL2(16 * 0)
  623. KERNEL3(16 * 0)
  624. KERNEL4(16 * 0)
  625. KERNEL5(16 * 0)
  626. KERNEL6(16 * 0)
  627. KERNEL7(16 * 0)
  628. KERNEL8(16 * 0)
  629. KERNEL1(16 * 1)
  630. KERNEL2(16 * 1)
  631. KERNEL3(16 * 1)
  632. KERNEL4(16 * 1)
  633. KERNEL5(16 * 1)
  634. KERNEL6(16 * 1)
  635. KERNEL7(16 * 1)
  636. KERNEL8(16 * 1)
  637. addq $8 * SIZE, %rax
  638. NOBRANCH
  639. je .L15
  640. KERNEL1(16 * 0)
  641. KERNEL2(16 * 0)
  642. KERNEL3(16 * 0)
  643. KERNEL4(16 * 0)
  644. KERNEL5(16 * 0)
  645. KERNEL6(16 * 0)
  646. KERNEL7(16 * 0)
  647. KERNEL8(16 * 0)
  648. KERNEL1(16 * 1)
  649. KERNEL2(16 * 1)
  650. KERNEL3(16 * 1)
  651. KERNEL4(16 * 1)
  652. KERNEL5(16 * 1)
  653. KERNEL6(16 * 1)
  654. KERNEL7(16 * 1)
  655. KERNEL8(16 * 1)
  656. addq $8 * SIZE, %rax
  657. NOBRANCH
  658. je .L15
  659. KERNEL1(16 * 0)
  660. KERNEL2(16 * 0)
  661. KERNEL3(16 * 0)
  662. KERNEL4(16 * 0)
  663. KERNEL5(16 * 0)
  664. KERNEL6(16 * 0)
  665. KERNEL7(16 * 0)
  666. KERNEL8(16 * 0)
  667. KERNEL1(16 * 1)
  668. KERNEL2(16 * 1)
  669. KERNEL3(16 * 1)
  670. KERNEL4(16 * 1)
  671. KERNEL5(16 * 1)
  672. KERNEL6(16 * 1)
  673. KERNEL7(16 * 1)
  674. KERNEL8(16 * 1)
  675. addq $8 * SIZE, %rax
  676. BRANCH
  677. jl .L12
  678. ALIGN_3
  679. .L15:
  680. #ifndef TRMMKERNEL
  681. movq K, %rax
  682. #else
  683. movq KKK, %rax
  684. #endif
  685. testq $4, %rax
  686. je .L16
  687. xorq %rax, %rax
  688. ALIGN_3
  689. KERNEL1(16 * 0)
  690. KERNEL2(16 * 0)
  691. KERNEL3(16 * 0)
  692. KERNEL4(16 * 0)
  693. KERNEL5(16 * 0)
  694. KERNEL6(16 * 0)
  695. KERNEL7(16 * 0)
  696. KERNEL8(16 * 0)
  697. addq $32 * SIZE, BO
  698. addq $16 * SIZE, AO
  699. ALIGN_3
  700. #else
  701. sarq $2, %rax
  702. NOBRANCH
  703. jle .L16
  704. ALIGN_3
  705. .L12:
  706. KERNEL1(16 * 0)
  707. KERNEL2(16 * 0)
  708. KERNEL3(16 * 0)
  709. KERNEL4(16 * 0)
  710. KERNEL5(16 * 0)
  711. KERNEL6(16 * 0)
  712. KERNEL7(16 * 0)
  713. KERNEL8(16 * 0)
  714. addq $ 32 * SIZE, BO
  715. subq $-16 * SIZE, AO
  716. decq %rax
  717. BRANCH
  718. jg .L12
  719. #endif
  720. .L16:
  721. movapd ALPHA, %xmm7
  722. #ifndef TRMMKERNEL
  723. movq K, %rax
  724. #else
  725. movq KKK, %rax
  726. #endif
  727. andq $3, %rax # if (k & 1)
  728. je .L19
  729. leaq (, %rax, SIZE), %rax
  730. leaq (AO, %rax, 4), AO
  731. leaq (BO, %rax, 8), BO
  732. negq %rax
  733. ALIGN_3
  734. .L17:
  735. mulpd %xmm0, %xmm1
  736. addpd %xmm1, %xmm8
  737. movapd -14 * SIZE(BO, %rax, 8), %xmm1
  738. mulpd %xmm0, %xmm1
  739. addpd %xmm1, %xmm9
  740. movapd -12 * SIZE(BO, %rax, 8), %xmm1
  741. mulpd %xmm0, %xmm1
  742. mulpd -10 * SIZE(BO, %rax, 8), %xmm0
  743. addpd %xmm1, %xmm10
  744. movapd -16 * SIZE(BO, %rax, 8), %xmm1
  745. addpd %xmm0, %xmm11
  746. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  747. mulpd %xmm2, %xmm1
  748. addpd %xmm1, %xmm12
  749. movapd -14 * SIZE(BO, %rax, 8), %xmm1
  750. mulpd %xmm2, %xmm1
  751. addpd %xmm1, %xmm13
  752. movapd -12 * SIZE(BO, %rax, 8), %xmm1
  753. mulpd %xmm2, %xmm1
  754. mulpd -10 * SIZE(BO, %rax, 8), %xmm2
  755. addpd %xmm1, %xmm14
  756. movapd -8 * SIZE(BO, %rax, 8), %xmm1
  757. addpd %xmm2, %xmm15
  758. movapd -10 * SIZE(AO, %rax, 4), %xmm2
  759. addq $SIZE, %rax
  760. jl .L17
  761. ALIGN_3
  762. .L19:
  763. PREFETCH 8 * SIZE(BB)
  764. subq $-12 * SIZE, BB
  765. #ifndef TRMMKERNEL
  766. movsd 0 * SIZE(CO1), %xmm0
  767. movhpd 1 * SIZE(CO1), %xmm0
  768. movsd 2 * SIZE(CO1), %xmm1
  769. movhpd 3 * SIZE(CO1), %xmm1
  770. movsd 0 * SIZE(CO2), %xmm2
  771. movhpd 1 * SIZE(CO2), %xmm2
  772. movsd 2 * SIZE(CO2), %xmm3
  773. movhpd 3 * SIZE(CO2), %xmm3
  774. #endif
  775. mulpd %xmm7, %xmm8
  776. mulpd %xmm7, %xmm9
  777. mulpd %xmm7, %xmm10
  778. mulpd %xmm7, %xmm11
  779. mulpd %xmm7, %xmm12
  780. mulpd %xmm7, %xmm13
  781. mulpd %xmm7, %xmm14
  782. mulpd %xmm7, %xmm15
  783. #ifndef TRMMKERNEL
  784. movlpd 0 * SIZE(CO1, LDC, 2), %xmm4
  785. movhpd 1 * SIZE(CO1, LDC, 2), %xmm4
  786. movlpd 2 * SIZE(CO1, LDC, 2), %xmm5
  787. movhpd 3 * SIZE(CO1, LDC, 2), %xmm5
  788. movlpd 0 * SIZE(CO2, LDC, 2), %xmm6
  789. movhpd 1 * SIZE(CO2, LDC, 2), %xmm6
  790. movlpd 2 * SIZE(CO2, LDC, 2), %xmm7
  791. movhpd 3 * SIZE(CO2, LDC, 2), %xmm7
  792. addpd %xmm0, %xmm8
  793. addpd %xmm1, %xmm12
  794. addpd %xmm2, %xmm9
  795. addpd %xmm3, %xmm13
  796. #endif
  797. movlpd %xmm8, 0 * SIZE(CO1)
  798. movhpd %xmm8, 1 * SIZE(CO1)
  799. movlpd %xmm12, 2 * SIZE(CO1)
  800. movhpd %xmm12, 3 * SIZE(CO1)
  801. movlpd %xmm9, 0 * SIZE(CO2)
  802. movhpd %xmm9, 1 * SIZE(CO2)
  803. movlpd %xmm13, 2 * SIZE(CO2)
  804. movhpd %xmm13, 3 * SIZE(CO2)
  805. #ifndef TRMMKERNEL
  806. addpd %xmm4, %xmm10
  807. addpd %xmm5, %xmm14
  808. addpd %xmm6, %xmm11
  809. addpd %xmm7, %xmm15
  810. #endif
  811. movlpd %xmm10, 0 * SIZE(CO1, LDC, 2)
  812. movhpd %xmm10, 1 * SIZE(CO1, LDC, 2)
  813. movlpd %xmm14, 2 * SIZE(CO1, LDC, 2)
  814. movhpd %xmm14, 3 * SIZE(CO1, LDC, 2)
  815. movlpd %xmm11, 0 * SIZE(CO2, LDC, 2)
  816. movhpd %xmm11, 1 * SIZE(CO2, LDC, 2)
  817. movlpd %xmm15, 2 * SIZE(CO2, LDC, 2)
  818. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  819. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  820. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  821. movq K, %rax
  822. subq KKK, %rax
  823. leaq (,%rax, SIZE), %rax
  824. leaq (AO, %rax, 4), AO
  825. leaq (BO, %rax, 8), BO
  826. #endif
  827. #if defined(TRMMKERNEL) && defined(LEFT)
  828. addq $4, KK
  829. #endif
  830. addq $4 * SIZE, CO1 # coffset += 4
  831. addq $4 * SIZE, CO2 # coffset += 4
  832. decq I # i --
  833. BRANCH
  834. jg .L11
  835. ALIGN_3
  836. .L20:
  837. testq $3, M
  838. je .L39
  839. testq $2, M
  840. je .L30
  841. ALIGN_3
  842. .L21:
  843. #if !defined(TRMMKERNEL) || \
  844. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  845. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  846. leaq BUFFER, BO
  847. #else
  848. leaq BUFFER, BO
  849. movq KK, %rax
  850. leaq (, %rax, SIZE), %rax
  851. leaq (AO, %rax, 2), AO
  852. leaq (BO, %rax, 8), BO
  853. #endif
  854. movapd -16 * SIZE(AO), %xmm0
  855. pxor %xmm8, %xmm8
  856. movapd 0 * SIZE(BO), %xmm1
  857. pxor %xmm9, %xmm9
  858. movapd -8 * SIZE(AO), %xmm2
  859. pxor %xmm10, %xmm10
  860. movapd 8 * SIZE(BO), %xmm3
  861. pxor %xmm11, %xmm11
  862. movapd 16 * SIZE(BO), %xmm5
  863. movapd 24 * SIZE(BO), %xmm7
  864. #ifndef TRMMKERNEL
  865. movq K, %rax
  866. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  867. movq K, %rax
  868. subq KK, %rax
  869. movq %rax, KKK
  870. #else
  871. movq KK, %rax
  872. #ifdef LEFT
  873. addq $2, %rax
  874. #else
  875. addq $4, %rax
  876. #endif
  877. movq %rax, KKK
  878. #endif
  879. sarq $3, %rax
  880. je .L25
  881. ALIGN_3
  882. .L22:
  883. mulpd %xmm0, %xmm1
  884. addpd %xmm1, %xmm8
  885. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  886. movapd 2 * SIZE(BO), %xmm1
  887. mulpd %xmm0, %xmm1
  888. addpd %xmm1, %xmm9
  889. movapd 4 * SIZE(BO), %xmm1
  890. mulpd %xmm0, %xmm1
  891. mulpd 6 * SIZE(BO), %xmm0
  892. addpd %xmm1, %xmm10
  893. movapd 32 * SIZE(BO), %xmm1
  894. addpd %xmm0, %xmm11
  895. movapd -14 * SIZE(AO), %xmm0
  896. mulpd %xmm0, %xmm3
  897. addpd %xmm3, %xmm8
  898. movapd 10 * SIZE(BO), %xmm3
  899. mulpd %xmm0, %xmm3
  900. addpd %xmm3, %xmm9
  901. movapd 12 * SIZE(BO), %xmm3
  902. mulpd %xmm0, %xmm3
  903. mulpd 14 * SIZE(BO), %xmm0
  904. addpd %xmm3, %xmm10
  905. movapd 40 * SIZE(BO), %xmm3
  906. addpd %xmm0, %xmm11
  907. movapd -12 * SIZE(AO), %xmm0
  908. mulpd %xmm0, %xmm5
  909. addpd %xmm5, %xmm8
  910. movapd 18 * SIZE(BO), %xmm5
  911. mulpd %xmm0, %xmm5
  912. addpd %xmm5, %xmm9
  913. movapd 20 * SIZE(BO), %xmm5
  914. mulpd %xmm0, %xmm5
  915. mulpd 22 * SIZE(BO), %xmm0
  916. addpd %xmm5, %xmm10
  917. movapd 48 * SIZE(BO), %xmm5
  918. addpd %xmm0, %xmm11
  919. movapd -10 * SIZE(AO), %xmm0
  920. mulpd %xmm0, %xmm7
  921. addpd %xmm7, %xmm8
  922. movapd 26 * SIZE(BO), %xmm7
  923. mulpd %xmm0, %xmm7
  924. addpd %xmm7, %xmm9
  925. movapd 28 * SIZE(BO), %xmm7
  926. mulpd %xmm0, %xmm7
  927. mulpd 30 * SIZE(BO), %xmm0
  928. addpd %xmm7, %xmm10
  929. movapd 56 * SIZE(BO), %xmm7
  930. addpd %xmm0, %xmm11
  931. movapd 0 * SIZE(AO), %xmm0
  932. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  933. mulpd %xmm2, %xmm1
  934. addpd %xmm1, %xmm8
  935. movapd 34 * SIZE(BO), %xmm1
  936. mulpd %xmm2, %xmm1
  937. addpd %xmm1, %xmm9
  938. movapd 36 * SIZE(BO), %xmm1
  939. mulpd %xmm2, %xmm1
  940. mulpd 38 * SIZE(BO), %xmm2
  941. addpd %xmm1, %xmm10
  942. movapd 64 * SIZE(BO), %xmm1
  943. addpd %xmm2, %xmm11
  944. movapd -6 * SIZE(AO), %xmm2
  945. mulpd %xmm2, %xmm3
  946. addpd %xmm3, %xmm8
  947. movapd 42 * SIZE(BO), %xmm3
  948. mulpd %xmm2, %xmm3
  949. addpd %xmm3, %xmm9
  950. movapd 44 * SIZE(BO), %xmm3
  951. mulpd %xmm2, %xmm3
  952. mulpd 46 * SIZE(BO), %xmm2
  953. addpd %xmm3, %xmm10
  954. movapd 72 * SIZE(BO), %xmm3
  955. addpd %xmm2, %xmm11
  956. movapd -4 * SIZE(AO), %xmm2
  957. mulpd %xmm2, %xmm5
  958. addpd %xmm5, %xmm8
  959. movapd 50 * SIZE(BO), %xmm5
  960. mulpd %xmm2, %xmm5
  961. addpd %xmm5, %xmm9
  962. movapd 52 * SIZE(BO), %xmm5
  963. mulpd %xmm2, %xmm5
  964. mulpd 54 * SIZE(BO), %xmm2
  965. addpd %xmm5, %xmm10
  966. movapd 80 * SIZE(BO), %xmm5
  967. addpd %xmm2, %xmm11
  968. movapd -2 * SIZE(AO), %xmm2
  969. mulpd %xmm2, %xmm7
  970. addpd %xmm7, %xmm8
  971. movapd 58 * SIZE(BO), %xmm7
  972. mulpd %xmm2, %xmm7
  973. addpd %xmm7, %xmm9
  974. movapd 60 * SIZE(BO), %xmm7
  975. mulpd %xmm2, %xmm7
  976. mulpd 62 * SIZE(BO), %xmm2
  977. addpd %xmm7, %xmm10
  978. movapd 88 * SIZE(BO), %xmm7
  979. addpd %xmm2, %xmm11
  980. movapd 8 * SIZE(AO), %xmm2
  981. addq $16 * SIZE, AO
  982. addq $64 * SIZE, BO
  983. decq %rax
  984. jne .L22
  985. ALIGN_3
  986. .L25:
  987. #ifndef TRMMKERNEL
  988. movq K, %rax
  989. #else
  990. movq KKK, %rax
  991. #endif
  992. movapd ALPHA, %xmm7
  993. andq $7, %rax # if (k & 1)
  994. BRANCH
  995. je .L29
  996. ALIGN_3
  997. .L26:
  998. mulpd %xmm0, %xmm1
  999. addpd %xmm1, %xmm8
  1000. movapd 2 * SIZE(BO), %xmm1
  1001. mulpd %xmm0, %xmm1
  1002. addpd %xmm1, %xmm9
  1003. movapd 4 * SIZE(BO), %xmm1
  1004. mulpd %xmm0, %xmm1
  1005. mulpd 6 * SIZE(BO), %xmm0
  1006. addpd %xmm1, %xmm10
  1007. movapd 8 * SIZE(BO), %xmm1
  1008. addpd %xmm0, %xmm11
  1009. movapd -14 * SIZE(AO), %xmm0
  1010. addq $2 * SIZE, AO # aoffset += 4
  1011. addq $8 * SIZE, BO # boffset1 += 8
  1012. decq %rax
  1013. jg .L26
  1014. ALIGN_3
  1015. .L29:
  1016. #ifndef TRMMKERNEL
  1017. movlpd 0 * SIZE(CO1), %xmm0
  1018. movhpd 1 * SIZE(CO1), %xmm0
  1019. movlpd 0 * SIZE(CO2), %xmm2
  1020. movhpd 1 * SIZE(CO2), %xmm2
  1021. movlpd 0 * SIZE(CO1, LDC, 2), %xmm4
  1022. movhpd 1 * SIZE(CO1, LDC, 2), %xmm4
  1023. movlpd 0 * SIZE(CO2, LDC, 2), %xmm6
  1024. movhpd 1 * SIZE(CO2, LDC, 2), %xmm6
  1025. #endif
  1026. mulpd %xmm7, %xmm8
  1027. mulpd %xmm7, %xmm9
  1028. mulpd %xmm7, %xmm10
  1029. mulpd %xmm7, %xmm11
  1030. #ifndef TRMMKERNEL
  1031. addpd %xmm0, %xmm8
  1032. addpd %xmm2, %xmm9
  1033. addpd %xmm4, %xmm10
  1034. addpd %xmm6, %xmm11
  1035. #endif
  1036. movlpd %xmm8, 0 * SIZE(CO1)
  1037. movhpd %xmm8, 1 * SIZE(CO1)
  1038. movlpd %xmm9, 0 * SIZE(CO2)
  1039. movhpd %xmm9, 1 * SIZE(CO2)
  1040. movlpd %xmm10, 0 * SIZE(CO1, LDC, 2)
  1041. movhpd %xmm10, 1 * SIZE(CO1, LDC, 2)
  1042. movlpd %xmm11, 0 * SIZE(CO2, LDC, 2)
  1043. movhpd %xmm11, 1 * SIZE(CO2, LDC, 2)
  1044. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1045. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1046. movq K, %rax
  1047. subq KKK, %rax
  1048. leaq (,%rax, SIZE), %rax
  1049. leaq (AO, %rax, 2), AO
  1050. leaq (BO, %rax, 8), BO
  1051. #endif
  1052. #if defined(TRMMKERNEL) && defined(LEFT)
  1053. addq $2, KK
  1054. #endif
  1055. addq $2 * SIZE, CO1 # coffset += 4
  1056. addq $2 * SIZE, CO2 # coffset += 4
  1057. ALIGN_3
  1058. .L30:
  1059. testq $1, M
  1060. je .L39
  1061. ALIGN_3
  1062. .L31:
  1063. #if !defined(TRMMKERNEL) || \
  1064. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1065. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1066. leaq BUFFER, BO
  1067. #else
  1068. leaq BUFFER, BO
  1069. movq KK, %rax
  1070. leaq (, %rax, SIZE), %rax
  1071. leaq (AO, %rax, 1), AO
  1072. leaq (BO, %rax, 8), BO
  1073. #endif
  1074. movsd -16 * SIZE(AO), %xmm0
  1075. pxor %xmm8, %xmm8
  1076. movsd 0 * SIZE(BO), %xmm1
  1077. pxor %xmm9, %xmm9
  1078. movsd -8 * SIZE(AO), %xmm2
  1079. pxor %xmm10, %xmm10
  1080. movsd 8 * SIZE(BO), %xmm3
  1081. pxor %xmm11, %xmm11
  1082. movsd 16 * SIZE(BO), %xmm5
  1083. movsd 24 * SIZE(BO), %xmm7
  1084. #ifndef TRMMKERNEL
  1085. movq K, %rax
  1086. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1087. movq K, %rax
  1088. subq KK, %rax
  1089. movq %rax, KKK
  1090. #else
  1091. movq KK, %rax
  1092. #ifdef LEFT
  1093. addq $1, %rax
  1094. #else
  1095. addq $4, %rax
  1096. #endif
  1097. movq %rax, KKK
  1098. #endif
  1099. sarq $3, %rax
  1100. je .L35
  1101. ALIGN_3
  1102. .L32:
  1103. mulsd %xmm0, %xmm1
  1104. addsd %xmm1, %xmm8
  1105. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1106. movsd 2 * SIZE(BO), %xmm1
  1107. mulsd %xmm0, %xmm1
  1108. addsd %xmm1, %xmm9
  1109. movsd 4 * SIZE(BO), %xmm1
  1110. mulsd %xmm0, %xmm1
  1111. mulsd 6 * SIZE(BO), %xmm0
  1112. addsd %xmm1, %xmm10
  1113. movsd 32 * SIZE(BO), %xmm1
  1114. addsd %xmm0, %xmm11
  1115. movsd -15 * SIZE(AO), %xmm0
  1116. mulsd %xmm0, %xmm3
  1117. addsd %xmm3, %xmm8
  1118. movsd 10 * SIZE(BO), %xmm3
  1119. mulsd %xmm0, %xmm3
  1120. addsd %xmm3, %xmm9
  1121. movsd 12 * SIZE(BO), %xmm3
  1122. mulsd %xmm0, %xmm3
  1123. mulsd 14 * SIZE(BO), %xmm0
  1124. addsd %xmm3, %xmm10
  1125. movsd 40 * SIZE(BO), %xmm3
  1126. addsd %xmm0, %xmm11
  1127. movsd -14 * SIZE(AO), %xmm0
  1128. mulsd %xmm0, %xmm5
  1129. addsd %xmm5, %xmm8
  1130. movsd 18 * SIZE(BO), %xmm5
  1131. mulsd %xmm0, %xmm5
  1132. addsd %xmm5, %xmm9
  1133. movsd 20 * SIZE(BO), %xmm5
  1134. mulsd %xmm0, %xmm5
  1135. mulsd 22 * SIZE(BO), %xmm0
  1136. addsd %xmm5, %xmm10
  1137. movsd 48 * SIZE(BO), %xmm5
  1138. addsd %xmm0, %xmm11
  1139. movsd -13 * SIZE(AO), %xmm0
  1140. mulsd %xmm0, %xmm7
  1141. addsd %xmm7, %xmm8
  1142. movsd 26 * SIZE(BO), %xmm7
  1143. mulsd %xmm0, %xmm7
  1144. addsd %xmm7, %xmm9
  1145. movsd 28 * SIZE(BO), %xmm7
  1146. mulsd %xmm0, %xmm7
  1147. mulsd 30 * SIZE(BO), %xmm0
  1148. addsd %xmm7, %xmm10
  1149. movsd 56 * SIZE(BO), %xmm7
  1150. addsd %xmm0, %xmm11
  1151. movsd -12 * SIZE(AO), %xmm0
  1152. mulsd %xmm0, %xmm1
  1153. addsd %xmm1, %xmm8
  1154. movsd 34 * SIZE(BO), %xmm1
  1155. mulsd %xmm0, %xmm1
  1156. addsd %xmm1, %xmm9
  1157. movsd 36 * SIZE(BO), %xmm1
  1158. mulsd %xmm0, %xmm1
  1159. mulsd 38 * SIZE(BO), %xmm0
  1160. addsd %xmm1, %xmm10
  1161. movsd 64 * SIZE(BO), %xmm1
  1162. addsd %xmm0, %xmm11
  1163. movsd -11 * SIZE(AO), %xmm0
  1164. mulsd %xmm0, %xmm3
  1165. addsd %xmm3, %xmm8
  1166. movsd 42 * SIZE(BO), %xmm3
  1167. mulsd %xmm0, %xmm3
  1168. addsd %xmm3, %xmm9
  1169. movsd 44 * SIZE(BO), %xmm3
  1170. mulsd %xmm0, %xmm3
  1171. mulsd 46 * SIZE(BO), %xmm0
  1172. addsd %xmm3, %xmm10
  1173. movsd 72 * SIZE(BO), %xmm3
  1174. addsd %xmm0, %xmm11
  1175. movsd -10 * SIZE(AO), %xmm0
  1176. mulsd %xmm0, %xmm5
  1177. addsd %xmm5, %xmm8
  1178. movsd 50 * SIZE(BO), %xmm5
  1179. mulsd %xmm0, %xmm5
  1180. addsd %xmm5, %xmm9
  1181. movsd 52 * SIZE(BO), %xmm5
  1182. mulsd %xmm0, %xmm5
  1183. mulsd 54 * SIZE(BO), %xmm0
  1184. addsd %xmm5, %xmm10
  1185. movsd 80 * SIZE(BO), %xmm5
  1186. addsd %xmm0, %xmm11
  1187. movsd -9 * SIZE(AO), %xmm0
  1188. mulsd %xmm0, %xmm7
  1189. addsd %xmm7, %xmm8
  1190. movsd 58 * SIZE(BO), %xmm7
  1191. mulsd %xmm0, %xmm7
  1192. addsd %xmm7, %xmm9
  1193. movsd 60 * SIZE(BO), %xmm7
  1194. mulsd %xmm0, %xmm7
  1195. mulsd 62 * SIZE(BO), %xmm0
  1196. addsd %xmm7, %xmm10
  1197. movsd 88 * SIZE(BO), %xmm7
  1198. addsd %xmm0, %xmm11
  1199. movsd -8 * SIZE(AO), %xmm0
  1200. addq $ 8 * SIZE, AO
  1201. addq $64 * SIZE, BO
  1202. decq %rax
  1203. jne .L32
  1204. ALIGN_3
  1205. .L35:
  1206. #ifndef TRMMKERNEL
  1207. movq K, %rax
  1208. #else
  1209. movq KKK, %rax
  1210. #endif
  1211. movsd ALPHA, %xmm7
  1212. andq $7, %rax # if (k & 1)
  1213. BRANCH
  1214. je .L38
  1215. ALIGN_3
  1216. .L36:
  1217. mulsd %xmm0, %xmm1
  1218. addsd %xmm1, %xmm8
  1219. movsd 2 * SIZE(BO), %xmm1
  1220. mulsd %xmm0, %xmm1
  1221. addsd %xmm1, %xmm9
  1222. movsd 4 * SIZE(BO), %xmm1
  1223. mulsd %xmm0, %xmm1
  1224. mulsd 6 * SIZE(BO), %xmm0
  1225. addsd %xmm1, %xmm10
  1226. movsd 8 * SIZE(BO), %xmm1
  1227. addsd %xmm0, %xmm11
  1228. movsd -15 * SIZE(AO), %xmm0
  1229. addq $1 * SIZE, AO # aoffset += 4
  1230. addq $8 * SIZE, BO # boffset1 += 8
  1231. decq %rax
  1232. jg .L36
  1233. ALIGN_3
  1234. .L38:
  1235. #ifndef TRMMKERNEL
  1236. movsd 0 * SIZE(CO1), %xmm0
  1237. movsd 0 * SIZE(CO2), %xmm2
  1238. movsd 0 * SIZE(CO1, LDC, 2), %xmm4
  1239. movsd 0 * SIZE(CO2, LDC, 2), %xmm6
  1240. #endif
  1241. mulsd %xmm7, %xmm8
  1242. mulsd %xmm7, %xmm9
  1243. mulsd %xmm7, %xmm10
  1244. mulsd %xmm7, %xmm11
  1245. #ifndef TRMMKERNEL
  1246. addsd %xmm0, %xmm8
  1247. addsd %xmm2, %xmm9
  1248. addsd %xmm4, %xmm10
  1249. addsd %xmm6, %xmm11
  1250. #endif
  1251. movsd %xmm8, 0 * SIZE(CO1)
  1252. movsd %xmm9, 0 * SIZE(CO2)
  1253. movsd %xmm10, 0 * SIZE(CO1, LDC, 2)
  1254. movsd %xmm11, 0 * SIZE(CO2, LDC, 2)
  1255. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1256. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1257. movq K, %rax
  1258. subq KKK, %rax
  1259. leaq (,%rax, SIZE), %rax
  1260. leaq (AO, %rax, 1), AO
  1261. leaq (BO, %rax, 8), BO
  1262. #endif
  1263. #if defined(TRMMKERNEL) && defined(LEFT)
  1264. addq $1, KK
  1265. #endif
  1266. ALIGN_3
  1267. .L39:
  1268. #if defined(TRMMKERNEL) && !defined(LEFT)
  1269. addl $4, KK
  1270. #endif
  1271. leaq (C, LDC, 4), C # c += 4 * ldc
  1272. decq J # j --
  1273. jg .L01
  1274. ALIGN_3
  1275. .L40:
  1276. testq $3, N
  1277. je .L999
  1278. testq $2, N
  1279. je .L80
  1280. ALIGN_4
  1281. .L41:
  1282. /* Copying to Sub Buffer */
  1283. leaq BUFFER, BO
  1284. #if defined(TRMMKERNEL) && defined(LEFT)
  1285. movq OFFSET, %rax
  1286. movq %rax, KK
  1287. #endif
  1288. movq K, %rax
  1289. sarq $2, %rax
  1290. jle .L43
  1291. ALIGN_3
  1292. .L42:
  1293. PREFETCH 56 * SIZE(B)
  1294. movq 0 * SIZE(B), %mm0
  1295. movq 1 * SIZE(B), %mm1
  1296. movq 2 * SIZE(B), %mm2
  1297. movq 3 * SIZE(B), %mm3
  1298. movq 4 * SIZE(B), %mm4
  1299. movq 5 * SIZE(B), %mm5
  1300. movq 6 * SIZE(B), %mm6
  1301. movq 7 * SIZE(B), %mm7
  1302. addq $ 8 * SIZE, B
  1303. addq $16 * SIZE, BO
  1304. movq %mm0, -16 * SIZE(BO)
  1305. movq %mm0, -15 * SIZE(BO)
  1306. movq %mm1, -14 * SIZE(BO)
  1307. movq %mm1, -13 * SIZE(BO)
  1308. movq %mm2, -12 * SIZE(BO)
  1309. movq %mm2, -11 * SIZE(BO)
  1310. movq %mm3, -10 * SIZE(BO)
  1311. movq %mm3, -9 * SIZE(BO)
  1312. movq %mm4, -8 * SIZE(BO)
  1313. movq %mm4, -7 * SIZE(BO)
  1314. movq %mm5, -6 * SIZE(BO)
  1315. movq %mm5, -5 * SIZE(BO)
  1316. movq %mm6, -4 * SIZE(BO)
  1317. movq %mm6, -3 * SIZE(BO)
  1318. movq %mm7, -2 * SIZE(BO)
  1319. movq %mm7, -1 * SIZE(BO)
  1320. decq %rax
  1321. jne .L42
  1322. ALIGN_3
  1323. .L43:
  1324. movq K, %rax
  1325. andq $3, %rax
  1326. BRANCH
  1327. jle .L50
  1328. ALIGN_3
  1329. .L44:
  1330. movq 0 * SIZE(B), %mm0
  1331. movq 1 * SIZE(B), %mm1
  1332. movq %mm0, 0 * SIZE(BO)
  1333. movq %mm0, 1 * SIZE(BO)
  1334. movq %mm1, 2 * SIZE(BO)
  1335. movq %mm1, 3 * SIZE(BO)
  1336. addq $2 * SIZE, B
  1337. addq $4 * SIZE, BO
  1338. decq %rax
  1339. jne .L44
  1340. ALIGN_3
  1341. .L50:
  1342. movq C, CO1 # coffset1 = c
  1343. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1344. movq A, AO # aoffset = a
  1345. movq M, I
  1346. sarq $2, I # i = (m >> 2)
  1347. jle .L60
  1348. ALIGN_3
  1349. .L51:
  1350. #if !defined(TRMMKERNEL) || \
  1351. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1352. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1353. leaq BUFFER, BO
  1354. #else
  1355. leaq BUFFER, BO
  1356. movq KK, %rax
  1357. leaq (, %rax, SIZE), %rax
  1358. leaq (AO, %rax, 4), AO
  1359. leaq (BO, %rax, 4), BO
  1360. #endif
  1361. movapd -16 * SIZE(AO), %xmm0
  1362. pxor %xmm8, %xmm8
  1363. movapd 0 * SIZE(BO), %xmm1
  1364. pxor %xmm9, %xmm9
  1365. movapd -8 * SIZE(AO), %xmm2
  1366. pxor %xmm12, %xmm12
  1367. movapd 8 * SIZE(BO), %xmm3
  1368. pxor %xmm13, %xmm13
  1369. movapd 0 * SIZE(AO), %xmm4
  1370. movapd 16 * SIZE(BO), %xmm5
  1371. movapd 8 * SIZE(AO), %xmm6
  1372. movapd 24 * SIZE(BO), %xmm7
  1373. PREFETCHW 4 * SIZE(CO1)
  1374. PREFETCHW 4 * SIZE(CO2)
  1375. #ifndef TRMMKERNEL
  1376. movq K, %rax
  1377. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1378. movq K, %rax
  1379. subq KK, %rax
  1380. movq %rax, KKK
  1381. #else
  1382. movq KK, %rax
  1383. #ifdef LEFT
  1384. addq $4, %rax
  1385. #else
  1386. addq $2, %rax
  1387. #endif
  1388. movq %rax, KKK
  1389. #endif
  1390. sarq $3, %rax
  1391. je .L55
  1392. ALIGN_3
  1393. .L52:
  1394. mulpd %xmm0, %xmm1
  1395. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1396. mulpd 2 * SIZE(BO), %xmm0
  1397. addpd %xmm1, %xmm8
  1398. movapd 0 * SIZE(BO), %xmm1
  1399. addpd %xmm0, %xmm9
  1400. movapd -14 * SIZE(AO), %xmm0
  1401. mulpd %xmm0, %xmm1
  1402. mulpd 2 * SIZE(BO), %xmm0
  1403. addpd %xmm1, %xmm12
  1404. movapd 4 * SIZE(BO), %xmm1
  1405. addpd %xmm0, %xmm13
  1406. movapd -12 * SIZE(AO), %xmm0
  1407. mulpd %xmm0, %xmm1
  1408. mulpd 6 * SIZE(BO), %xmm0
  1409. addpd %xmm1, %xmm8
  1410. movapd 4 * SIZE(BO), %xmm1
  1411. addpd %xmm0, %xmm9
  1412. movapd -10 * SIZE(AO), %xmm0
  1413. mulpd %xmm0, %xmm1
  1414. mulpd 6 * SIZE(BO), %xmm0
  1415. addpd %xmm1, %xmm12
  1416. movapd 32 * SIZE(BO), %xmm1
  1417. addpd %xmm0, %xmm13
  1418. movapd 16 * SIZE(AO), %xmm0
  1419. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1420. mulpd %xmm2, %xmm3
  1421. mulpd 10 * SIZE(BO), %xmm2
  1422. addpd %xmm3, %xmm8
  1423. movapd 8 * SIZE(BO), %xmm3
  1424. addpd %xmm2, %xmm9
  1425. movapd -6 * SIZE(AO), %xmm2
  1426. mulpd %xmm2, %xmm3
  1427. mulpd 10 * SIZE(BO), %xmm2
  1428. addpd %xmm3, %xmm12
  1429. movapd 12 * SIZE(BO), %xmm3
  1430. addpd %xmm2, %xmm13
  1431. movapd -4 * SIZE(AO), %xmm2
  1432. mulpd %xmm2, %xmm3
  1433. mulpd 14 * SIZE(BO), %xmm2
  1434. addpd %xmm3, %xmm8
  1435. movapd 12 * SIZE(BO), %xmm3
  1436. addpd %xmm2, %xmm9
  1437. movapd -2 * SIZE(AO), %xmm2
  1438. mulpd %xmm2, %xmm3
  1439. mulpd 14 * SIZE(BO), %xmm2
  1440. addpd %xmm3, %xmm12
  1441. movapd 40 * SIZE(BO), %xmm3
  1442. addpd %xmm2, %xmm13
  1443. movapd 24 * SIZE(AO), %xmm2
  1444. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1445. mulpd %xmm4, %xmm5
  1446. mulpd 18 * SIZE(BO), %xmm4
  1447. addpd %xmm5, %xmm8
  1448. movapd 16 * SIZE(BO), %xmm5
  1449. addpd %xmm4, %xmm9
  1450. movapd 2 * SIZE(AO), %xmm4
  1451. mulpd %xmm4, %xmm5
  1452. mulpd 18 * SIZE(BO), %xmm4
  1453. addpd %xmm5, %xmm12
  1454. movapd 20 * SIZE(BO), %xmm5
  1455. addpd %xmm4, %xmm13
  1456. movapd 4 * SIZE(AO), %xmm4
  1457. mulpd %xmm4, %xmm5
  1458. mulpd 22 * SIZE(BO), %xmm4
  1459. addpd %xmm5, %xmm8
  1460. movapd 20 * SIZE(BO), %xmm5
  1461. addpd %xmm4, %xmm9
  1462. movapd 6 * SIZE(AO), %xmm4
  1463. mulpd %xmm4, %xmm5
  1464. mulpd 22 * SIZE(BO), %xmm4
  1465. addpd %xmm5, %xmm12
  1466. movapd 48 * SIZE(BO), %xmm5
  1467. addpd %xmm4, %xmm13
  1468. movapd 32 * SIZE(AO), %xmm4
  1469. PREFETCH (PREFETCHSIZE + 24) * SIZE(AO)
  1470. mulpd %xmm6, %xmm7
  1471. mulpd 26 * SIZE(BO), %xmm6
  1472. addpd %xmm7, %xmm8
  1473. movapd 24 * SIZE(BO), %xmm7
  1474. addpd %xmm6, %xmm9
  1475. movapd 10 * SIZE(AO), %xmm6
  1476. mulpd %xmm6, %xmm7
  1477. mulpd 26 * SIZE(BO), %xmm6
  1478. addpd %xmm7, %xmm12
  1479. movapd 28 * SIZE(BO), %xmm7
  1480. addpd %xmm6, %xmm13
  1481. movapd 12 * SIZE(AO), %xmm6
  1482. mulpd %xmm6, %xmm7
  1483. mulpd 30 * SIZE(BO), %xmm6
  1484. addpd %xmm7, %xmm8
  1485. movapd 28 * SIZE(BO), %xmm7
  1486. addpd %xmm6, %xmm9
  1487. movapd 14 * SIZE(AO), %xmm6
  1488. mulpd %xmm6, %xmm7
  1489. mulpd 30 * SIZE(BO), %xmm6
  1490. addpd %xmm7, %xmm12
  1491. movapd 56 * SIZE(BO), %xmm7
  1492. addpd %xmm6, %xmm13
  1493. movapd 40 * SIZE(AO), %xmm6
  1494. addq $32 * SIZE, AO
  1495. addq $32 * SIZE, BO
  1496. decq %rax
  1497. jne .L52
  1498. ALIGN_3
  1499. .L55:
  1500. #ifndef TRMMKERNEL
  1501. movq K, %rax
  1502. #else
  1503. movq KKK, %rax
  1504. #endif
  1505. movapd ALPHA, %xmm7
  1506. andq $7, %rax # if (k & 1)
  1507. BRANCH
  1508. je .L59
  1509. ALIGN_3
  1510. .L56:
  1511. movapd 0 * SIZE(BO), %xmm1
  1512. mulpd %xmm0, %xmm1
  1513. addpd %xmm1, %xmm8
  1514. mulpd 2 * SIZE(BO), %xmm0
  1515. addpd %xmm0, %xmm9
  1516. movapd -14 * SIZE(AO), %xmm0
  1517. movapd 0 * SIZE(BO), %xmm1
  1518. mulpd %xmm0, %xmm1
  1519. addpd %xmm1, %xmm12
  1520. mulpd 2 * SIZE(BO), %xmm0
  1521. addpd %xmm0, %xmm13
  1522. movapd -12 * SIZE(AO), %xmm0
  1523. addq $4 * SIZE, AO # aoffset += 4
  1524. addq $4 * SIZE, BO # boffset1 += 8
  1525. decq %rax
  1526. jg .L56
  1527. ALIGN_3
  1528. .L59:
  1529. #ifndef TRMMKERNEL
  1530. movsd 0 * SIZE(CO1), %xmm0
  1531. movhpd 1 * SIZE(CO1), %xmm0
  1532. movsd 2 * SIZE(CO1), %xmm1
  1533. movhpd 3 * SIZE(CO1), %xmm1
  1534. movsd 0 * SIZE(CO2), %xmm2
  1535. movhpd 1 * SIZE(CO2), %xmm2
  1536. movsd 2 * SIZE(CO2), %xmm3
  1537. movhpd 3 * SIZE(CO2), %xmm3
  1538. #endif
  1539. mulpd %xmm7, %xmm8
  1540. mulpd %xmm7, %xmm9
  1541. mulpd %xmm7, %xmm12
  1542. mulpd %xmm7, %xmm13
  1543. #ifndef TRMMKERNEL
  1544. addpd %xmm0, %xmm8
  1545. addpd %xmm1, %xmm12
  1546. addpd %xmm2, %xmm9
  1547. addpd %xmm3, %xmm13
  1548. #endif
  1549. movsd %xmm8, 0 * SIZE(CO1)
  1550. movhpd %xmm8, 1 * SIZE(CO1)
  1551. movsd %xmm12, 2 * SIZE(CO1)
  1552. movhpd %xmm12, 3 * SIZE(CO1)
  1553. movsd %xmm9, 0 * SIZE(CO2)
  1554. movhpd %xmm9, 1 * SIZE(CO2)
  1555. movsd %xmm13, 2 * SIZE(CO2)
  1556. movhpd %xmm13, 3 * SIZE(CO2)
  1557. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1558. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1559. movq K, %rax
  1560. subq KKK, %rax
  1561. leaq (,%rax, SIZE), %rax
  1562. leaq (AO, %rax, 4), AO
  1563. leaq (BO, %rax, 4), BO
  1564. #endif
  1565. #if defined(TRMMKERNEL) && defined(LEFT)
  1566. addq $4, KK
  1567. #endif
  1568. addq $4 * SIZE, CO1 # coffset += 4
  1569. addq $4 * SIZE, CO2 # coffset += 4
  1570. decq I # i --
  1571. jg .L51
  1572. ALIGN_3
  1573. .L60:
  1574. testq $2, M
  1575. je .L70
  1576. ALIGN_3
  1577. .L61:
  1578. #if !defined(TRMMKERNEL) || \
  1579. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1580. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1581. leaq BUFFER, BO
  1582. #else
  1583. leaq BUFFER, BO
  1584. movq KK, %rax
  1585. leaq (, %rax, SIZE), %rax
  1586. leaq (AO, %rax, 2), AO
  1587. leaq (BO, %rax, 4), BO
  1588. #endif
  1589. movapd -16 * SIZE(AO), %xmm0
  1590. pxor %xmm8, %xmm8
  1591. movapd 0 * SIZE(BO), %xmm1
  1592. pxor %xmm9, %xmm9
  1593. movapd -8 * SIZE(AO), %xmm2
  1594. pxor %xmm10, %xmm10
  1595. movapd 8 * SIZE(BO), %xmm3
  1596. pxor %xmm11, %xmm11
  1597. movapd 16 * SIZE(BO), %xmm5
  1598. movapd 24 * SIZE(BO), %xmm7
  1599. #ifndef TRMMKERNEL
  1600. movq K, %rax
  1601. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1602. movq K, %rax
  1603. subq KK, %rax
  1604. movq %rax, KKK
  1605. #else
  1606. movq KK, %rax
  1607. #ifdef LEFT
  1608. addq $2, %rax
  1609. #else
  1610. addq $2, %rax
  1611. #endif
  1612. movq %rax, KKK
  1613. #endif
  1614. sarq $3, %rax
  1615. je .L65
  1616. ALIGN_3
  1617. .L62:
  1618. mulpd %xmm0, %xmm1
  1619. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1620. mulpd 2 * SIZE(BO), %xmm0
  1621. addpd %xmm1, %xmm8
  1622. movapd 4 * SIZE(BO), %xmm1
  1623. addpd %xmm0, %xmm9
  1624. movapd -14 * SIZE(AO), %xmm0
  1625. mulpd %xmm0, %xmm1
  1626. mulpd 6 * SIZE(BO), %xmm0
  1627. addpd %xmm1, %xmm10
  1628. movapd 32 * SIZE(BO), %xmm1
  1629. addpd %xmm0, %xmm11
  1630. movapd -12 * SIZE(AO), %xmm0
  1631. mulpd %xmm0, %xmm3
  1632. mulpd 10 * SIZE(BO), %xmm0
  1633. addpd %xmm3, %xmm8
  1634. movapd 12 * SIZE(BO), %xmm3
  1635. addpd %xmm0, %xmm9
  1636. movapd -10 * SIZE(AO), %xmm0
  1637. mulpd %xmm0, %xmm3
  1638. mulpd 14 * SIZE(BO), %xmm0
  1639. addpd %xmm3, %xmm10
  1640. movapd 40 * SIZE(BO), %xmm3
  1641. addpd %xmm0, %xmm11
  1642. movapd 0 * SIZE(AO), %xmm0
  1643. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1644. mulpd %xmm2, %xmm5
  1645. mulpd 18 * SIZE(BO), %xmm2
  1646. addpd %xmm5, %xmm8
  1647. movapd 20 * SIZE(BO), %xmm5
  1648. addpd %xmm2, %xmm9
  1649. movapd -6 * SIZE(AO), %xmm2
  1650. mulpd %xmm2, %xmm5
  1651. mulpd 22 * SIZE(BO), %xmm2
  1652. addpd %xmm5, %xmm10
  1653. movapd 48 * SIZE(BO), %xmm5
  1654. addpd %xmm2, %xmm11
  1655. movapd -4 * SIZE(AO), %xmm2
  1656. mulpd %xmm2, %xmm7
  1657. mulpd 26 * SIZE(BO), %xmm2
  1658. addpd %xmm7, %xmm8
  1659. movapd 28 * SIZE(BO), %xmm7
  1660. addpd %xmm2, %xmm9
  1661. movapd -2 * SIZE(AO), %xmm2
  1662. mulpd %xmm2, %xmm7
  1663. mulpd 30 * SIZE(BO), %xmm2
  1664. addpd %xmm7, %xmm10
  1665. movapd 56 * SIZE(BO), %xmm7
  1666. addpd %xmm2, %xmm11
  1667. movapd 8 * SIZE(AO), %xmm2
  1668. addq $16 * SIZE, AO
  1669. addq $32 * SIZE, BO
  1670. decq %rax
  1671. jne .L62
  1672. ALIGN_3
  1673. .L65:
  1674. #ifndef TRMMKERNEL
  1675. movq K, %rax
  1676. #else
  1677. movq KKK, %rax
  1678. #endif
  1679. movapd ALPHA, %xmm7
  1680. andq $7, %rax # if (k & 1)
  1681. BRANCH
  1682. je .L69
  1683. ALIGN_3
  1684. .L66:
  1685. mulpd %xmm0, %xmm1
  1686. mulpd 2 * SIZE(BO), %xmm0
  1687. addpd %xmm1, %xmm8
  1688. movapd 4 * SIZE(BO), %xmm1
  1689. addpd %xmm0, %xmm9
  1690. movapd -14 * SIZE(AO), %xmm0
  1691. addq $2 * SIZE, AO # aoffset += 4
  1692. addq $4 * SIZE, BO # boffset1 += 8
  1693. decq %rax
  1694. jg .L66
  1695. ALIGN_3
  1696. .L69:
  1697. #ifndef TRMMKERNEL
  1698. movsd 0 * SIZE(CO1), %xmm0
  1699. movhpd 1 * SIZE(CO1), %xmm0
  1700. movsd 0 * SIZE(CO2), %xmm2
  1701. movhpd 1 * SIZE(CO2), %xmm2
  1702. #endif
  1703. addpd %xmm10, %xmm8
  1704. addpd %xmm11, %xmm9
  1705. mulpd %xmm7, %xmm8
  1706. mulpd %xmm7, %xmm9
  1707. #ifndef TRMMKERNEL
  1708. addpd %xmm0, %xmm8
  1709. addpd %xmm2, %xmm9
  1710. #endif
  1711. movsd %xmm8, 0 * SIZE(CO1)
  1712. movhpd %xmm8, 1 * SIZE(CO1)
  1713. movsd %xmm9, 0 * SIZE(CO2)
  1714. movhpd %xmm9, 1 * SIZE(CO2)
  1715. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1716. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1717. movq K, %rax
  1718. subq KKK, %rax
  1719. leaq (,%rax, SIZE), %rax
  1720. leaq (AO, %rax, 2), AO
  1721. leaq (BO, %rax, 4), BO
  1722. #endif
  1723. #if defined(TRMMKERNEL) && defined(LEFT)
  1724. addq $2, KK
  1725. #endif
  1726. addq $2 * SIZE, CO1 # coffset += 4
  1727. addq $2 * SIZE, CO2 # coffset += 4
  1728. ALIGN_3
  1729. .L70:
  1730. testq $1, M
  1731. je .L79
  1732. ALIGN_3
  1733. .L71:
  1734. #if !defined(TRMMKERNEL) || \
  1735. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1736. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1737. leaq BUFFER, BO
  1738. #else
  1739. leaq BUFFER, BO
  1740. movq KK, %rax
  1741. leaq (, %rax, SIZE), %rax
  1742. leaq (AO, %rax, 1), AO
  1743. leaq (BO, %rax, 4), BO
  1744. #endif
  1745. movsd -16 * SIZE(AO), %xmm0
  1746. pxor %xmm8, %xmm8
  1747. movsd 0 * SIZE(BO), %xmm1
  1748. pxor %xmm9, %xmm9
  1749. movsd -12 * SIZE(AO), %xmm2
  1750. pxor %xmm10, %xmm10
  1751. movsd 8 * SIZE(BO), %xmm3
  1752. pxor %xmm11, %xmm11
  1753. movsd 16 * SIZE(BO), %xmm5
  1754. movsd 24 * SIZE(BO), %xmm7
  1755. #ifndef TRMMKERNEL
  1756. movq K, %rax
  1757. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1758. movq K, %rax
  1759. subq KK, %rax
  1760. movq %rax, KKK
  1761. #else
  1762. movq KK, %rax
  1763. #ifdef LEFT
  1764. addq $1, %rax
  1765. #else
  1766. addq $2, %rax
  1767. #endif
  1768. movq %rax, KKK
  1769. #endif
  1770. sarq $3, %rax
  1771. je .L75
  1772. ALIGN_3
  1773. .L72:
  1774. mulsd %xmm0, %xmm1
  1775. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1776. mulsd 2 * SIZE(BO), %xmm0
  1777. addsd %xmm1, %xmm8
  1778. movsd 4 * SIZE(BO), %xmm1
  1779. addsd %xmm0, %xmm9
  1780. movsd -15 * SIZE(AO), %xmm0
  1781. mulsd %xmm0, %xmm1
  1782. mulsd 6 * SIZE(BO), %xmm0
  1783. addsd %xmm1, %xmm10
  1784. movsd 32 * SIZE(BO), %xmm1
  1785. addsd %xmm0, %xmm11
  1786. movsd -14 * SIZE(AO), %xmm0
  1787. mulsd %xmm0, %xmm3
  1788. mulsd 10 * SIZE(BO), %xmm0
  1789. addsd %xmm3, %xmm8
  1790. movsd 12 * SIZE(BO), %xmm3
  1791. addsd %xmm0, %xmm9
  1792. movsd -13 * SIZE(AO), %xmm0
  1793. mulsd %xmm0, %xmm3
  1794. mulsd 14 * SIZE(BO), %xmm0
  1795. addsd %xmm3, %xmm10
  1796. movsd 40 * SIZE(BO), %xmm3
  1797. addsd %xmm0, %xmm11
  1798. movsd -8 * SIZE(AO), %xmm0
  1799. mulsd %xmm2, %xmm5
  1800. mulsd 18 * SIZE(BO), %xmm2
  1801. addsd %xmm5, %xmm8
  1802. movsd 20 * SIZE(BO), %xmm5
  1803. addsd %xmm2, %xmm9
  1804. movsd -11 * SIZE(AO), %xmm2
  1805. mulsd %xmm2, %xmm5
  1806. mulsd 22 * SIZE(BO), %xmm2
  1807. addsd %xmm5, %xmm10
  1808. movsd 48 * SIZE(BO), %xmm5
  1809. addsd %xmm2, %xmm11
  1810. movsd -10 * SIZE(AO), %xmm2
  1811. mulsd %xmm2, %xmm7
  1812. mulsd 26 * SIZE(BO), %xmm2
  1813. addsd %xmm7, %xmm8
  1814. movsd 28 * SIZE(BO), %xmm7
  1815. addsd %xmm2, %xmm9
  1816. movsd -9 * SIZE(AO), %xmm2
  1817. mulsd %xmm2, %xmm7
  1818. mulsd 30 * SIZE(BO), %xmm2
  1819. addsd %xmm7, %xmm10
  1820. movsd 56 * SIZE(BO), %xmm7
  1821. addsd %xmm2, %xmm11
  1822. movsd -4 * SIZE(AO), %xmm2
  1823. addq $ 8 * SIZE, AO
  1824. addq $32 * SIZE, BO
  1825. decq %rax
  1826. jne .L72
  1827. ALIGN_3
  1828. .L75:
  1829. #ifndef TRMMKERNEL
  1830. movq K, %rax
  1831. #else
  1832. movq KKK, %rax
  1833. #endif
  1834. movsd ALPHA, %xmm7
  1835. andq $7, %rax # if (k & 1)
  1836. BRANCH
  1837. je .L78
  1838. ALIGN_3
  1839. .L76:
  1840. mulsd %xmm0, %xmm1
  1841. mulsd 2 * SIZE(BO), %xmm0
  1842. addsd %xmm1, %xmm8
  1843. addsd %xmm0, %xmm9
  1844. movsd -15 * SIZE(AO), %xmm0
  1845. movsd 4 * SIZE(BO), %xmm1
  1846. addq $1 * SIZE, AO # aoffset += 4
  1847. addq $4 * SIZE, BO # boffset1 += 8
  1848. decq %rax
  1849. jg .L76
  1850. ALIGN_3
  1851. .L78:
  1852. #ifndef TRMMKERNEL
  1853. movsd 0 * SIZE(CO1), %xmm0
  1854. movsd 0 * SIZE(CO2), %xmm2
  1855. #endif
  1856. addsd %xmm10, %xmm8
  1857. addsd %xmm11, %xmm9
  1858. mulsd %xmm7, %xmm8
  1859. mulsd %xmm7, %xmm9
  1860. #ifndef TRMMKERNEL
  1861. addsd %xmm0, %xmm8
  1862. addsd %xmm2, %xmm9
  1863. #endif
  1864. movsd %xmm8, 0 * SIZE(CO1)
  1865. movsd %xmm9, 0 * SIZE(CO2)
  1866. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1867. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1868. movq K, %rax
  1869. subq KKK, %rax
  1870. leaq (,%rax, SIZE), %rax
  1871. leaq (AO, %rax, 1), AO
  1872. leaq (BO, %rax, 4), BO
  1873. #endif
  1874. #if defined(TRMMKERNEL) && defined(LEFT)
  1875. addq $1, KK
  1876. #endif
  1877. ALIGN_3
  1878. .L79:
  1879. #if defined(TRMMKERNEL) && !defined(LEFT)
  1880. addl $2, KK
  1881. #endif
  1882. leaq (C, LDC, 2), C
  1883. ALIGN_3
  1884. .L80:
  1885. testq $1, N
  1886. je .L999
  1887. ALIGN_4
  1888. .L81:
  1889. /* Copying to Sub Buffer */
  1890. leaq BUFFER, BO
  1891. #if defined(TRMMKERNEL) && defined(LEFT)
  1892. movq OFFSET, %rax
  1893. movq %rax, KK
  1894. #endif
  1895. movq K, %rax
  1896. sarq $3, %rax
  1897. jle .L83
  1898. ALIGN_3
  1899. .L82:
  1900. PREFETCH 56 * SIZE(B)
  1901. movq 0 * SIZE(B), %mm0
  1902. movq 1 * SIZE(B), %mm1
  1903. movq 2 * SIZE(B), %mm2
  1904. movq 3 * SIZE(B), %mm3
  1905. movq 4 * SIZE(B), %mm4
  1906. movq 5 * SIZE(B), %mm5
  1907. movq 6 * SIZE(B), %mm6
  1908. movq 7 * SIZE(B), %mm7
  1909. addq $ 8 * SIZE, B
  1910. addq $16 * SIZE, BO
  1911. movq %mm0, -16 * SIZE(BO)
  1912. movq %mm0, -15 * SIZE(BO)
  1913. movq %mm1, -14 * SIZE(BO)
  1914. movq %mm1, -13 * SIZE(BO)
  1915. movq %mm2, -12 * SIZE(BO)
  1916. movq %mm2, -11 * SIZE(BO)
  1917. movq %mm3, -10 * SIZE(BO)
  1918. movq %mm3, -9 * SIZE(BO)
  1919. movq %mm4, -8 * SIZE(BO)
  1920. movq %mm4, -7 * SIZE(BO)
  1921. movq %mm5, -6 * SIZE(BO)
  1922. movq %mm5, -5 * SIZE(BO)
  1923. movq %mm6, -4 * SIZE(BO)
  1924. movq %mm6, -3 * SIZE(BO)
  1925. movq %mm7, -2 * SIZE(BO)
  1926. movq %mm7, -1 * SIZE(BO)
  1927. decq %rax
  1928. jne .L82
  1929. ALIGN_3
  1930. .L83:
  1931. movq K, %rax
  1932. andq $7, %rax
  1933. BRANCH
  1934. jle .L90
  1935. ALIGN_3
  1936. .L84:
  1937. movq 0 * SIZE(B), %mm0
  1938. movq %mm0, 0 * SIZE(BO)
  1939. movq %mm0, 1 * SIZE(BO)
  1940. addq $1 * SIZE, B
  1941. addq $2 * SIZE, BO
  1942. decq %rax
  1943. jne .L84
  1944. ALIGN_3
  1945. .L90:
  1946. movq C, CO1 # coffset1 = c
  1947. movq A, AO # aoffset = a
  1948. movq M, I
  1949. sarq $2, I # i = (m >> 2)
  1950. jle .L100
  1951. ALIGN_3
  1952. .L91:
  1953. #if !defined(TRMMKERNEL) || \
  1954. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1955. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1956. leaq BUFFER, BO
  1957. #else
  1958. leaq BUFFER, BO
  1959. movq KK, %rax
  1960. leaq (, %rax, SIZE), %rax
  1961. leaq (AO, %rax, 4), AO
  1962. leaq (BO, %rax, 2), BO
  1963. #endif
  1964. movapd -16 * SIZE(AO), %xmm0
  1965. pxor %xmm8, %xmm8
  1966. movapd 0 * SIZE(BO), %xmm1
  1967. pxor %xmm9, %xmm9
  1968. movapd -8 * SIZE(AO), %xmm2
  1969. pxor %xmm10, %xmm10
  1970. movapd 8 * SIZE(BO), %xmm3
  1971. pxor %xmm11, %xmm11
  1972. movapd 0 * SIZE(AO), %xmm4
  1973. movapd 8 * SIZE(AO), %xmm6
  1974. PREFETCHW 4 * SIZE(CO1)
  1975. #ifndef TRMMKERNEL
  1976. movq K, %rax
  1977. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1978. movq K, %rax
  1979. subq KK, %rax
  1980. movq %rax, KKK
  1981. #else
  1982. movq KK, %rax
  1983. #ifdef LEFT
  1984. addq $4, %rax
  1985. #else
  1986. addq $1, %rax
  1987. #endif
  1988. movq %rax, KKK
  1989. #endif
  1990. sarq $3, %rax
  1991. je .L95
  1992. ALIGN_3
  1993. .L92:
  1994. mulpd %xmm1, %xmm0
  1995. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1996. mulpd -14 * SIZE(AO), %xmm1
  1997. addpd %xmm0, %xmm8
  1998. movapd -12 * SIZE(AO), %xmm0
  1999. addpd %xmm1, %xmm9
  2000. movapd 2 * SIZE(BO), %xmm1
  2001. mulpd %xmm1, %xmm0
  2002. mulpd -10 * SIZE(AO), %xmm1
  2003. addpd %xmm0, %xmm10
  2004. movapd 16 * SIZE(AO), %xmm0
  2005. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  2006. addpd %xmm1, %xmm11
  2007. movapd 4 * SIZE(BO), %xmm1
  2008. mulpd %xmm1, %xmm2
  2009. mulpd -6 * SIZE(AO), %xmm1
  2010. addpd %xmm2, %xmm8
  2011. movapd -4 * SIZE(AO), %xmm2
  2012. addpd %xmm1, %xmm9
  2013. movapd 6 * SIZE(BO), %xmm1
  2014. mulpd %xmm1, %xmm2
  2015. mulpd -2 * SIZE(AO), %xmm1
  2016. addpd %xmm2, %xmm10
  2017. movapd 24 * SIZE(AO), %xmm2
  2018. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  2019. addpd %xmm1, %xmm11
  2020. movapd 16 * SIZE(BO), %xmm1
  2021. mulpd %xmm3, %xmm4
  2022. mulpd 2 * SIZE(AO), %xmm3
  2023. addpd %xmm4, %xmm8
  2024. movapd 4 * SIZE(AO), %xmm4
  2025. addpd %xmm3, %xmm9
  2026. movapd 10 * SIZE(BO), %xmm3
  2027. mulpd %xmm3, %xmm4
  2028. mulpd 6 * SIZE(AO), %xmm3
  2029. addpd %xmm4, %xmm10
  2030. movapd 32 * SIZE(AO), %xmm4
  2031. PREFETCH (PREFETCHSIZE + 24) * SIZE(AO)
  2032. addpd %xmm3, %xmm11
  2033. movapd 12 * SIZE(BO), %xmm3
  2034. mulpd %xmm3, %xmm6
  2035. mulpd 10 * SIZE(AO), %xmm3
  2036. addpd %xmm6, %xmm8
  2037. movapd 12 * SIZE(AO), %xmm6
  2038. addpd %xmm3, %xmm9
  2039. movapd 14 * SIZE(BO), %xmm3
  2040. mulpd %xmm3, %xmm6
  2041. mulpd 14 * SIZE(AO), %xmm3
  2042. addpd %xmm6, %xmm10
  2043. movapd 40 * SIZE(AO), %xmm6
  2044. addpd %xmm3, %xmm11
  2045. movapd 24 * SIZE(BO), %xmm3
  2046. addq $32 * SIZE, AO
  2047. addq $16 * SIZE, BO
  2048. decq %rax
  2049. jne .L92
  2050. ALIGN_3
  2051. .L95:
  2052. #ifndef TRMMKERNEL
  2053. movq K, %rax
  2054. #else
  2055. movq KKK, %rax
  2056. #endif
  2057. movapd ALPHA, %xmm7
  2058. andq $7, %rax # if (k & 1)
  2059. BRANCH
  2060. je .L99
  2061. ALIGN_3
  2062. .L96:
  2063. mulpd %xmm1, %xmm0
  2064. mulpd -14 * SIZE(AO), %xmm1
  2065. addpd %xmm0, %xmm8
  2066. movapd -12 * SIZE(AO), %xmm0
  2067. addpd %xmm1, %xmm9
  2068. movapd 2 * SIZE(BO), %xmm1
  2069. addq $4 * SIZE, AO # aoffset += 4
  2070. addq $2 * SIZE, BO # boffset1 += 8
  2071. decq %rax
  2072. jg .L96
  2073. ALIGN_3
  2074. .L99:
  2075. #ifndef TRMMKERNEL
  2076. movsd 0 * SIZE(CO1), %xmm0
  2077. movhpd 1 * SIZE(CO1), %xmm0
  2078. movsd 2 * SIZE(CO1), %xmm1
  2079. movhpd 3 * SIZE(CO1), %xmm1
  2080. #endif
  2081. addpd %xmm10, %xmm8
  2082. addpd %xmm11, %xmm9
  2083. mulpd %xmm7, %xmm8
  2084. mulpd %xmm7, %xmm9
  2085. #ifndef TRMMKERNEL
  2086. addpd %xmm0, %xmm8
  2087. addpd %xmm1, %xmm9
  2088. #endif
  2089. movsd %xmm8, 0 * SIZE(CO1)
  2090. movhpd %xmm8, 1 * SIZE(CO1)
  2091. movsd %xmm9, 2 * SIZE(CO1)
  2092. movhpd %xmm9, 3 * SIZE(CO1)
  2093. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2094. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2095. movq K, %rax
  2096. subq KKK, %rax
  2097. leaq (,%rax, SIZE), %rax
  2098. leaq (AO, %rax, 4), AO
  2099. leaq (BO, %rax, 2), BO
  2100. #endif
  2101. #if defined(TRMMKERNEL) && defined(LEFT)
  2102. addq $4, KK
  2103. #endif
  2104. addq $4 * SIZE, CO1 # coffset += 4
  2105. decq I # i --
  2106. jg .L91
  2107. ALIGN_3
  2108. .L100:
  2109. testq $2, M
  2110. je .L110
  2111. ALIGN_3
  2112. .L101:
  2113. #if !defined(TRMMKERNEL) || \
  2114. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2115. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2116. leaq BUFFER, BO
  2117. #else
  2118. leaq BUFFER, BO
  2119. movq KK, %rax
  2120. leaq (, %rax, SIZE), %rax
  2121. leaq (AO, %rax, 2), AO
  2122. leaq (BO, %rax, 2), BO
  2123. #endif
  2124. movapd -16 * SIZE(AO), %xmm0
  2125. pxor %xmm8, %xmm8
  2126. movapd 0 * SIZE(BO), %xmm1
  2127. pxor %xmm9, %xmm9
  2128. movapd -8 * SIZE(AO), %xmm2
  2129. pxor %xmm10, %xmm10
  2130. movapd 8 * SIZE(BO), %xmm3
  2131. pxor %xmm11, %xmm11
  2132. #ifndef TRMMKERNEL
  2133. movq K, %rax
  2134. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2135. movq K, %rax
  2136. subq KK, %rax
  2137. movq %rax, KKK
  2138. #else
  2139. movq KK, %rax
  2140. #ifdef LEFT
  2141. addq $2, %rax
  2142. #else
  2143. addq $1, %rax
  2144. #endif
  2145. movq %rax, KKK
  2146. #endif
  2147. sarq $3, %rax
  2148. je .L105
  2149. ALIGN_3
  2150. .L102:
  2151. mulpd %xmm0, %xmm1
  2152. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2153. movapd -14 * SIZE(AO), %xmm0
  2154. mulpd 2 * SIZE(BO), %xmm0
  2155. addpd %xmm1, %xmm8
  2156. movapd 16 * SIZE(BO), %xmm1
  2157. addpd %xmm0, %xmm9
  2158. movapd -12 * SIZE(AO), %xmm0
  2159. mulpd 4 * SIZE(BO), %xmm0
  2160. addpd %xmm0, %xmm10
  2161. movapd -10 * SIZE(AO), %xmm0
  2162. mulpd 6 * SIZE(BO), %xmm0
  2163. addpd %xmm0, %xmm11
  2164. movapd 0 * SIZE(AO), %xmm0
  2165. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2166. mulpd %xmm2, %xmm3
  2167. movapd -6 * SIZE(AO), %xmm2
  2168. mulpd 10 * SIZE(BO), %xmm2
  2169. addpd %xmm3, %xmm8
  2170. movapd 24 * SIZE(BO), %xmm3
  2171. addpd %xmm2, %xmm9
  2172. movapd -4 * SIZE(AO), %xmm2
  2173. mulpd 12 * SIZE(BO), %xmm2
  2174. addpd %xmm2, %xmm10
  2175. movapd -2 * SIZE(AO), %xmm2
  2176. mulpd 14 * SIZE(BO), %xmm2
  2177. addpd %xmm2, %xmm11
  2178. movapd 8 * SIZE(AO), %xmm2
  2179. addq $16 * SIZE, AO
  2180. addq $16 * SIZE, BO
  2181. decq %rax
  2182. jne .L102
  2183. ALIGN_3
  2184. .L105:
  2185. #ifndef TRMMKERNEL
  2186. movq K, %rax
  2187. #else
  2188. movq KKK, %rax
  2189. #endif
  2190. movapd ALPHA, %xmm7
  2191. andq $7, %rax # if (k & 1)
  2192. BRANCH
  2193. je .L109
  2194. ALIGN_3
  2195. .L106:
  2196. mulpd %xmm0, %xmm1
  2197. addpd %xmm1, %xmm8
  2198. movapd -14 * SIZE(AO), %xmm0
  2199. movapd 2 * SIZE(BO), %xmm1
  2200. addq $2 * SIZE, AO # aoffset += 4
  2201. addq $2 * SIZE, BO # boffset1 += 8
  2202. decq %rax
  2203. jg .L106
  2204. ALIGN_3
  2205. .L109:
  2206. addpd %xmm9, %xmm8
  2207. addpd %xmm11, %xmm10
  2208. addpd %xmm10, %xmm8
  2209. mulpd %xmm7, %xmm8
  2210. #ifndef TRMMKERNEL
  2211. movsd 0 * SIZE(CO1), %xmm0
  2212. movhpd 1 * SIZE(CO1), %xmm0
  2213. addpd %xmm0, %xmm8
  2214. #endif
  2215. movsd %xmm8, 0 * SIZE(CO1)
  2216. movhpd %xmm8, 1 * SIZE(CO1)
  2217. addq $2 * SIZE, CO1 # coffset += 4
  2218. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2219. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2220. movq K, %rax
  2221. subq KKK, %rax
  2222. leaq (,%rax, SIZE), %rax
  2223. leaq (AO, %rax, 2), AO
  2224. leaq (BO, %rax, 2), BO
  2225. #endif
  2226. #if defined(TRMMKERNEL) && defined(LEFT)
  2227. addq $2, KK
  2228. #endif
  2229. ALIGN_3
  2230. .L110:
  2231. testq $1, M
  2232. je .L999
  2233. ALIGN_3
  2234. .L111:
  2235. #if !defined(TRMMKERNEL) || \
  2236. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2237. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2238. leaq BUFFER, BO
  2239. #else
  2240. leaq BUFFER, BO
  2241. movq KK, %rax
  2242. leaq (, %rax, SIZE), %rax
  2243. leaq (AO, %rax, 1), AO
  2244. leaq (BO, %rax, 2), BO
  2245. #endif
  2246. movsd -16 * SIZE(AO), %xmm0
  2247. pxor %xmm8, %xmm8
  2248. movsd 0 * SIZE(BO), %xmm1
  2249. pxor %xmm9, %xmm9
  2250. movsd -12 * SIZE(AO), %xmm2
  2251. pxor %xmm10, %xmm10
  2252. movsd 8 * SIZE(BO), %xmm3
  2253. pxor %xmm11, %xmm11
  2254. #ifndef TRMMKERNEL
  2255. movq K, %rax
  2256. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2257. movq K, %rax
  2258. subq KK, %rax
  2259. movq %rax, KKK
  2260. #else
  2261. movq KK, %rax
  2262. #ifdef LEFT
  2263. addq $1, %rax
  2264. #else
  2265. addq $1, %rax
  2266. #endif
  2267. movq %rax, KKK
  2268. #endif
  2269. sarq $3, %rax
  2270. je .L115
  2271. ALIGN_3
  2272. .L112:
  2273. mulsd %xmm0, %xmm1
  2274. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2275. movsd -15 * SIZE(AO), %xmm0
  2276. addsd %xmm1, %xmm8
  2277. movsd 16 * SIZE(BO), %xmm1
  2278. mulsd 2 * SIZE(BO), %xmm0
  2279. addsd %xmm0, %xmm9
  2280. movsd -14 * SIZE(AO), %xmm0
  2281. mulsd 4 * SIZE(BO), %xmm0
  2282. addsd %xmm0, %xmm10
  2283. movsd -13 * SIZE(AO), %xmm0
  2284. mulsd 6 * SIZE(BO), %xmm0
  2285. addsd %xmm0, %xmm11
  2286. movsd -8 * SIZE(AO), %xmm0
  2287. mulsd %xmm2, %xmm3
  2288. movsd -11 * SIZE(AO), %xmm2
  2289. addsd %xmm3, %xmm8
  2290. movsd 24 * SIZE(BO), %xmm3
  2291. mulsd 10 * SIZE(BO), %xmm2
  2292. addsd %xmm2, %xmm9
  2293. movsd -10 * SIZE(AO), %xmm2
  2294. mulsd 12 * SIZE(BO), %xmm2
  2295. addsd %xmm2, %xmm10
  2296. movsd -9 * SIZE(AO), %xmm2
  2297. mulsd 14 * SIZE(BO), %xmm2
  2298. addsd %xmm2, %xmm11
  2299. movsd -4 * SIZE(AO), %xmm2
  2300. addq $ 8 * SIZE, AO
  2301. addq $16 * SIZE, BO
  2302. decq %rax
  2303. jne .L112
  2304. ALIGN_3
  2305. .L115:
  2306. #ifndef TRMMKERNEL
  2307. movq K, %rax
  2308. #else
  2309. movq KKK, %rax
  2310. #endif
  2311. movsd ALPHA, %xmm7
  2312. andq $7, %rax # if (k & 1)
  2313. BRANCH
  2314. je .L118
  2315. ALIGN_3
  2316. .L116:
  2317. mulsd %xmm0, %xmm1
  2318. movsd -15 * SIZE(AO), %xmm0
  2319. addsd %xmm1, %xmm8
  2320. movsd 2 * SIZE(BO), %xmm1
  2321. addq $1 * SIZE, AO # aoffset += 4
  2322. addq $2 * SIZE, BO # boffset1 += 8
  2323. decq %rax
  2324. jg .L116
  2325. ALIGN_3
  2326. .L118:
  2327. addsd %xmm10, %xmm8
  2328. addsd %xmm11, %xmm9
  2329. addsd %xmm9, %xmm8
  2330. mulsd %xmm7, %xmm8
  2331. #ifndef TRMMKERNEL
  2332. addsd 0 * SIZE(CO1), %xmm8
  2333. #endif
  2334. movsd %xmm8, 0 * SIZE(CO1)
  2335. ALIGN_3
  2336. .L999:
  2337. movq %rbx, %rsp
  2338. EMMS
  2339. movq 0(%rsp), %rbx
  2340. movq 8(%rsp), %rbp
  2341. movq 16(%rsp), %r12
  2342. movq 24(%rsp), %r13
  2343. movq 32(%rsp), %r14
  2344. movq 40(%rsp), %r15
  2345. #ifdef WINDOWS_ABI
  2346. movq 48(%rsp), %rdi
  2347. movq 56(%rsp), %rsi
  2348. movups 64(%rsp), %xmm6
  2349. movups 80(%rsp), %xmm7
  2350. movups 96(%rsp), %xmm8
  2351. movups 112(%rsp), %xmm9
  2352. movups 128(%rsp), %xmm10
  2353. movups 144(%rsp), %xmm11
  2354. movups 160(%rsp), %xmm12
  2355. movups 176(%rsp), %xmm13
  2356. movups 192(%rsp), %xmm14
  2357. movups 208(%rsp), %xmm15
  2358. #endif
  2359. addq $STACKSIZE, %rsp
  2360. ret
  2361. EPILOGUE