You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x2_sse.S 49 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %rbp
  54. #define BB %r12
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  60. #define OLD_A 48 + STACKSIZE(%rsp)
  61. #define OLD_B 56 + STACKSIZE(%rsp)
  62. #define OLD_C 64 + STACKSIZE(%rsp)
  63. #define OLD_LDC 72 + STACKSIZE(%rsp)
  64. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  65. #endif
  66. #define POSINV 0(%rsp)
  67. #define ALPHA_R 16(%rsp)
  68. #define ALPHA_I 32(%rsp)
  69. #define J 48(%rsp)
  70. #define OFFSET 56(%rsp)
  71. #define KK 64(%rsp)
  72. #define KKK 72(%rsp)
  73. #define BUFFER 256(%rsp)
  74. #ifdef OPTERON
  75. #define movsd movlps
  76. #endif
  77. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
  78. #define PREFETCH prefetch
  79. #define PREFETCHW prefetchw
  80. #define PREFETCHSIZE (16 * 5 + 8)
  81. #endif
  82. #if defined(PENTIUM4) || defined(GENERIC)
  83. #define PREFETCH prefetcht0
  84. #define PREFETCHW prefetcht0
  85. #define PREFETCHSIZE 160
  86. #endif
  87. #define RPREFETCHSIZE (8 * 7 + 4)
  88. #define WPREFETCHSIZE (8 * 8 + 4)
  89. #ifndef GENERIC
  90. #define KERNEL1(xx) \
  91. mulps %xmm0, %xmm1 ;\
  92. addps %xmm1, %xmm8 ;\
  93. movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  94. mulps %xmm0, %xmm3 ;\
  95. addps %xmm3, %xmm9 ;\
  96. movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  97. mulps %xmm0, %xmm5 ;\
  98. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  99. mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  100. addps %xmm5, %xmm10 ;\
  101. movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  102. addps %xmm0, %xmm11 ;\
  103. movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  104. #define KERNEL2(xx) \
  105. mulps %xmm2, %xmm1 ;\
  106. addps %xmm1, %xmm12 ;\
  107. movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  108. mulps %xmm2, %xmm3 ;\
  109. addps %xmm3, %xmm13 ;\
  110. movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  111. mulps %xmm2, %xmm5 ;\
  112. mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  113. addps %xmm5, %xmm14 ;\
  114. movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  115. addps %xmm2, %xmm15 ;\
  116. movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  117. #define KERNEL3(xx) \
  118. mulps %xmm4, %xmm7 ;\
  119. addps %xmm7, %xmm8 ;\
  120. movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  121. mulps %xmm4, %xmm3 ;\
  122. addps %xmm3, %xmm9 ;\
  123. movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  124. mulps %xmm4, %xmm5 ;\
  125. mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  126. addps %xmm5, %xmm10 ;\
  127. movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  128. addps %xmm4, %xmm11 ;\
  129. movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  130. #define KERNEL4(xx) \
  131. mulps %xmm6, %xmm7 ;\
  132. addps %xmm7, %xmm12 ;\
  133. movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  134. mulps %xmm6, %xmm3 ;\
  135. addps %xmm3, %xmm13 ;\
  136. movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  137. mulps %xmm6, %xmm5 ;\
  138. mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  139. addps %xmm5, %xmm14 ;\
  140. movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  141. PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  142. addps %xmm6, %xmm15 ;\
  143. movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  144. #define KERNEL5(xx) \
  145. mulps %xmm0, %xmm1 ;\
  146. addps %xmm1, %xmm8 ;\
  147. movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  148. mulps %xmm0, %xmm3 ;\
  149. addps %xmm3, %xmm9 ;\
  150. movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  151. mulps %xmm0, %xmm5 ;\
  152. mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  153. addps %xmm5, %xmm10 ;\
  154. movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  155. addps %xmm0, %xmm11 ;\
  156. movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  157. #define KERNEL6(xx) \
  158. mulps %xmm2, %xmm1 ;\
  159. addps %xmm1, %xmm12 ;\
  160. movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  161. mulps %xmm2, %xmm3 ;\
  162. addps %xmm3, %xmm13 ;\
  163. movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  164. mulps %xmm2, %xmm5 ;\
  165. mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  166. addps %xmm5, %xmm14 ;\
  167. movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  168. addps %xmm2, %xmm15 ;\
  169. movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  170. #define KERNEL7(xx) \
  171. mulps %xmm4, %xmm7 ;\
  172. addps %xmm7, %xmm8 ;\
  173. movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  174. mulps %xmm4, %xmm3 ;\
  175. addps %xmm3, %xmm9 ;\
  176. movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  177. mulps %xmm4, %xmm5 ;\
  178. mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  179. addps %xmm5, %xmm10 ;\
  180. movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  181. addps %xmm4, %xmm11 ;\
  182. movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  183. #define KERNEL8(xx) \
  184. mulps %xmm6, %xmm7 ;\
  185. addps %xmm7, %xmm12 ;\
  186. movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  187. mulps %xmm6, %xmm3 ;\
  188. addps %xmm3, %xmm13 ;\
  189. movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  190. mulps %xmm6, %xmm5 ;\
  191. mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  192. addps %xmm5, %xmm14 ;\
  193. movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  194. addps %xmm6, %xmm15 ;\
  195. movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  196. #else
  197. #define KERNEL1(xx) \
  198. mulps %xmm0, %xmm1 ;\
  199. addps %xmm1, %xmm8 ;\
  200. movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  201. mulps %xmm0, %xmm3 ;\
  202. addps %xmm3, %xmm9 ;\
  203. movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  204. mulps %xmm0, %xmm5 ;\
  205. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  206. mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  207. addps %xmm5, %xmm10 ;\
  208. movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  209. addps %xmm0, %xmm11 ;\
  210. movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  211. #define KERNEL2(xx) \
  212. mulps %xmm2, %xmm1 ;\
  213. addps %xmm1, %xmm12 ;\
  214. movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  215. mulps %xmm2, %xmm3 ;\
  216. addps %xmm3, %xmm13 ;\
  217. movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  218. mulps %xmm2, %xmm5 ;\
  219. mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  220. addps %xmm5, %xmm14 ;\
  221. movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  222. addps %xmm2, %xmm15 ;\
  223. movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\
  224. #define KERNEL3(xx) \
  225. mulps %xmm4, %xmm7 ;\
  226. addps %xmm7, %xmm8 ;\
  227. movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  228. mulps %xmm4, %xmm3 ;\
  229. addps %xmm3, %xmm9 ;\
  230. movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  231. mulps %xmm4, %xmm5 ;\
  232. mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  233. addps %xmm5, %xmm10 ;\
  234. movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  235. addps %xmm4, %xmm11 ;\
  236. movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  237. #define KERNEL4(xx) \
  238. mulps %xmm6, %xmm7 ;\
  239. addps %xmm7, %xmm12 ;\
  240. movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  241. mulps %xmm6, %xmm3 ;\
  242. addps %xmm3, %xmm13 ;\
  243. movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  244. mulps %xmm6, %xmm5 ;\
  245. mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  246. addps %xmm5, %xmm14 ;\
  247. movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  248. addps %xmm6, %xmm15 ;\
  249. movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  250. #define KERNEL5(xx) \
  251. mulps %xmm0, %xmm1 ;\
  252. PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\
  253. addps %xmm1, %xmm8 ;\
  254. movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  255. mulps %xmm0, %xmm3 ;\
  256. addps %xmm3, %xmm9 ;\
  257. movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  258. mulps %xmm0, %xmm5 ;\
  259. mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  260. addps %xmm5, %xmm10 ;\
  261. movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  262. addps %xmm0, %xmm11 ;\
  263. movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  264. #define KERNEL6(xx) \
  265. mulps %xmm2, %xmm1 ;\
  266. addps %xmm1, %xmm12 ;\
  267. movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  268. mulps %xmm2, %xmm3 ;\
  269. addps %xmm3, %xmm13 ;\
  270. movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  271. mulps %xmm2, %xmm5 ;\
  272. mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  273. addps %xmm5, %xmm14 ;\
  274. movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  275. addps %xmm2, %xmm15 ;\
  276. movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  277. #define KERNEL7(xx) \
  278. mulps %xmm4, %xmm7 ;\
  279. addps %xmm7, %xmm8 ;\
  280. movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  281. mulps %xmm4, %xmm3 ;\
  282. addps %xmm3, %xmm9 ;\
  283. movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  284. mulps %xmm4, %xmm5 ;\
  285. mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  286. addps %xmm5, %xmm10 ;\
  287. movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  288. addps %xmm4, %xmm11 ;\
  289. movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  290. #define KERNEL8(xx) \
  291. mulps %xmm6, %xmm7 ;\
  292. addps %xmm7, %xmm12 ;\
  293. movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  294. mulps %xmm6, %xmm3 ;\
  295. addps %xmm3, %xmm13 ;\
  296. movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  297. mulps %xmm6, %xmm5 ;\
  298. mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  299. addps %xmm5, %xmm14 ;\
  300. movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  301. addps %xmm6, %xmm15 ;\
  302. movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  303. #endif
  304. PROLOGUE
  305. PROFCODE
  306. subq $STACKSIZE, %rsp
  307. movq %rbx, 0(%rsp)
  308. movq %rbp, 8(%rsp)
  309. movq %r12, 16(%rsp)
  310. movq %r13, 24(%rsp)
  311. movq %r14, 32(%rsp)
  312. movq %r15, 40(%rsp)
  313. #ifdef WINDOWS_ABI
  314. movq %rdi, 48(%rsp)
  315. movq %rsi, 56(%rsp)
  316. movups %xmm6, 64(%rsp)
  317. movups %xmm7, 80(%rsp)
  318. movups %xmm8, 96(%rsp)
  319. movups %xmm9, 112(%rsp)
  320. movups %xmm10, 128(%rsp)
  321. movups %xmm11, 144(%rsp)
  322. movups %xmm12, 160(%rsp)
  323. movups %xmm13, 176(%rsp)
  324. movups %xmm14, 192(%rsp)
  325. movups %xmm15, 208(%rsp)
  326. movq ARG1, OLD_M
  327. movq ARG2, OLD_N
  328. movq ARG3, K
  329. movq OLD_A, A
  330. movq OLD_B, B
  331. movq OLD_C, C
  332. movq OLD_LDC, LDC
  333. #ifdef TRMMKERNEL
  334. movsd OLD_OFFSET, %xmm12
  335. #endif
  336. movaps %xmm3, %xmm0
  337. movsd OLD_ALPHA_I, %xmm1
  338. #else
  339. movq 72(%rsp), LDC
  340. #ifdef TRMMKERNEL
  341. movsd 80(%rsp), %xmm12
  342. #endif
  343. #endif
  344. movq %rsp, %rbx # save old stack
  345. subq $256 + LOCAL_BUFFER_SIZE, %rsp
  346. andq $-4096, %rsp # align stack
  347. STACK_TOUCHING
  348. movq OLD_M, M
  349. movq OLD_N, N
  350. pxor %xmm7, %xmm7
  351. cmpeqps %xmm7, %xmm7
  352. pslld $31, %xmm7 # Generate mask
  353. pxor %xmm10, %xmm10
  354. shufps $0, %xmm0, %xmm0
  355. movaps %xmm0, 0 + ALPHA_R
  356. movss %xmm1, 4 + ALPHA_I
  357. movss %xmm1, 12 + ALPHA_I
  358. xorps %xmm7, %xmm1
  359. movss %xmm1, 0 + ALPHA_I
  360. movss %xmm1, 8 + ALPHA_I
  361. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  362. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  363. movss %xmm7, 0 + POSINV
  364. movss %xmm10, 4 + POSINV
  365. movss %xmm7, 8 + POSINV
  366. movss %xmm10,12 + POSINV
  367. #else
  368. movss %xmm10, 0 + POSINV
  369. movss %xmm7, 4 + POSINV
  370. movss %xmm10, 8 + POSINV
  371. movss %xmm7, 12 + POSINV
  372. #endif
  373. addq $32 * SIZE, A
  374. #ifdef TRMMKERNEL
  375. movsd %xmm12, OFFSET
  376. movsd %xmm12, KK
  377. #ifndef LEFT
  378. negq KK
  379. #endif
  380. #endif
  381. salq $ZBASE_SHIFT, LDC
  382. movq N, J
  383. sarq $1, J # j = (n >> 2)
  384. jle .L40
  385. ALIGN_4
  386. .L01:
  387. #if defined(TRMMKERNEL) && defined(LEFT)
  388. movq OFFSET, %rax
  389. movq %rax, KK
  390. #endif
  391. /* Copying to Sub Buffer */
  392. leaq BUFFER, BO
  393. movaps POSINV, %xmm7
  394. movq K, %rax
  395. sarq $2, %rax
  396. jle .L03
  397. addq %rax, %rax
  398. ALIGN_4
  399. .L02:
  400. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  401. movss 0 * SIZE(B), %xmm8
  402. movss 1 * SIZE(B), %xmm9
  403. movss 2 * SIZE(B), %xmm10
  404. movss 3 * SIZE(B), %xmm11
  405. movss 4 * SIZE(B), %xmm12
  406. movss 5 * SIZE(B), %xmm13
  407. movss 6 * SIZE(B), %xmm14
  408. movss 7 * SIZE(B), %xmm15
  409. PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO)
  410. shufps $0, %xmm8, %xmm8
  411. shufps $0, %xmm9, %xmm9
  412. shufps $0, %xmm10, %xmm10
  413. shufps $0, %xmm11, %xmm11
  414. shufps $0, %xmm12, %xmm12
  415. shufps $0, %xmm13, %xmm13
  416. shufps $0, %xmm14, %xmm14
  417. shufps $0, %xmm15, %xmm15
  418. PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO)
  419. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  420. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  421. xorps %xmm7, %xmm9
  422. xorps %xmm7, %xmm11
  423. xorps %xmm7, %xmm13
  424. xorps %xmm7, %xmm15
  425. #else
  426. xorps %xmm7, %xmm8
  427. xorps %xmm7, %xmm10
  428. xorps %xmm7, %xmm12
  429. xorps %xmm7, %xmm14
  430. #endif
  431. movaps %xmm8, 0 * SIZE(BO)
  432. movaps %xmm9, 4 * SIZE(BO)
  433. movaps %xmm10, 8 * SIZE(BO)
  434. movaps %xmm11, 12 * SIZE(BO)
  435. movaps %xmm12, 16 * SIZE(BO)
  436. movaps %xmm13, 20 * SIZE(BO)
  437. movaps %xmm14, 24 * SIZE(BO)
  438. movaps %xmm15, 28 * SIZE(BO)
  439. addq $32 * SIZE, BO
  440. addq $ 8 * SIZE, B
  441. decq %rax
  442. jne .L02
  443. ALIGN_4
  444. .L03:
  445. movq K, %rax
  446. andq $3, %rax
  447. BRANCH
  448. jle .L10
  449. ALIGN_4
  450. .L04:
  451. movss 0 * SIZE(B), %xmm8
  452. movss 1 * SIZE(B), %xmm9
  453. movss 2 * SIZE(B), %xmm10
  454. movss 3 * SIZE(B), %xmm11
  455. shufps $0, %xmm8, %xmm8
  456. shufps $0, %xmm9, %xmm9
  457. shufps $0, %xmm10, %xmm10
  458. shufps $0, %xmm11, %xmm11
  459. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  460. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  461. xorps %xmm7, %xmm9
  462. xorps %xmm7, %xmm11
  463. #else
  464. xorps %xmm7, %xmm8
  465. xorps %xmm7, %xmm10
  466. #endif
  467. movaps %xmm8, 0 * SIZE(BO)
  468. movaps %xmm9, 4 * SIZE(BO)
  469. movaps %xmm10, 8 * SIZE(BO)
  470. movaps %xmm11, 12 * SIZE(BO)
  471. addq $ 4 * SIZE, B
  472. addq $16 * SIZE, BO
  473. decq %rax
  474. jne .L04
  475. ALIGN_4
  476. .L10:
  477. movq C, CO1 # coffset1 = c
  478. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  479. movq A, AO # aoffset = a
  480. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  481. movq M, I
  482. sarq $2, I # i = (m >> 2)
  483. jle .L20
  484. ALIGN_4
  485. .L11:
  486. #if !defined(TRMMKERNEL) || \
  487. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  488. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  489. leaq 32 * SIZE + BUFFER, BO
  490. #else
  491. leaq 32 * SIZE + BUFFER, BO
  492. movq KK, %rax
  493. leaq (, %rax, 8), %rax
  494. leaq (AO, %rax, 4), AO
  495. leaq (BO, %rax, 8), BO
  496. #endif
  497. movaps -32 * SIZE(AO), %xmm0
  498. movaps -32 * SIZE(BO), %xmm1
  499. pxor %xmm8, %xmm8
  500. movaps -28 * SIZE(AO), %xmm2
  501. movaps -28 * SIZE(BO), %xmm3
  502. pxor %xmm9, %xmm9
  503. movaps -24 * SIZE(AO), %xmm4
  504. movaps -24 * SIZE(BO), %xmm5
  505. pxor %xmm10, %xmm10
  506. movaps -20 * SIZE(AO), %xmm6
  507. movaps -16 * SIZE(BO), %xmm7
  508. pxor %xmm11, %xmm11
  509. PREFETCHW 7 * SIZE(CO1)
  510. pxor %xmm12, %xmm12
  511. PREFETCHW 7 * SIZE(CO2)
  512. pxor %xmm13, %xmm13
  513. PREFETCH -32 * SIZE(BB)
  514. pxor %xmm14, %xmm14
  515. PREFETCH -16 * SIZE(BB)
  516. pxor %xmm15, %xmm15
  517. subq $-16 * SIZE, BB
  518. #ifndef TRMMKERNEL
  519. movq K, %rax
  520. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  521. movq K, %rax
  522. subq KK, %rax
  523. movq %rax, KKK
  524. #else
  525. movq KK, %rax
  526. #ifdef LEFT
  527. addq $4, %rax
  528. #else
  529. addq $2, %rax
  530. #endif
  531. movq %rax, KKK
  532. #endif
  533. #ifndef GENERIC
  534. andq $-8, %rax
  535. leaq (, %rax, 8), %rax
  536. leaq (AO, %rax, 4), AO
  537. leaq (BO, %rax, 8), BO
  538. negq %rax
  539. NOBRANCH
  540. je .L15
  541. ALIGN_3
  542. .L12:
  543. KERNEL1(16 * 0)
  544. KERNEL2(16 * 0)
  545. KERNEL3(16 * 0)
  546. KERNEL4(16 * 0)
  547. KERNEL5(16 * 0)
  548. KERNEL6(16 * 0)
  549. KERNEL7(16 * 0)
  550. KERNEL8(16 * 0)
  551. KERNEL1(16 * 2)
  552. KERNEL2(16 * 2)
  553. KERNEL3(16 * 2)
  554. KERNEL4(16 * 2)
  555. KERNEL5(16 * 2)
  556. KERNEL6(16 * 2)
  557. KERNEL7(16 * 2)
  558. KERNEL8(16 * 2)
  559. addq $16 * SIZE, %rax
  560. NOBRANCH
  561. je .L15
  562. KERNEL1(16 * 0)
  563. KERNEL2(16 * 0)
  564. KERNEL3(16 * 0)
  565. KERNEL4(16 * 0)
  566. KERNEL5(16 * 0)
  567. KERNEL6(16 * 0)
  568. KERNEL7(16 * 0)
  569. KERNEL8(16 * 0)
  570. KERNEL1(16 * 2)
  571. KERNEL2(16 * 2)
  572. KERNEL3(16 * 2)
  573. KERNEL4(16 * 2)
  574. KERNEL5(16 * 2)
  575. KERNEL6(16 * 2)
  576. KERNEL7(16 * 2)
  577. KERNEL8(16 * 2)
  578. addq $16 * SIZE, %rax
  579. NOBRANCH
  580. je .L15
  581. KERNEL1(16 * 0)
  582. KERNEL2(16 * 0)
  583. KERNEL3(16 * 0)
  584. KERNEL4(16 * 0)
  585. KERNEL5(16 * 0)
  586. KERNEL6(16 * 0)
  587. KERNEL7(16 * 0)
  588. KERNEL8(16 * 0)
  589. KERNEL1(16 * 2)
  590. KERNEL2(16 * 2)
  591. KERNEL3(16 * 2)
  592. KERNEL4(16 * 2)
  593. KERNEL5(16 * 2)
  594. KERNEL6(16 * 2)
  595. KERNEL7(16 * 2)
  596. KERNEL8(16 * 2)
  597. addq $16 * SIZE, %rax
  598. NOBRANCH
  599. je .L15
  600. KERNEL1(16 * 0)
  601. KERNEL2(16 * 0)
  602. KERNEL3(16 * 0)
  603. KERNEL4(16 * 0)
  604. KERNEL5(16 * 0)
  605. KERNEL6(16 * 0)
  606. KERNEL7(16 * 0)
  607. KERNEL8(16 * 0)
  608. KERNEL1(16 * 2)
  609. KERNEL2(16 * 2)
  610. KERNEL3(16 * 2)
  611. KERNEL4(16 * 2)
  612. KERNEL5(16 * 2)
  613. KERNEL6(16 * 2)
  614. KERNEL7(16 * 2)
  615. KERNEL8(16 * 2)
  616. addq $16 * SIZE, %rax
  617. NOBRANCH
  618. je .L15
  619. KERNEL1(16 * 0)
  620. KERNEL2(16 * 0)
  621. KERNEL3(16 * 0)
  622. KERNEL4(16 * 0)
  623. KERNEL5(16 * 0)
  624. KERNEL6(16 * 0)
  625. KERNEL7(16 * 0)
  626. KERNEL8(16 * 0)
  627. KERNEL1(16 * 2)
  628. KERNEL2(16 * 2)
  629. KERNEL3(16 * 2)
  630. KERNEL4(16 * 2)
  631. KERNEL5(16 * 2)
  632. KERNEL6(16 * 2)
  633. KERNEL7(16 * 2)
  634. KERNEL8(16 * 2)
  635. addq $16 * SIZE, %rax
  636. NOBRANCH
  637. je .L15
  638. KERNEL1(16 * 0)
  639. KERNEL2(16 * 0)
  640. KERNEL3(16 * 0)
  641. KERNEL4(16 * 0)
  642. KERNEL5(16 * 0)
  643. KERNEL6(16 * 0)
  644. KERNEL7(16 * 0)
  645. KERNEL8(16 * 0)
  646. KERNEL1(16 * 2)
  647. KERNEL2(16 * 2)
  648. KERNEL3(16 * 2)
  649. KERNEL4(16 * 2)
  650. KERNEL5(16 * 2)
  651. KERNEL6(16 * 2)
  652. KERNEL7(16 * 2)
  653. KERNEL8(16 * 2)
  654. addq $16 * SIZE, %rax
  655. NOBRANCH
  656. je .L15
  657. KERNEL1(16 * 0)
  658. KERNEL2(16 * 0)
  659. KERNEL3(16 * 0)
  660. KERNEL4(16 * 0)
  661. KERNEL5(16 * 0)
  662. KERNEL6(16 * 0)
  663. KERNEL7(16 * 0)
  664. KERNEL8(16 * 0)
  665. KERNEL1(16 * 2)
  666. KERNEL2(16 * 2)
  667. KERNEL3(16 * 2)
  668. KERNEL4(16 * 2)
  669. KERNEL5(16 * 2)
  670. KERNEL6(16 * 2)
  671. KERNEL7(16 * 2)
  672. KERNEL8(16 * 2)
  673. addq $16 * SIZE, %rax
  674. NOBRANCH
  675. je .L15
  676. KERNEL1(16 * 0)
  677. KERNEL2(16 * 0)
  678. KERNEL3(16 * 0)
  679. KERNEL4(16 * 0)
  680. KERNEL5(16 * 0)
  681. KERNEL6(16 * 0)
  682. KERNEL7(16 * 0)
  683. KERNEL8(16 * 0)
  684. KERNEL1(16 * 2)
  685. KERNEL2(16 * 2)
  686. KERNEL3(16 * 2)
  687. KERNEL4(16 * 2)
  688. KERNEL5(16 * 2)
  689. KERNEL6(16 * 2)
  690. KERNEL7(16 * 2)
  691. KERNEL8(16 * 2)
  692. addq $16 * SIZE, %rax
  693. BRANCH
  694. jl .L12
  695. ALIGN_3
  696. .L15:
  697. #ifndef TRMMKERNEL
  698. movq K, %rax
  699. #else
  700. movq KKK, %rax
  701. #endif
  702. testq $4, %rax
  703. je .L16
  704. xorq %rax, %rax
  705. ALIGN_3
  706. KERNEL1(16 * 0)
  707. KERNEL2(16 * 0)
  708. KERNEL3(16 * 0)
  709. KERNEL4(16 * 0)
  710. KERNEL5(16 * 0)
  711. KERNEL6(16 * 0)
  712. KERNEL7(16 * 0)
  713. KERNEL8(16 * 0)
  714. addq $64 * SIZE, BO
  715. addq $32 * SIZE, AO
  716. ALIGN_3
  717. #else
  718. sarq $2, %rax
  719. NOBRANCH
  720. jle .L16
  721. ALIGN_3
  722. .L12:
  723. KERNEL1(16 * 0)
  724. KERNEL2(16 * 0)
  725. KERNEL3(16 * 0)
  726. KERNEL4(16 * 0)
  727. KERNEL5(16 * 0)
  728. KERNEL6(16 * 0)
  729. KERNEL7(16 * 0)
  730. KERNEL8(16 * 0)
  731. addq $ 64 * SIZE, BO
  732. subq $-32 * SIZE, AO
  733. decq %rax
  734. BRANCH
  735. jg .L12
  736. #endif
  737. .L16:
  738. #ifndef TRMMKERNEL
  739. movq K, %rax
  740. #else
  741. movq KKK, %rax
  742. #endif
  743. movaps ALPHA_R, %xmm6
  744. movaps ALPHA_I, %xmm7
  745. andq $3, %rax # if (k & 1)
  746. BRANCH
  747. je .L18
  748. leaq (, %rax, 8), %rax
  749. leaq (AO, %rax, 4), AO
  750. leaq (BO, %rax, 8), BO
  751. negq %rax
  752. ALIGN_4
  753. .L17:
  754. mulps %xmm0, %xmm1
  755. addps %xmm1, %xmm8
  756. movaps -28 * SIZE(BO, %rax, 8), %xmm1
  757. mulps %xmm0, %xmm1
  758. addps %xmm1, %xmm9
  759. movaps -24 * SIZE(BO, %rax, 8), %xmm1
  760. mulps %xmm0, %xmm1
  761. mulps -20 * SIZE(BO, %rax, 8), %xmm0
  762. addps %xmm1, %xmm10
  763. movaps -32 * SIZE(BO, %rax, 8), %xmm1
  764. addps %xmm0, %xmm11
  765. movaps -24 * SIZE(AO, %rax, 4), %xmm0
  766. mulps %xmm2, %xmm1
  767. addps %xmm1, %xmm12
  768. movaps -28 * SIZE(BO, %rax, 8), %xmm1
  769. mulps %xmm2, %xmm1
  770. addps %xmm1, %xmm13
  771. movaps -24 * SIZE(BO, %rax, 8), %xmm1
  772. mulps %xmm2, %xmm1
  773. mulps -20 * SIZE(BO, %rax, 8), %xmm2
  774. addps %xmm1, %xmm14
  775. movaps -16 * SIZE(BO, %rax, 8), %xmm1
  776. addps %xmm2, %xmm15
  777. movaps -20 * SIZE(AO, %rax, 4), %xmm2
  778. addq $SIZE * 2, %rax
  779. jl .L17
  780. ALIGN_4
  781. .L18:
  782. #ifndef TRMMKERNEL
  783. movsd 0 * SIZE(CO1), %xmm0
  784. movhps 2 * SIZE(CO1), %xmm0
  785. movsd 4 * SIZE(CO1), %xmm2
  786. movhps 6 * SIZE(CO1), %xmm2
  787. movsd 0 * SIZE(CO2), %xmm1
  788. movhps 2 * SIZE(CO2), %xmm1
  789. movsd 4 * SIZE(CO2), %xmm3
  790. movhps 6 * SIZE(CO2), %xmm3
  791. #endif
  792. shufps $0xb1, %xmm9, %xmm9
  793. shufps $0xb1, %xmm11, %xmm11
  794. shufps $0xb1, %xmm13, %xmm13
  795. shufps $0xb1, %xmm15, %xmm15
  796. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  797. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  798. subps %xmm9, %xmm8
  799. subps %xmm11, %xmm10
  800. subps %xmm13, %xmm12
  801. subps %xmm15, %xmm14
  802. #else
  803. addps %xmm9, %xmm8
  804. addps %xmm11, %xmm10
  805. addps %xmm13, %xmm12
  806. addps %xmm15, %xmm14
  807. #endif
  808. movaps %xmm8, %xmm9
  809. movaps %xmm10, %xmm11
  810. movaps %xmm12, %xmm13
  811. movaps %xmm14, %xmm15
  812. shufps $0xb1, %xmm8, %xmm8
  813. shufps $0xb1, %xmm10, %xmm10
  814. shufps $0xb1, %xmm12, %xmm12
  815. shufps $0xb1, %xmm14, %xmm14
  816. mulps %xmm6, %xmm9
  817. mulps %xmm7, %xmm8
  818. mulps %xmm6, %xmm11
  819. mulps %xmm7, %xmm10
  820. mulps %xmm6, %xmm13
  821. mulps %xmm7, %xmm12
  822. mulps %xmm6, %xmm15
  823. mulps %xmm7, %xmm14
  824. addps %xmm9, %xmm8
  825. addps %xmm11, %xmm10
  826. addps %xmm13, %xmm12
  827. addps %xmm15, %xmm14
  828. #ifndef TRMMKERNEL
  829. addps %xmm0, %xmm8
  830. addps %xmm1, %xmm10
  831. addps %xmm2, %xmm12
  832. addps %xmm3, %xmm14
  833. #endif
  834. movsd %xmm8, 0 * SIZE(CO1)
  835. movhps %xmm8, 2 * SIZE(CO1)
  836. movsd %xmm12, 4 * SIZE(CO1)
  837. movhps %xmm12, 6 * SIZE(CO1)
  838. movsd %xmm10, 0 * SIZE(CO2)
  839. movhps %xmm10, 2 * SIZE(CO2)
  840. movsd %xmm14, 4 * SIZE(CO2)
  841. movhps %xmm14, 6 * SIZE(CO2)
  842. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  843. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  844. movq K, %rax
  845. subq KKK, %rax
  846. leaq (,%rax, 8), %rax
  847. leaq (AO, %rax, 4), AO
  848. leaq (BO, %rax, 8), BO
  849. #endif
  850. #if defined(TRMMKERNEL) && defined(LEFT)
  851. addq $4, KK
  852. #endif
  853. addq $8 * SIZE, CO1 # coffset += 4
  854. addq $8 * SIZE, CO2 # coffset += 4
  855. decq I # i --
  856. jg .L11
  857. ALIGN_4
  858. .L20:
  859. testq $2, M
  860. je .L30
  861. #if !defined(TRMMKERNEL) || \
  862. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  863. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  864. leaq 32 * SIZE + BUFFER, BO
  865. #else
  866. leaq 32 * SIZE + BUFFER, BO
  867. movq KK, %rax
  868. leaq (, %rax, 8), %rax
  869. leaq (AO, %rax, 2), AO
  870. leaq (BO, %rax, 8), BO
  871. #endif
  872. movaps -32 * SIZE(AO), %xmm0
  873. movaps -16 * SIZE(AO), %xmm2
  874. movaps 0 * SIZE(AO), %xmm4
  875. movaps 16 * SIZE(AO), %xmm6
  876. movaps -32 * SIZE(BO), %xmm1
  877. movaps -16 * SIZE(BO), %xmm3
  878. movaps 0 * SIZE(BO), %xmm5
  879. movaps 16 * SIZE(BO), %xmm7
  880. pxor %xmm8, %xmm8
  881. pxor %xmm9, %xmm9
  882. pxor %xmm10, %xmm10
  883. pxor %xmm11, %xmm11
  884. #ifndef TRMMKERNEL
  885. movq K, %rax
  886. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  887. movq K, %rax
  888. subq KK, %rax
  889. movq %rax, KKK
  890. #else
  891. movq KK, %rax
  892. #ifdef LEFT
  893. addq $2, %rax
  894. #else
  895. addq $2, %rax
  896. #endif
  897. movq %rax, KKK
  898. #endif
  899. sarq $3, %rax
  900. je .L25
  901. ALIGN_4
  902. .L22:
  903. mulps %xmm0, %xmm1
  904. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  905. addps %xmm1, %xmm8
  906. movaps -28 * SIZE(BO), %xmm1
  907. mulps %xmm0, %xmm1
  908. addps %xmm1, %xmm9
  909. movaps -24 * SIZE(BO), %xmm1
  910. mulps %xmm0, %xmm1
  911. mulps -20 * SIZE(BO), %xmm0
  912. addps %xmm1, %xmm10
  913. movaps 32 * SIZE(BO), %xmm1
  914. addps %xmm0, %xmm11
  915. movaps -28 * SIZE(AO), %xmm0
  916. mulps %xmm0, %xmm3
  917. addps %xmm3, %xmm8
  918. movaps -12 * SIZE(BO), %xmm3
  919. mulps %xmm0, %xmm3
  920. addps %xmm3, %xmm9
  921. movaps -8 * SIZE(BO), %xmm3
  922. mulps %xmm0, %xmm3
  923. mulps -4 * SIZE(BO), %xmm0
  924. addps %xmm3, %xmm10
  925. movaps 48 * SIZE(BO), %xmm3
  926. addps %xmm0, %xmm11
  927. movaps -24 * SIZE(AO), %xmm0
  928. mulps %xmm0, %xmm5
  929. addps %xmm5, %xmm8
  930. movaps 4 * SIZE(BO), %xmm5
  931. mulps %xmm0, %xmm5
  932. addps %xmm5, %xmm9
  933. movaps 8 * SIZE(BO), %xmm5
  934. mulps %xmm0, %xmm5
  935. mulps 12 * SIZE(BO), %xmm0
  936. addps %xmm5, %xmm10
  937. movaps 64 * SIZE(BO), %xmm5
  938. addps %xmm0, %xmm11
  939. movaps -20 * SIZE(AO), %xmm0
  940. mulps %xmm0, %xmm7
  941. addps %xmm7, %xmm8
  942. movaps 20 * SIZE(BO), %xmm7
  943. mulps %xmm0, %xmm7
  944. addps %xmm7, %xmm9
  945. movaps 24 * SIZE(BO), %xmm7
  946. mulps %xmm0, %xmm7
  947. mulps 28 * SIZE(BO), %xmm0
  948. addps %xmm7, %xmm10
  949. movaps 80 * SIZE(BO), %xmm7
  950. addps %xmm0, %xmm11
  951. movaps 0 * SIZE(AO), %xmm0
  952. mulps %xmm2, %xmm1
  953. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  954. addps %xmm1, %xmm8
  955. movaps 36 * SIZE(BO), %xmm1
  956. mulps %xmm2, %xmm1
  957. addps %xmm1, %xmm9
  958. movaps 40 * SIZE(BO), %xmm1
  959. mulps %xmm2, %xmm1
  960. mulps 44 * SIZE(BO), %xmm2
  961. addps %xmm1, %xmm10
  962. movaps 96 * SIZE(BO), %xmm1
  963. addps %xmm2, %xmm11
  964. movaps -12 * SIZE(AO), %xmm2
  965. mulps %xmm2, %xmm3
  966. addps %xmm3, %xmm8
  967. movaps 52 * SIZE(BO), %xmm3
  968. mulps %xmm2, %xmm3
  969. addps %xmm3, %xmm9
  970. movaps 56 * SIZE(BO), %xmm3
  971. mulps %xmm2, %xmm3
  972. mulps 60 * SIZE(BO), %xmm2
  973. addps %xmm3, %xmm10
  974. movaps 112 * SIZE(BO), %xmm3
  975. addps %xmm2, %xmm11
  976. movaps -8 * SIZE(AO), %xmm2
  977. mulps %xmm2, %xmm5
  978. addps %xmm5, %xmm8
  979. movaps 68 * SIZE(BO), %xmm5
  980. mulps %xmm2, %xmm5
  981. addps %xmm5, %xmm9
  982. movaps 72 * SIZE(BO), %xmm5
  983. mulps %xmm2, %xmm5
  984. mulps 76 * SIZE(BO), %xmm2
  985. addps %xmm5, %xmm10
  986. movaps 128 * SIZE(BO), %xmm5
  987. addps %xmm2, %xmm11
  988. movaps -4 * SIZE(AO), %xmm2
  989. mulps %xmm2, %xmm7
  990. addps %xmm7, %xmm8
  991. movaps 84 * SIZE(BO), %xmm7
  992. mulps %xmm2, %xmm7
  993. addps %xmm7, %xmm9
  994. movaps 88 * SIZE(BO), %xmm7
  995. mulps %xmm2, %xmm7
  996. mulps 92 * SIZE(BO), %xmm2
  997. addps %xmm7, %xmm10
  998. movaps 144 * SIZE(BO), %xmm7
  999. addps %xmm2, %xmm11
  1000. movaps 16 * SIZE(AO), %xmm2
  1001. subq $ -32 * SIZE, AO
  1002. subq $-128 * SIZE, BO
  1003. decq %rax
  1004. jne .L22
  1005. ALIGN_4
  1006. .L25:
  1007. #ifndef TRMMKERNEL
  1008. movq K, %rax
  1009. #else
  1010. movq KKK, %rax
  1011. #endif
  1012. movaps ALPHA_R, %xmm6
  1013. movaps ALPHA_I, %xmm7
  1014. andq $7, %rax # if (k & 1)
  1015. BRANCH
  1016. je .L28
  1017. ALIGN_4
  1018. .L26:
  1019. mulps %xmm0, %xmm1
  1020. addps %xmm1, %xmm8
  1021. movaps -28 * SIZE(BO), %xmm1
  1022. mulps %xmm0, %xmm1
  1023. addps %xmm1, %xmm9
  1024. movaps -24 * SIZE(BO), %xmm1
  1025. mulps %xmm0, %xmm1
  1026. mulps -20 * SIZE(BO), %xmm0
  1027. addps %xmm1, %xmm10
  1028. movaps -16 * SIZE(BO), %xmm1
  1029. addps %xmm0, %xmm11
  1030. movaps -28 * SIZE(AO), %xmm0
  1031. subq $- 4 * SIZE, AO
  1032. subq $-16 * SIZE, BO
  1033. decq %rax
  1034. jg .L26
  1035. ALIGN_4
  1036. .L28:
  1037. #ifndef TRMMKERNEL
  1038. movsd 0 * SIZE(CO1), %xmm0
  1039. movhps 2 * SIZE(CO1), %xmm0
  1040. movsd 0 * SIZE(CO2), %xmm1
  1041. movhps 2 * SIZE(CO2), %xmm1
  1042. #endif
  1043. shufps $0xb1, %xmm9, %xmm9
  1044. shufps $0xb1, %xmm11, %xmm11
  1045. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1046. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1047. subps %xmm9, %xmm8
  1048. subps %xmm11, %xmm10
  1049. #else
  1050. addps %xmm9, %xmm8
  1051. addps %xmm11, %xmm10
  1052. #endif
  1053. movaps %xmm8, %xmm9
  1054. movaps %xmm10, %xmm11
  1055. shufps $0xb1, %xmm8, %xmm8
  1056. shufps $0xb1, %xmm10, %xmm10
  1057. mulps %xmm6, %xmm9
  1058. mulps %xmm7, %xmm8
  1059. mulps %xmm6, %xmm11
  1060. mulps %xmm7, %xmm10
  1061. addps %xmm9, %xmm8
  1062. addps %xmm11, %xmm10
  1063. #ifndef TRMMKERNEL
  1064. addps %xmm0, %xmm8
  1065. addps %xmm1, %xmm10
  1066. #endif
  1067. movsd %xmm8, 0 * SIZE(CO1)
  1068. movhps %xmm8, 2 * SIZE(CO1)
  1069. movsd %xmm10, 0 * SIZE(CO2)
  1070. movhps %xmm10, 2 * SIZE(CO2)
  1071. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1072. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1073. movq K, %rax
  1074. subq KKK, %rax
  1075. leaq (,%rax, 8), %rax
  1076. leaq (AO, %rax, 2), AO
  1077. leaq (BO, %rax, 8), BO
  1078. #endif
  1079. #if defined(TRMMKERNEL) && defined(LEFT)
  1080. addq $2, KK
  1081. #endif
  1082. addq $4 * SIZE, CO1 # coffset += 4
  1083. addq $4 * SIZE, CO2 # coffset += 4
  1084. ALIGN_4
  1085. .L30:
  1086. testq $1, M
  1087. je .L39
  1088. #if !defined(TRMMKERNEL) || \
  1089. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1090. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1091. leaq 32 * SIZE + BUFFER, BO
  1092. #else
  1093. leaq 32 * SIZE + BUFFER, BO
  1094. movq KK, %rax
  1095. leaq (, %rax, 8), %rax
  1096. leaq (AO, %rax, 1), AO
  1097. leaq (BO, %rax, 8), BO
  1098. #endif
  1099. movaps -32 * SIZE(AO), %xmm0
  1100. movaps -24 * SIZE(AO), %xmm2
  1101. movaps -32 * SIZE(BO), %xmm1
  1102. movaps -16 * SIZE(BO), %xmm3
  1103. movaps 0 * SIZE(BO), %xmm5
  1104. movaps 16 * SIZE(BO), %xmm7
  1105. pxor %xmm8, %xmm8
  1106. pxor %xmm9, %xmm9
  1107. pxor %xmm10, %xmm10
  1108. pxor %xmm11, %xmm11
  1109. #ifndef TRMMKERNEL
  1110. movq K, %rax
  1111. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1112. movq K, %rax
  1113. subq KK, %rax
  1114. movq %rax, KKK
  1115. #else
  1116. movq KK, %rax
  1117. #ifdef LEFT
  1118. addq $1, %rax
  1119. #else
  1120. addq $2, %rax
  1121. #endif
  1122. movq %rax, KKK
  1123. #endif
  1124. sarq $3, %rax
  1125. je .L35
  1126. ALIGN_4
  1127. .L32:
  1128. mulps %xmm0, %xmm1
  1129. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1130. addps %xmm1, %xmm8
  1131. movaps -28 * SIZE(BO), %xmm1
  1132. mulps %xmm0, %xmm1
  1133. addps %xmm1, %xmm9
  1134. movaps -24 * SIZE(BO), %xmm1
  1135. mulps %xmm0, %xmm1
  1136. addps %xmm1, %xmm10
  1137. movaps -20 * SIZE(BO), %xmm1
  1138. mulps %xmm0, %xmm1
  1139. movsd -30 * SIZE(AO), %xmm0
  1140. addps %xmm1, %xmm11
  1141. movaps 32 * SIZE(BO), %xmm1
  1142. mulps %xmm0, %xmm3
  1143. addps %xmm3, %xmm8
  1144. movaps -12 * SIZE(BO), %xmm3
  1145. mulps %xmm0, %xmm3
  1146. addps %xmm3, %xmm9
  1147. movaps -8 * SIZE(BO), %xmm3
  1148. mulps %xmm0, %xmm3
  1149. addps %xmm3, %xmm10
  1150. movaps -4 * SIZE(BO), %xmm3
  1151. mulps %xmm0, %xmm3
  1152. movsd -28 * SIZE(AO), %xmm0
  1153. addps %xmm3, %xmm11
  1154. movaps 48 * SIZE(BO), %xmm3
  1155. mulps %xmm0, %xmm5
  1156. addps %xmm5, %xmm8
  1157. movaps 4 * SIZE(BO), %xmm5
  1158. mulps %xmm0, %xmm5
  1159. addps %xmm5, %xmm9
  1160. movaps 8 * SIZE(BO), %xmm5
  1161. mulps %xmm0, %xmm5
  1162. addps %xmm5, %xmm10
  1163. movaps 12 * SIZE(BO), %xmm5
  1164. mulps %xmm0, %xmm5
  1165. movsd -26 * SIZE(AO), %xmm0
  1166. addps %xmm5, %xmm11
  1167. movaps 64 * SIZE(BO), %xmm5
  1168. mulps %xmm0, %xmm7
  1169. addps %xmm7, %xmm8
  1170. movaps 20 * SIZE(BO), %xmm7
  1171. mulps %xmm0, %xmm7
  1172. addps %xmm7, %xmm9
  1173. movaps 24 * SIZE(BO), %xmm7
  1174. mulps %xmm0, %xmm7
  1175. addps %xmm7, %xmm10
  1176. movaps 28 * SIZE(BO), %xmm7
  1177. mulps %xmm0, %xmm7
  1178. movsd -16 * SIZE(AO), %xmm0
  1179. addps %xmm7, %xmm11
  1180. movaps 80 * SIZE(BO), %xmm7
  1181. mulps %xmm2, %xmm1
  1182. addps %xmm1, %xmm8
  1183. movaps 36 * SIZE(BO), %xmm1
  1184. mulps %xmm2, %xmm1
  1185. addps %xmm1, %xmm9
  1186. movaps 40 * SIZE(BO), %xmm1
  1187. mulps %xmm2, %xmm1
  1188. addps %xmm1, %xmm10
  1189. movaps 44 * SIZE(BO), %xmm1
  1190. mulps %xmm2, %xmm1
  1191. movsd -22 * SIZE(AO), %xmm2
  1192. addps %xmm1, %xmm11
  1193. movaps 96 * SIZE(BO), %xmm1
  1194. mulps %xmm2, %xmm3
  1195. addps %xmm3, %xmm8
  1196. movaps 52 * SIZE(BO), %xmm3
  1197. mulps %xmm2, %xmm3
  1198. addps %xmm3, %xmm9
  1199. movaps 56 * SIZE(BO), %xmm3
  1200. mulps %xmm2, %xmm3
  1201. addps %xmm3, %xmm10
  1202. movaps 60 * SIZE(BO), %xmm3
  1203. mulps %xmm2, %xmm3
  1204. movsd -20 * SIZE(AO), %xmm2
  1205. addps %xmm3, %xmm11
  1206. movaps 112 * SIZE(BO), %xmm3
  1207. mulps %xmm2, %xmm5
  1208. addps %xmm5, %xmm8
  1209. movaps 68 * SIZE(BO), %xmm5
  1210. mulps %xmm2, %xmm5
  1211. addps %xmm5, %xmm9
  1212. movaps 72 * SIZE(BO), %xmm5
  1213. mulps %xmm2, %xmm5
  1214. addps %xmm5, %xmm10
  1215. movaps 76 * SIZE(BO), %xmm5
  1216. mulps %xmm2, %xmm5
  1217. movsd -18 * SIZE(AO), %xmm2
  1218. addps %xmm5, %xmm11
  1219. movaps 128 * SIZE(BO), %xmm5
  1220. mulps %xmm2, %xmm7
  1221. addps %xmm7, %xmm8
  1222. movaps 84 * SIZE(BO), %xmm7
  1223. mulps %xmm2, %xmm7
  1224. addps %xmm7, %xmm9
  1225. movaps 88 * SIZE(BO), %xmm7
  1226. mulps %xmm2, %xmm7
  1227. addps %xmm7, %xmm10
  1228. movaps 92 * SIZE(BO), %xmm7
  1229. mulps %xmm2, %xmm7
  1230. movsd -8 * SIZE(AO), %xmm2
  1231. addps %xmm7, %xmm11
  1232. movaps 144 * SIZE(BO), %xmm7
  1233. subq $ -16 * SIZE, AO
  1234. subq $-128 * SIZE, BO
  1235. decq %rax
  1236. jne .L32
  1237. ALIGN_4
  1238. .L35:
  1239. #ifndef TRMMKERNEL
  1240. movq K, %rax
  1241. #else
  1242. movq KKK, %rax
  1243. #endif
  1244. movaps ALPHA_R, %xmm6
  1245. movaps ALPHA_I, %xmm7
  1246. andq $7, %rax # if (k & 1)
  1247. BRANCH
  1248. je .L38
  1249. ALIGN_4
  1250. .L36:
  1251. mulps %xmm0, %xmm1
  1252. addps %xmm1, %xmm8
  1253. movaps -28 * SIZE(BO), %xmm1
  1254. mulps %xmm0, %xmm1
  1255. addps %xmm1, %xmm9
  1256. movaps -24 * SIZE(BO), %xmm1
  1257. mulps %xmm0, %xmm1
  1258. addps %xmm1, %xmm10
  1259. movaps -20 * SIZE(BO), %xmm1
  1260. mulps %xmm0, %xmm1
  1261. movsd -30 * SIZE(AO), %xmm0
  1262. addps %xmm1, %xmm11
  1263. movaps -16 * SIZE(BO), %xmm1
  1264. subq $ -2 * SIZE, AO
  1265. subq $-16 * SIZE, BO
  1266. decq %rax
  1267. jg .L36
  1268. ALIGN_4
  1269. .L38:
  1270. #ifndef TRMMKERNEL
  1271. #ifdef movsd
  1272. xorps %xmm0, %xmm0
  1273. #endif
  1274. movsd 0 * SIZE(CO1), %xmm0
  1275. #ifdef movsd
  1276. xorps %xmm1, %xmm1
  1277. #endif
  1278. movsd 0 * SIZE(CO2), %xmm1
  1279. #endif
  1280. shufps $0xb1, %xmm9, %xmm9
  1281. shufps $0xb1, %xmm11, %xmm11
  1282. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1283. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1284. subps %xmm9, %xmm8
  1285. subps %xmm11, %xmm10
  1286. #else
  1287. addps %xmm9, %xmm8
  1288. addps %xmm11, %xmm10
  1289. #endif
  1290. movaps %xmm8, %xmm9
  1291. movaps %xmm10, %xmm11
  1292. shufps $0xb1, %xmm8, %xmm8
  1293. shufps $0xb1, %xmm10, %xmm10
  1294. mulps %xmm6, %xmm9
  1295. mulps %xmm7, %xmm8
  1296. mulps %xmm6, %xmm11
  1297. mulps %xmm7, %xmm10
  1298. addps %xmm9, %xmm8
  1299. addps %xmm11, %xmm10
  1300. #ifndef TRMMKERNEL
  1301. addps %xmm0, %xmm8
  1302. addps %xmm1, %xmm10
  1303. #endif
  1304. movlps %xmm8, 0 * SIZE(CO1)
  1305. movlps %xmm10, 0 * SIZE(CO2)
  1306. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1307. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1308. movq K, %rax
  1309. subq KKK, %rax
  1310. leaq (,%rax, 8), %rax
  1311. leaq (AO, %rax, 1), AO
  1312. leaq (BO, %rax, 8), BO
  1313. #endif
  1314. #if defined(TRMMKERNEL) && defined(LEFT)
  1315. addq $1, KK
  1316. #endif
  1317. ALIGN_4
  1318. .L39:
  1319. #if defined(TRMMKERNEL) && !defined(LEFT)
  1320. addl $2, KK
  1321. #endif
  1322. leaq (C, LDC, 2), C # c += 2 * ldc
  1323. decq J # j --
  1324. jg .L01
  1325. ALIGN_4
  1326. .L40:
  1327. testq $1, N
  1328. je .L999
  1329. ALIGN_4
  1330. .L41:
  1331. #if defined(TRMMKERNEL) && defined(LEFT)
  1332. movq OFFSET, %rax
  1333. movq %rax, KK
  1334. #endif
  1335. /* Copying to Sub Buffer */
  1336. leaq BUFFER, BO
  1337. movaps POSINV, %xmm7
  1338. movq K, %rax
  1339. sarq $2, %rax
  1340. jle .L43
  1341. ALIGN_4
  1342. .L42:
  1343. movss 0 * SIZE(B), %xmm8
  1344. movss 1 * SIZE(B), %xmm9
  1345. movss 2 * SIZE(B), %xmm10
  1346. movss 3 * SIZE(B), %xmm11
  1347. movss 4 * SIZE(B), %xmm12
  1348. movss 5 * SIZE(B), %xmm13
  1349. movss 6 * SIZE(B), %xmm14
  1350. movss 7 * SIZE(B), %xmm15
  1351. shufps $0, %xmm8, %xmm8
  1352. shufps $0, %xmm9, %xmm9
  1353. shufps $0, %xmm10, %xmm10
  1354. shufps $0, %xmm11, %xmm11
  1355. shufps $0, %xmm12, %xmm12
  1356. shufps $0, %xmm13, %xmm13
  1357. shufps $0, %xmm14, %xmm14
  1358. shufps $0, %xmm15, %xmm15
  1359. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  1360. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  1361. xorps %xmm7, %xmm9
  1362. xorps %xmm7, %xmm11
  1363. xorps %xmm7, %xmm13
  1364. xorps %xmm7, %xmm15
  1365. #else
  1366. xorps %xmm7, %xmm8
  1367. xorps %xmm7, %xmm10
  1368. xorps %xmm7, %xmm12
  1369. xorps %xmm7, %xmm14
  1370. #endif
  1371. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
  1372. prefetchnta 56 * SIZE(B)
  1373. #endif
  1374. movaps %xmm8, 0 * SIZE(BO)
  1375. movaps %xmm9, 4 * SIZE(BO)
  1376. movaps %xmm10, 8 * SIZE(BO)
  1377. movaps %xmm11, 12 * SIZE(BO)
  1378. movaps %xmm12, 16 * SIZE(BO)
  1379. movaps %xmm13, 20 * SIZE(BO)
  1380. movaps %xmm14, 24 * SIZE(BO)
  1381. movaps %xmm15, 28 * SIZE(BO)
  1382. #if defined(PENTIUM4) || defined(GENERIC)
  1383. PREFETCHW 128 * SIZE(BO)
  1384. PREFETCH 112 * SIZE(B)
  1385. #endif
  1386. addq $32 * SIZE, BO
  1387. addq $ 8 * SIZE, B
  1388. decq %rax
  1389. jne .L42
  1390. ALIGN_4
  1391. .L43:
  1392. movq K, %rax
  1393. andq $3, %rax
  1394. BRANCH
  1395. jle .L50
  1396. ALIGN_4
  1397. .L44:
  1398. movss 0 * SIZE(B), %xmm8
  1399. movss 1 * SIZE(B), %xmm9
  1400. shufps $0, %xmm8, %xmm8
  1401. shufps $0, %xmm9, %xmm9
  1402. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  1403. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  1404. xorps %xmm7, %xmm9
  1405. #else
  1406. xorps %xmm7, %xmm8
  1407. #endif
  1408. movaps %xmm8, 0 * SIZE(BO)
  1409. movaps %xmm9, 4 * SIZE(BO)
  1410. addq $2 * SIZE, B
  1411. addq $8 * SIZE, BO
  1412. decq %rax
  1413. jne .L44
  1414. ALIGN_4
  1415. .L50:
  1416. movq C, CO1 # coffset1 = c
  1417. movq A, AO # aoffset = a
  1418. movq M, I
  1419. sarq $2, I # i = (m >> 2)
  1420. jle .L60
  1421. ALIGN_4
  1422. .L51:
  1423. #if !defined(TRMMKERNEL) || \
  1424. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1425. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1426. leaq 32 * SIZE + BUFFER, BO
  1427. #else
  1428. leaq 32 * SIZE + BUFFER, BO
  1429. movq KK, %rax
  1430. leaq (, %rax, 8), %rax
  1431. leaq (AO, %rax, 4), AO
  1432. leaq (BO, %rax, 4), BO
  1433. #endif
  1434. movaps -32 * SIZE(AO), %xmm0
  1435. pxor %xmm8, %xmm8
  1436. movaps -16 * SIZE(AO), %xmm2
  1437. pxor %xmm9, %xmm9
  1438. movaps 0 * SIZE(AO), %xmm4
  1439. pxor %xmm10, %xmm10
  1440. movaps 16 * SIZE(AO), %xmm6
  1441. pxor %xmm11, %xmm11
  1442. movaps -32 * SIZE(BO), %xmm1
  1443. pxor %xmm12, %xmm12
  1444. movaps -16 * SIZE(BO), %xmm3
  1445. pxor %xmm13, %xmm13
  1446. movaps 0 * SIZE(BO), %xmm5
  1447. pxor %xmm14, %xmm14
  1448. movaps 16 * SIZE(BO), %xmm7
  1449. pxor %xmm15, %xmm15
  1450. PREFETCHW 7 * SIZE(CO1)
  1451. #ifndef TRMMKERNEL
  1452. movq K, %rax
  1453. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1454. movq K, %rax
  1455. subq KK, %rax
  1456. movq %rax, KKK
  1457. #else
  1458. movq KK, %rax
  1459. #ifdef LEFT
  1460. addq $4, %rax
  1461. #else
  1462. addq $1, %rax
  1463. #endif
  1464. movq %rax, KKK
  1465. #endif
  1466. sarq $3, %rax
  1467. je .L55
  1468. ALIGN_4
  1469. .L52:
  1470. mulps %xmm0, %xmm1
  1471. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1472. mulps -28 * SIZE(BO), %xmm0
  1473. addps %xmm1, %xmm8
  1474. movaps -32 * SIZE(BO), %xmm1
  1475. addps %xmm0, %xmm9
  1476. movaps -28 * SIZE(AO), %xmm0
  1477. mulps %xmm0, %xmm1
  1478. mulps -28 * SIZE(BO), %xmm0
  1479. addps %xmm1, %xmm12
  1480. movaps -24 * SIZE(BO), %xmm1
  1481. addps %xmm0, %xmm13
  1482. movaps -24 * SIZE(AO), %xmm0
  1483. mulps %xmm0, %xmm1
  1484. mulps -20 * SIZE(BO), %xmm0
  1485. addps %xmm1, %xmm8
  1486. movaps -24 * SIZE(BO), %xmm1
  1487. addps %xmm0, %xmm9
  1488. movaps -20 * SIZE(AO), %xmm0
  1489. mulps %xmm0, %xmm1
  1490. mulps -20 * SIZE(BO), %xmm0
  1491. addps %xmm1, %xmm12
  1492. movaps 32 * SIZE(BO), %xmm1
  1493. addps %xmm0, %xmm13
  1494. movaps 32 * SIZE(AO), %xmm0
  1495. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1496. mulps %xmm2, %xmm3
  1497. mulps -12 * SIZE(BO), %xmm2
  1498. addps %xmm3, %xmm8
  1499. movaps -16 * SIZE(BO), %xmm3
  1500. addps %xmm2, %xmm9
  1501. movaps -12 * SIZE(AO), %xmm2
  1502. mulps %xmm2, %xmm3
  1503. mulps -12 * SIZE(BO), %xmm2
  1504. addps %xmm3, %xmm12
  1505. movaps -8 * SIZE(BO), %xmm3
  1506. addps %xmm2, %xmm13
  1507. movaps -8 * SIZE(AO), %xmm2
  1508. mulps %xmm2, %xmm3
  1509. mulps -4 * SIZE(BO), %xmm2
  1510. addps %xmm3, %xmm8
  1511. movaps -8 * SIZE(BO), %xmm3
  1512. addps %xmm2, %xmm9
  1513. movaps -4 * SIZE(AO), %xmm2
  1514. mulps %xmm2, %xmm3
  1515. mulps -4 * SIZE(BO), %xmm2
  1516. addps %xmm3, %xmm12
  1517. movaps 48 * SIZE(BO), %xmm3
  1518. addps %xmm2, %xmm13
  1519. movaps 48 * SIZE(AO), %xmm2
  1520. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  1521. mulps %xmm4, %xmm5
  1522. mulps 4 * SIZE(BO), %xmm4
  1523. addps %xmm5, %xmm8
  1524. movaps 0 * SIZE(BO), %xmm5
  1525. addps %xmm4, %xmm9
  1526. movaps 4 * SIZE(AO), %xmm4
  1527. mulps %xmm4, %xmm5
  1528. mulps 4 * SIZE(BO), %xmm4
  1529. addps %xmm5, %xmm12
  1530. movaps 8 * SIZE(BO), %xmm5
  1531. addps %xmm4, %xmm13
  1532. movaps 8 * SIZE(AO), %xmm4
  1533. mulps %xmm4, %xmm5
  1534. mulps 12 * SIZE(BO), %xmm4
  1535. addps %xmm5, %xmm8
  1536. movaps 8 * SIZE(BO), %xmm5
  1537. addps %xmm4, %xmm9
  1538. movaps 12 * SIZE(AO), %xmm4
  1539. mulps %xmm4, %xmm5
  1540. mulps 12 * SIZE(BO), %xmm4
  1541. addps %xmm5, %xmm12
  1542. movaps 64 * SIZE(BO), %xmm5
  1543. addps %xmm4, %xmm13
  1544. movaps 64 * SIZE(AO), %xmm4
  1545. PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
  1546. mulps %xmm6, %xmm7
  1547. mulps 20 * SIZE(BO), %xmm6
  1548. addps %xmm7, %xmm8
  1549. movaps 16 * SIZE(BO), %xmm7
  1550. addps %xmm6, %xmm9
  1551. movaps 20 * SIZE(AO), %xmm6
  1552. mulps %xmm6, %xmm7
  1553. mulps 20 * SIZE(BO), %xmm6
  1554. addps %xmm7, %xmm12
  1555. movaps 24 * SIZE(BO), %xmm7
  1556. addps %xmm6, %xmm13
  1557. movaps 24 * SIZE(AO), %xmm6
  1558. mulps %xmm6, %xmm7
  1559. mulps 28 * SIZE(BO), %xmm6
  1560. addps %xmm7, %xmm8
  1561. movaps 24 * SIZE(BO), %xmm7
  1562. addps %xmm6, %xmm9
  1563. movaps 28 * SIZE(AO), %xmm6
  1564. mulps %xmm6, %xmm7
  1565. mulps 28 * SIZE(BO), %xmm6
  1566. addps %xmm7, %xmm12
  1567. movaps 80 * SIZE(BO), %xmm7
  1568. addps %xmm6, %xmm13
  1569. movaps 80 * SIZE(AO), %xmm6
  1570. subq $-64 * SIZE, AO
  1571. subq $-64 * SIZE, BO
  1572. decq %rax
  1573. jne .L52
  1574. ALIGN_4
  1575. .L55:
  1576. #ifndef TRMMKERNEL
  1577. movq K, %rax
  1578. #else
  1579. movq KKK, %rax
  1580. #endif
  1581. movaps ALPHA_R, %xmm6
  1582. movaps ALPHA_I, %xmm7
  1583. andq $7, %rax # if (k & 1)
  1584. BRANCH
  1585. je .L58
  1586. ALIGN_4
  1587. .L56:
  1588. mulps %xmm0, %xmm1
  1589. mulps -28 * SIZE(BO), %xmm0
  1590. addps %xmm1, %xmm8
  1591. movaps -32 * SIZE(BO), %xmm1
  1592. addps %xmm0, %xmm9
  1593. movaps -28 * SIZE(AO), %xmm0
  1594. mulps %xmm0, %xmm1
  1595. mulps -28 * SIZE(BO), %xmm0
  1596. addps %xmm1, %xmm12
  1597. movaps -24 * SIZE(BO), %xmm1
  1598. addps %xmm0, %xmm13
  1599. movaps -24 * SIZE(AO), %xmm0
  1600. addq $ 8 * SIZE, AO
  1601. addq $ 8 * SIZE, BO
  1602. decq %rax
  1603. jg .L56
  1604. ALIGN_4
  1605. .L58:
  1606. #ifndef TRMMKERNEL
  1607. movsd 0 * SIZE(CO1), %xmm0
  1608. movhps 2 * SIZE(CO1), %xmm0
  1609. movsd 4 * SIZE(CO1), %xmm2
  1610. movhps 6 * SIZE(CO1), %xmm2
  1611. #endif
  1612. shufps $0xb1, %xmm9, %xmm9
  1613. shufps $0xb1, %xmm13, %xmm13
  1614. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1615. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1616. subps %xmm9, %xmm8
  1617. subps %xmm13, %xmm12
  1618. #else
  1619. addps %xmm9, %xmm8
  1620. addps %xmm13, %xmm12
  1621. #endif
  1622. movaps %xmm8, %xmm9
  1623. movaps %xmm12, %xmm13
  1624. shufps $0xb1, %xmm8, %xmm8
  1625. shufps $0xb1, %xmm12, %xmm12
  1626. mulps %xmm6, %xmm9
  1627. mulps %xmm7, %xmm8
  1628. mulps %xmm6, %xmm13
  1629. mulps %xmm7, %xmm12
  1630. addps %xmm9, %xmm8
  1631. addps %xmm13, %xmm12
  1632. #ifndef TRMMKERNEL
  1633. addps %xmm0, %xmm8
  1634. addps %xmm2, %xmm12
  1635. #endif
  1636. movlps %xmm8, 0 * SIZE(CO1)
  1637. movhps %xmm8, 2 * SIZE(CO1)
  1638. movlps %xmm12, 4 * SIZE(CO1)
  1639. movhps %xmm12, 6 * SIZE(CO1)
  1640. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1641. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1642. movq K, %rax
  1643. subq KKK, %rax
  1644. leaq (,%rax, 8), %rax
  1645. leaq (AO, %rax, 4), AO
  1646. leaq (BO, %rax, 4), BO
  1647. #endif
  1648. #if defined(TRMMKERNEL) && defined(LEFT)
  1649. addq $4, KK
  1650. #endif
  1651. addq $8 * SIZE, CO1 # coffset += 4
  1652. decq I # i --
  1653. jg .L51
  1654. ALIGN_4
  1655. .L60:
  1656. testq $2, M
  1657. je .L70
  1658. #if !defined(TRMMKERNEL) || \
  1659. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1660. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1661. leaq 32 * SIZE + BUFFER, BO
  1662. #else
  1663. leaq 32 * SIZE + BUFFER, BO
  1664. movq KK, %rax
  1665. leaq (, %rax, 8), %rax
  1666. leaq (AO, %rax, 2), AO
  1667. leaq (BO, %rax, 4), BO
  1668. #endif
  1669. movaps -32 * SIZE(AO), %xmm0
  1670. pxor %xmm8, %xmm8
  1671. movaps -16 * SIZE(AO), %xmm2
  1672. pxor %xmm9, %xmm9
  1673. movaps -32 * SIZE(BO), %xmm1
  1674. pxor %xmm10, %xmm10
  1675. movaps -16 * SIZE(BO), %xmm3
  1676. pxor %xmm11, %xmm11
  1677. movaps 0 * SIZE(BO), %xmm5
  1678. movaps 16 * SIZE(BO), %xmm7
  1679. #ifndef TRMMKERNEL
  1680. movq K, %rax
  1681. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1682. movq K, %rax
  1683. subq KK, %rax
  1684. movq %rax, KKK
  1685. #else
  1686. movq KK, %rax
  1687. #ifdef LEFT
  1688. addq $2, %rax
  1689. #else
  1690. addq $1, %rax
  1691. #endif
  1692. movq %rax, KKK
  1693. #endif
  1694. sarq $3, %rax
  1695. je .L65
  1696. ALIGN_4
  1697. .L62:
  1698. mulps %xmm0, %xmm1
  1699. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1700. mulps -28 * SIZE(BO), %xmm0
  1701. addps %xmm1, %xmm8
  1702. movaps -24 * SIZE(BO), %xmm1
  1703. addps %xmm0, %xmm9
  1704. movaps -28 * SIZE(AO), %xmm0
  1705. mulps %xmm0, %xmm1
  1706. mulps -20 * SIZE(BO), %xmm0
  1707. addps %xmm1, %xmm10
  1708. movaps 32 * SIZE(BO), %xmm1
  1709. addps %xmm0, %xmm11
  1710. movaps -24 * SIZE(AO), %xmm0
  1711. mulps %xmm0, %xmm3
  1712. mulps -12 * SIZE(BO), %xmm0
  1713. addps %xmm3, %xmm8
  1714. movaps -8 * SIZE(BO), %xmm3
  1715. addps %xmm0, %xmm9
  1716. movaps -20 * SIZE(AO), %xmm0
  1717. mulps %xmm0, %xmm3
  1718. mulps -4 * SIZE(BO), %xmm0
  1719. addps %xmm3, %xmm10
  1720. movaps 48 * SIZE(BO), %xmm3
  1721. addps %xmm0, %xmm11
  1722. movaps 0 * SIZE(AO), %xmm0
  1723. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1724. mulps %xmm2, %xmm5
  1725. mulps 4 * SIZE(BO), %xmm2
  1726. addps %xmm5, %xmm8
  1727. movaps 8 * SIZE(BO), %xmm5
  1728. addps %xmm2, %xmm9
  1729. movaps -12 * SIZE(AO), %xmm2
  1730. mulps %xmm2, %xmm5
  1731. mulps 12 * SIZE(BO), %xmm2
  1732. addps %xmm5, %xmm10
  1733. movaps 64 * SIZE(BO), %xmm5
  1734. addps %xmm2, %xmm11
  1735. movaps -8 * SIZE(AO), %xmm2
  1736. mulps %xmm2, %xmm7
  1737. mulps 20 * SIZE(BO), %xmm2
  1738. addps %xmm7, %xmm8
  1739. movaps 24 * SIZE(BO), %xmm7
  1740. addps %xmm2, %xmm9
  1741. movaps -4 * SIZE(AO), %xmm2
  1742. mulps %xmm2, %xmm7
  1743. mulps 28 * SIZE(BO), %xmm2
  1744. addps %xmm7, %xmm10
  1745. movaps 80 * SIZE(BO), %xmm7
  1746. addps %xmm2, %xmm11
  1747. movaps 16 * SIZE(AO), %xmm2
  1748. subq $-32 * SIZE, AO
  1749. subq $-64 * SIZE, BO
  1750. decq %rax
  1751. jne .L62
  1752. ALIGN_4
  1753. .L65:
  1754. #ifndef TRMMKERNEL
  1755. movq K, %rax
  1756. #else
  1757. movq KKK, %rax
  1758. #endif
  1759. movaps ALPHA_R, %xmm6
  1760. movaps ALPHA_I, %xmm7
  1761. andq $7, %rax # if (k & 1)
  1762. BRANCH
  1763. je .L68
  1764. ALIGN_4
  1765. .L66:
  1766. mulps %xmm0, %xmm1
  1767. mulps -28 * SIZE(BO), %xmm0
  1768. addps %xmm1, %xmm8
  1769. movaps -24 * SIZE(BO), %xmm1
  1770. addps %xmm0, %xmm9
  1771. movaps -28 * SIZE(AO), %xmm0
  1772. addq $4 * SIZE, AO # aoffset += 4
  1773. addq $8 * SIZE, BO # boffset1 += 8
  1774. decq %rax
  1775. jg .L66
  1776. ALIGN_4
  1777. .L68:
  1778. #ifndef TRMMKERNEL
  1779. movsd 0 * SIZE(CO1), %xmm0
  1780. movhps 2 * SIZE(CO1), %xmm0
  1781. #endif
  1782. addps %xmm10, %xmm8
  1783. addps %xmm11, %xmm9
  1784. shufps $0xb1, %xmm9, %xmm9
  1785. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1786. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1787. subps %xmm9, %xmm8
  1788. #else
  1789. addps %xmm9, %xmm8
  1790. #endif
  1791. movaps %xmm8, %xmm9
  1792. shufps $0xb1, %xmm8, %xmm8
  1793. mulps %xmm6, %xmm9
  1794. mulps %xmm7, %xmm8
  1795. addps %xmm9, %xmm8
  1796. #ifndef TRMMKERNEL
  1797. addps %xmm0, %xmm8
  1798. #endif
  1799. movsd %xmm8, 0 * SIZE(CO1)
  1800. movhps %xmm8, 2 * SIZE(CO1)
  1801. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1802. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1803. movq K, %rax
  1804. subq KKK, %rax
  1805. leaq (,%rax, 8), %rax
  1806. leaq (AO, %rax, 2), AO
  1807. leaq (BO, %rax, 4), BO
  1808. #endif
  1809. #if defined(TRMMKERNEL) && defined(LEFT)
  1810. addq $2, KK
  1811. #endif
  1812. addq $4 * SIZE, CO1 # coffset += 4
  1813. ALIGN_4
  1814. .L70:
  1815. testq $1, M
  1816. je .L999
  1817. #if !defined(TRMMKERNEL) || \
  1818. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1819. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1820. leaq 32 * SIZE + BUFFER, BO
  1821. #else
  1822. leaq 32 * SIZE + BUFFER, BO
  1823. movq KK, %rax
  1824. leaq (, %rax, 8), %rax
  1825. leaq (AO, %rax, 1), AO
  1826. leaq (BO, %rax, 4), BO
  1827. #endif
  1828. movaps -32 * SIZE(AO), %xmm0
  1829. pxor %xmm8, %xmm8
  1830. movaps -24 * SIZE(AO), %xmm2
  1831. pxor %xmm9, %xmm9
  1832. movaps -32 * SIZE(BO), %xmm1
  1833. pxor %xmm10, %xmm10
  1834. movaps -16 * SIZE(BO), %xmm3
  1835. pxor %xmm11, %xmm11
  1836. movaps 0 * SIZE(BO), %xmm5
  1837. movaps 16 * SIZE(BO), %xmm7
  1838. #ifndef TRMMKERNEL
  1839. movq K, %rax
  1840. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1841. movq K, %rax
  1842. subq KK, %rax
  1843. movq %rax, KKK
  1844. #else
  1845. movq KK, %rax
  1846. #ifdef LEFT
  1847. addq $1, %rax
  1848. #else
  1849. addq $1, %rax
  1850. #endif
  1851. movq %rax, KKK
  1852. #endif
  1853. sarq $3, %rax
  1854. je .L75
  1855. ALIGN_4
  1856. .L72:
  1857. mulps %xmm0, %xmm1
  1858. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1859. addps %xmm1, %xmm8
  1860. movaps -28 * SIZE(BO), %xmm1
  1861. mulps %xmm0, %xmm1
  1862. movsd -30 * SIZE(AO), %xmm0
  1863. addps %xmm1, %xmm9
  1864. movaps -24 * SIZE(BO), %xmm1
  1865. mulps %xmm0, %xmm1
  1866. addps %xmm1, %xmm10
  1867. movaps -20 * SIZE(BO), %xmm1
  1868. mulps %xmm0, %xmm1
  1869. movsd -28 * SIZE(AO), %xmm0
  1870. addps %xmm1, %xmm11
  1871. movaps 32 * SIZE(BO), %xmm1
  1872. mulps %xmm0, %xmm3
  1873. addps %xmm3, %xmm8
  1874. movaps -12 * SIZE(BO), %xmm3
  1875. mulps %xmm0, %xmm3
  1876. movsd -26 * SIZE(AO), %xmm0
  1877. addps %xmm3, %xmm9
  1878. movaps -8 * SIZE(BO), %xmm3
  1879. mulps %xmm0, %xmm3
  1880. addps %xmm3, %xmm10
  1881. movaps -4 * SIZE(BO), %xmm3
  1882. mulps %xmm0, %xmm3
  1883. movsd -16 * SIZE(AO), %xmm0
  1884. addps %xmm3, %xmm11
  1885. movaps 48 * SIZE(BO), %xmm3
  1886. mulps %xmm2, %xmm5
  1887. addps %xmm5, %xmm8
  1888. movaps 4 * SIZE(BO), %xmm5
  1889. mulps %xmm2, %xmm5
  1890. movsd -22 * SIZE(AO), %xmm2
  1891. addps %xmm5, %xmm9
  1892. movaps 8 * SIZE(BO), %xmm5
  1893. mulps %xmm2, %xmm5
  1894. addps %xmm5, %xmm10
  1895. movaps 12 * SIZE(BO), %xmm5
  1896. mulps %xmm2, %xmm5
  1897. movsd -20 * SIZE(AO), %xmm2
  1898. addps %xmm5, %xmm11
  1899. movaps 64 * SIZE(BO), %xmm5
  1900. mulps %xmm2, %xmm7
  1901. addps %xmm7, %xmm8
  1902. movaps 20 * SIZE(BO), %xmm7
  1903. mulps %xmm2, %xmm7
  1904. movsd -18 * SIZE(AO), %xmm2
  1905. addps %xmm7, %xmm9
  1906. movaps 24 * SIZE(BO), %xmm7
  1907. mulps %xmm2, %xmm7
  1908. addps %xmm7, %xmm10
  1909. movaps 28 * SIZE(BO), %xmm7
  1910. mulps %xmm2, %xmm7
  1911. movsd -8 * SIZE(AO), %xmm2
  1912. addps %xmm7, %xmm11
  1913. movaps 80 * SIZE(BO), %xmm7
  1914. subq $-16 * SIZE, AO
  1915. subq $-64 * SIZE, BO
  1916. decq %rax
  1917. jne .L72
  1918. ALIGN_4
  1919. .L75:
  1920. #ifndef TRMMKERNEL
  1921. movq K, %rax
  1922. #else
  1923. movq KKK, %rax
  1924. #endif
  1925. movaps ALPHA_R, %xmm6
  1926. movaps ALPHA_I, %xmm7
  1927. andq $7, %rax # if (k & 1)
  1928. BRANCH
  1929. je .L78
  1930. ALIGN_4
  1931. .L76:
  1932. mulps %xmm0, %xmm1
  1933. addps %xmm1, %xmm8
  1934. movaps -28 * SIZE(BO), %xmm1
  1935. mulps %xmm0, %xmm1
  1936. movsd -30 * SIZE(AO), %xmm0
  1937. addps %xmm1, %xmm9
  1938. movaps -24 * SIZE(BO), %xmm1
  1939. addq $2 * SIZE, AO
  1940. addq $8 * SIZE, BO
  1941. decq %rax
  1942. jg .L76
  1943. ALIGN_4
  1944. .L78:
  1945. #ifndef TRMMKERNEL
  1946. #ifdef movsd
  1947. xorps %xmm0, %xmm0
  1948. #endif
  1949. movsd 0 * SIZE(CO1), %xmm0
  1950. #endif
  1951. addps %xmm10, %xmm8
  1952. addps %xmm11, %xmm9
  1953. shufps $0xb1, %xmm9, %xmm9
  1954. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1955. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1956. subps %xmm9, %xmm8
  1957. #else
  1958. addps %xmm9, %xmm8
  1959. #endif
  1960. movaps %xmm8, %xmm9
  1961. shufps $0xb1, %xmm8, %xmm8
  1962. mulps %xmm6, %xmm9
  1963. mulps %xmm7, %xmm8
  1964. addps %xmm9, %xmm8
  1965. #ifndef TRMMKERNEL
  1966. addps %xmm0, %xmm8
  1967. #endif
  1968. movlps %xmm8, 0 * SIZE(CO1)
  1969. ALIGN_4
  1970. .L999:
  1971. movq %rbx, %rsp
  1972. movq 0(%rsp), %rbx
  1973. movq 8(%rsp), %rbp
  1974. movq 16(%rsp), %r12
  1975. movq 24(%rsp), %r13
  1976. movq 32(%rsp), %r14
  1977. movq 40(%rsp), %r15
  1978. #ifdef WINDOWS_ABI
  1979. movq 48(%rsp), %rdi
  1980. movq 56(%rsp), %rsi
  1981. movups 64(%rsp), %xmm6
  1982. movups 80(%rsp), %xmm7
  1983. movups 96(%rsp), %xmm8
  1984. movups 112(%rsp), %xmm9
  1985. movups 128(%rsp), %xmm10
  1986. movups 144(%rsp), %xmm11
  1987. movups 160(%rsp), %xmm12
  1988. movups 176(%rsp), %xmm13
  1989. movups 192(%rsp), %xmm14
  1990. movups 208(%rsp), %xmm15
  1991. #endif
  1992. addq $STACKSIZE, %rsp
  1993. ret
  1994. EPILOGUE