You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x2_sse3.S 47 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define AO %r12
  49. #define BO %r13
  50. #define CO1 %r14
  51. #define CO2 %r15
  52. #define BB %rbp
  53. #ifndef WINDOWS_ABI
  54. #define STACKSIZE 64
  55. #else
  56. #define STACKSIZE 256
  57. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  58. #define OLD_A 48 + STACKSIZE(%rsp)
  59. #define OLD_B 56 + STACKSIZE(%rsp)
  60. #define OLD_C 64 + STACKSIZE(%rsp)
  61. #define OLD_LDC 72 + STACKSIZE(%rsp)
  62. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  63. #endif
  64. #define ALPHA_R 0(%rsp)
  65. #define ALPHA_I 16(%rsp)
  66. #define J 32(%rsp)
  67. #define OFFSET 40(%rsp)
  68. #define KK 48(%rsp)
  69. #define KKK 56(%rsp)
  70. #define BUFFER 128(%rsp)
  71. #define PREFETCH prefetcht0
  72. #define PREFETCHSIZE 320
  73. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  74. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  75. #define ADDSUB addps
  76. #else
  77. #define ADDSUB subps
  78. #endif
  79. #define KERNEL1(address) \
  80. mulps %xmm8, %xmm9; \
  81. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO); \
  82. addps %xmm9, %xmm0; \
  83. movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  84. mulps %xmm8, %xmm9; \
  85. ADDSUB %xmm9, %xmm1; \
  86. movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  87. mulps %xmm8, %xmm9; \
  88. addps %xmm9, %xmm2; \
  89. movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  90. mulps %xmm8, %xmm9; \
  91. movaps 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
  92. ADDSUB %xmm9, %xmm3; \
  93. movsldup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  94. #define KERNEL2(address) \
  95. mulps %xmm8, %xmm9; \
  96. addps %xmm9, %xmm4; \
  97. movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  98. mulps %xmm8, %xmm9; \
  99. ADDSUB %xmm9, %xmm5; \
  100. movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  101. mulps %xmm8, %xmm9; \
  102. addps %xmm9, %xmm6; \
  103. movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  104. mulps %xmm8, %xmm9; \
  105. movaps 8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
  106. ADDSUB %xmm9, %xmm7; \
  107. movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  108. #define KERNEL3(address) \
  109. mulps %xmm8, %xmm9; \
  110. addps %xmm9, %xmm0; \
  111. movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  112. mulps %xmm8, %xmm9; \
  113. ADDSUB %xmm9, %xmm1; \
  114. movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  115. mulps %xmm8, %xmm9; \
  116. addps %xmm9, %xmm2; \
  117. movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  118. mulps %xmm8, %xmm9; \
  119. movaps 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
  120. ADDSUB %xmm9, %xmm3; \
  121. movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  122. #define KERNEL4(address) \
  123. mulps %xmm8, %xmm9; \
  124. addps %xmm9, %xmm4; \
  125. movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  126. mulps %xmm8, %xmm9; \
  127. ADDSUB %xmm9, %xmm5; \
  128. movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  129. mulps %xmm8, %xmm9; \
  130. addps %xmm9, %xmm6; \
  131. movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
  132. mulps %xmm8, %xmm9; \
  133. movaps 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
  134. ADDSUB %xmm9, %xmm7; \
  135. movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  136. #define KERNEL5(address) \
  137. mulps %xmm10, %xmm11; \
  138. addps %xmm11, %xmm0; \
  139. movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  140. mulps %xmm10, %xmm11; \
  141. ADDSUB %xmm11, %xmm1; \
  142. movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  143. mulps %xmm10, %xmm11; \
  144. addps %xmm11, %xmm2; \
  145. movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  146. mulps %xmm10, %xmm11; \
  147. movaps 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
  148. ADDSUB %xmm11, %xmm3; \
  149. movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  150. #define KERNEL6(address) \
  151. mulps %xmm10, %xmm11; \
  152. addps %xmm11, %xmm4; \
  153. movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  154. mulps %xmm10, %xmm11; \
  155. ADDSUB %xmm11, %xmm5; \
  156. movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  157. mulps %xmm10, %xmm11; \
  158. addps %xmm11, %xmm6; \
  159. movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  160. mulps %xmm10, %xmm11; \
  161. movaps 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
  162. ADDSUB %xmm11, %xmm7; \
  163. movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  164. #define KERNEL7(address) \
  165. mulps %xmm10, %xmm11; \
  166. addps %xmm11, %xmm0; \
  167. movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  168. mulps %xmm10, %xmm11; \
  169. ADDSUB %xmm11, %xmm1; \
  170. movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  171. mulps %xmm10, %xmm11; \
  172. addps %xmm11, %xmm2; \
  173. movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  174. mulps %xmm10, %xmm11; \
  175. movaps 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
  176. ADDSUB %xmm11, %xmm3; \
  177. movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  178. #define KERNEL8(address) \
  179. mulps %xmm10, %xmm11; \
  180. addps %xmm11, %xmm4; \
  181. movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  182. mulps %xmm10, %xmm11; \
  183. ADDSUB %xmm11, %xmm5; \
  184. movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  185. mulps %xmm10, %xmm11; \
  186. addps %xmm11, %xmm6; \
  187. movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
  188. mulps %xmm10, %xmm11; \
  189. movaps 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
  190. ADDSUB %xmm11, %xmm7; \
  191. movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  192. #define KERNEL9(address) \
  193. mulps %xmm12, %xmm13; \
  194. PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \
  195. addps %xmm13, %xmm0; \
  196. movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  197. mulps %xmm12, %xmm13; \
  198. ADDSUB %xmm13, %xmm1; \
  199. movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  200. mulps %xmm12, %xmm13; \
  201. addps %xmm13, %xmm2; \
  202. movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  203. mulps %xmm12, %xmm13; \
  204. movaps 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
  205. ADDSUB %xmm13, %xmm3; \
  206. movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  207. #define KERNEL10(address) \
  208. mulps %xmm12, %xmm13; \
  209. addps %xmm13, %xmm4; \
  210. movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  211. mulps %xmm12, %xmm13; \
  212. ADDSUB %xmm13, %xmm5; \
  213. movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  214. mulps %xmm12, %xmm13; \
  215. addps %xmm13, %xmm6; \
  216. movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  217. mulps %xmm12, %xmm13; \
  218. movaps 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
  219. ADDSUB %xmm13, %xmm7; \
  220. movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  221. #define KERNEL11(address) \
  222. mulps %xmm12, %xmm13; \
  223. addps %xmm13, %xmm0; \
  224. movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  225. mulps %xmm12, %xmm13; \
  226. ADDSUB %xmm13, %xmm1; \
  227. movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  228. mulps %xmm12, %xmm13; \
  229. addps %xmm13, %xmm2; \
  230. movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  231. mulps %xmm12, %xmm13; \
  232. movaps 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
  233. ADDSUB %xmm13, %xmm3; \
  234. movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  235. #define KERNEL12(address) \
  236. mulps %xmm12, %xmm13; \
  237. addps %xmm13, %xmm4; \
  238. movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  239. mulps %xmm12, %xmm13; \
  240. ADDSUB %xmm13, %xmm5; \
  241. movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  242. mulps %xmm12, %xmm13; \
  243. addps %xmm13, %xmm6; \
  244. movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
  245. mulps %xmm12, %xmm13; \
  246. movaps 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
  247. ADDSUB %xmm13, %xmm7; \
  248. movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  249. #define KERNEL13(address) \
  250. mulps %xmm14, %xmm15; \
  251. addps %xmm15, %xmm0; \
  252. movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  253. mulps %xmm14, %xmm15; \
  254. ADDSUB %xmm15, %xmm1; \
  255. movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  256. mulps %xmm14, %xmm15; \
  257. addps %xmm15, %xmm2; \
  258. movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  259. mulps %xmm14, %xmm15; \
  260. movaps 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
  261. ADDSUB %xmm15, %xmm3; \
  262. movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  263. #define KERNEL14(address) \
  264. mulps %xmm14, %xmm15; \
  265. addps %xmm15, %xmm4; \
  266. movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  267. mulps %xmm14, %xmm15; \
  268. ADDSUB %xmm15, %xmm5; \
  269. movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  270. mulps %xmm14, %xmm15; \
  271. addps %xmm15, %xmm6; \
  272. movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  273. mulps %xmm14, %xmm15; \
  274. movaps 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
  275. ADDSUB %xmm15, %xmm7; \
  276. movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  277. #define KERNEL15(address) \
  278. mulps %xmm14, %xmm15; \
  279. addps %xmm15, %xmm0; \
  280. movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  281. mulps %xmm14, %xmm15; \
  282. ADDSUB %xmm15, %xmm1; \
  283. movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  284. mulps %xmm14, %xmm15; \
  285. addps %xmm15, %xmm2; \
  286. movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  287. mulps %xmm14, %xmm15; \
  288. movaps 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
  289. ADDSUB %xmm15, %xmm3; \
  290. movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  291. #define KERNEL16(address) \
  292. mulps %xmm14, %xmm15; \
  293. addps %xmm15, %xmm4; \
  294. movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  295. mulps %xmm14, %xmm15; \
  296. ADDSUB %xmm15, %xmm5; \
  297. movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  298. mulps %xmm14, %xmm15; \
  299. addps %xmm15, %xmm6; \
  300. movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
  301. mulps %xmm14, %xmm15; \
  302. movaps 112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
  303. ADDSUB %xmm15, %xmm7; \
  304. movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  305. PROLOGUE
  306. PROFCODE
  307. subq $STACKSIZE, %rsp
  308. movq %rbx, 0(%rsp)
  309. movq %rbp, 8(%rsp)
  310. movq %r12, 16(%rsp)
  311. movq %r13, 24(%rsp)
  312. movq %r14, 32(%rsp)
  313. movq %r15, 40(%rsp)
  314. #ifdef WINDOWS_ABI
  315. movq %rdi, 48(%rsp)
  316. movq %rsi, 56(%rsp)
  317. movups %xmm6, 64(%rsp)
  318. movups %xmm7, 80(%rsp)
  319. movups %xmm8, 96(%rsp)
  320. movups %xmm9, 112(%rsp)
  321. movups %xmm10, 128(%rsp)
  322. movups %xmm11, 144(%rsp)
  323. movups %xmm12, 160(%rsp)
  324. movups %xmm13, 176(%rsp)
  325. movups %xmm14, 192(%rsp)
  326. movups %xmm15, 208(%rsp)
  327. movq ARG1, M
  328. movq ARG2, N
  329. movq ARG3, K
  330. movq OLD_A, A
  331. movq OLD_B, B
  332. movq OLD_C, C
  333. movq OLD_LDC, LDC
  334. #ifdef TRMMKERNEL
  335. movsd OLD_OFFSET, %xmm4
  336. #endif
  337. movaps %xmm3, %xmm0
  338. movsd OLD_ALPHA_I, %xmm1
  339. #else
  340. movq 72(%rsp), LDC
  341. #ifdef TRMMKERNEL
  342. movsd 80(%rsp), %xmm4
  343. #endif
  344. #endif
  345. movq %rsp, %rbx # save old stack
  346. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  347. andq $-4096, %rsp # align stack
  348. STACK_TOUCHING
  349. pxor %xmm15, %xmm15
  350. cmpeqps %xmm15, %xmm15
  351. pslld $31, %xmm15 # Generate mask
  352. pxor %xmm2, %xmm2
  353. shufps $0, %xmm0, %xmm0
  354. movaps %xmm0, 0 + ALPHA_R
  355. movss %xmm1, 4 + ALPHA_I
  356. movss %xmm1, 12 + ALPHA_I
  357. xorps %xmm15, %xmm1
  358. movss %xmm1, 0 + ALPHA_I
  359. movss %xmm1, 8 + ALPHA_I
  360. #ifdef TRMMKERNEL
  361. movsd %xmm4, OFFSET
  362. movsd %xmm4, KK
  363. #ifndef LEFT
  364. negq KK
  365. #endif
  366. #endif
  367. salq $ZBASE_SHIFT, LDC
  368. movq N, J
  369. sarq $1, J # j = (n >> 2)
  370. jle .L40
  371. ALIGN_4
  372. .L01:
  373. #if defined(TRMMKERNEL) && defined(LEFT)
  374. movq OFFSET, %rax
  375. movq %rax, KK
  376. #endif
  377. /* Copying to Sub Buffer */
  378. leaq BUFFER, BO
  379. movq K, %rax
  380. sarq $2, %rax
  381. jle .L03
  382. ALIGN_4
  383. .L02:
  384. movddup 0 * SIZE(B), %xmm0
  385. movddup 2 * SIZE(B), %xmm1
  386. movddup 4 * SIZE(B), %xmm2
  387. movddup 6 * SIZE(B), %xmm3
  388. movddup 8 * SIZE(B), %xmm4
  389. movddup 10 * SIZE(B), %xmm5
  390. movddup 12 * SIZE(B), %xmm6
  391. movddup 14 * SIZE(B), %xmm7
  392. movaps %xmm0, 0 * SIZE(BO)
  393. movaps %xmm1, 4 * SIZE(BO)
  394. movaps %xmm2, 8 * SIZE(BO)
  395. movaps %xmm3, 12 * SIZE(BO)
  396. movaps %xmm4, 16 * SIZE(BO)
  397. movaps %xmm5, 20 * SIZE(BO)
  398. movaps %xmm6, 24 * SIZE(BO)
  399. movaps %xmm7, 28 * SIZE(BO)
  400. prefetcht1 128 * SIZE(BO)
  401. prefetcht0 112 * SIZE(B)
  402. addq $16 * SIZE, B
  403. addq $32 * SIZE, BO
  404. decq %rax
  405. jne .L02
  406. ALIGN_4
  407. .L03:
  408. movq K, %rax
  409. andq $3, %rax
  410. BRANCH
  411. jle .L10
  412. ALIGN_4
  413. .L04:
  414. movddup 0 * SIZE(B), %xmm0
  415. movddup 2 * SIZE(B), %xmm1
  416. movaps %xmm0, 0 * SIZE(BO)
  417. movaps %xmm1, 4 * SIZE(BO)
  418. addq $4 * SIZE, B
  419. addq $8 * SIZE, BO
  420. decq %rax
  421. jne .L04
  422. ALIGN_4
  423. .L10:
  424. movq C, CO1 # coffset1 = c
  425. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  426. movq A, AO # aoffset = a
  427. leaq 112 * SIZE(B), BB
  428. movq M, I
  429. sarq $2, I # i = (m >> 2)
  430. jle .L20
  431. ALIGN_4
  432. .L11:
  433. prefetcht0 0 * SIZE(BB)
  434. subq $-8 * SIZE, BB
  435. #if !defined(TRMMKERNEL) || \
  436. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  437. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  438. leaq BUFFER, BO
  439. #else
  440. leaq BUFFER, BO
  441. movq KK, %rax
  442. leaq (, %rax, 8), %rax
  443. leaq (AO, %rax, 4), AO
  444. leaq (BO, %rax, 4), BO
  445. #endif
  446. movaps 0 * SIZE(AO), %xmm8
  447. pxor %xmm0, %xmm0
  448. movaps 16 * SIZE(AO), %xmm10
  449. pxor %xmm1, %xmm1
  450. movaps 32 * SIZE(AO), %xmm12
  451. pxor %xmm2, %xmm2
  452. movaps 48 * SIZE(AO), %xmm14
  453. pxor %xmm3, %xmm3
  454. movsldup 0 * SIZE(BO), %xmm9
  455. pxor %xmm4, %xmm4
  456. movsldup 16 * SIZE(BO), %xmm11
  457. pxor %xmm5, %xmm5
  458. movsldup 32 * SIZE(BO), %xmm13
  459. pxor %xmm6, %xmm6
  460. movsldup 48 * SIZE(BO), %xmm15
  461. pxor %xmm7, %xmm7
  462. prefetchnta 8 * SIZE(CO1)
  463. prefetchnta 8 * SIZE(CO2)
  464. #ifndef TRMMKERNEL
  465. movq K, %rax
  466. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  467. movq K, %rax
  468. subq KK, %rax
  469. movq %rax, KKK
  470. #else
  471. movq KK, %rax
  472. #ifdef LEFT
  473. addq $4, %rax
  474. #else
  475. addq $2, %rax
  476. #endif
  477. movq %rax, KKK
  478. #endif
  479. #if 1
  480. andq $-8, %rax
  481. salq $4, %rax
  482. je .L15
  483. .L1X:
  484. KERNEL1 (32 * 0)
  485. KERNEL2 (32 * 0)
  486. KERNEL3 (32 * 0)
  487. KERNEL4 (32 * 0)
  488. KERNEL5 (32 * 0)
  489. KERNEL6 (32 * 0)
  490. KERNEL7 (32 * 0)
  491. KERNEL8 (32 * 0)
  492. KERNEL9 (32 * 0)
  493. KERNEL10(32 * 0)
  494. KERNEL11(32 * 0)
  495. KERNEL12(32 * 0)
  496. KERNEL13(32 * 0)
  497. KERNEL14(32 * 0)
  498. KERNEL15(32 * 0)
  499. KERNEL16(32 * 0)
  500. cmpq $128 * 1, %rax
  501. jle .L12
  502. KERNEL1 (32 * 1)
  503. KERNEL2 (32 * 1)
  504. KERNEL3 (32 * 1)
  505. KERNEL4 (32 * 1)
  506. KERNEL5 (32 * 1)
  507. KERNEL6 (32 * 1)
  508. KERNEL7 (32 * 1)
  509. KERNEL8 (32 * 1)
  510. KERNEL9 (32 * 1)
  511. KERNEL10(32 * 1)
  512. KERNEL11(32 * 1)
  513. KERNEL12(32 * 1)
  514. KERNEL13(32 * 1)
  515. KERNEL14(32 * 1)
  516. KERNEL15(32 * 1)
  517. KERNEL16(32 * 1)
  518. cmpq $128 * 2, %rax
  519. jle .L12
  520. KERNEL1 (32 * 2)
  521. KERNEL2 (32 * 2)
  522. KERNEL3 (32 * 2)
  523. KERNEL4 (32 * 2)
  524. KERNEL5 (32 * 2)
  525. KERNEL6 (32 * 2)
  526. KERNEL7 (32 * 2)
  527. KERNEL8 (32 * 2)
  528. KERNEL9 (32 * 2)
  529. KERNEL10(32 * 2)
  530. KERNEL11(32 * 2)
  531. KERNEL12(32 * 2)
  532. KERNEL13(32 * 2)
  533. KERNEL14(32 * 2)
  534. KERNEL15(32 * 2)
  535. KERNEL16(32 * 2)
  536. cmpq $128 * 3, %rax
  537. jle .L12
  538. KERNEL1 (32 * 3)
  539. KERNEL2 (32 * 3)
  540. KERNEL3 (32 * 3)
  541. KERNEL4 (32 * 3)
  542. KERNEL5 (32 * 3)
  543. KERNEL6 (32 * 3)
  544. KERNEL7 (32 * 3)
  545. KERNEL8 (32 * 3)
  546. KERNEL9 (32 * 3)
  547. KERNEL10(32 * 3)
  548. KERNEL11(32 * 3)
  549. KERNEL12(32 * 3)
  550. KERNEL13(32 * 3)
  551. KERNEL14(32 * 3)
  552. KERNEL15(32 * 3)
  553. KERNEL16(32 * 3)
  554. cmpq $128 * 4, %rax
  555. jle .L12
  556. KERNEL1 (32 * 4)
  557. KERNEL2 (32 * 4)
  558. KERNEL3 (32 * 4)
  559. KERNEL4 (32 * 4)
  560. KERNEL5 (32 * 4)
  561. KERNEL6 (32 * 4)
  562. KERNEL7 (32 * 4)
  563. KERNEL8 (32 * 4)
  564. KERNEL9 (32 * 4)
  565. KERNEL10(32 * 4)
  566. KERNEL11(32 * 4)
  567. KERNEL12(32 * 4)
  568. KERNEL13(32 * 4)
  569. KERNEL14(32 * 4)
  570. KERNEL15(32 * 4)
  571. KERNEL16(32 * 4)
  572. cmpq $128 * 5, %rax
  573. jle .L12
  574. KERNEL1 (32 * 5)
  575. KERNEL2 (32 * 5)
  576. KERNEL3 (32 * 5)
  577. KERNEL4 (32 * 5)
  578. KERNEL5 (32 * 5)
  579. KERNEL6 (32 * 5)
  580. KERNEL7 (32 * 5)
  581. KERNEL8 (32 * 5)
  582. KERNEL9 (32 * 5)
  583. KERNEL10(32 * 5)
  584. KERNEL11(32 * 5)
  585. KERNEL12(32 * 5)
  586. KERNEL13(32 * 5)
  587. KERNEL14(32 * 5)
  588. KERNEL15(32 * 5)
  589. KERNEL16(32 * 5)
  590. cmpq $128 * 6, %rax
  591. jle .L12
  592. KERNEL1 (32 * 6)
  593. KERNEL2 (32 * 6)
  594. KERNEL3 (32 * 6)
  595. KERNEL4 (32 * 6)
  596. KERNEL5 (32 * 6)
  597. KERNEL6 (32 * 6)
  598. KERNEL7 (32 * 6)
  599. KERNEL8 (32 * 6)
  600. KERNEL9 (32 * 6)
  601. KERNEL10(32 * 6)
  602. KERNEL11(32 * 6)
  603. KERNEL12(32 * 6)
  604. KERNEL13(32 * 6)
  605. KERNEL14(32 * 6)
  606. KERNEL15(32 * 6)
  607. KERNEL16(32 * 6)
  608. cmpq $128 * 7, %rax
  609. jle .L12
  610. KERNEL1 (32 * 7)
  611. KERNEL2 (32 * 7)
  612. KERNEL3 (32 * 7)
  613. KERNEL4 (32 * 7)
  614. KERNEL5 (32 * 7)
  615. KERNEL6 (32 * 7)
  616. KERNEL7 (32 * 7)
  617. KERNEL8 (32 * 7)
  618. KERNEL9 (32 * 7)
  619. KERNEL10(32 * 7)
  620. KERNEL11(32 * 7)
  621. KERNEL12(32 * 7)
  622. KERNEL13(32 * 7)
  623. KERNEL14(32 * 7)
  624. KERNEL15(32 * 7)
  625. KERNEL16(32 * 7)
  626. addq $64 * 8 * SIZE, AO
  627. addq $64 * 8 * SIZE, BO
  628. subq $128 * 8, %rax
  629. jg .L1X
  630. .L12:
  631. leaq (AO, %rax, 2), AO # * 16
  632. leaq (BO, %rax, 2), BO # * 64
  633. #else
  634. sarq $3, %rax
  635. je .L15
  636. ALIGN_4
  637. .L12:
  638. KERNEL1 (32 * 0)
  639. KERNEL2 (32 * 0)
  640. KERNEL3 (32 * 0)
  641. KERNEL4 (32 * 0)
  642. KERNEL5 (32 * 0)
  643. KERNEL6 (32 * 0)
  644. KERNEL7 (32 * 0)
  645. KERNEL8 (32 * 0)
  646. KERNEL9 (32 * 0)
  647. KERNEL10(32 * 0)
  648. KERNEL11(32 * 0)
  649. KERNEL12(32 * 0)
  650. KERNEL13(32 * 0)
  651. KERNEL14(32 * 0)
  652. KERNEL15(32 * 0)
  653. KERNEL16(32 * 0)
  654. addq $64 * SIZE, AO
  655. addq $64 * SIZE, BO
  656. decq %rax
  657. jne .L12
  658. #endif
  659. ALIGN_4
  660. .L15:
  661. #ifndef TRMMKERNEL
  662. movq K, %rax
  663. #else
  664. movq KKK, %rax
  665. #endif
  666. movaps ALPHA_R, %xmm14
  667. movaps ALPHA_I, %xmm15
  668. andq $7, %rax # if (k & 1)
  669. BRANCH
  670. je .L18
  671. ALIGN_4
  672. .L16:
  673. mulps %xmm8, %xmm9
  674. addps %xmm9, %xmm0
  675. movshdup 0 * SIZE(BO), %xmm9
  676. mulps %xmm8, %xmm9
  677. ADDSUB %xmm9, %xmm1
  678. movsldup 4 * SIZE(BO), %xmm9
  679. mulps %xmm8, %xmm9
  680. addps %xmm9, %xmm2
  681. movshdup 4 * SIZE(BO), %xmm9
  682. mulps %xmm8, %xmm9
  683. movaps 4 * SIZE(AO), %xmm8
  684. ADDSUB %xmm9, %xmm3
  685. movsldup 0 * SIZE(BO), %xmm9
  686. mulps %xmm8, %xmm9
  687. addps %xmm9, %xmm4
  688. movshdup 0 * SIZE(BO), %xmm9
  689. mulps %xmm8, %xmm9
  690. ADDSUB %xmm9, %xmm5
  691. movsldup 4 * SIZE(BO), %xmm9
  692. mulps %xmm8, %xmm9
  693. addps %xmm9, %xmm6
  694. movshdup 4 * SIZE(BO), %xmm9
  695. mulps %xmm8, %xmm9
  696. movaps 8 * SIZE(AO), %xmm8
  697. ADDSUB %xmm9, %xmm7
  698. movsldup 8 * SIZE(BO), %xmm9
  699. addq $8 * SIZE, AO
  700. addq $8 * SIZE, BO
  701. decq %rax
  702. jg .L16
  703. ALIGN_4
  704. .L18:
  705. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  706. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  707. shufps $0xb1, %xmm1, %xmm1
  708. shufps $0xb1, %xmm3, %xmm3
  709. shufps $0xb1, %xmm5, %xmm5
  710. shufps $0xb1, %xmm7, %xmm7
  711. addsubps %xmm1, %xmm0
  712. addsubps %xmm3, %xmm2
  713. addsubps %xmm5, %xmm4
  714. addsubps %xmm7, %xmm6
  715. movaps %xmm0, %xmm1
  716. movaps %xmm2, %xmm3
  717. movaps %xmm4, %xmm5
  718. movaps %xmm6, %xmm7
  719. shufps $0xb1, %xmm0, %xmm0
  720. shufps $0xb1, %xmm2, %xmm2
  721. shufps $0xb1, %xmm4, %xmm4
  722. shufps $0xb1, %xmm6, %xmm6
  723. #else
  724. shufps $0xb1, %xmm0, %xmm0
  725. shufps $0xb1, %xmm2, %xmm2
  726. shufps $0xb1, %xmm4, %xmm4
  727. shufps $0xb1, %xmm6, %xmm6
  728. addsubps %xmm0, %xmm1
  729. addsubps %xmm2, %xmm3
  730. addsubps %xmm4, %xmm5
  731. addsubps %xmm6, %xmm7
  732. movaps %xmm1, %xmm0
  733. movaps %xmm3, %xmm2
  734. movaps %xmm5, %xmm4
  735. movaps %xmm7, %xmm6
  736. shufps $0xb1, %xmm1, %xmm1
  737. shufps $0xb1, %xmm3, %xmm3
  738. shufps $0xb1, %xmm5, %xmm5
  739. shufps $0xb1, %xmm7, %xmm7
  740. #endif
  741. mulps %xmm14, %xmm1
  742. mulps %xmm15, %xmm0
  743. mulps %xmm14, %xmm3
  744. mulps %xmm15, %xmm2
  745. mulps %xmm14, %xmm5
  746. mulps %xmm15, %xmm4
  747. mulps %xmm14, %xmm7
  748. mulps %xmm15, %xmm6
  749. addps %xmm1, %xmm0
  750. addps %xmm3, %xmm2
  751. addps %xmm5, %xmm4
  752. addps %xmm7, %xmm6
  753. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  754. shufps $0xe4, %xmm8, %xmm8
  755. shufps $0xe4, %xmm9, %xmm9
  756. shufps $0xe4, %xmm10, %xmm10
  757. shufps $0xe4, %xmm11, %xmm11
  758. movsd 0 * SIZE(CO1), %xmm8
  759. movhps 2 * SIZE(CO1), %xmm8
  760. movsd 4 * SIZE(CO1), %xmm10
  761. movhps 6 * SIZE(CO1), %xmm10
  762. movsd 0 * SIZE(CO2), %xmm9
  763. movhps 2 * SIZE(CO2), %xmm9
  764. movsd 4 * SIZE(CO2), %xmm11
  765. movhps 6 * SIZE(CO2), %xmm11
  766. addps %xmm8, %xmm0
  767. addps %xmm9, %xmm2
  768. addps %xmm10, %xmm4
  769. addps %xmm11, %xmm6
  770. #endif
  771. movsd %xmm0, 0 * SIZE(CO1)
  772. movhps %xmm0, 2 * SIZE(CO1)
  773. movsd %xmm4, 4 * SIZE(CO1)
  774. movhps %xmm4, 6 * SIZE(CO1)
  775. movsd %xmm2, 0 * SIZE(CO2)
  776. movhps %xmm2, 2 * SIZE(CO2)
  777. movsd %xmm6, 4 * SIZE(CO2)
  778. movhps %xmm6, 6 * SIZE(CO2)
  779. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  780. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  781. movq K, %rax
  782. subq KKK, %rax
  783. leaq (,%rax, 8), %rax
  784. leaq (AO, %rax, 4), AO
  785. leaq (BO, %rax, 4), BO
  786. #endif
  787. #if defined(TRMMKERNEL) && defined(LEFT)
  788. addq $4, KK
  789. #endif
  790. addq $8 * SIZE, CO1 # coffset += 4
  791. addq $8 * SIZE, CO2 # coffset += 4
  792. decq I # i --
  793. jg .L11
  794. ALIGN_4
  795. .L20:
  796. testq $2, M
  797. je .L30
  798. #if !defined(TRMMKERNEL) || \
  799. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  800. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  801. leaq BUFFER, BO
  802. #else
  803. leaq BUFFER, BO
  804. movq KK, %rax
  805. leaq (, %rax, 8), %rax
  806. leaq (AO, %rax, 2), AO
  807. leaq (BO, %rax, 4), BO
  808. #endif
  809. movaps 0 * SIZE(AO), %xmm8
  810. pxor %xmm0, %xmm0
  811. movaps 16 * SIZE(AO), %xmm10
  812. pxor %xmm1, %xmm1
  813. movsldup 0 * SIZE(BO), %xmm9
  814. pxor %xmm2, %xmm2
  815. movsldup 16 * SIZE(BO), %xmm11
  816. pxor %xmm3, %xmm3
  817. movsldup 32 * SIZE(BO), %xmm13
  818. movsldup 48 * SIZE(BO), %xmm15
  819. #ifndef TRMMKERNEL
  820. movq K, %rax
  821. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  822. movq K, %rax
  823. subq KK, %rax
  824. movq %rax, KKK
  825. #else
  826. movq KK, %rax
  827. #ifdef LEFT
  828. addq $2, %rax
  829. #else
  830. addq $2, %rax
  831. #endif
  832. movq %rax, KKK
  833. #endif
  834. sarq $3, %rax
  835. je .L25
  836. ALIGN_4
  837. .L22:
  838. mulps %xmm8, %xmm9
  839. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  840. addps %xmm9, %xmm0
  841. movshdup 0 * SIZE(BO), %xmm9
  842. mulps %xmm8, %xmm9
  843. ADDSUB %xmm9, %xmm1
  844. movsldup 4 * SIZE(BO), %xmm9
  845. mulps %xmm8, %xmm9
  846. addps %xmm9, %xmm2
  847. movshdup 4 * SIZE(BO), %xmm9
  848. mulps %xmm8, %xmm9
  849. movaps 4 * SIZE(AO), %xmm8
  850. ADDSUB %xmm9, %xmm3
  851. movsldup 8 * SIZE(BO), %xmm9
  852. mulps %xmm8, %xmm9
  853. addps %xmm9, %xmm0
  854. movshdup 8 * SIZE(BO), %xmm9
  855. mulps %xmm8, %xmm9
  856. ADDSUB %xmm9, %xmm1
  857. movsldup 12 * SIZE(BO), %xmm9
  858. mulps %xmm8, %xmm9
  859. addps %xmm9, %xmm2
  860. movshdup 12 * SIZE(BO), %xmm9
  861. mulps %xmm8, %xmm9
  862. movaps 8 * SIZE(AO), %xmm8
  863. ADDSUB %xmm9, %xmm3
  864. movsldup 64 * SIZE(BO), %xmm9
  865. mulps %xmm8, %xmm11
  866. addps %xmm11, %xmm0
  867. movshdup 16 * SIZE(BO), %xmm11
  868. mulps %xmm8, %xmm11
  869. ADDSUB %xmm11, %xmm1
  870. movsldup 20 * SIZE(BO), %xmm11
  871. mulps %xmm8, %xmm11
  872. addps %xmm11, %xmm2
  873. movshdup 20 * SIZE(BO), %xmm11
  874. mulps %xmm8, %xmm11
  875. movaps 12 * SIZE(AO), %xmm8
  876. ADDSUB %xmm11, %xmm3
  877. movsldup 24 * SIZE(BO), %xmm11
  878. mulps %xmm8, %xmm11
  879. addps %xmm11, %xmm0
  880. movshdup 24 * SIZE(BO), %xmm11
  881. mulps %xmm8, %xmm11
  882. ADDSUB %xmm11, %xmm1
  883. movsldup 28 * SIZE(BO), %xmm11
  884. mulps %xmm8, %xmm11
  885. addps %xmm11, %xmm2
  886. movshdup 28 * SIZE(BO), %xmm11
  887. mulps %xmm8, %xmm11
  888. movaps 32 * SIZE(AO), %xmm8
  889. ADDSUB %xmm11, %xmm3
  890. movsldup 80 * SIZE(BO), %xmm11
  891. mulps %xmm10, %xmm13
  892. addps %xmm13, %xmm0
  893. movshdup 32 * SIZE(BO), %xmm13
  894. mulps %xmm10, %xmm13
  895. ADDSUB %xmm13, %xmm1
  896. movsldup 36 * SIZE(BO), %xmm13
  897. mulps %xmm10, %xmm13
  898. addps %xmm13, %xmm2
  899. movshdup 36 * SIZE(BO), %xmm13
  900. mulps %xmm10, %xmm13
  901. movaps 20 * SIZE(AO), %xmm10
  902. ADDSUB %xmm13, %xmm3
  903. movsldup 40 * SIZE(BO), %xmm13
  904. mulps %xmm10, %xmm13
  905. addps %xmm13, %xmm0
  906. movshdup 40 * SIZE(BO), %xmm13
  907. mulps %xmm10, %xmm13
  908. ADDSUB %xmm13, %xmm1
  909. movsldup 44 * SIZE(BO), %xmm13
  910. mulps %xmm10, %xmm13
  911. addps %xmm13, %xmm2
  912. movshdup 44 * SIZE(BO), %xmm13
  913. mulps %xmm10, %xmm13
  914. movaps 24 * SIZE(AO), %xmm10
  915. ADDSUB %xmm13, %xmm3
  916. movsldup 96 * SIZE(BO), %xmm13
  917. mulps %xmm10, %xmm15
  918. addps %xmm15, %xmm0
  919. movshdup 48 * SIZE(BO), %xmm15
  920. mulps %xmm10, %xmm15
  921. ADDSUB %xmm15, %xmm1
  922. movsldup 52 * SIZE(BO), %xmm15
  923. mulps %xmm10, %xmm15
  924. addps %xmm15, %xmm2
  925. movshdup 52 * SIZE(BO), %xmm15
  926. mulps %xmm10, %xmm15
  927. movaps 28 * SIZE(AO), %xmm10
  928. ADDSUB %xmm15, %xmm3
  929. movsldup 56 * SIZE(BO), %xmm15
  930. mulps %xmm10, %xmm15
  931. addps %xmm15, %xmm0
  932. movshdup 56 * SIZE(BO), %xmm15
  933. mulps %xmm10, %xmm15
  934. ADDSUB %xmm15, %xmm1
  935. movsldup 60 * SIZE(BO), %xmm15
  936. mulps %xmm10, %xmm15
  937. addps %xmm15, %xmm2
  938. movshdup 60 * SIZE(BO), %xmm15
  939. mulps %xmm10, %xmm15
  940. movaps 48 * SIZE(AO), %xmm10
  941. ADDSUB %xmm15, %xmm3
  942. movsldup 112 * SIZE(BO), %xmm15
  943. addq $32 * SIZE, AO
  944. addq $64 * SIZE, BO
  945. decq %rax
  946. jne .L22
  947. ALIGN_4
  948. .L25:
  949. #ifndef TRMMKERNEL
  950. movq K, %rax
  951. #else
  952. movq KKK, %rax
  953. #endif
  954. movaps ALPHA_R, %xmm14
  955. movaps ALPHA_I, %xmm15
  956. andq $7, %rax # if (k & 1)
  957. BRANCH
  958. je .L28
  959. ALIGN_4
  960. .L26:
  961. mulps %xmm8, %xmm9
  962. addps %xmm9, %xmm0
  963. movshdup 0 * SIZE(BO), %xmm9
  964. mulps %xmm8, %xmm9
  965. ADDSUB %xmm9, %xmm1
  966. movsldup 4 * SIZE(BO), %xmm9
  967. mulps %xmm8, %xmm9
  968. addps %xmm9, %xmm2
  969. movshdup 4 * SIZE(BO), %xmm9
  970. mulps %xmm8, %xmm9
  971. movaps 4 * SIZE(AO), %xmm8
  972. ADDSUB %xmm9, %xmm3
  973. movsldup 8 * SIZE(BO), %xmm9
  974. addq $ 4 * SIZE, AO
  975. addq $ 8 * SIZE, BO
  976. decq %rax
  977. jg .L26
  978. ALIGN_4
  979. .L28:
  980. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  981. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  982. shufps $0xb1, %xmm1, %xmm1
  983. shufps $0xb1, %xmm3, %xmm3
  984. addsubps %xmm1, %xmm0
  985. addsubps %xmm3, %xmm2
  986. movaps %xmm0, %xmm1
  987. movaps %xmm2, %xmm3
  988. shufps $0xb1, %xmm0, %xmm0
  989. shufps $0xb1, %xmm2, %xmm2
  990. #else
  991. shufps $0xb1, %xmm0, %xmm0
  992. shufps $0xb1, %xmm2, %xmm2
  993. addsubps %xmm0, %xmm1
  994. addsubps %xmm2, %xmm3
  995. movaps %xmm1, %xmm0
  996. movaps %xmm3, %xmm2
  997. shufps $0xb1, %xmm1, %xmm1
  998. shufps $0xb1, %xmm3, %xmm3
  999. #endif
  1000. mulps %xmm14, %xmm1
  1001. mulps %xmm15, %xmm0
  1002. mulps %xmm14, %xmm3
  1003. mulps %xmm15, %xmm2
  1004. addps %xmm1, %xmm0
  1005. addps %xmm3, %xmm2
  1006. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1007. shufps $0xe4, %xmm8, %xmm8
  1008. shufps $0xe4, %xmm10, %xmm10
  1009. movsd 0 * SIZE(CO1), %xmm8
  1010. movhps 2 * SIZE(CO1), %xmm8
  1011. movsd 0 * SIZE(CO2), %xmm10
  1012. movhps 2 * SIZE(CO2), %xmm10
  1013. addps %xmm8, %xmm0
  1014. addps %xmm10, %xmm2
  1015. #endif
  1016. movsd %xmm0, 0 * SIZE(CO1)
  1017. movhps %xmm0, 2 * SIZE(CO1)
  1018. movsd %xmm2, 0 * SIZE(CO2)
  1019. movhps %xmm2, 2 * SIZE(CO2)
  1020. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1021. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1022. movq K, %rax
  1023. subq KKK, %rax
  1024. leaq (,%rax, 8), %rax
  1025. leaq (AO, %rax, 2), AO
  1026. leaq (BO, %rax, 4), BO
  1027. #endif
  1028. #if defined(TRMMKERNEL) && defined(LEFT)
  1029. addq $2, KK
  1030. #endif
  1031. addq $4 * SIZE, CO1 # coffset += 4
  1032. addq $4 * SIZE, CO2 # coffset += 4
  1033. ALIGN_4
  1034. .L30:
  1035. testq $1, M
  1036. je .L39
  1037. #if !defined(TRMMKERNEL) || \
  1038. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1039. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1040. leaq BUFFER, BO
  1041. #else
  1042. leaq BUFFER, BO
  1043. movq KK, %rax
  1044. leaq (, %rax, 8), %rax
  1045. leaq (AO, %rax, 1), AO
  1046. leaq (BO, %rax, 4), BO
  1047. #endif
  1048. movddup 0 * SIZE(AO), %xmm8
  1049. pxor %xmm0, %xmm0
  1050. movddup 8 * SIZE(AO), %xmm10
  1051. pxor %xmm1, %xmm1
  1052. movsd 0 * SIZE(BO), %xmm9
  1053. pxor %xmm2, %xmm2
  1054. movsd 16 * SIZE(BO), %xmm11
  1055. pxor %xmm3, %xmm3
  1056. movsd 32 * SIZE(BO), %xmm13
  1057. movsd 48 * SIZE(BO), %xmm15
  1058. #ifndef TRMMKERNEL
  1059. movq K, %rax
  1060. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1061. movq K, %rax
  1062. subq KK, %rax
  1063. movq %rax, KKK
  1064. #else
  1065. movq KK, %rax
  1066. #ifdef LEFT
  1067. addq $1, %rax
  1068. #else
  1069. addq $2, %rax
  1070. #endif
  1071. movq %rax, KKK
  1072. #endif
  1073. sarq $3, %rax
  1074. je .L35
  1075. ALIGN_4
  1076. .L32:
  1077. shufps $0x50, %xmm9, %xmm9
  1078. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1079. mulps %xmm8, %xmm9
  1080. addps %xmm9, %xmm0
  1081. movsd 4 * SIZE(BO), %xmm9
  1082. shufps $0x50, %xmm9, %xmm9
  1083. mulps %xmm8, %xmm9
  1084. movddup 2 * SIZE(AO), %xmm8
  1085. addps %xmm9, %xmm1
  1086. movsd 8 * SIZE(BO), %xmm9
  1087. shufps $0x50, %xmm9, %xmm9
  1088. mulps %xmm8, %xmm9
  1089. addps %xmm9, %xmm0
  1090. movsd 12 * SIZE(BO), %xmm9
  1091. shufps $0x50, %xmm9, %xmm9
  1092. mulps %xmm8, %xmm9
  1093. movddup 4 * SIZE(AO), %xmm8
  1094. addps %xmm9, %xmm1
  1095. movsd 64 * SIZE(BO), %xmm9
  1096. shufps $0x50, %xmm11, %xmm11
  1097. mulps %xmm8, %xmm11
  1098. addps %xmm11, %xmm0
  1099. movsd 20 * SIZE(BO), %xmm11
  1100. shufps $0x50, %xmm11, %xmm11
  1101. mulps %xmm8, %xmm11
  1102. movddup 6 * SIZE(AO), %xmm8
  1103. addps %xmm11, %xmm1
  1104. movsd 24 * SIZE(BO), %xmm11
  1105. shufps $0x50, %xmm11, %xmm11
  1106. mulps %xmm8, %xmm11
  1107. addps %xmm11, %xmm0
  1108. movsd 28 * SIZE(BO), %xmm11
  1109. shufps $0x50, %xmm11, %xmm11
  1110. mulps %xmm8, %xmm11
  1111. movddup 16 * SIZE(AO), %xmm8
  1112. addps %xmm11, %xmm1
  1113. movsd 80 * SIZE(BO), %xmm11
  1114. shufps $0x50, %xmm13, %xmm13
  1115. mulps %xmm10, %xmm13
  1116. addps %xmm13, %xmm0
  1117. movsd 36 * SIZE(BO), %xmm13
  1118. shufps $0x50, %xmm13, %xmm13
  1119. mulps %xmm10, %xmm13
  1120. movddup 10 * SIZE(AO), %xmm10
  1121. addps %xmm13, %xmm1
  1122. movsd 40 * SIZE(BO), %xmm13
  1123. shufps $0x50, %xmm13, %xmm13
  1124. mulps %xmm10, %xmm13
  1125. addps %xmm13, %xmm0
  1126. movsd 44 * SIZE(BO), %xmm13
  1127. shufps $0x50, %xmm13, %xmm13
  1128. mulps %xmm10, %xmm13
  1129. movddup 12 * SIZE(AO), %xmm10
  1130. addps %xmm13, %xmm1
  1131. movsd 96 * SIZE(BO), %xmm13
  1132. shufps $0x50, %xmm15, %xmm15
  1133. mulps %xmm10, %xmm15
  1134. addps %xmm15, %xmm0
  1135. movsd 52 * SIZE(BO), %xmm15
  1136. shufps $0x50, %xmm15, %xmm15
  1137. mulps %xmm10, %xmm15
  1138. movddup 14 * SIZE(AO), %xmm10
  1139. addps %xmm15, %xmm1
  1140. movsd 56 * SIZE(BO), %xmm15
  1141. shufps $0x50, %xmm15, %xmm15
  1142. mulps %xmm10, %xmm15
  1143. addps %xmm15, %xmm0
  1144. movsd 60 * SIZE(BO), %xmm15
  1145. shufps $0x50, %xmm15, %xmm15
  1146. mulps %xmm10, %xmm15
  1147. movddup 24 * SIZE(AO), %xmm10
  1148. addps %xmm15, %xmm1
  1149. movsd 112 * SIZE(BO), %xmm15
  1150. addq $16 * SIZE, AO
  1151. addq $64 * SIZE, BO
  1152. decq %rax
  1153. jne .L32
  1154. ALIGN_4
  1155. .L35:
  1156. #ifndef TRMMKERNEL
  1157. movq K, %rax
  1158. #else
  1159. movq KKK, %rax
  1160. #endif
  1161. movaps ALPHA_R, %xmm14
  1162. movaps ALPHA_I, %xmm15
  1163. andq $7, %rax # if (k & 1)
  1164. BRANCH
  1165. je .L38
  1166. ALIGN_4
  1167. .L36:
  1168. shufps $0x50, %xmm9, %xmm9
  1169. mulps %xmm8, %xmm9
  1170. addps %xmm9, %xmm0
  1171. movsd 4 * SIZE(BO), %xmm9
  1172. shufps $0x50, %xmm9, %xmm9
  1173. mulps %xmm8, %xmm9
  1174. movddup 2 * SIZE(AO), %xmm8
  1175. addps %xmm9, %xmm1
  1176. movsd 8 * SIZE(BO), %xmm9
  1177. addq $2 * SIZE, AO
  1178. addq $8 * SIZE, BO
  1179. decq %rax
  1180. jg .L36
  1181. ALIGN_4
  1182. .L38:
  1183. movaps %xmm0, %xmm6
  1184. movlhps %xmm1, %xmm0
  1185. movhlps %xmm6, %xmm1
  1186. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  1187. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1188. cmpeqps %xmm7, %xmm7
  1189. pslld $31, %xmm7
  1190. xorps %xmm7, %xmm1
  1191. #endif
  1192. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1193. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1194. shufps $0xb1, %xmm1, %xmm1
  1195. addsubps %xmm1, %xmm0
  1196. movaps %xmm0, %xmm1
  1197. shufps $0xb1, %xmm0, %xmm0
  1198. #else
  1199. shufps $0xb1, %xmm0, %xmm0
  1200. addsubps %xmm0, %xmm1
  1201. movaps %xmm1, %xmm0
  1202. shufps $0xb1, %xmm1, %xmm1
  1203. #endif
  1204. mulps %xmm14, %xmm1
  1205. mulps %xmm15, %xmm0
  1206. addps %xmm1, %xmm0
  1207. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1208. movsd 0 * SIZE(CO1), %xmm8
  1209. movhps 0 * SIZE(CO2), %xmm8
  1210. addps %xmm8, %xmm0
  1211. #endif
  1212. movsd %xmm0, 0 * SIZE(CO1)
  1213. movhps %xmm0, 0 * SIZE(CO2)
  1214. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1215. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1216. movq K, %rax
  1217. subq KKK, %rax
  1218. leaq (,%rax, 8), %rax
  1219. leaq (AO, %rax, 1), AO
  1220. leaq (BO, %rax, 4), BO
  1221. #endif
  1222. #if defined(TRMMKERNEL) && defined(LEFT)
  1223. addq $1, KK
  1224. #endif
  1225. ALIGN_4
  1226. .L39:
  1227. #if defined(TRMMKERNEL) && !defined(LEFT)
  1228. addl $2, KK
  1229. #endif
  1230. leaq (C, LDC, 2), C # c += 2 * ldc
  1231. decq J # j --
  1232. jg .L01
  1233. ALIGN_4
  1234. .L40:
  1235. testq $1, N
  1236. je .L999
  1237. ALIGN_4
  1238. .L41:
  1239. #if defined(TRMMKERNEL) && defined(LEFT)
  1240. movq OFFSET, %rax
  1241. movq %rax, KK
  1242. #endif
  1243. /* Copying to Sub Buffer */
  1244. leaq BUFFER, BO
  1245. movq K, %rax
  1246. sarq $3, %rax
  1247. jle .L43
  1248. ALIGN_4
  1249. .L42:
  1250. movddup 0 * SIZE(B), %xmm0
  1251. movddup 2 * SIZE(B), %xmm1
  1252. movddup 4 * SIZE(B), %xmm2
  1253. movddup 6 * SIZE(B), %xmm3
  1254. movddup 8 * SIZE(B), %xmm4
  1255. movddup 10 * SIZE(B), %xmm5
  1256. movddup 12 * SIZE(B), %xmm6
  1257. movddup 14 * SIZE(B), %xmm7
  1258. movaps %xmm0, 0 * SIZE(BO)
  1259. movaps %xmm1, 4 * SIZE(BO)
  1260. movaps %xmm2, 8 * SIZE(BO)
  1261. movaps %xmm3, 12 * SIZE(BO)
  1262. movaps %xmm4, 16 * SIZE(BO)
  1263. movaps %xmm5, 20 * SIZE(BO)
  1264. movaps %xmm6, 24 * SIZE(BO)
  1265. movaps %xmm7, 28 * SIZE(BO)
  1266. prefetcht1 128 * SIZE(BO)
  1267. prefetcht0 112 * SIZE(B)
  1268. addq $16 * SIZE, B
  1269. addq $32 * SIZE, BO
  1270. decq %rax
  1271. jne .L42
  1272. ALIGN_4
  1273. .L43:
  1274. movq K, %rax
  1275. andq $7, %rax
  1276. BRANCH
  1277. jle .L50
  1278. ALIGN_4
  1279. .L44:
  1280. movddup 0 * SIZE(B), %xmm0
  1281. movaps %xmm0, 0 * SIZE(BO)
  1282. addq $2 * SIZE, B
  1283. addq $4 * SIZE, BO
  1284. decq %rax
  1285. jne .L44
  1286. ALIGN_4
  1287. .L50:
  1288. movq C, CO1 # coffset1 = c
  1289. movq A, AO # aoffset = a
  1290. movq M, I
  1291. sarq $2, I # i = (m >> 2)
  1292. jle .L60
  1293. ALIGN_4
  1294. .L51:
  1295. #if !defined(TRMMKERNEL) || \
  1296. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1297. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1298. leaq BUFFER, BO
  1299. #else
  1300. leaq BUFFER, BO
  1301. movq KK, %rax
  1302. leaq (, %rax, 8), %rax
  1303. leaq (AO, %rax, 4), AO
  1304. leaq (BO, %rax, 2), BO
  1305. #endif
  1306. movaps 0 * SIZE(AO), %xmm8
  1307. pxor %xmm0, %xmm0
  1308. movaps 16 * SIZE(AO), %xmm10
  1309. pxor %xmm1, %xmm1
  1310. movaps 32 * SIZE(AO), %xmm12
  1311. pxor %xmm4, %xmm4
  1312. movaps 48 * SIZE(AO), %xmm14
  1313. pxor %xmm5, %xmm5
  1314. movsldup 0 * SIZE(BO), %xmm9
  1315. movsldup 16 * SIZE(BO), %xmm11
  1316. prefetchnta 4 * SIZE(CO1)
  1317. #ifndef TRMMKERNEL
  1318. movq K, %rax
  1319. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1320. movq K, %rax
  1321. subq KK, %rax
  1322. movq %rax, KKK
  1323. #else
  1324. movq KK, %rax
  1325. #ifdef LEFT
  1326. addq $4, %rax
  1327. #else
  1328. addq $1, %rax
  1329. #endif
  1330. movq %rax, KKK
  1331. #endif
  1332. sarq $3, %rax
  1333. je .L55
  1334. ALIGN_4
  1335. .L52:
  1336. mulps %xmm8, %xmm9
  1337. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1338. addps %xmm9, %xmm0
  1339. movshdup 0 * SIZE(BO), %xmm9
  1340. mulps %xmm8, %xmm9
  1341. movaps 4 * SIZE(AO), %xmm8
  1342. ADDSUB %xmm9, %xmm1
  1343. movsldup 0 * SIZE(BO), %xmm9
  1344. mulps %xmm8, %xmm9
  1345. addps %xmm9, %xmm4
  1346. movshdup 0 * SIZE(BO), %xmm9
  1347. mulps %xmm8, %xmm9
  1348. movaps 8 * SIZE(AO), %xmm8
  1349. ADDSUB %xmm9, %xmm5
  1350. movsldup 4 * SIZE(BO), %xmm9
  1351. mulps %xmm8, %xmm9
  1352. addps %xmm9, %xmm0
  1353. movshdup 4 * SIZE(BO), %xmm9
  1354. mulps %xmm8, %xmm9
  1355. movaps 12 * SIZE(AO), %xmm8
  1356. ADDSUB %xmm9, %xmm1
  1357. movsldup 4 * SIZE(BO), %xmm9
  1358. mulps %xmm8, %xmm9
  1359. addps %xmm9, %xmm4
  1360. movshdup 4 * SIZE(BO), %xmm9
  1361. mulps %xmm8, %xmm9
  1362. movaps 64 * SIZE(AO), %xmm8
  1363. ADDSUB %xmm9, %xmm5
  1364. movsldup 8 * SIZE(BO), %xmm9
  1365. mulps %xmm10, %xmm9
  1366. addps %xmm9, %xmm0
  1367. movshdup 8 * SIZE(BO), %xmm9
  1368. mulps %xmm10, %xmm9
  1369. movaps 20 * SIZE(AO), %xmm10
  1370. ADDSUB %xmm9, %xmm1
  1371. movsldup 8 * SIZE(BO), %xmm9
  1372. mulps %xmm10, %xmm9
  1373. addps %xmm9, %xmm4
  1374. movshdup 8 * SIZE(BO), %xmm9
  1375. mulps %xmm10, %xmm9
  1376. movaps 24 * SIZE(AO), %xmm10
  1377. ADDSUB %xmm9, %xmm5
  1378. movsldup 12 * SIZE(BO), %xmm9
  1379. mulps %xmm10, %xmm9
  1380. addps %xmm9, %xmm0
  1381. movshdup 12 * SIZE(BO), %xmm9
  1382. mulps %xmm10, %xmm9
  1383. movaps 28 * SIZE(AO), %xmm10
  1384. ADDSUB %xmm9, %xmm1
  1385. movsldup 12 * SIZE(BO), %xmm9
  1386. mulps %xmm10, %xmm9
  1387. addps %xmm9, %xmm4
  1388. movshdup 12 * SIZE(BO), %xmm9
  1389. mulps %xmm10, %xmm9
  1390. movaps 80 * SIZE(AO), %xmm10
  1391. ADDSUB %xmm9, %xmm5
  1392. movsldup 32 * SIZE(BO), %xmm9
  1393. mulps %xmm12, %xmm11
  1394. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  1395. addps %xmm11, %xmm0
  1396. movshdup 16 * SIZE(BO), %xmm11
  1397. mulps %xmm12, %xmm11
  1398. movaps 36 * SIZE(AO), %xmm12
  1399. ADDSUB %xmm11, %xmm1
  1400. movsldup 16 * SIZE(BO), %xmm11
  1401. mulps %xmm12, %xmm11
  1402. addps %xmm11, %xmm4
  1403. movshdup 16 * SIZE(BO), %xmm11
  1404. mulps %xmm12, %xmm11
  1405. movaps 40 * SIZE(AO), %xmm12
  1406. ADDSUB %xmm11, %xmm5
  1407. movsldup 20 * SIZE(BO), %xmm11
  1408. mulps %xmm12, %xmm11
  1409. addps %xmm11, %xmm0
  1410. movshdup 20 * SIZE(BO), %xmm11
  1411. mulps %xmm12, %xmm11
  1412. movaps 44 * SIZE(AO), %xmm12
  1413. ADDSUB %xmm11, %xmm1
  1414. movsldup 20 * SIZE(BO), %xmm11
  1415. mulps %xmm12, %xmm11
  1416. addps %xmm11, %xmm4
  1417. movshdup 20 * SIZE(BO), %xmm11
  1418. mulps %xmm12, %xmm11
  1419. movaps 96 * SIZE(AO), %xmm12
  1420. ADDSUB %xmm11, %xmm5
  1421. movsldup 24 * SIZE(BO), %xmm11
  1422. mulps %xmm14, %xmm11
  1423. addps %xmm11, %xmm0
  1424. movshdup 24 * SIZE(BO), %xmm11
  1425. mulps %xmm14, %xmm11
  1426. movaps 52 * SIZE(AO), %xmm14
  1427. ADDSUB %xmm11, %xmm1
  1428. movsldup 24 * SIZE(BO), %xmm11
  1429. mulps %xmm14, %xmm11
  1430. addps %xmm11, %xmm4
  1431. movshdup 24 * SIZE(BO), %xmm11
  1432. mulps %xmm14, %xmm11
  1433. movaps 56 * SIZE(AO), %xmm14
  1434. ADDSUB %xmm11, %xmm5
  1435. movsldup 28 * SIZE(BO), %xmm11
  1436. mulps %xmm14, %xmm11
  1437. addps %xmm11, %xmm0
  1438. movshdup 28 * SIZE(BO), %xmm11
  1439. mulps %xmm14, %xmm11
  1440. movaps 60 * SIZE(AO), %xmm14
  1441. ADDSUB %xmm11, %xmm1
  1442. movsldup 28 * SIZE(BO), %xmm11
  1443. mulps %xmm14, %xmm11
  1444. addps %xmm11, %xmm4
  1445. movshdup 28 * SIZE(BO), %xmm11
  1446. mulps %xmm14, %xmm11
  1447. movaps 112 * SIZE(AO), %xmm14
  1448. ADDSUB %xmm11, %xmm5
  1449. movsldup 48 * SIZE(BO), %xmm11
  1450. addq $64 * SIZE, AO
  1451. addq $32 * SIZE, BO
  1452. decq %rax
  1453. jne .L52
  1454. ALIGN_4
  1455. .L55:
  1456. #ifndef TRMMKERNEL
  1457. movq K, %rax
  1458. #else
  1459. movq KKK, %rax
  1460. #endif
  1461. movaps ALPHA_R, %xmm14
  1462. movaps ALPHA_I, %xmm15
  1463. andq $7, %rax # if (k & 1)
  1464. BRANCH
  1465. je .L58
  1466. ALIGN_4
  1467. .L56:
  1468. mulps %xmm8, %xmm9
  1469. addps %xmm9, %xmm0
  1470. movshdup 0 * SIZE(BO), %xmm9
  1471. mulps %xmm8, %xmm9
  1472. movaps 4 * SIZE(AO), %xmm8
  1473. ADDSUB %xmm9, %xmm1
  1474. movsldup 0 * SIZE(BO), %xmm9
  1475. mulps %xmm8, %xmm9
  1476. addps %xmm9, %xmm4
  1477. movshdup 0 * SIZE(BO), %xmm9
  1478. mulps %xmm8, %xmm9
  1479. movaps 8 * SIZE(AO), %xmm8
  1480. ADDSUB %xmm9, %xmm5
  1481. movsldup 4 * SIZE(BO), %xmm9
  1482. addq $ 8 * SIZE, AO
  1483. addq $ 4 * SIZE, BO
  1484. decq %rax
  1485. jg .L56
  1486. ALIGN_4
  1487. .L58:
  1488. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1489. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1490. shufps $0xb1, %xmm1, %xmm1
  1491. shufps $0xb1, %xmm5, %xmm5
  1492. addsubps %xmm1, %xmm0
  1493. addsubps %xmm5, %xmm4
  1494. movaps %xmm0, %xmm1
  1495. movaps %xmm4, %xmm5
  1496. shufps $0xb1, %xmm0, %xmm0
  1497. shufps $0xb1, %xmm4, %xmm4
  1498. #else
  1499. shufps $0xb1, %xmm0, %xmm0
  1500. shufps $0xb1, %xmm4, %xmm4
  1501. addsubps %xmm0, %xmm1
  1502. addsubps %xmm4, %xmm5
  1503. movaps %xmm1, %xmm0
  1504. movaps %xmm5, %xmm4
  1505. shufps $0xb1, %xmm1, %xmm1
  1506. shufps $0xb1, %xmm5, %xmm5
  1507. #endif
  1508. mulps %xmm14, %xmm1
  1509. mulps %xmm15, %xmm0
  1510. mulps %xmm14, %xmm5
  1511. mulps %xmm15, %xmm4
  1512. addps %xmm1, %xmm0
  1513. addps %xmm5, %xmm4
  1514. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1515. movsd 0 * SIZE(CO1), %xmm8
  1516. movhps 2 * SIZE(CO1), %xmm8
  1517. movsd 4 * SIZE(CO1), %xmm9
  1518. movhps 6 * SIZE(CO1), %xmm9
  1519. addps %xmm8, %xmm0
  1520. addps %xmm9, %xmm4
  1521. #endif
  1522. movsd %xmm0, 0 * SIZE(CO1)
  1523. movhps %xmm0, 2 * SIZE(CO1)
  1524. movsd %xmm4, 4 * SIZE(CO1)
  1525. movhps %xmm4, 6 * SIZE(CO1)
  1526. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1527. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1528. movq K, %rax
  1529. subq KKK, %rax
  1530. leaq (,%rax, 8), %rax
  1531. leaq (AO, %rax, 4), AO
  1532. leaq (BO, %rax, 2), BO
  1533. #endif
  1534. #if defined(TRMMKERNEL) && defined(LEFT)
  1535. addq $4, KK
  1536. #endif
  1537. addq $8 * SIZE, CO1 # coffset += 4
  1538. decq I # i --
  1539. jg .L51
  1540. ALIGN_4
  1541. .L60:
  1542. testq $2, M
  1543. je .L70
  1544. #if !defined(TRMMKERNEL) || \
  1545. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1546. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1547. leaq BUFFER, BO
  1548. #else
  1549. leaq BUFFER, BO
  1550. movq KK, %rax
  1551. leaq (, %rax, 8), %rax
  1552. leaq (AO, %rax, 2), AO
  1553. leaq (BO, %rax, 2), BO
  1554. #endif
  1555. movaps 0 * SIZE(AO), %xmm8
  1556. pxor %xmm0, %xmm0
  1557. movsldup 0 * SIZE(BO), %xmm9
  1558. pxor %xmm1, %xmm1
  1559. movaps 16 * SIZE(AO), %xmm10
  1560. movsldup 16 * SIZE(BO), %xmm11
  1561. #ifndef TRMMKERNEL
  1562. movq K, %rax
  1563. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1564. movq K, %rax
  1565. subq KK, %rax
  1566. movq %rax, KKK
  1567. #else
  1568. movq KK, %rax
  1569. #ifdef LEFT
  1570. addq $2, %rax
  1571. #else
  1572. addq $1, %rax
  1573. #endif
  1574. movq %rax, KKK
  1575. #endif
  1576. sarq $3, %rax
  1577. je .L65
  1578. ALIGN_4
  1579. .L62:
  1580. mulps %xmm8, %xmm9
  1581. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1582. addps %xmm9, %xmm0
  1583. movshdup 0 * SIZE(BO), %xmm9
  1584. mulps %xmm8, %xmm9
  1585. movaps 4 * SIZE(AO), %xmm8
  1586. ADDSUB %xmm9, %xmm1
  1587. movsldup 4 * SIZE(BO), %xmm9
  1588. mulps %xmm8, %xmm9
  1589. addps %xmm9, %xmm0
  1590. movshdup 4 * SIZE(BO), %xmm9
  1591. mulps %xmm8, %xmm9
  1592. movaps 8 * SIZE(AO), %xmm8
  1593. ADDSUB %xmm9, %xmm1
  1594. movsldup 8 * SIZE(BO), %xmm9
  1595. mulps %xmm8, %xmm9
  1596. addps %xmm9, %xmm0
  1597. movshdup 8 * SIZE(BO), %xmm9
  1598. mulps %xmm8, %xmm9
  1599. movaps 12 * SIZE(AO), %xmm8
  1600. ADDSUB %xmm9, %xmm1
  1601. movsldup 12 * SIZE(BO), %xmm9
  1602. mulps %xmm8, %xmm9
  1603. addps %xmm9, %xmm0
  1604. movshdup 12 * SIZE(BO), %xmm9
  1605. mulps %xmm8, %xmm9
  1606. movaps 32 * SIZE(AO), %xmm8
  1607. ADDSUB %xmm9, %xmm1
  1608. movsldup 32 * SIZE(BO), %xmm9
  1609. mulps %xmm10, %xmm11
  1610. addps %xmm11, %xmm0
  1611. movshdup 16 * SIZE(BO), %xmm11
  1612. mulps %xmm10, %xmm11
  1613. movaps 20 * SIZE(AO), %xmm10
  1614. ADDSUB %xmm11, %xmm1
  1615. movsldup 20 * SIZE(BO), %xmm11
  1616. mulps %xmm10, %xmm11
  1617. addps %xmm11, %xmm0
  1618. movshdup 20 * SIZE(BO), %xmm11
  1619. mulps %xmm10, %xmm11
  1620. movaps 24 * SIZE(AO), %xmm10
  1621. ADDSUB %xmm11, %xmm1
  1622. movsldup 24 * SIZE(BO), %xmm11
  1623. mulps %xmm10, %xmm11
  1624. addps %xmm11, %xmm0
  1625. movshdup 24 * SIZE(BO), %xmm11
  1626. mulps %xmm10, %xmm11
  1627. movaps 28 * SIZE(AO), %xmm10
  1628. ADDSUB %xmm11, %xmm1
  1629. movsldup 28 * SIZE(BO), %xmm11
  1630. mulps %xmm10, %xmm11
  1631. addps %xmm11, %xmm0
  1632. movshdup 28 * SIZE(BO), %xmm11
  1633. mulps %xmm10, %xmm11
  1634. movaps 48 * SIZE(AO), %xmm10
  1635. ADDSUB %xmm11, %xmm1
  1636. movsldup 48 * SIZE(BO), %xmm11
  1637. addq $32 * SIZE, AO
  1638. addq $32 * SIZE, BO
  1639. decq %rax
  1640. jne .L62
  1641. ALIGN_4
  1642. .L65:
  1643. #ifndef TRMMKERNEL
  1644. movq K, %rax
  1645. #else
  1646. movq KKK, %rax
  1647. #endif
  1648. movaps ALPHA_R, %xmm14
  1649. movaps ALPHA_I, %xmm15
  1650. andq $7, %rax # if (k & 1)
  1651. BRANCH
  1652. je .L68
  1653. ALIGN_4
  1654. .L66:
  1655. mulps %xmm8, %xmm9
  1656. addps %xmm9, %xmm0
  1657. movshdup 0 * SIZE(BO), %xmm9
  1658. mulps %xmm8, %xmm9
  1659. movaps 4 * SIZE(AO), %xmm8
  1660. ADDSUB %xmm9, %xmm1
  1661. movsldup 4 * SIZE(BO), %xmm9
  1662. addq $4 * SIZE, AO # aoffset += 4
  1663. addq $4 * SIZE, BO # boffset1 += 8
  1664. decq %rax
  1665. jg .L66
  1666. ALIGN_4
  1667. .L68:
  1668. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1669. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1670. shufps $0xb1, %xmm1, %xmm1
  1671. addsubps %xmm1, %xmm0
  1672. movaps %xmm0, %xmm1
  1673. shufps $0xb1, %xmm0, %xmm0
  1674. #else
  1675. shufps $0xb1, %xmm0, %xmm0
  1676. addsubps %xmm0, %xmm1
  1677. movaps %xmm1, %xmm0
  1678. shufps $0xb1, %xmm1, %xmm1
  1679. #endif
  1680. mulps %xmm14, %xmm1
  1681. mulps %xmm15, %xmm0
  1682. addps %xmm1, %xmm0
  1683. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1684. movsd 0 * SIZE(CO1), %xmm8
  1685. movhps 2 * SIZE(CO1), %xmm8
  1686. addps %xmm8, %xmm0
  1687. #endif
  1688. movsd %xmm0, 0 * SIZE(CO1)
  1689. movhps %xmm0, 2 * SIZE(CO1)
  1690. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1691. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1692. movq K, %rax
  1693. subq KKK, %rax
  1694. leaq (,%rax, 8), %rax
  1695. leaq (AO, %rax, 2), AO
  1696. leaq (BO, %rax, 2), BO
  1697. #endif
  1698. #if defined(TRMMKERNEL) && defined(LEFT)
  1699. addq $2, KK
  1700. #endif
  1701. addq $4 * SIZE, CO1 # coffset += 4
  1702. ALIGN_4
  1703. .L70:
  1704. testq $1, M
  1705. je .L999
  1706. #if !defined(TRMMKERNEL) || \
  1707. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1708. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1709. leaq BUFFER, BO
  1710. #else
  1711. leaq BUFFER, BO
  1712. movq KK, %rax
  1713. leaq (, %rax, 8), %rax
  1714. leaq (AO, %rax, 1), AO
  1715. leaq (BO, %rax, 2), BO
  1716. #endif
  1717. movddup 0 * SIZE(AO), %xmm8
  1718. pxor %xmm0, %xmm0
  1719. movsd 0 * SIZE(BO), %xmm9
  1720. pxor %xmm1, %xmm1
  1721. movddup 8 * SIZE(AO), %xmm10
  1722. movsd 16 * SIZE(BO), %xmm11
  1723. #ifndef TRMMKERNEL
  1724. movq K, %rax
  1725. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1726. movq K, %rax
  1727. subq KK, %rax
  1728. movq %rax, KKK
  1729. #else
  1730. movq KK, %rax
  1731. #ifdef LEFT
  1732. addq $1, %rax
  1733. #else
  1734. addq $1, %rax
  1735. #endif
  1736. movq %rax, KKK
  1737. #endif
  1738. sarq $3, %rax
  1739. je .L75
  1740. ALIGN_4
  1741. .L72:
  1742. shufps $0x50, %xmm9, %xmm9
  1743. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1744. mulps %xmm8, %xmm9
  1745. movddup 2 * SIZE(AO), %xmm8
  1746. addps %xmm9, %xmm0
  1747. movsd 4 * SIZE(BO), %xmm9
  1748. shufps $0x50, %xmm9, %xmm9
  1749. mulps %xmm8, %xmm9
  1750. movddup 4 * SIZE(AO), %xmm8
  1751. addps %xmm9, %xmm1
  1752. movsd 8 * SIZE(BO), %xmm9
  1753. shufps $0x50, %xmm9, %xmm9
  1754. mulps %xmm8, %xmm9
  1755. movddup 6 * SIZE(AO), %xmm8
  1756. addps %xmm9, %xmm0
  1757. movsd 12 * SIZE(BO), %xmm9
  1758. shufps $0x50, %xmm9, %xmm9
  1759. mulps %xmm8, %xmm9
  1760. movddup 16 * SIZE(AO), %xmm8
  1761. addps %xmm9, %xmm1
  1762. movsd 32 * SIZE(BO), %xmm9
  1763. shufps $0x50, %xmm11, %xmm11
  1764. mulps %xmm10, %xmm11
  1765. movddup 10 * SIZE(AO), %xmm10
  1766. addps %xmm11, %xmm0
  1767. movsd 20 * SIZE(BO), %xmm11
  1768. shufps $0x50, %xmm11, %xmm11
  1769. mulps %xmm10, %xmm11
  1770. movddup 12 * SIZE(AO), %xmm10
  1771. addps %xmm11, %xmm1
  1772. movsd 24 * SIZE(BO), %xmm11
  1773. shufps $0x50, %xmm11, %xmm11
  1774. mulps %xmm10, %xmm11
  1775. movddup 14 * SIZE(AO), %xmm10
  1776. addps %xmm11, %xmm0
  1777. movsd 28 * SIZE(BO), %xmm11
  1778. shufps $0x50, %xmm11, %xmm11
  1779. mulps %xmm10, %xmm11
  1780. movddup 24 * SIZE(AO), %xmm10
  1781. addps %xmm11, %xmm1
  1782. movsd 48 * SIZE(BO), %xmm11
  1783. addq $16 * SIZE, AO
  1784. addq $32 * SIZE, BO
  1785. decq %rax
  1786. jne .L72
  1787. ALIGN_4
  1788. .L75:
  1789. #ifndef TRMMKERNEL
  1790. movq K, %rax
  1791. #else
  1792. movq KKK, %rax
  1793. #endif
  1794. movaps ALPHA_R, %xmm14
  1795. movaps ALPHA_I, %xmm15
  1796. andq $7, %rax # if (k & 1)
  1797. BRANCH
  1798. je .L78
  1799. ALIGN_4
  1800. .L76:
  1801. shufps $0x50, %xmm9, %xmm9
  1802. mulps %xmm8, %xmm9
  1803. movddup 2 * SIZE(AO), %xmm8
  1804. addps %xmm9, %xmm0
  1805. movsd 4 * SIZE(BO), %xmm9
  1806. addq $2 * SIZE, AO
  1807. addq $4 * SIZE, BO
  1808. decq %rax
  1809. jg .L76
  1810. ALIGN_4
  1811. .L78:
  1812. addps %xmm1, %xmm0
  1813. movhlps %xmm0, %xmm1
  1814. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  1815. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1816. cmpeqps %xmm7, %xmm7
  1817. pslld $31, %xmm7
  1818. xorps %xmm7, %xmm1
  1819. #endif
  1820. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1821. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1822. shufps $0xb1, %xmm1, %xmm1
  1823. addsubps %xmm1, %xmm0
  1824. movaps %xmm0, %xmm1
  1825. shufps $0xb1, %xmm0, %xmm0
  1826. #else
  1827. shufps $0xb1, %xmm0, %xmm0
  1828. addsubps %xmm0, %xmm1
  1829. movaps %xmm1, %xmm0
  1830. shufps $0xb1, %xmm1, %xmm1
  1831. #endif
  1832. mulps %xmm14, %xmm1
  1833. mulps %xmm15, %xmm0
  1834. addps %xmm1, %xmm0
  1835. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1836. movsd 0 * SIZE(CO1), %xmm8
  1837. addps %xmm8, %xmm0
  1838. #endif
  1839. movsd %xmm0, 0 * SIZE(CO1)
  1840. ALIGN_4
  1841. .L999:
  1842. movq %rbx, %rsp
  1843. movq 0(%rsp), %rbx
  1844. movq 8(%rsp), %rbp
  1845. movq 16(%rsp), %r12
  1846. movq 24(%rsp), %r13
  1847. movq 32(%rsp), %r14
  1848. movq 40(%rsp), %r15
  1849. #ifdef WINDOWS_ABI
  1850. movq 48(%rsp), %rdi
  1851. movq 56(%rsp), %rsi
  1852. movups 64(%rsp), %xmm6
  1853. movups 80(%rsp), %xmm7
  1854. movups 96(%rsp), %xmm8
  1855. movups 112(%rsp), %xmm9
  1856. movups 128(%rsp), %xmm10
  1857. movups 144(%rsp), %xmm11
  1858. movups 160(%rsp), %xmm12
  1859. movups 176(%rsp), %xmm13
  1860. movups 192(%rsp), %xmm14
  1861. movups 208(%rsp), %xmm15
  1862. #endif
  1863. addq $STACKSIZE, %rsp
  1864. ret
  1865. EPILOGUE