You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_4x2_bulldozer.S 53 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define J %r14
  44. #define OLD_K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define K %r12
  54. #define BI %rbp
  55. #define SP %rbx
  56. #define BO1 %rdi
  57. #define BO2 %r15
  58. #ifndef WINDOWS_ABI
  59. #define STACKSIZE 96
  60. #else
  61. #define STACKSIZE 320
  62. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  63. #define OLD_A 48 + STACKSIZE(%rsp)
  64. #define OLD_B 56 + STACKSIZE(%rsp)
  65. #define OLD_C 64 + STACKSIZE(%rsp)
  66. #define OLD_LDC 72 + STACKSIZE(%rsp)
  67. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  68. #endif
  69. #define L_BUFFER_SIZE 8192
  70. #define Ndiv6 24(%rsp)
  71. #define Nmod6 32(%rsp)
  72. #define N 40(%rsp)
  73. #define ALPHA_R 48(%rsp)
  74. #define ALPHA_I 56(%rsp)
  75. #define OFFSET 64(%rsp)
  76. #define KK 72(%rsp)
  77. #define KKK 80(%rsp)
  78. #define BUFFER1 128(%rsp)
  79. #define BUFFER2 LB2_OFFSET+128(%rsp)
  80. #if defined(OS_WINDOWS)
  81. #if L_BUFFER_SIZE > 16384
  82. #define STACK_TOUCH \
  83. movl $0, 4096 * 4(%rsp);\
  84. movl $0, 4096 * 3(%rsp);\
  85. movl $0, 4096 * 2(%rsp);\
  86. movl $0, 4096 * 1(%rsp);
  87. #elif L_BUFFER_SIZE > 12288
  88. #define STACK_TOUCH \
  89. movl $0, 4096 * 3(%rsp);\
  90. movl $0, 4096 * 2(%rsp);\
  91. movl $0, 4096 * 1(%rsp);
  92. #elif L_BUFFER_SIZE > 8192
  93. #define STACK_TOUCH \
  94. movl $0, 4096 * 2(%rsp);\
  95. movl $0, 4096 * 1(%rsp);
  96. #elif L_BUFFER_SIZE > 4096
  97. #define STACK_TOUCH \
  98. movl $0, 4096 * 1(%rsp);
  99. #else
  100. #define STACK_TOUCH
  101. #endif
  102. #else
  103. #define STACK_TOUCH
  104. #endif
  105. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  106. #define VFMADD_R vfmaddps
  107. #define VFMADD_I vfmaddps
  108. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  109. #define VFMADD_R vfnmaddps
  110. #define VFMADD_I vfmaddps
  111. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  112. #define VFMADD_R vfmaddps
  113. #define VFMADD_I vfnmaddps
  114. #else
  115. #define VFMADD_R vfnmaddps
  116. #define VFMADD_I vfnmaddps
  117. #endif
  118. #define A_PR1 384
  119. #define B_PR1 192
  120. #define KERNEL4x2_1(xx) \
  121. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  122. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  123. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  124. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  125. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  126. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  127. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  128. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  129. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  130. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  131. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  132. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  133. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  134. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  135. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  136. #define KERNEL4x2_2(xx) \
  137. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  138. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  139. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  140. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  141. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  142. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  143. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  144. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  145. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  146. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  147. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  148. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  149. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  150. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  151. #define KERNEL4x2_3(xx) \
  152. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  153. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  154. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  155. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  156. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  157. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  158. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  159. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  160. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  161. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  162. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  163. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  164. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  165. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  166. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  167. #define KERNEL4x2_4(xx) \
  168. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  169. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  170. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  171. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  172. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  173. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  174. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  175. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  176. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  177. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  178. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  179. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  180. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  181. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  182. addq $16, BI ;\
  183. addq $32, %rax ;\
  184. #define KERNEL4x2_SUB(xx) \
  185. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  186. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  187. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  188. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  189. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  190. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  191. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  192. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  193. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  194. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  195. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  196. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  197. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  198. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  199. addq $4, BI ;\
  200. addq $8, %rax ;\
  201. /************************************************************************************************/
  202. #define KERNEL2x2_1(xx) \
  203. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  204. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  205. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  206. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  207. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  208. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  209. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  210. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  211. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  212. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  213. #define KERNEL2x2_2(xx) \
  214. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  215. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  216. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  217. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  218. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  219. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  220. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  221. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  222. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  223. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  224. #define KERNEL2x2_3(xx) \
  225. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  226. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  227. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  228. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  229. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  230. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  231. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  232. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  233. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  234. #define KERNEL2x2_4(xx) \
  235. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  236. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  237. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  238. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  239. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  240. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  241. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  242. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  243. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  244. addq $16, BI ;\
  245. addq $16, %rax ;\
  246. #define KERNEL2x2_SUB(xx) \
  247. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  248. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  249. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  250. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  251. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  252. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  253. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  254. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  255. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  256. addq $4, BI ;\
  257. addq $4, %rax ;\
  258. /************************************************************************************************/
  259. #define KERNEL1x2_1(xx) \
  260. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  261. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  262. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  263. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  264. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  265. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  266. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  267. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  268. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  269. #define KERNEL1x2_2(xx) \
  270. vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  271. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  272. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  273. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  274. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  275. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  276. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  277. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  278. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  279. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  280. #define KERNEL1x2_3(xx) \
  281. vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  282. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  283. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  284. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  285. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  286. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  287. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  288. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  289. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  290. #define KERNEL1x2_4(xx) \
  291. vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  292. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  293. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  294. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  295. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  296. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  297. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  298. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  299. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  300. addq $16, BI ;\
  301. addq $8, %rax ;\
  302. #define KERNEL1x2_SUB(xx) \
  303. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  304. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  305. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  306. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  307. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  308. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  309. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  310. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  311. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  312. addq $4, BI ;\
  313. addq $2, %rax ;\
  314. /************************************************************************************************/
  315. #define KERNEL4x1_1(xx) \
  316. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  317. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  318. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  319. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  320. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  321. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  322. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  323. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  324. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  325. #define KERNEL4x1_2(xx) \
  326. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  327. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  328. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  329. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  330. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  331. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  332. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  333. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  334. #define KERNEL4x1_3(xx) \
  335. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  336. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  337. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  338. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  339. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  340. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  341. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  342. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  343. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  344. #define KERNEL4x1_4(xx) \
  345. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  346. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  347. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  348. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  349. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  350. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  351. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  352. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  353. addq $8, BI ;\
  354. addq $32, %rax ;\
  355. #define KERNEL4x1_SUB(xx) \
  356. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  357. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  358. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  359. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  360. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  361. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  362. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  363. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  364. addq $2, BI ;\
  365. addq $8, %rax ;\
  366. /************************************************************************************************/
  367. #define KERNEL2x1_1(xx) \
  368. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  369. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  370. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  371. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  372. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  373. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  374. #define KERNEL2x1_2(xx) \
  375. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  376. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  377. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  378. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  379. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  380. #define KERNEL2x1_3(xx) \
  381. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  382. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  383. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  384. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  385. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  386. #define KERNEL2x1_4(xx) \
  387. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  388. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  389. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  390. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  391. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  392. addq $8, BI ;\
  393. addq $16, %rax ;\
  394. #define KERNEL2x1_SUB(xx) \
  395. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  396. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  397. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  398. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  399. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  400. addq $2, BI ;\
  401. addq $4, %rax ;\
  402. /************************************************************************************************/
  403. #define KERNEL1x1_1(xx) \
  404. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  405. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  406. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  407. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  408. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  409. #define KERNEL1x1_2(xx) \
  410. vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  411. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  412. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  413. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  414. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  415. #define KERNEL1x1_3(xx) \
  416. vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  417. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  418. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  419. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  420. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  421. #define KERNEL1x1_4(xx) \
  422. vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  423. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  424. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  425. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  426. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  427. addq $8, BI ;\
  428. addq $8, %rax ;\
  429. #define KERNEL1x1_SUB(xx) \
  430. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  431. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  432. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  433. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  434. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  435. addq $2, BI ;\
  436. addq $2, %rax ;\
  437. /************************************************************************************************/
  438. PROLOGUE
  439. PROFCODE
  440. subq $STACKSIZE, %rsp
  441. movq %rbx, (%rsp)
  442. movq %rbp, 8(%rsp)
  443. movq %r12, 16(%rsp)
  444. movq %r13, 24(%rsp)
  445. movq %r14, 32(%rsp)
  446. movq %r15, 40(%rsp)
  447. vzeroupper
  448. #ifdef WINDOWS_ABI
  449. movq %rdi, 48(%rsp)
  450. movq %rsi, 56(%rsp)
  451. vmovups %xmm6, 64(%rsp)
  452. vmovups %xmm7, 80(%rsp)
  453. vmovups %xmm8, 96(%rsp)
  454. vmovups %xmm9, 112(%rsp)
  455. vmovups %xmm10, 128(%rsp)
  456. vmovups %xmm11, 144(%rsp)
  457. vmovups %xmm12, 160(%rsp)
  458. vmovups %xmm13, 176(%rsp)
  459. vmovups %xmm14, 192(%rsp)
  460. vmovups %xmm15, 208(%rsp)
  461. movq ARG1, OLD_M
  462. movq ARG2, OLD_N
  463. movq ARG3, OLD_K
  464. movq OLD_A, A
  465. movq OLD_B, B
  466. movq OLD_C, C
  467. movq OLD_LDC, LDC
  468. #ifdef TRMMKERNEL
  469. vmovsd OLD_OFFSET, %xmm12
  470. #endif
  471. vmovaps %xmm3, %xmm0
  472. vmovsd OLD_ALPHA_I, %xmm1
  473. #else
  474. movq STACKSIZE + 8(%rsp), LDC
  475. #ifdef TRMMKERNEL
  476. vmovsd STACKSIZE + 16(%rsp), %xmm12
  477. #endif
  478. #endif
  479. movq %rsp, SP # save old stack
  480. subq $128 + L_BUFFER_SIZE, %rsp
  481. andq $-4096, %rsp # align stack
  482. STACK_TOUCH
  483. cmpq $0, OLD_M
  484. je .L999
  485. cmpq $0, OLD_N
  486. je .L999
  487. cmpq $0, OLD_K
  488. je .L999
  489. movq OLD_M, M
  490. movq OLD_N, N
  491. movq OLD_K, K
  492. vmovss %xmm0, ALPHA_R
  493. vmovss %xmm1, ALPHA_I
  494. salq $ZBASE_SHIFT, LDC
  495. movq N, %rax
  496. xorq %rdx, %rdx
  497. movq $2, %rdi
  498. divq %rdi // N / 2
  499. movq %rax, Ndiv6 // N / 2
  500. movq %rdx, Nmod6 // N % 2
  501. #ifdef TRMMKERNEL
  502. vmovsd %xmm12, OFFSET
  503. vmovsd %xmm12, KK
  504. #ifndef LEFT
  505. negq KK
  506. #endif
  507. #endif
  508. .L2_0:
  509. movq Ndiv6, J
  510. cmpq $0, J
  511. je .L1_0
  512. ALIGN_4
  513. .L2_01:
  514. // copy to sub buffer
  515. movq B, BO1
  516. leaq BUFFER1, BO // first buffer to BO
  517. movq K, %rax
  518. ALIGN_4
  519. .L2_02b:
  520. vmovups (BO1), %xmm0
  521. vmovups %xmm0, (BO)
  522. addq $4*SIZE,BO1
  523. addq $4*SIZE,BO
  524. decq %rax
  525. jnz .L2_02b
  526. .L2_02c:
  527. movq BO1, B // next offset of B
  528. .L2_10:
  529. movq C, CO1
  530. leaq (C, LDC, 2), C // c += 2 * ldc
  531. #if defined(TRMMKERNEL) && defined(LEFT)
  532. movq OFFSET, %rax
  533. movq %rax, KK
  534. #endif
  535. movq A, AO // aoffset = a
  536. addq $16 * SIZE, AO
  537. movq M, I
  538. sarq $2, I // i = (m >> 2)
  539. je .L2_20
  540. ALIGN_4
  541. .L2_11:
  542. #if !defined(TRMMKERNEL) || \
  543. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  544. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  545. leaq BUFFER1, BO // first buffer to BO
  546. addq $8 * SIZE, BO
  547. #else
  548. movq KK, %rax
  549. leaq BUFFER1, BO // first buffer to BO
  550. addq $8 * SIZE, BO
  551. movq %rax, BI // Index for BO
  552. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  553. leaq (BO, BI, SIZE), BO
  554. salq $3, %rax // rax = rax * 8 ; number of values
  555. leaq (AO, %rax, SIZE), AO
  556. #endif
  557. vzeroall
  558. #ifndef TRMMKERNEL
  559. movq K, %rax
  560. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  561. movq K, %rax
  562. subq KK, %rax
  563. movq %rax, KKK
  564. #else
  565. movq KK, %rax
  566. #ifdef LEFT
  567. addq $4, %rax // number of values in AO
  568. #else
  569. addq $2, %rax // number of values in BO
  570. #endif
  571. movq %rax, KKK
  572. #endif
  573. andq $-8, %rax // K = K - ( K % 8 )
  574. je .L2_16
  575. movq %rax, BI // Index for BO
  576. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  577. salq $3, %rax // rax = rax * 8 ; number of values
  578. leaq (AO, %rax, SIZE), AO
  579. leaq (BO, BI, SIZE), BO
  580. negq BI
  581. negq %rax
  582. ALIGN_4
  583. .L2_12:
  584. prefetcht0 B_PR1(BO,BI,SIZE)
  585. KERNEL4x2_1(xxx)
  586. KERNEL4x2_2(xxx)
  587. KERNEL4x2_3(xxx)
  588. KERNEL4x2_4(xxx)
  589. prefetcht0 B_PR1(BO,BI,SIZE)
  590. KERNEL4x2_1(xxx)
  591. KERNEL4x2_2(xxx)
  592. KERNEL4x2_3(xxx)
  593. KERNEL4x2_4(xxx)
  594. je .L2_16
  595. prefetcht0 B_PR1(BO,BI,SIZE)
  596. KERNEL4x2_1(xxx)
  597. KERNEL4x2_2(xxx)
  598. KERNEL4x2_3(xxx)
  599. KERNEL4x2_4(xxx)
  600. prefetcht0 B_PR1(BO,BI,SIZE)
  601. KERNEL4x2_1(xxx)
  602. KERNEL4x2_2(xxx)
  603. KERNEL4x2_3(xxx)
  604. KERNEL4x2_4(xxx)
  605. je .L2_16
  606. jmp .L2_12
  607. ALIGN_4
  608. .L2_16:
  609. #ifndef TRMMKERNEL
  610. movq K, %rax
  611. #else
  612. movq KKK, %rax
  613. #endif
  614. andq $7, %rax # if (k & 1)
  615. je .L2_19
  616. movq %rax, BI // Index for BO
  617. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  618. salq $3, %rax // rax = rax * 8 ; number of values
  619. leaq (AO, %rax, SIZE), AO
  620. leaq (BO, BI, SIZE), BO
  621. negq BI
  622. negq %rax
  623. ALIGN_4
  624. .L2_17:
  625. KERNEL4x2_SUB(xxx)
  626. jl .L2_17
  627. ALIGN_4
  628. .L2_19:
  629. vbroadcastss ALPHA_R, %xmm0
  630. vbroadcastss ALPHA_I, %xmm1
  631. // swap high and low 64 bytes
  632. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  633. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  634. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  635. vshufps $0xb1, %xmm15, %xmm15, %xmm15
  636. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  637. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  638. vaddsubps %xmm9, %xmm8 , %xmm8
  639. vaddsubps %xmm11,%xmm10, %xmm10
  640. vaddsubps %xmm13,%xmm12, %xmm12
  641. vaddsubps %xmm15,%xmm14, %xmm14
  642. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  643. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  644. vshufps $0xb1, %xmm12, %xmm12, %xmm13
  645. vshufps $0xb1, %xmm14, %xmm14, %xmm15
  646. #else
  647. vaddsubps %xmm8, %xmm9 ,%xmm9
  648. vaddsubps %xmm10, %xmm11,%xmm11
  649. vaddsubps %xmm12, %xmm13,%xmm13
  650. vaddsubps %xmm14, %xmm15,%xmm15
  651. vmovaps %xmm9, %xmm8
  652. vmovaps %xmm11, %xmm10
  653. vmovaps %xmm13, %xmm12
  654. vmovaps %xmm15, %xmm14
  655. // swap high and low 64 bytes
  656. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  657. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  658. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  659. vshufps $0xb1, %xmm15, %xmm15, %xmm15
  660. #endif
  661. // multiply with ALPHA_R
  662. vmulps %xmm8 , %xmm0, %xmm8
  663. vmulps %xmm10, %xmm0, %xmm10
  664. vmulps %xmm12, %xmm0, %xmm12
  665. vmulps %xmm14, %xmm0, %xmm14
  666. // multiply with ALPHA_I
  667. vmulps %xmm9 , %xmm1, %xmm9
  668. vmulps %xmm11, %xmm1, %xmm11
  669. vmulps %xmm13, %xmm1, %xmm13
  670. vmulps %xmm15, %xmm1, %xmm15
  671. vaddsubps %xmm9, %xmm8 , %xmm8
  672. vaddsubps %xmm11,%xmm10, %xmm10
  673. vaddsubps %xmm13,%xmm12, %xmm12
  674. vaddsubps %xmm15,%xmm14, %xmm14
  675. #ifndef TRMMKERNEL
  676. vaddps (CO1), %xmm8 , %xmm8
  677. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  678. vaddps (CO1, LDC), %xmm10, %xmm10
  679. vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
  680. #endif
  681. vmovups %xmm8 , (CO1)
  682. vmovups %xmm12 , 4 * SIZE(CO1)
  683. vmovups %xmm10 , (CO1, LDC)
  684. vmovups %xmm14 , 4 * SIZE(CO1, LDC)
  685. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  686. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  687. movq K, %rax
  688. subq KKK, %rax
  689. movq %rax, BI // Index for BO
  690. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  691. leaq (BO, BI, SIZE), BO
  692. salq $3, %rax // rax = rax * 8 ; number of values
  693. leaq (AO, %rax, SIZE), AO
  694. #endif
  695. #if defined(TRMMKERNEL) && defined(LEFT)
  696. addq $4, KK
  697. #endif
  698. addq $8 * SIZE, CO1 # coffset += 8
  699. decq I # i --
  700. jg .L2_11
  701. ALIGN_4
  702. /**************************************************************************
  703. * Rest of M
  704. ***************************************************************************/
  705. .L2_20:
  706. testq $3, M
  707. jz .L2_60 // to next 2 lines of N
  708. testq $2, M
  709. jz .L2_40
  710. ALIGN_4
  711. .L2_21:
  712. #if !defined(TRMMKERNEL) || \
  713. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  714. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  715. leaq BUFFER1, BO // first buffer to BO
  716. addq $8 * SIZE, BO
  717. #else
  718. movq KK, %rax
  719. leaq BUFFER1, BO // first buffer to BO
  720. addq $8 * SIZE, BO
  721. movq %rax, BI // Index for BO
  722. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  723. leaq (BO, BI, SIZE), BO
  724. salq $2, %rax // rax = rax * 4 ; number of values
  725. leaq (AO, %rax, SIZE), AO
  726. #endif
  727. vzeroall
  728. #ifndef TRMMKERNEL
  729. movq K, %rax
  730. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  731. movq K, %rax
  732. subq KK, %rax
  733. movq %rax, KKK
  734. #else
  735. movq KK, %rax
  736. #ifdef LEFT
  737. addq $2, %rax // number of values in AO
  738. #else
  739. addq $2, %rax // number of values in BO
  740. #endif
  741. movq %rax, KKK
  742. #endif
  743. andq $-8, %rax // K = K - ( K % 8 )
  744. je .L2_26
  745. movq %rax, BI // Index for BO
  746. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  747. salq $2, %rax // rax = rax * 4 ; number of values
  748. leaq (AO, %rax, SIZE), AO
  749. leaq (BO, BI, SIZE), BO
  750. negq BI
  751. negq %rax
  752. ALIGN_4
  753. .L2_22:
  754. prefetcht0 B_PR1(BO,BI,SIZE)
  755. KERNEL2x2_1(xxx)
  756. KERNEL2x2_2(xxx)
  757. KERNEL2x2_3(xxx)
  758. KERNEL2x2_4(xxx)
  759. prefetcht0 B_PR1(BO,BI,SIZE)
  760. KERNEL2x2_1(xxx)
  761. KERNEL2x2_2(xxx)
  762. KERNEL2x2_3(xxx)
  763. KERNEL2x2_4(xxx)
  764. je .L2_26
  765. prefetcht0 B_PR1(BO,BI,SIZE)
  766. KERNEL2x2_1(xxx)
  767. KERNEL2x2_2(xxx)
  768. KERNEL2x2_3(xxx)
  769. KERNEL2x2_4(xxx)
  770. prefetcht0 B_PR1(BO,BI,SIZE)
  771. KERNEL2x2_1(xxx)
  772. KERNEL2x2_2(xxx)
  773. KERNEL2x2_3(xxx)
  774. KERNEL2x2_4(xxx)
  775. je .L2_26
  776. jmp .L2_22
  777. ALIGN_4
  778. .L2_26:
  779. #ifndef TRMMKERNEL
  780. movq K, %rax
  781. #else
  782. movq KKK, %rax
  783. #endif
  784. andq $7, %rax # if (k & 1)
  785. je .L2_29
  786. movq %rax, BI // Index for BO
  787. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  788. salq $2, %rax // rax = rax * 4 ; number of values
  789. leaq (AO, %rax, SIZE), AO
  790. leaq (BO, BI, SIZE), BO
  791. negq BI
  792. negq %rax
  793. ALIGN_4
  794. .L2_27:
  795. KERNEL2x2_SUB(xxx)
  796. jl .L2_27
  797. ALIGN_4
  798. .L2_29:
  799. vbroadcastss ALPHA_R, %xmm0
  800. vbroadcastss ALPHA_I, %xmm1
  801. // swap high and low 64 bytes
  802. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  803. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  804. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  805. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  806. vaddsubps %xmm9, %xmm8 , %xmm8
  807. vaddsubps %xmm11,%xmm10, %xmm10
  808. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  809. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  810. #else
  811. vaddsubps %xmm8, %xmm9 ,%xmm9
  812. vaddsubps %xmm10, %xmm11,%xmm11
  813. vmovaps %xmm9, %xmm8
  814. vmovaps %xmm11, %xmm10
  815. // swap high and low 64 bytes
  816. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  817. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  818. #endif
  819. // multiply with ALPHA_R
  820. vmulps %xmm8 , %xmm0, %xmm8
  821. vmulps %xmm10, %xmm0, %xmm10
  822. // multiply with ALPHA_I
  823. vmulps %xmm9 , %xmm1, %xmm9
  824. vmulps %xmm11, %xmm1, %xmm11
  825. vaddsubps %xmm9, %xmm8 , %xmm8
  826. vaddsubps %xmm11,%xmm10, %xmm10
  827. #ifndef TRMMKERNEL
  828. vaddps (CO1), %xmm8 , %xmm8
  829. vaddps (CO1, LDC), %xmm10, %xmm10
  830. #endif
  831. vmovups %xmm8 , (CO1)
  832. vmovups %xmm10 , (CO1, LDC)
  833. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  834. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  835. movq K, %rax
  836. subq KKK, %rax
  837. movq %rax, BI // Index for BO
  838. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  839. leaq (BO, BI, SIZE), BO
  840. salq $2, %rax // rax = rax * 4 ; number of values
  841. leaq (AO, %rax, SIZE), AO
  842. #endif
  843. #if defined(TRMMKERNEL) && defined(LEFT)
  844. addq $2, KK
  845. #endif
  846. addq $4 * SIZE, CO1 # coffset += 4
  847. ALIGN_4
  848. /**************************************************************************/
  849. .L2_40:
  850. testq $1, M
  851. jz .L2_60 // to next 2 lines of N
  852. ALIGN_4
  853. .L2_41:
  854. #if !defined(TRMMKERNEL) || \
  855. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  856. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  857. leaq BUFFER1, BO // first buffer to BO
  858. addq $8 * SIZE, BO
  859. #else
  860. movq KK, %rax
  861. leaq BUFFER1, BO // first buffer to BO
  862. addq $8 * SIZE, BO
  863. movq %rax, BI // Index for BO
  864. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  865. leaq (BO, BI, SIZE), BO
  866. salq $1, %rax // rax = rax * 2 ; number of values
  867. leaq (AO, %rax, SIZE), AO
  868. #endif
  869. vzeroall
  870. #ifndef TRMMKERNEL
  871. movq K, %rax
  872. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  873. movq K, %rax
  874. subq KK, %rax
  875. movq %rax, KKK
  876. #else
  877. movq KK, %rax
  878. #ifdef LEFT
  879. addq $1, %rax // number of values in AO
  880. #else
  881. addq $2, %rax // number of values in BO
  882. #endif
  883. movq %rax, KKK
  884. #endif
  885. andq $-8, %rax // K = K - ( K % 8 )
  886. je .L2_46
  887. movq %rax, BI // Index for BO
  888. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  889. salq $1, %rax // rax = rax * 2 ; number of values
  890. leaq (AO, %rax, SIZE), AO
  891. leaq (BO, BI, SIZE), BO
  892. negq BI
  893. negq %rax
  894. ALIGN_4
  895. .L2_42:
  896. prefetcht0 B_PR1(BO,BI,SIZE)
  897. KERNEL1x2_1(xxx)
  898. KERNEL1x2_2(xxx)
  899. KERNEL1x2_3(xxx)
  900. KERNEL1x2_4(xxx)
  901. prefetcht0 B_PR1(BO,BI,SIZE)
  902. KERNEL1x2_1(xxx)
  903. KERNEL1x2_2(xxx)
  904. KERNEL1x2_3(xxx)
  905. KERNEL1x2_4(xxx)
  906. je .L2_46
  907. prefetcht0 B_PR1(BO,BI,SIZE)
  908. KERNEL1x2_1(xxx)
  909. KERNEL1x2_2(xxx)
  910. KERNEL1x2_3(xxx)
  911. KERNEL1x2_4(xxx)
  912. prefetcht0 B_PR1(BO,BI,SIZE)
  913. KERNEL1x2_1(xxx)
  914. KERNEL1x2_2(xxx)
  915. KERNEL1x2_3(xxx)
  916. KERNEL1x2_4(xxx)
  917. je .L2_46
  918. jmp .L2_42
  919. ALIGN_4
  920. .L2_46:
  921. #ifndef TRMMKERNEL
  922. movq K, %rax
  923. #else
  924. movq KKK, %rax
  925. #endif
  926. andq $7, %rax # if (k & 1)
  927. je .L2_49
  928. movq %rax, BI // Index for BO
  929. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  930. salq $1, %rax // rax = rax * 2 ; number of values
  931. leaq (AO, %rax, SIZE), AO
  932. leaq (BO, BI, SIZE), BO
  933. negq BI
  934. negq %rax
  935. ALIGN_4
  936. .L2_47:
  937. KERNEL1x2_SUB(xxx)
  938. jl .L2_47
  939. ALIGN_4
  940. .L2_49:
  941. vbroadcastss ALPHA_R, %xmm0
  942. vbroadcastss ALPHA_I, %xmm1
  943. // swap high and low 64 bytes
  944. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  945. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  946. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  947. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  948. vaddsubps %xmm9, %xmm8 , %xmm8
  949. vaddsubps %xmm11,%xmm10, %xmm10
  950. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  951. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  952. #else
  953. vaddsubps %xmm8, %xmm9 ,%xmm9
  954. vaddsubps %xmm10, %xmm11,%xmm11
  955. vmovaps %xmm9, %xmm8
  956. vmovaps %xmm11, %xmm10
  957. // swap high and low 64 bytes
  958. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  959. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  960. #endif
  961. // multiply with ALPHA_R
  962. vmulps %xmm8 , %xmm0, %xmm8
  963. vmulps %xmm10, %xmm0, %xmm10
  964. // multiply with ALPHA_I
  965. vmulps %xmm9 , %xmm1, %xmm9
  966. vmulps %xmm11, %xmm1, %xmm11
  967. vaddsubps %xmm9, %xmm8 , %xmm8
  968. vaddsubps %xmm11,%xmm10, %xmm10
  969. #ifndef TRMMKERNEL
  970. vmovsd (CO1), %xmm14
  971. vaddps %xmm14, %xmm8 , %xmm8
  972. vmovsd (CO1, LDC), %xmm15
  973. vaddps %xmm15, %xmm10, %xmm10
  974. #endif
  975. vmovsd %xmm8 , (CO1)
  976. vmovsd %xmm10 , (CO1, LDC)
  977. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  978. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  979. movq K, %rax
  980. subq KKK, %rax
  981. movq %rax, BI // Index for BO
  982. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  983. leaq (BO, BI, SIZE), BO
  984. salq $1, %rax // rax = rax * 2 ; number of values
  985. leaq (AO, %rax, SIZE), AO
  986. #endif
  987. #if defined(TRMMKERNEL) && defined(LEFT)
  988. addq $1, KK
  989. #endif
  990. addq $2 * SIZE, CO1 # coffset += 2
  991. ALIGN_4
  992. .L2_60:
  993. #if defined(TRMMKERNEL) && !defined(LEFT)
  994. addq $2, KK
  995. #endif
  996. decq J // j --
  997. jg .L2_01 // next 2 lines of N
  998. .L1_0:
  999. /************************************************************************************************
  1000. * Loop for Nmod6 % 2 > 0
  1001. *************************************************************************************************/
  1002. movq Nmod6, J
  1003. andq $1, J // j % 2
  1004. je .L999
  1005. ALIGN_4
  1006. .L1_01:
  1007. // copy to sub buffer
  1008. movq B, BO1
  1009. leaq BUFFER1, BO // first buffer to BO
  1010. movq K, %rax
  1011. ALIGN_4
  1012. .L1_02b:
  1013. vmovsd (BO1), %xmm0
  1014. vmovsd %xmm0, (BO)
  1015. addq $2*SIZE,BO1
  1016. addq $2*SIZE,BO
  1017. decq %rax
  1018. jnz .L1_02b
  1019. .L1_02c:
  1020. movq BO1, B // next offset of B
  1021. .L1_10:
  1022. movq C, CO1
  1023. leaq (C, LDC, 1), C // c += 1 * ldc
  1024. #if defined(TRMMKERNEL) && defined(LEFT)
  1025. movq OFFSET, %rax
  1026. movq %rax, KK
  1027. #endif
  1028. movq A, AO // aoffset = a
  1029. addq $16 * SIZE, AO
  1030. movq M, I
  1031. sarq $2, I // i = (m >> 2)
  1032. je .L1_20
  1033. ALIGN_4
  1034. .L1_11:
  1035. #if !defined(TRMMKERNEL) || \
  1036. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1037. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1038. leaq BUFFER1, BO // first buffer to BO
  1039. addq $4 * SIZE, BO
  1040. #else
  1041. movq KK, %rax
  1042. leaq BUFFER1, BO // first buffer to BO
  1043. addq $4 * SIZE, BO
  1044. movq %rax, BI // Index for BO
  1045. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1046. leaq (BO, BI, SIZE), BO
  1047. salq $3, %rax // rax = rax * 8 ; number of values
  1048. leaq (AO, %rax, SIZE), AO
  1049. #endif
  1050. vzeroall
  1051. #ifndef TRMMKERNEL
  1052. movq K, %rax
  1053. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1054. movq K, %rax
  1055. subq KK, %rax
  1056. movq %rax, KKK
  1057. #else
  1058. movq KK, %rax
  1059. #ifdef LEFT
  1060. addq $4, %rax // number of values in AO
  1061. #else
  1062. addq $1, %rax // number of values in BO
  1063. #endif
  1064. movq %rax, KKK
  1065. #endif
  1066. andq $-8, %rax // K = K - ( K % 8 )
  1067. je .L1_16
  1068. movq %rax, BI // Index for BO
  1069. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1070. salq $3, %rax // rax = rax * 8 ; number of values
  1071. leaq (AO, %rax, SIZE), AO
  1072. leaq (BO, BI, SIZE), BO
  1073. negq BI
  1074. negq %rax
  1075. ALIGN_4
  1076. .L1_12:
  1077. prefetcht0 B_PR1(BO,BI,SIZE)
  1078. KERNEL4x1_1(xxx)
  1079. KERNEL4x1_2(xxx)
  1080. KERNEL4x1_3(xxx)
  1081. KERNEL4x1_4(xxx)
  1082. KERNEL4x1_1(xxx)
  1083. KERNEL4x1_2(xxx)
  1084. KERNEL4x1_3(xxx)
  1085. KERNEL4x1_4(xxx)
  1086. je .L1_16
  1087. prefetcht0 B_PR1(BO,BI,SIZE)
  1088. KERNEL4x1_1(xxx)
  1089. KERNEL4x1_2(xxx)
  1090. KERNEL4x1_3(xxx)
  1091. KERNEL4x1_4(xxx)
  1092. KERNEL4x1_1(xxx)
  1093. KERNEL4x1_2(xxx)
  1094. KERNEL4x1_3(xxx)
  1095. KERNEL4x1_4(xxx)
  1096. je .L1_16
  1097. jmp .L1_12
  1098. ALIGN_4
  1099. .L1_16:
  1100. #ifndef TRMMKERNEL
  1101. movq K, %rax
  1102. #else
  1103. movq KKK, %rax
  1104. #endif
  1105. andq $7, %rax # if (k & 1)
  1106. je .L1_19
  1107. movq %rax, BI // Index for BO
  1108. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  1109. salq $3, %rax // rax = rax * 8 ; number of values
  1110. leaq (AO, %rax, SIZE), AO
  1111. leaq (BO, BI, SIZE), BO
  1112. negq BI
  1113. negq %rax
  1114. ALIGN_4
  1115. .L1_17:
  1116. KERNEL4x1_SUB(xxx)
  1117. jl .L1_17
  1118. ALIGN_4
  1119. .L1_19:
  1120. vbroadcastss ALPHA_R, %xmm0
  1121. vbroadcastss ALPHA_I, %xmm1
  1122. // swap high and low 64 bytes
  1123. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1124. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  1125. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1126. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1127. vaddsubps %xmm9, %xmm8 , %xmm8
  1128. vaddsubps %xmm13,%xmm12, %xmm12
  1129. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1130. vshufps $0xb1, %xmm12, %xmm12, %xmm13
  1131. #else
  1132. vaddsubps %xmm8, %xmm9 ,%xmm9
  1133. vaddsubps %xmm12, %xmm13,%xmm13
  1134. vmovaps %xmm9, %xmm8
  1135. vmovaps %xmm13, %xmm12
  1136. // swap high and low 64 bytes
  1137. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1138. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  1139. #endif
  1140. // multiply with ALPHA_R
  1141. vmulps %xmm8 , %xmm0, %xmm8
  1142. vmulps %xmm12, %xmm0, %xmm12
  1143. // multiply with ALPHA_I
  1144. vmulps %xmm9 , %xmm1, %xmm9
  1145. vmulps %xmm13, %xmm1, %xmm13
  1146. vaddsubps %xmm9, %xmm8 , %xmm8
  1147. vaddsubps %xmm13,%xmm12, %xmm12
  1148. #ifndef TRMMKERNEL
  1149. vaddps (CO1), %xmm8 , %xmm8
  1150. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  1151. #endif
  1152. vmovups %xmm8 , (CO1)
  1153. vmovups %xmm12 , 4 * SIZE(CO1)
  1154. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1155. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1156. movq K, %rax
  1157. subq KKK, %rax
  1158. movq %rax, BI // Index for BO
  1159. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1160. leaq (BO, BI, SIZE), BO
  1161. salq $3, %rax // rax = rax * 8 ; number of values
  1162. leaq (AO, %rax, SIZE), AO
  1163. #endif
  1164. #if defined(TRMMKERNEL) && defined(LEFT)
  1165. addq $4, KK
  1166. #endif
  1167. addq $8 * SIZE, CO1 # coffset += 8
  1168. decq I # i --
  1169. jg .L1_11
  1170. ALIGN_4
  1171. /**************************************************************************
  1172. * Rest of M
  1173. ***************************************************************************/
  1174. .L1_20:
  1175. testq $3, M
  1176. jz .L999
  1177. testq $2, M
  1178. jz .L1_40
  1179. ALIGN_4
  1180. .L1_21:
  1181. #if !defined(TRMMKERNEL) || \
  1182. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1183. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1184. leaq BUFFER1, BO // first buffer to BO
  1185. addq $4 * SIZE, BO
  1186. #else
  1187. movq KK, %rax
  1188. leaq BUFFER1, BO // first buffer to BO
  1189. addq $4 * SIZE, BO
  1190. movq %rax, BI // Index for BO
  1191. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1192. leaq (BO, BI, SIZE), BO
  1193. salq $2, %rax // rax = rax * 4 ; number of values
  1194. leaq (AO, %rax, SIZE), AO
  1195. #endif
  1196. vzeroall
  1197. #ifndef TRMMKERNEL
  1198. movq K, %rax
  1199. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1200. movq K, %rax
  1201. subq KK, %rax
  1202. movq %rax, KKK
  1203. #else
  1204. movq KK, %rax
  1205. #ifdef LEFT
  1206. addq $2, %rax // number of values in AO
  1207. #else
  1208. addq $1, %rax // number of values in BO
  1209. #endif
  1210. movq %rax, KKK
  1211. #endif
  1212. andq $-8, %rax // K = K - ( K % 8 )
  1213. je .L1_26
  1214. movq %rax, BI // Index for BO
  1215. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1216. salq $2, %rax // rax = rax * 4 ; number of values
  1217. leaq (AO, %rax, SIZE), AO
  1218. leaq (BO, BI, SIZE), BO
  1219. negq BI
  1220. negq %rax
  1221. ALIGN_4
  1222. .L1_22:
  1223. prefetcht0 B_PR1(BO,BI,SIZE)
  1224. KERNEL2x1_1(xxx)
  1225. KERNEL2x1_2(xxx)
  1226. KERNEL2x1_3(xxx)
  1227. KERNEL2x1_4(xxx)
  1228. KERNEL2x1_1(xxx)
  1229. KERNEL2x1_2(xxx)
  1230. KERNEL2x1_3(xxx)
  1231. KERNEL2x1_4(xxx)
  1232. je .L1_26
  1233. prefetcht0 B_PR1(BO,BI,SIZE)
  1234. KERNEL2x1_1(xxx)
  1235. KERNEL2x1_2(xxx)
  1236. KERNEL2x1_3(xxx)
  1237. KERNEL2x1_4(xxx)
  1238. KERNEL2x1_1(xxx)
  1239. KERNEL2x1_2(xxx)
  1240. KERNEL2x1_3(xxx)
  1241. KERNEL2x1_4(xxx)
  1242. je .L1_26
  1243. jmp .L1_22
  1244. ALIGN_4
  1245. .L1_26:
  1246. #ifndef TRMMKERNEL
  1247. movq K, %rax
  1248. #else
  1249. movq KKK, %rax
  1250. #endif
  1251. andq $7, %rax # if (k & 1)
  1252. je .L1_29
  1253. movq %rax, BI // Index for BO
  1254. leaq ( ,BI,2), BI // BI = BI * 2; number of values
  1255. salq $2, %rax // rax = rax * 4 ; number of values
  1256. leaq (AO, %rax, SIZE), AO
  1257. leaq (BO, BI, SIZE), BO
  1258. negq BI
  1259. negq %rax
  1260. ALIGN_4
  1261. .L1_27:
  1262. KERNEL2x1_SUB(xxx)
  1263. jl .L1_27
  1264. ALIGN_4
  1265. .L1_29:
  1266. vbroadcastss ALPHA_R, %xmm0
  1267. vbroadcastss ALPHA_I, %xmm1
  1268. // swap high and low 64 bytes
  1269. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1270. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1271. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1272. vaddsubps %xmm9, %xmm8 , %xmm8
  1273. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1274. #else
  1275. vaddsubps %xmm8, %xmm9 ,%xmm9
  1276. vmovaps %xmm9, %xmm8
  1277. // swap high and low 64 bytes
  1278. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1279. #endif
  1280. // multiply with ALPHA_R
  1281. vmulps %xmm8 , %xmm0, %xmm8
  1282. // multiply with ALPHA_I
  1283. vmulps %xmm9 , %xmm1, %xmm9
  1284. vaddsubps %xmm9, %xmm8 , %xmm8
  1285. #ifndef TRMMKERNEL
  1286. vaddps (CO1), %xmm8 , %xmm8
  1287. #endif
  1288. vmovups %xmm8 , (CO1)
  1289. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1290. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1291. movq K, %rax
  1292. subq KKK, %rax
  1293. movq %rax, BI // Index for BO
  1294. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1295. leaq (BO, BI, SIZE), BO
  1296. salq $2, %rax // rax = rax * 4 ; number of values
  1297. leaq (AO, %rax, SIZE), AO
  1298. #endif
  1299. #if defined(TRMMKERNEL) && defined(LEFT)
  1300. addq $2, KK
  1301. #endif
  1302. addq $4 * SIZE, CO1 # coffset += 4
  1303. ALIGN_4
  1304. /**************************************************************************/
  1305. .L1_40:
  1306. testq $1, M
  1307. jz .L999 // to next 2 lines of N
  1308. ALIGN_4
  1309. .L1_41:
  1310. #if !defined(TRMMKERNEL) || \
  1311. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1312. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1313. leaq BUFFER1, BO // first buffer to BO
  1314. addq $4 * SIZE, BO
  1315. #else
  1316. movq KK, %rax
  1317. leaq BUFFER1, BO // first buffer to BO
  1318. addq $4 * SIZE, BO
  1319. movq %rax, BI // Index for BO
  1320. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1321. leaq (BO, BI, SIZE), BO
  1322. salq $1, %rax // rax = rax * 2 ; number of values
  1323. leaq (AO, %rax, SIZE), AO
  1324. #endif
  1325. vzeroall
  1326. #ifndef TRMMKERNEL
  1327. movq K, %rax
  1328. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1329. movq K, %rax
  1330. subq KK, %rax
  1331. movq %rax, KKK
  1332. #else
  1333. movq KK, %rax
  1334. #ifdef LEFT
  1335. addq $1, %rax // number of values in AO
  1336. #else
  1337. addq $1, %rax // number of values in BO
  1338. #endif
  1339. movq %rax, KKK
  1340. #endif
  1341. andq $-8, %rax // K = K - ( K % 8 )
  1342. je .L1_46
  1343. movq %rax, BI // Index for BO
  1344. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1345. salq $1, %rax // rax = rax * 2 ; number of values
  1346. leaq (AO, %rax, SIZE), AO
  1347. leaq (BO, BI, SIZE), BO
  1348. negq BI
  1349. negq %rax
  1350. ALIGN_4
  1351. .L1_42:
  1352. prefetcht0 B_PR1(BO,BI,SIZE)
  1353. KERNEL1x1_1(xxx)
  1354. KERNEL1x1_2(xxx)
  1355. KERNEL1x1_3(xxx)
  1356. KERNEL1x1_4(xxx)
  1357. KERNEL1x1_1(xxx)
  1358. KERNEL1x1_2(xxx)
  1359. KERNEL1x1_3(xxx)
  1360. KERNEL1x1_4(xxx)
  1361. je .L1_46
  1362. prefetcht0 B_PR1(BO,BI,SIZE)
  1363. KERNEL1x1_1(xxx)
  1364. KERNEL1x1_2(xxx)
  1365. KERNEL1x1_3(xxx)
  1366. KERNEL1x1_4(xxx)
  1367. KERNEL1x1_1(xxx)
  1368. KERNEL1x1_2(xxx)
  1369. KERNEL1x1_3(xxx)
  1370. KERNEL1x1_4(xxx)
  1371. je .L1_46
  1372. jmp .L1_42
  1373. ALIGN_4
  1374. .L1_46:
  1375. #ifndef TRMMKERNEL
  1376. movq K, %rax
  1377. #else
  1378. movq KKK, %rax
  1379. #endif
  1380. andq $7, %rax # if (k & 1)
  1381. je .L1_49
  1382. movq %rax, BI // Index for BO
  1383. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1384. salq $1, %rax // rax = rax * 2 ; number of values
  1385. leaq (AO, %rax, SIZE), AO
  1386. leaq (BO, BI, SIZE), BO
  1387. negq BI
  1388. negq %rax
  1389. ALIGN_4
  1390. .L1_47:
  1391. KERNEL1x1_SUB(xxx)
  1392. jl .L1_47
  1393. ALIGN_4
  1394. .L1_49:
  1395. vbroadcastss ALPHA_R, %xmm0
  1396. vbroadcastss ALPHA_I, %xmm1
  1397. // swap high and low 64 bytes
  1398. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1399. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1400. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1401. vaddsubps %xmm9, %xmm8 , %xmm8
  1402. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1403. #else
  1404. vaddsubps %xmm8, %xmm9 ,%xmm9
  1405. vmovaps %xmm9, %xmm8
  1406. // swap high and low 64 bytes
  1407. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1408. #endif
  1409. // multiply with ALPHA_R
  1410. vmulps %xmm8 , %xmm0, %xmm8
  1411. // multiply with ALPHA_I
  1412. vmulps %xmm9 , %xmm1, %xmm9
  1413. vaddsubps %xmm9, %xmm8 , %xmm8
  1414. #ifndef TRMMKERNEL
  1415. vmovsd (CO1), %xmm14
  1416. vaddps %xmm14, %xmm8 , %xmm8
  1417. #endif
  1418. vmovsd %xmm8 , (CO1)
  1419. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1420. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1421. movq K, %rax
  1422. subq KKK, %rax
  1423. movq %rax, BI // Index for BO
  1424. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1425. leaq (BO, BI, SIZE), BO
  1426. salq $1, %rax // rax = rax * 2 ; number of values
  1427. leaq (AO, %rax, SIZE), AO
  1428. #endif
  1429. #if defined(TRMMKERNEL) && defined(LEFT)
  1430. addq $1, KK
  1431. #endif
  1432. addq $2 * SIZE, CO1 # coffset += 2
  1433. ALIGN_4
  1434. .L999:
  1435. vzeroupper
  1436. movq SP, %rsp
  1437. movq (%rsp), %rbx
  1438. movq 8(%rsp), %rbp
  1439. movq 16(%rsp), %r12
  1440. movq 24(%rsp), %r13
  1441. movq 32(%rsp), %r14
  1442. movq 40(%rsp), %r15
  1443. #ifdef WINDOWS_ABI
  1444. movq 48(%rsp), %rdi
  1445. movq 56(%rsp), %rsi
  1446. vmovups 64(%rsp), %xmm6
  1447. vmovups 80(%rsp), %xmm7
  1448. vmovups 96(%rsp), %xmm8
  1449. vmovups 112(%rsp), %xmm9
  1450. vmovups 128(%rsp), %xmm10
  1451. vmovups 144(%rsp), %xmm11
  1452. vmovups 160(%rsp), %xmm12
  1453. vmovups 176(%rsp), %xmm13
  1454. vmovups 192(%rsp), %xmm14
  1455. vmovups 208(%rsp), %xmm15
  1456. #endif
  1457. addq $STACKSIZE, %rsp
  1458. ret
  1459. EPILOGUE