You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_bulldozer.S 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define J %r14
  44. #define OLD_K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define K %r12
  54. #define BI %rbp
  55. #define SP %rbx
  56. #define BO1 %rdi
  57. #define BO2 %r15
  58. #ifndef WINDOWS_ABI
  59. #define STACKSIZE 96
  60. #else
  61. #define STACKSIZE 320
  62. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  63. #define OLD_A 48 + STACKSIZE(%rsp)
  64. #define OLD_B 56 + STACKSIZE(%rsp)
  65. #define OLD_C 64 + STACKSIZE(%rsp)
  66. #define OLD_LDC 72 + STACKSIZE(%rsp)
  67. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  68. #endif
  69. #define L_BUFFER_SIZE 8192
  70. #define Ndiv6 24(%rsp)
  71. #define Nmod6 32(%rsp)
  72. #define N 40(%rsp)
  73. #define ALPHA_R 48(%rsp)
  74. #define ALPHA_I 56(%rsp)
  75. #define OFFSET 64(%rsp)
  76. #define KK 72(%rsp)
  77. #define KKK 80(%rsp)
  78. #define BUFFER1 128(%rsp)
  79. #if defined(OS_WINDOWS)
  80. #if L_BUFFER_SIZE > 16384
  81. #define STACK_TOUCH \
  82. movl $0, 4096 * 4(%rsp);\
  83. movl $0, 4096 * 3(%rsp);\
  84. movl $0, 4096 * 2(%rsp);\
  85. movl $0, 4096 * 1(%rsp);
  86. #elif L_BUFFER_SIZE > 12288
  87. #define STACK_TOUCH \
  88. movl $0, 4096 * 3(%rsp);\
  89. movl $0, 4096 * 2(%rsp);\
  90. movl $0, 4096 * 1(%rsp);
  91. #elif L_BUFFER_SIZE > 8192
  92. #define STACK_TOUCH \
  93. movl $0, 4096 * 2(%rsp);\
  94. movl $0, 4096 * 1(%rsp);
  95. #elif L_BUFFER_SIZE > 4096
  96. #define STACK_TOUCH \
  97. movl $0, 4096 * 1(%rsp);
  98. #else
  99. #define STACK_TOUCH
  100. #endif
  101. #else
  102. #define STACK_TOUCH
  103. #endif
  104. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  105. #define VFMADD_R vfmaddpd
  106. #define VFMADD_I vfmaddpd
  107. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  108. #define VFMADD_R vfnmaddpd
  109. #define VFMADD_I vfmaddpd
  110. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  111. #define VFMADD_R vfmaddpd
  112. #define VFMADD_I vfnmaddpd
  113. #else
  114. #define VFMADD_R vfnmaddpd
  115. #define VFMADD_I vfnmaddpd
  116. #endif
  117. #define A_PR1 384
  118. #define B_PR1 192
  119. #define KERNEL2x2_1(xx) \
  120. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  121. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  122. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  123. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  124. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  125. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  126. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  127. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  128. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  129. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  130. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  131. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  132. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  133. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  134. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  135. #define KERNEL2x2_2(xx) \
  136. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  137. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  138. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  139. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  140. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  141. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  142. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  143. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  144. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  145. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  146. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  147. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  148. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  149. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  150. #define KERNEL2x2_3(xx) \
  151. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  152. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  153. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  154. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  155. vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  156. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  157. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  158. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  159. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  160. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  161. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  162. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  163. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  164. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  165. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  166. #define KERNEL2x2_4(xx) \
  167. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  168. vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  169. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  170. vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  171. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  172. vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  173. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  174. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  175. vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  176. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  177. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  178. vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  179. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  180. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  181. addq $16, BI ;\
  182. addq $16, %rax ;\
  183. #define KERNEL2x2_SUB(xx) \
  184. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  185. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  186. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  187. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  188. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  189. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  190. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  191. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  192. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  193. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  194. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  195. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  196. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  197. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  198. addq $4, BI ;\
  199. addq $4, %rax ;\
  200. /************************************************************************************************/
  201. #define KERNEL1x2_1(xx) \
  202. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  203. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  204. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  205. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  206. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  207. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  208. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  209. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  210. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  211. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  212. #define KERNEL1x2_2(xx) \
  213. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  214. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  215. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  216. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  217. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  218. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  219. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  220. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  221. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  222. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  223. #define KERNEL1x2_3(xx) \
  224. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  225. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  226. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  227. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  228. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  229. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  230. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  231. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  232. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  233. #define KERNEL1x2_4(xx) \
  234. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  235. vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  236. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  237. vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  238. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  239. vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  240. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  241. vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  242. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  243. addq $16, BI ;\
  244. addq $8 , %rax ;\
  245. #define KERNEL1x2_SUB(xx) \
  246. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  247. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  248. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  249. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  250. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  251. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  252. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  253. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  254. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  255. addq $4, BI ;\
  256. addq $2, %rax ;\
  257. /************************************************************************************************/
  258. #define KERNEL2x1_1(xx) \
  259. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  260. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  261. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  262. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  263. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  264. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  265. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  266. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  267. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  268. #define KERNEL2x1_2(xx) \
  269. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  270. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  271. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  272. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  273. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  274. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  275. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  276. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  277. #define KERNEL2x1_3(xx) \
  278. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  279. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  280. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  281. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  282. vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  283. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  284. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  285. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  286. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  287. #define KERNEL2x1_4(xx) \
  288. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  289. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  290. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  291. vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  292. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  293. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  294. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  295. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  296. addq $8, BI ;\
  297. addq $16, %rax ;\
  298. #define KERNEL2x1_SUB(xx) \
  299. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  300. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  301. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  302. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  303. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  304. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  305. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  306. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  307. addq $2, BI ;\
  308. addq $4, %rax ;\
  309. /************************************************************************************************/
  310. #define KERNEL1x1_1(xx) \
  311. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  312. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  313. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  314. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  315. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  316. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  317. #define KERNEL1x1_2(xx) \
  318. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  319. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  320. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  321. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  322. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  323. #define KERNEL1x1_3(xx) \
  324. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  325. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  326. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  327. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  328. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  329. #define KERNEL1x1_4(xx) \
  330. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  331. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  332. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  333. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  334. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  335. addq $8, BI ;\
  336. addq $8, %rax ;\
  337. #define KERNEL1x1_SUB(xx) \
  338. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  339. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  340. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  341. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  342. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  343. addq $2, BI ;\
  344. addq $2, %rax ;\
  345. /************************************************************************************************/
  346. PROLOGUE
  347. PROFCODE
  348. subq $STACKSIZE, %rsp
  349. movq %rbx, (%rsp)
  350. movq %rbp, 8(%rsp)
  351. movq %r12, 16(%rsp)
  352. movq %r13, 24(%rsp)
  353. movq %r14, 32(%rsp)
  354. movq %r15, 40(%rsp)
  355. vzeroupper
  356. #ifdef WINDOWS_ABI
  357. movq %rdi, 48(%rsp)
  358. movq %rsi, 56(%rsp)
  359. vmovups %xmm6, 64(%rsp)
  360. vmovups %xmm7, 80(%rsp)
  361. vmovups %xmm8, 96(%rsp)
  362. vmovups %xmm9, 112(%rsp)
  363. vmovups %xmm10, 128(%rsp)
  364. vmovups %xmm11, 144(%rsp)
  365. vmovups %xmm12, 160(%rsp)
  366. vmovups %xmm13, 176(%rsp)
  367. vmovups %xmm14, 192(%rsp)
  368. vmovups %xmm15, 208(%rsp)
  369. movq ARG1, OLD_M
  370. movq ARG2, OLD_N
  371. movq ARG3, OLD_K
  372. movq OLD_A, A
  373. movq OLD_B, B
  374. movq OLD_C, C
  375. movq OLD_LDC, LDC
  376. #ifdef TRMMKERNEL
  377. vmovsd OLD_OFFSET, %xmm12
  378. #endif
  379. vmovaps %xmm3, %xmm0
  380. vmovsd OLD_ALPHA_I, %xmm1
  381. #else
  382. movq STACKSIZE + 8(%rsp), LDC
  383. #ifdef TRMMKERNEL
  384. vmovsd STACKSIZE + 16(%rsp), %xmm12
  385. #endif
  386. #endif
  387. movq %rsp, SP # save old stack
  388. subq $128 + L_BUFFER_SIZE, %rsp
  389. andq $-4096, %rsp # align stack
  390. STACK_TOUCH
  391. cmpq $0, OLD_M
  392. je .L999
  393. cmpq $0, OLD_N
  394. je .L999
  395. cmpq $0, OLD_K
  396. je .L999
  397. movq OLD_M, M
  398. movq OLD_N, N
  399. movq OLD_K, K
  400. vmovsd %xmm0, ALPHA_R
  401. vmovsd %xmm1, ALPHA_I
  402. salq $ZBASE_SHIFT, LDC
  403. movq N, %rax
  404. xorq %rdx, %rdx
  405. movq $2, %rdi
  406. divq %rdi // N / 2
  407. movq %rax, Ndiv6 // N / 2
  408. movq %rdx, Nmod6 // N % 2
  409. #ifdef TRMMKERNEL
  410. vmovsd %xmm12, OFFSET
  411. vmovsd %xmm12, KK
  412. #ifndef LEFT
  413. negq KK
  414. #endif
  415. #endif
  416. .L2_0:
  417. movq Ndiv6, J
  418. cmpq $0, J
  419. je .L1_0
  420. ALIGN_4
  421. .L2_01:
  422. // copy to sub buffer
  423. movq B, BO1
  424. leaq BUFFER1, BO // first buffer to BO
  425. movq K, %rax
  426. ALIGN_4
  427. .L2_02b:
  428. vmovups (BO1), %xmm0
  429. vmovups 2 * SIZE(BO1), %xmm1
  430. vmovups %xmm0, (BO)
  431. vmovups %xmm1, 2 * SIZE(BO)
  432. addq $4*SIZE,BO1
  433. addq $4*SIZE,BO
  434. decq %rax
  435. jnz .L2_02b
  436. .L2_02c:
  437. movq BO1, B // next offset of B
  438. .L2_10:
  439. movq C, CO1
  440. leaq (C, LDC, 2), C // c += 2 * ldc
  441. #if defined(TRMMKERNEL) && defined(LEFT)
  442. movq OFFSET, %rax
  443. movq %rax, KK
  444. #endif
  445. movq A, AO // aoffset = a
  446. addq $8 * SIZE, AO
  447. movq M, I
  448. sarq $1, I // i = (m >> 1)
  449. je .L2_40
  450. ALIGN_4
  451. .L2_11:
  452. #if !defined(TRMMKERNEL) || \
  453. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  454. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  455. leaq BUFFER1, BO // first buffer to BO
  456. addq $8 * SIZE, BO
  457. #else
  458. movq KK, %rax
  459. leaq BUFFER1, BO // first buffer to BO
  460. addq $8 * SIZE, BO
  461. movq %rax, BI // Index for BO
  462. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  463. leaq (BO, BI, SIZE), BO
  464. salq $2, %rax // rax = rax * 4 ; number of values
  465. leaq (AO, %rax, SIZE), AO
  466. #endif
  467. vzeroall
  468. #ifndef TRMMKERNEL
  469. movq K, %rax
  470. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  471. movq K, %rax
  472. subq KK, %rax
  473. movq %rax, KKK
  474. #else
  475. movq KK, %rax
  476. #ifdef LEFT
  477. addq $2, %rax // number of values in AO
  478. #else
  479. addq $2, %rax // number of values in BO
  480. #endif
  481. movq %rax, KKK
  482. #endif
  483. andq $-8, %rax // K = K - ( K % 8 )
  484. je .L2_16
  485. movq %rax, BI // Index for BO
  486. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  487. salq $2, %rax // rax = rax * 4 ; number of values
  488. leaq (AO, %rax, SIZE), AO
  489. leaq (BO, BI, SIZE), BO
  490. negq BI
  491. negq %rax
  492. ALIGN_4
  493. .L2_12:
  494. prefetcht0 B_PR1(BO,BI,SIZE)
  495. KERNEL2x2_1(xxx)
  496. KERNEL2x2_2(xxx)
  497. prefetcht0 B_PR1+64(BO,BI,SIZE)
  498. KERNEL2x2_3(xxx)
  499. KERNEL2x2_4(xxx)
  500. prefetcht0 B_PR1(BO,BI,SIZE)
  501. KERNEL2x2_1(xxx)
  502. KERNEL2x2_2(xxx)
  503. prefetcht0 B_PR1+64(BO,BI,SIZE)
  504. KERNEL2x2_3(xxx)
  505. KERNEL2x2_4(xxx)
  506. je .L2_16
  507. prefetcht0 B_PR1(BO,BI,SIZE)
  508. KERNEL2x2_1(xxx)
  509. KERNEL2x2_2(xxx)
  510. prefetcht0 B_PR1+64(BO,BI,SIZE)
  511. KERNEL2x2_3(xxx)
  512. KERNEL2x2_4(xxx)
  513. prefetcht0 B_PR1(BO,BI,SIZE)
  514. KERNEL2x2_1(xxx)
  515. KERNEL2x2_2(xxx)
  516. prefetcht0 B_PR1+64(BO,BI,SIZE)
  517. KERNEL2x2_3(xxx)
  518. KERNEL2x2_4(xxx)
  519. je .L2_16
  520. jmp .L2_12
  521. ALIGN_4
  522. .L2_16:
  523. #ifndef TRMMKERNEL
  524. movq K, %rax
  525. #else
  526. movq KKK, %rax
  527. #endif
  528. andq $7, %rax # if (k & 1)
  529. je .L2_19
  530. movq %rax, BI // Index for BO
  531. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  532. salq $2, %rax // rax = rax * 4 ; number of values
  533. leaq (AO, %rax, SIZE), AO
  534. leaq (BO, BI, SIZE), BO
  535. negq BI
  536. negq %rax
  537. ALIGN_4
  538. .L2_17:
  539. KERNEL2x2_SUB(xxx)
  540. jl .L2_17
  541. ALIGN_4
  542. .L2_19:
  543. vmovddup ALPHA_R, %xmm0
  544. vmovddup ALPHA_I, %xmm1
  545. // swap high and low 64 bytes
  546. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  547. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  548. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  549. vshufpd $0x01, %xmm15, %xmm15, %xmm15
  550. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  551. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  552. vaddsubpd %xmm9, %xmm8 , %xmm8
  553. vaddsubpd %xmm11,%xmm10, %xmm10
  554. vaddsubpd %xmm13,%xmm12, %xmm12
  555. vaddsubpd %xmm15,%xmm14, %xmm14
  556. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  557. vshufpd $0x01, %xmm10, %xmm10, %xmm11
  558. vshufpd $0x01, %xmm12, %xmm12, %xmm13
  559. vshufpd $0x01, %xmm14, %xmm14, %xmm15
  560. #else
  561. vaddsubpd %xmm8, %xmm9 ,%xmm9
  562. vaddsubpd %xmm10, %xmm11,%xmm11
  563. vaddsubpd %xmm12, %xmm13,%xmm13
  564. vaddsubpd %xmm14, %xmm15,%xmm15
  565. vmovapd %xmm9, %xmm8
  566. vmovapd %xmm11, %xmm10
  567. vmovapd %xmm13, %xmm12
  568. vmovapd %xmm15, %xmm14
  569. // swap high and low 64 bytes
  570. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  571. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  572. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  573. vshufpd $0x01, %xmm15, %xmm15, %xmm15
  574. #endif
  575. // multiply with ALPHA_R
  576. vmulpd %xmm8 , %xmm0, %xmm8
  577. vmulpd %xmm10, %xmm0, %xmm10
  578. vmulpd %xmm12, %xmm0, %xmm12
  579. vmulpd %xmm14, %xmm0, %xmm14
  580. // multiply with ALPHA_I
  581. vmulpd %xmm9 , %xmm1, %xmm9
  582. vmulpd %xmm11, %xmm1, %xmm11
  583. vmulpd %xmm13, %xmm1, %xmm13
  584. vmulpd %xmm15, %xmm1, %xmm15
  585. vaddsubpd %xmm9, %xmm8 , %xmm8
  586. vaddsubpd %xmm11,%xmm10, %xmm10
  587. vaddsubpd %xmm13,%xmm12, %xmm12
  588. vaddsubpd %xmm15,%xmm14, %xmm14
  589. #ifndef TRMMKERNEL
  590. vaddpd (CO1), %xmm8 , %xmm8
  591. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  592. vaddpd (CO1, LDC), %xmm10, %xmm10
  593. vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
  594. #endif
  595. vmovups %xmm8 , (CO1)
  596. vmovups %xmm12 , 2 * SIZE(CO1)
  597. vmovups %xmm10 , (CO1, LDC)
  598. vmovups %xmm14 , 2 * SIZE(CO1, LDC)
  599. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  600. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  601. movq K, %rax
  602. subq KKK, %rax
  603. movq %rax, BI // Index for BO
  604. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  605. leaq (BO, BI, SIZE), BO
  606. salq $2, %rax // rax = rax * 4 ; number of values
  607. leaq (AO, %rax, SIZE), AO
  608. #endif
  609. #if defined(TRMMKERNEL) && defined(LEFT)
  610. addq $2, KK
  611. #endif
  612. addq $4 * SIZE, CO1 # coffset += 4
  613. decq I # i --
  614. jg .L2_11
  615. ALIGN_4
  616. /**************************************************************************
  617. * Rest of M
  618. ***************************************************************************/
  619. .L2_40:
  620. testq $1, M
  621. jz .L2_60 // to next 2 lines of N
  622. ALIGN_4
  623. .L2_41:
  624. #if !defined(TRMMKERNEL) || \
  625. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  626. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  627. leaq BUFFER1, BO // first buffer to BO
  628. addq $8 * SIZE, BO
  629. #else
  630. movq KK, %rax
  631. leaq BUFFER1, BO // first buffer to BO
  632. addq $8 * SIZE, BO
  633. movq %rax, BI // Index for BO
  634. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  635. leaq (BO, BI, SIZE), BO
  636. salq $1, %rax // rax = rax * 2 ; number of values
  637. leaq (AO, %rax, SIZE), AO
  638. #endif
  639. vzeroall
  640. #ifndef TRMMKERNEL
  641. movq K, %rax
  642. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  643. movq K, %rax
  644. subq KK, %rax
  645. movq %rax, KKK
  646. #else
  647. movq KK, %rax
  648. #ifdef LEFT
  649. addq $1, %rax // number of values in AO
  650. #else
  651. addq $2, %rax // number of values in BO
  652. #endif
  653. movq %rax, KKK
  654. #endif
  655. andq $-8, %rax // K = K - ( K % 8 )
  656. je .L2_46
  657. movq %rax, BI // Index for BO
  658. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  659. salq $1, %rax // rax = rax * 2 ; number of values
  660. leaq (AO, %rax, SIZE), AO
  661. leaq (BO, BI, SIZE), BO
  662. negq BI
  663. negq %rax
  664. ALIGN_4
  665. .L2_42:
  666. prefetcht0 B_PR1(BO,BI,SIZE)
  667. KERNEL1x2_1(xxx)
  668. KERNEL1x2_2(xxx)
  669. prefetcht0 B_PR1+64(BO,BI,SIZE)
  670. KERNEL1x2_3(xxx)
  671. KERNEL1x2_4(xxx)
  672. prefetcht0 B_PR1(BO,BI,SIZE)
  673. KERNEL1x2_1(xxx)
  674. KERNEL1x2_2(xxx)
  675. prefetcht0 B_PR1+64(BO,BI,SIZE)
  676. KERNEL1x2_3(xxx)
  677. KERNEL1x2_4(xxx)
  678. je .L2_46
  679. prefetcht0 B_PR1(BO,BI,SIZE)
  680. KERNEL1x2_1(xxx)
  681. KERNEL1x2_2(xxx)
  682. prefetcht0 B_PR1+64(BO,BI,SIZE)
  683. KERNEL1x2_3(xxx)
  684. KERNEL1x2_4(xxx)
  685. prefetcht0 B_PR1(BO,BI,SIZE)
  686. KERNEL1x2_1(xxx)
  687. KERNEL1x2_2(xxx)
  688. prefetcht0 B_PR1+64(BO,BI,SIZE)
  689. KERNEL1x2_3(xxx)
  690. KERNEL1x2_4(xxx)
  691. je .L2_46
  692. jmp .L2_42
  693. ALIGN_4
  694. .L2_46:
  695. #ifndef TRMMKERNEL
  696. movq K, %rax
  697. #else
  698. movq KKK, %rax
  699. #endif
  700. andq $7, %rax # if (k & 1)
  701. je .L2_49
  702. movq %rax, BI // Index for BO
  703. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  704. salq $1, %rax // rax = rax * 2 ; number of values
  705. leaq (AO, %rax, SIZE), AO
  706. leaq (BO, BI, SIZE), BO
  707. negq BI
  708. negq %rax
  709. ALIGN_4
  710. .L2_47:
  711. KERNEL1x2_SUB(xxx)
  712. jl .L2_47
  713. ALIGN_4
  714. .L2_49:
  715. vmovddup ALPHA_R, %xmm0
  716. vmovddup ALPHA_I, %xmm1
  717. // swap high and low 64 bytes
  718. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  719. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  720. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  721. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  722. vaddsubpd %xmm9, %xmm8 , %xmm8
  723. vaddsubpd %xmm11,%xmm10, %xmm10
  724. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  725. vshufpd $0x01, %xmm10, %xmm10, %xmm11
  726. #else
  727. vaddsubpd %xmm8, %xmm9, %xmm9
  728. vaddsubpd %xmm10,%xmm11, %xmm11
  729. vmovapd %xmm9, %xmm8
  730. vmovapd %xmm11, %xmm10
  731. // swap high and low 64 bytes
  732. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  733. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  734. #endif
  735. // multiply with ALPHA_R
  736. vmulpd %xmm8 , %xmm0, %xmm8
  737. vmulpd %xmm10, %xmm0, %xmm10
  738. // multiply with ALPHA_I
  739. vmulpd %xmm9 , %xmm1, %xmm9
  740. vmulpd %xmm11, %xmm1, %xmm11
  741. vaddsubpd %xmm9, %xmm8 , %xmm8
  742. vaddsubpd %xmm11,%xmm10, %xmm10
  743. #ifndef TRMMKERNEL
  744. vaddpd (CO1), %xmm8 , %xmm8
  745. vaddpd (CO1, LDC), %xmm10, %xmm10
  746. #endif
  747. vmovups %xmm8 , (CO1)
  748. vmovups %xmm10 , (CO1, LDC)
  749. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  750. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  751. movq K, %rax
  752. subq KKK, %rax
  753. movq %rax, BI // Index for BO
  754. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  755. leaq (BO, BI, SIZE), BO
  756. salq $1, %rax // rax = rax * 2 ; number of values
  757. leaq (AO, %rax, SIZE), AO
  758. #endif
  759. #if defined(TRMMKERNEL) && defined(LEFT)
  760. addq $1, KK
  761. #endif
  762. addq $2 * SIZE, CO1 # coffset += 2
  763. ALIGN_4
  764. .L2_60:
  765. #if defined(TRMMKERNEL) && !defined(LEFT)
  766. addq $2, KK
  767. #endif
  768. decq J // j --
  769. jg .L2_01 // next 2 lines of N
  770. .L1_0:
  771. /************************************************************************************************
  772. * Loop for Nmod6 % 2 > 0
  773. *************************************************************************************************/
  774. movq Nmod6, J
  775. andq $1, J // j % 2
  776. je .L999
  777. ALIGN_4
  778. .L1_01:
  779. // copy to sub buffer
  780. movq B, BO1
  781. leaq BUFFER1, BO // first buffer to BO
  782. movq K, %rax
  783. ALIGN_4
  784. .L1_02b:
  785. vmovups (BO1), %xmm0
  786. vmovups %xmm0, (BO)
  787. addq $2*SIZE,BO1
  788. addq $2*SIZE,BO
  789. decq %rax
  790. jnz .L1_02b
  791. .L1_02c:
  792. movq BO1, B // next offset of B
  793. .L1_10:
  794. movq C, CO1
  795. leaq (C, LDC, 1), C // c += 1 * ldc
  796. #if defined(TRMMKERNEL) && defined(LEFT)
  797. movq OFFSET, %rax
  798. movq %rax, KK
  799. #endif
  800. movq A, AO // aoffset = a
  801. addq $8 * SIZE, AO
  802. movq M, I
  803. sarq $1, I // i = (m >> 1)
  804. je .L1_40
  805. ALIGN_4
  806. .L1_11:
  807. #if !defined(TRMMKERNEL) || \
  808. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  809. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  810. leaq BUFFER1, BO // first buffer to BO
  811. addq $4 * SIZE, BO
  812. #else
  813. movq KK, %rax
  814. leaq BUFFER1, BO // first buffer to BO
  815. addq $4 * SIZE, BO
  816. movq %rax, BI // Index for BO
  817. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  818. leaq (BO, BI, SIZE), BO
  819. salq $2, %rax // rax = rax * 4 ; number of values
  820. leaq (AO, %rax, SIZE), AO
  821. #endif
  822. vzeroall
  823. #ifndef TRMMKERNEL
  824. movq K, %rax
  825. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  826. movq K, %rax
  827. subq KK, %rax
  828. movq %rax, KKK
  829. #else
  830. movq KK, %rax
  831. #ifdef LEFT
  832. addq $2, %rax // number of values in AO
  833. #else
  834. addq $1, %rax // number of values in BO
  835. #endif
  836. movq %rax, KKK
  837. #endif
  838. andq $-8, %rax // K = K - ( K % 8 )
  839. je .L1_16
  840. movq %rax, BI // Index for BO
  841. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  842. salq $2, %rax // rax = rax * 4 ; number of values
  843. leaq (AO, %rax, SIZE), AO
  844. leaq (BO, BI, SIZE), BO
  845. negq BI
  846. negq %rax
  847. ALIGN_4
  848. .L1_12:
  849. prefetcht0 B_PR1(BO,BI,SIZE)
  850. KERNEL2x1_1(xxx)
  851. KERNEL2x1_2(xxx)
  852. KERNEL2x1_3(xxx)
  853. KERNEL2x1_4(xxx)
  854. prefetcht0 B_PR1(BO,BI,SIZE)
  855. KERNEL2x1_1(xxx)
  856. KERNEL2x1_2(xxx)
  857. KERNEL2x1_3(xxx)
  858. KERNEL2x1_4(xxx)
  859. je .L1_16
  860. prefetcht0 B_PR1(BO,BI,SIZE)
  861. KERNEL2x1_1(xxx)
  862. KERNEL2x1_2(xxx)
  863. KERNEL2x1_3(xxx)
  864. KERNEL2x1_4(xxx)
  865. prefetcht0 B_PR1(BO,BI,SIZE)
  866. KERNEL2x1_1(xxx)
  867. KERNEL2x1_2(xxx)
  868. KERNEL2x1_3(xxx)
  869. KERNEL2x1_4(xxx)
  870. je .L1_16
  871. jmp .L1_12
  872. ALIGN_4
  873. .L1_16:
  874. #ifndef TRMMKERNEL
  875. movq K, %rax
  876. #else
  877. movq KKK, %rax
  878. #endif
  879. andq $7, %rax # if (k & 1)
  880. je .L1_19
  881. movq %rax, BI // Index for BO
  882. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  883. salq $2, %rax // rax = rax * 4 ; number of values
  884. leaq (AO, %rax, SIZE), AO
  885. leaq (BO, BI, SIZE), BO
  886. negq BI
  887. negq %rax
  888. ALIGN_4
  889. .L1_17:
  890. KERNEL2x1_SUB(xxx)
  891. jl .L1_17
  892. ALIGN_4
  893. .L1_19:
  894. vmovddup ALPHA_R, %xmm0
  895. vmovddup ALPHA_I, %xmm1
  896. // swap high and low 64 bytes
  897. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  898. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  899. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  900. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  901. vaddsubpd %xmm9, %xmm8 , %xmm8
  902. vaddsubpd %xmm13,%xmm12 , %xmm12
  903. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  904. vshufpd $0x01, %xmm12, %xmm12, %xmm13
  905. #else
  906. vaddsubpd %xmm8, %xmm9 , %xmm9
  907. vaddsubpd %xmm12,%xmm13, %xmm13
  908. vmovapd %xmm9, %xmm8
  909. vmovapd %xmm13, %xmm12
  910. // swap high and low 64 bytes
  911. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  912. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  913. #endif
  914. // multiply with ALPHA_R
  915. vmulpd %xmm8 , %xmm0, %xmm8
  916. vmulpd %xmm12, %xmm0, %xmm12
  917. // multiply with ALPHA_I
  918. vmulpd %xmm9 , %xmm1, %xmm9
  919. vmulpd %xmm13, %xmm1, %xmm13
  920. vaddsubpd %xmm9, %xmm8 , %xmm8
  921. vaddsubpd %xmm13, %xmm12, %xmm12
  922. #ifndef TRMMKERNEL
  923. vaddpd (CO1), %xmm8 , %xmm8
  924. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  925. #endif
  926. vmovups %xmm8 , (CO1)
  927. vmovups %xmm12 , 2 * SIZE(CO1)
  928. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  929. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  930. movq K, %rax
  931. subq KKK, %rax
  932. movq %rax, BI // Index for BO
  933. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  934. leaq (BO, BI, SIZE), BO
  935. salq $2, %rax // rax = rax * 4 ; number of values
  936. leaq (AO, %rax, SIZE), AO
  937. #endif
  938. #if defined(TRMMKERNEL) && defined(LEFT)
  939. addq $2, KK
  940. #endif
  941. addq $4 * SIZE, CO1 # coffset += 4
  942. decq I # i --
  943. jg .L1_11
  944. ALIGN_4
  945. /**************************************************************************
  946. * Rest of M
  947. ***************************************************************************/
  948. .L1_40:
  949. testq $1, M
  950. jz .L999
  951. ALIGN_4
  952. .L1_41:
  953. #if !defined(TRMMKERNEL) || \
  954. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  955. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  956. leaq BUFFER1, BO // first buffer to BO
  957. addq $4 * SIZE, BO
  958. #else
  959. movq KK, %rax
  960. leaq BUFFER1, BO // first buffer to BO
  961. addq $4 * SIZE, BO
  962. movq %rax, BI // Index for BO
  963. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  964. leaq (BO, BI, SIZE), BO
  965. salq $1, %rax // rax = rax * 2 ; number of values
  966. leaq (AO, %rax, SIZE), AO
  967. #endif
  968. vzeroall
  969. #ifndef TRMMKERNEL
  970. movq K, %rax
  971. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  972. movq K, %rax
  973. subq KK, %rax
  974. movq %rax, KKK
  975. #else
  976. movq KK, %rax
  977. #ifdef LEFT
  978. addq $1, %rax // number of values in AO
  979. #else
  980. addq $1, %rax // number of values in BO
  981. #endif
  982. movq %rax, KKK
  983. #endif
  984. andq $-8, %rax // K = K - ( K % 8 )
  985. je .L1_46
  986. movq %rax, BI // Index for BO
  987. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  988. salq $1, %rax // rax = rax * 2 ; number of values
  989. leaq (AO, %rax, SIZE), AO
  990. leaq (BO, BI, SIZE), BO
  991. negq BI
  992. negq %rax
  993. ALIGN_4
  994. .L1_42:
  995. prefetcht0 B_PR1(BO,BI,SIZE)
  996. KERNEL1x1_1(xxx)
  997. KERNEL1x1_2(xxx)
  998. KERNEL1x1_3(xxx)
  999. KERNEL1x1_4(xxx)
  1000. prefetcht0 B_PR1(BO,BI,SIZE)
  1001. KERNEL1x1_1(xxx)
  1002. KERNEL1x1_2(xxx)
  1003. KERNEL1x1_3(xxx)
  1004. KERNEL1x1_4(xxx)
  1005. je .L1_46
  1006. prefetcht0 B_PR1(BO,BI,SIZE)
  1007. KERNEL1x1_1(xxx)
  1008. KERNEL1x1_2(xxx)
  1009. KERNEL1x1_3(xxx)
  1010. KERNEL1x1_4(xxx)
  1011. prefetcht0 B_PR1(BO,BI,SIZE)
  1012. KERNEL1x1_1(xxx)
  1013. KERNEL1x1_2(xxx)
  1014. KERNEL1x1_3(xxx)
  1015. KERNEL1x1_4(xxx)
  1016. je .L1_46
  1017. jmp .L1_42
  1018. ALIGN_4
  1019. .L1_46:
  1020. #ifndef TRMMKERNEL
  1021. movq K, %rax
  1022. #else
  1023. movq KKK, %rax
  1024. #endif
  1025. andq $7, %rax # if (k & 1)
  1026. je .L1_49
  1027. movq %rax, BI // Index for BO
  1028. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1029. salq $1, %rax // rax = rax * 2 ; number of values
  1030. leaq (AO, %rax, SIZE), AO
  1031. leaq (BO, BI, SIZE), BO
  1032. negq BI
  1033. negq %rax
  1034. ALIGN_4
  1035. .L1_47:
  1036. KERNEL1x1_SUB(xxx)
  1037. jl .L1_47
  1038. ALIGN_4
  1039. .L1_49:
  1040. vmovddup ALPHA_R, %xmm0
  1041. vmovddup ALPHA_I, %xmm1
  1042. // swap high and low 64 bytes
  1043. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  1044. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1045. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1046. vaddsubpd %xmm9, %xmm8, %xmm8
  1047. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  1048. #else
  1049. vaddsubpd %xmm8, %xmm9, %xmm9
  1050. vmovapd %xmm9, %xmm8
  1051. // swap high and low 64 bytes
  1052. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  1053. #endif
  1054. // multiply with ALPHA_R
  1055. vmulpd %xmm8 , %xmm0, %xmm8
  1056. // multiply with ALPHA_I
  1057. vmulpd %xmm9 , %xmm1, %xmm9
  1058. vaddsubpd %xmm9 ,%xmm8, %xmm8
  1059. #ifndef TRMMKERNEL
  1060. vaddpd (CO1), %xmm8 , %xmm8
  1061. #endif
  1062. vmovups %xmm8 , (CO1)
  1063. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1064. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1065. movq K, %rax
  1066. subq KKK, %rax
  1067. movq %rax, BI // Index for BO
  1068. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1069. leaq (BO, BI, SIZE), BO
  1070. salq $1, %rax // rax = rax * 2 ; number of values
  1071. leaq (AO, %rax, SIZE), AO
  1072. #endif
  1073. #if defined(TRMMKERNEL) && defined(LEFT)
  1074. addq $1, KK
  1075. #endif
  1076. addq $2 * SIZE, CO1 # coffset += 2
  1077. ALIGN_4
  1078. .L999:
  1079. vzeroupper
  1080. movq SP, %rsp
  1081. movq (%rsp), %rbx
  1082. movq 8(%rsp), %rbp
  1083. movq 16(%rsp), %r12
  1084. movq 24(%rsp), %r13
  1085. movq 32(%rsp), %r14
  1086. movq 40(%rsp), %r15
  1087. #ifdef WINDOWS_ABI
  1088. movq 48(%rsp), %rdi
  1089. movq 56(%rsp), %rsi
  1090. vmovups 64(%rsp), %xmm6
  1091. vmovups 80(%rsp), %xmm7
  1092. vmovups 96(%rsp), %xmm8
  1093. vmovups 112(%rsp), %xmm9
  1094. vmovups 128(%rsp), %xmm10
  1095. vmovups 144(%rsp), %xmm11
  1096. vmovups 160(%rsp), %xmm12
  1097. vmovups 176(%rsp), %xmm13
  1098. vmovups 192(%rsp), %xmm14
  1099. vmovups 208(%rsp), %xmm15
  1100. #endif
  1101. addq $STACKSIZE, %rsp
  1102. ret
  1103. EPILOGUE