You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_piledriver.S 39 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /*********************************************************************
  28. *
  29. * 2014/06/28 Saar
  30. * BLASTEST : OK
  31. * CTEST : OK
  32. * TEST : OK
  33. *
  34. *
  35. * 2013/10/30 Saar
  36. *
  37. * Parameter:
  38. * UNROLL_M 2
  39. * UNROLL_N 2
  40. * ZGEMM_P 384
  41. * ZGEMM_Q 168
  42. * A_PR1 512
  43. * B_PR1 256
  44. *
  45. * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
  46. *
  47. * 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 )
  48. * 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 )
  49. * 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 )
  50. * 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 )
  51. *
  52. * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
  53. *
  54. * 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
  55. * 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
  56. * 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 )
  57. * 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 )
  58. * 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 )
  59. * 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 )
  60. *
  61. *********************************************************************/
  62. #define ASSEMBLER
  63. #include "common.h"
  64. #define OLD_M %rdi
  65. #define OLD_N %rsi
  66. #define M %r13
  67. #define J %r14
  68. #define OLD_K %rdx
  69. #define A %rcx
  70. #define B %r8
  71. #define C %r9
  72. #define LDC %r10
  73. #define I %r11
  74. #define AO %rdi
  75. #define BO %rsi
  76. #define CO1 %r15
  77. #define K %r12
  78. #define BI %rbp
  79. #define SP %rbx
  80. #define BO1 %rdi
  81. #define BO2 %r15
  82. #ifndef WINDOWS_ABI
  83. #define STACKSIZE 96
  84. #else
  85. #define STACKSIZE 320
  86. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  87. #define OLD_A 48 + STACKSIZE(%rsp)
  88. #define OLD_B 56 + STACKSIZE(%rsp)
  89. #define OLD_C 64 + STACKSIZE(%rsp)
  90. #define OLD_LDC 72 + STACKSIZE(%rsp)
  91. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  92. #endif
  93. #define L_BUFFER_SIZE 256*8*4
  94. #define Ndiv6 24(%rsp)
  95. #define Nmod6 32(%rsp)
  96. #define N 40(%rsp)
  97. #define ALPHA_R 48(%rsp)
  98. #define ALPHA_I 56(%rsp)
  99. #define OFFSET 64(%rsp)
  100. #define KK 72(%rsp)
  101. #define KKK 80(%rsp)
  102. #define BUFFER1 128(%rsp)
  103. #if defined(OS_WINDOWS)
  104. #if L_BUFFER_SIZE > 16384
  105. #define STACK_TOUCH \
  106. movl $0, 4096 * 4(%rsp);\
  107. movl $0, 4096 * 3(%rsp);\
  108. movl $0, 4096 * 2(%rsp);\
  109. movl $0, 4096 * 1(%rsp);
  110. #elif L_BUFFER_SIZE > 12288
  111. #define STACK_TOUCH \
  112. movl $0, 4096 * 3(%rsp);\
  113. movl $0, 4096 * 2(%rsp);\
  114. movl $0, 4096 * 1(%rsp);
  115. #elif L_BUFFER_SIZE > 8192
  116. #define STACK_TOUCH \
  117. movl $0, 4096 * 2(%rsp);\
  118. movl $0, 4096 * 1(%rsp);
  119. #elif L_BUFFER_SIZE > 4096
  120. #define STACK_TOUCH \
  121. movl $0, 4096 * 1(%rsp);
  122. #else
  123. #define STACK_TOUCH
  124. #endif
  125. #else
  126. #define STACK_TOUCH
  127. #endif
  128. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  129. #define VFMADD_R vfmaddpd
  130. #define VFMADD_I vfmaddpd
  131. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  132. #define VFMADD_R vfnmaddpd
  133. #define VFMADD_I vfmaddpd
  134. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  135. #define VFMADD_R vfmaddpd
  136. #define VFMADD_I vfnmaddpd
  137. #else
  138. #define VFMADD_R vfnmaddpd
  139. #define VFMADD_I vfnmaddpd
  140. #endif
  141. #define A_PR1 512
  142. #define B_PR1 256
  143. #define KERNEL2x2_1(xx) \
  144. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  145. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  146. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  147. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  148. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  149. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  150. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  151. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  152. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  153. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  154. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  155. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  156. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  157. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  158. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  159. #define KERNEL2x2_2(xx) \
  160. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  161. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  162. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  163. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  164. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  165. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  166. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  167. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  168. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  169. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  170. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  171. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  172. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  173. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  174. #define KERNEL2x2_3(xx) \
  175. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  176. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  177. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  178. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  179. vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  180. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  181. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  182. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  183. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  184. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  185. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  186. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  187. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  188. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  189. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  190. #define KERNEL2x2_4(xx) \
  191. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  192. vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  193. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  194. vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  195. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  196. vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  197. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  198. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  199. vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  200. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  201. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  202. vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  203. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  204. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  205. addq $16, BI ;\
  206. addq $16, %rax ;\
  207. #define KERNEL2x2_SUB(xx) \
  208. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  209. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  210. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  211. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  212. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  213. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  214. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  215. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  216. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  217. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  218. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  219. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  220. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  221. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  222. addq $4, BI ;\
  223. addq $4, %rax ;\
  224. /************************************************************************************************/
  225. #define KERNEL1x2_1(xx) \
  226. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  227. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  228. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  229. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  230. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  231. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  232. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  233. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  234. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  235. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  236. #define KERNEL1x2_2(xx) \
  237. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  238. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  239. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  240. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  241. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  242. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  243. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  244. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  245. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  246. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  247. #define KERNEL1x2_3(xx) \
  248. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  249. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  250. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  251. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  252. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  253. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  254. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  255. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  256. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  257. #define KERNEL1x2_4(xx) \
  258. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  259. vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  260. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  261. vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  262. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  263. vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  264. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  265. vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  266. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  267. addq $16, BI ;\
  268. addq $8 , %rax ;\
  269. #define KERNEL1x2_SUB(xx) \
  270. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  271. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  272. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  273. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  274. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  275. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  276. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  277. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  278. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  279. addq $4, BI ;\
  280. addq $2, %rax ;\
  281. /************************************************************************************************/
  282. #define KERNEL2x1_1(xx) \
  283. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  284. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  285. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  286. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  287. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  288. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  289. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  290. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  291. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  292. #define KERNEL2x1_2(xx) \
  293. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  294. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  295. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  296. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  297. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  298. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  299. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  300. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  301. #define KERNEL2x1_3(xx) \
  302. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  303. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  304. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  305. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  306. vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  307. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  308. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  309. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  310. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  311. #define KERNEL2x1_4(xx) \
  312. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  313. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  314. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  315. vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  316. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  317. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  318. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  319. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  320. addq $8, BI ;\
  321. addq $16, %rax ;\
  322. #define KERNEL2x1_SUB(xx) \
  323. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  324. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  325. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  326. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  327. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  328. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  329. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  330. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  331. addq $2, BI ;\
  332. addq $4, %rax ;\
  333. /************************************************************************************************/
  334. #define KERNEL1x1_1(xx) \
  335. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  336. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  337. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  338. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  339. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  340. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  341. #define KERNEL1x1_2(xx) \
  342. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  343. vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  344. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  345. vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  346. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  347. #define KERNEL1x1_3(xx) \
  348. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  349. vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  350. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  351. vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  352. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  353. #define KERNEL1x1_4(xx) \
  354. vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  355. vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  356. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  357. vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  358. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  359. addq $8, BI ;\
  360. addq $8, %rax ;\
  361. #define KERNEL1x1_SUB(xx) \
  362. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  363. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  364. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  365. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  366. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  367. addq $2, BI ;\
  368. addq $2, %rax ;\
  369. /************************************************************************************************/
  370. PROLOGUE
  371. PROFCODE
  372. subq $STACKSIZE, %rsp
  373. movq %rbx, (%rsp)
  374. movq %rbp, 8(%rsp)
  375. movq %r12, 16(%rsp)
  376. movq %r13, 24(%rsp)
  377. movq %r14, 32(%rsp)
  378. movq %r15, 40(%rsp)
  379. vzeroupper
  380. #ifdef WINDOWS_ABI
  381. movq %rdi, 48(%rsp)
  382. movq %rsi, 56(%rsp)
  383. vmovups %xmm6, 64(%rsp)
  384. vmovups %xmm7, 80(%rsp)
  385. vmovups %xmm8, 96(%rsp)
  386. vmovups %xmm9, 112(%rsp)
  387. vmovups %xmm10, 128(%rsp)
  388. vmovups %xmm11, 144(%rsp)
  389. vmovups %xmm12, 160(%rsp)
  390. vmovups %xmm13, 176(%rsp)
  391. vmovups %xmm14, 192(%rsp)
  392. vmovups %xmm15, 208(%rsp)
  393. movq ARG1, OLD_M
  394. movq ARG2, OLD_N
  395. movq ARG3, OLD_K
  396. movq OLD_A, A
  397. movq OLD_B, B
  398. movq OLD_C, C
  399. movq OLD_LDC, LDC
  400. #ifdef TRMMKERNEL
  401. vmovsd OLD_OFFSET, %xmm12
  402. #endif
  403. vmovaps %xmm3, %xmm0
  404. vmovsd OLD_ALPHA_I, %xmm1
  405. #else
  406. movq STACKSIZE + 8(%rsp), LDC
  407. #ifdef TRMMKERNEL
  408. vmovsd STACKSIZE + 16(%rsp), %xmm12
  409. #endif
  410. #endif
  411. movq %rsp, SP # save old stack
  412. subq $128 + L_BUFFER_SIZE, %rsp
  413. andq $-4096, %rsp # align stack
  414. STACK_TOUCH
  415. cmpq $0, OLD_M
  416. je .L999
  417. cmpq $0, OLD_N
  418. je .L999
  419. cmpq $0, OLD_K
  420. je .L999
  421. movq OLD_M, M
  422. movq OLD_N, N
  423. movq OLD_K, K
  424. vmovsd %xmm0, ALPHA_R
  425. vmovsd %xmm1, ALPHA_I
  426. salq $ZBASE_SHIFT, LDC
  427. movq N, %rax
  428. xorq %rdx, %rdx
  429. movq $2, %rdi
  430. divq %rdi // N / 2
  431. movq %rax, Ndiv6 // N / 2
  432. movq %rdx, Nmod6 // N % 2
  433. #ifdef TRMMKERNEL
  434. vmovsd %xmm12, OFFSET
  435. vmovsd %xmm12, KK
  436. #ifndef LEFT
  437. negq KK
  438. #endif
  439. #endif
  440. .L2_0:
  441. movq Ndiv6, J
  442. cmpq $0, J
  443. je .L1_0
  444. ALIGN_4
  445. .L2_01:
  446. // copy to sub buffer
  447. movq B, BO1
  448. leaq BUFFER1, BO // first buffer to BO
  449. movq K, %rax
  450. ALIGN_4
  451. .L2_02b:
  452. vmovups (BO1), %xmm0
  453. vmovups 2 * SIZE(BO1), %xmm1
  454. vmovups %xmm0, (BO)
  455. vmovups %xmm1, 2 * SIZE(BO)
  456. addq $4*SIZE,BO1
  457. addq $4*SIZE,BO
  458. decq %rax
  459. jnz .L2_02b
  460. .L2_02c:
  461. movq BO1, B // next offset of B
  462. .L2_10:
  463. movq C, CO1
  464. leaq (C, LDC, 2), C // c += 2 * ldc
  465. #if defined(TRMMKERNEL) && defined(LEFT)
  466. movq OFFSET, %rax
  467. movq %rax, KK
  468. #endif
  469. movq A, AO // aoffset = a
  470. addq $8 * SIZE, AO
  471. movq M, I
  472. sarq $1, I // i = (m >> 1)
  473. je .L2_40
  474. ALIGN_4
  475. .L2_11:
  476. #if !defined(TRMMKERNEL) || \
  477. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  478. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  479. leaq BUFFER1, BO // first buffer to BO
  480. addq $8 * SIZE, BO
  481. #else
  482. movq KK, %rax
  483. leaq BUFFER1, BO // first buffer to BO
  484. addq $8 * SIZE, BO
  485. movq %rax, BI // Index for BO
  486. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  487. leaq (BO, BI, SIZE), BO
  488. salq $2, %rax // rax = rax * 4 ; number of values
  489. leaq (AO, %rax, SIZE), AO
  490. #endif
  491. vzeroall
  492. #ifndef TRMMKERNEL
  493. movq K, %rax
  494. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  495. movq K, %rax
  496. subq KK, %rax
  497. movq %rax, KKK
  498. #else
  499. movq KK, %rax
  500. #ifdef LEFT
  501. addq $2, %rax // number of values in AO
  502. #else
  503. addq $2, %rax // number of values in BO
  504. #endif
  505. movq %rax, KKK
  506. #endif
  507. andq $-8, %rax // K = K - ( K % 8 )
  508. je .L2_16
  509. movq %rax, BI // Index for BO
  510. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  511. salq $2, %rax // rax = rax * 4 ; number of values
  512. leaq (AO, %rax, SIZE), AO
  513. leaq (BO, BI, SIZE), BO
  514. negq BI
  515. negq %rax
  516. ALIGN_4
  517. .L2_12:
  518. prefetcht0 B_PR1(BO,BI,SIZE)
  519. KERNEL2x2_1(xxx)
  520. KERNEL2x2_2(xxx)
  521. prefetcht0 B_PR1+64(BO,BI,SIZE)
  522. KERNEL2x2_3(xxx)
  523. KERNEL2x2_4(xxx)
  524. prefetcht0 B_PR1(BO,BI,SIZE)
  525. KERNEL2x2_1(xxx)
  526. KERNEL2x2_2(xxx)
  527. prefetcht0 B_PR1+64(BO,BI,SIZE)
  528. KERNEL2x2_3(xxx)
  529. KERNEL2x2_4(xxx)
  530. je .L2_16
  531. prefetcht0 B_PR1(BO,BI,SIZE)
  532. KERNEL2x2_1(xxx)
  533. KERNEL2x2_2(xxx)
  534. prefetcht0 B_PR1+64(BO,BI,SIZE)
  535. KERNEL2x2_3(xxx)
  536. KERNEL2x2_4(xxx)
  537. prefetcht0 B_PR1(BO,BI,SIZE)
  538. KERNEL2x2_1(xxx)
  539. KERNEL2x2_2(xxx)
  540. prefetcht0 B_PR1+64(BO,BI,SIZE)
  541. KERNEL2x2_3(xxx)
  542. KERNEL2x2_4(xxx)
  543. je .L2_16
  544. jmp .L2_12
  545. ALIGN_4
  546. .L2_16:
  547. #ifndef TRMMKERNEL
  548. movq K, %rax
  549. #else
  550. movq KKK, %rax
  551. #endif
  552. andq $7, %rax # if (k & 1)
  553. je .L2_19
  554. movq %rax, BI // Index for BO
  555. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  556. salq $2, %rax // rax = rax * 4 ; number of values
  557. leaq (AO, %rax, SIZE), AO
  558. leaq (BO, BI, SIZE), BO
  559. negq BI
  560. negq %rax
  561. ALIGN_4
  562. .L2_17:
  563. KERNEL2x2_SUB(xxx)
  564. jl .L2_17
  565. ALIGN_4
  566. .L2_19:
  567. vmovddup ALPHA_R, %xmm0
  568. vmovddup ALPHA_I, %xmm1
  569. // swap high and low 64 bytes
  570. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  571. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  572. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  573. vshufpd $0x01, %xmm15, %xmm15, %xmm15
  574. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  575. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  576. vaddsubpd %xmm9, %xmm8 , %xmm8
  577. vaddsubpd %xmm11,%xmm10, %xmm10
  578. vaddsubpd %xmm13,%xmm12, %xmm12
  579. vaddsubpd %xmm15,%xmm14, %xmm14
  580. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  581. vshufpd $0x01, %xmm10, %xmm10, %xmm11
  582. vshufpd $0x01, %xmm12, %xmm12, %xmm13
  583. vshufpd $0x01, %xmm14, %xmm14, %xmm15
  584. #else
  585. vaddsubpd %xmm8, %xmm9 ,%xmm9
  586. vaddsubpd %xmm10, %xmm11,%xmm11
  587. vaddsubpd %xmm12, %xmm13,%xmm13
  588. vaddsubpd %xmm14, %xmm15,%xmm15
  589. vmovapd %xmm9, %xmm8
  590. vmovapd %xmm11, %xmm10
  591. vmovapd %xmm13, %xmm12
  592. vmovapd %xmm15, %xmm14
  593. // swap high and low 64 bytes
  594. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  595. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  596. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  597. vshufpd $0x01, %xmm15, %xmm15, %xmm15
  598. #endif
  599. // multiply with ALPHA_R
  600. vmulpd %xmm8 , %xmm0, %xmm8
  601. vmulpd %xmm10, %xmm0, %xmm10
  602. vmulpd %xmm12, %xmm0, %xmm12
  603. vmulpd %xmm14, %xmm0, %xmm14
  604. // multiply with ALPHA_I
  605. vmulpd %xmm9 , %xmm1, %xmm9
  606. vmulpd %xmm11, %xmm1, %xmm11
  607. vmulpd %xmm13, %xmm1, %xmm13
  608. vmulpd %xmm15, %xmm1, %xmm15
  609. vaddsubpd %xmm9, %xmm8 , %xmm8
  610. vaddsubpd %xmm11,%xmm10, %xmm10
  611. vaddsubpd %xmm13,%xmm12, %xmm12
  612. vaddsubpd %xmm15,%xmm14, %xmm14
  613. #ifndef TRMMKERNEL
  614. vaddpd (CO1), %xmm8 , %xmm8
  615. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  616. vaddpd (CO1, LDC), %xmm10, %xmm10
  617. vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
  618. #endif
  619. vmovups %xmm8 , (CO1)
  620. vmovups %xmm12 , 2 * SIZE(CO1)
  621. vmovups %xmm10 , (CO1, LDC)
  622. vmovups %xmm14 , 2 * SIZE(CO1, LDC)
  623. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  624. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  625. movq K, %rax
  626. subq KKK, %rax
  627. movq %rax, BI // Index for BO
  628. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  629. leaq (BO, BI, SIZE), BO
  630. salq $2, %rax // rax = rax * 4 ; number of values
  631. leaq (AO, %rax, SIZE), AO
  632. #endif
  633. #if defined(TRMMKERNEL) && defined(LEFT)
  634. addq $2, KK
  635. #endif
  636. addq $4 * SIZE, CO1 # coffset += 4
  637. decq I # i --
  638. jg .L2_11
  639. ALIGN_4
  640. /**************************************************************************
  641. * Rest of M
  642. ***************************************************************************/
  643. .L2_40:
  644. testq $1, M
  645. jz .L2_60 // to next 2 lines of N
  646. ALIGN_4
  647. .L2_41:
  648. #if !defined(TRMMKERNEL) || \
  649. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  650. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  651. leaq BUFFER1, BO // first buffer to BO
  652. addq $8 * SIZE, BO
  653. #else
  654. movq KK, %rax
  655. leaq BUFFER1, BO // first buffer to BO
  656. addq $8 * SIZE, BO
  657. movq %rax, BI // Index for BO
  658. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  659. leaq (BO, BI, SIZE), BO
  660. salq $1, %rax // rax = rax * 2 ; number of values
  661. leaq (AO, %rax, SIZE), AO
  662. #endif
  663. vzeroall
  664. #ifndef TRMMKERNEL
  665. movq K, %rax
  666. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  667. movq K, %rax
  668. subq KK, %rax
  669. movq %rax, KKK
  670. #else
  671. movq KK, %rax
  672. #ifdef LEFT
  673. addq $1, %rax // number of values in AO
  674. #else
  675. addq $2, %rax // number of values in BO
  676. #endif
  677. movq %rax, KKK
  678. #endif
  679. andq $-8, %rax // K = K - ( K % 8 )
  680. je .L2_46
  681. movq %rax, BI // Index for BO
  682. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  683. salq $1, %rax // rax = rax * 2 ; number of values
  684. leaq (AO, %rax, SIZE), AO
  685. leaq (BO, BI, SIZE), BO
  686. negq BI
  687. negq %rax
  688. ALIGN_4
  689. .L2_42:
  690. prefetcht0 B_PR1(BO,BI,SIZE)
  691. KERNEL1x2_1(xxx)
  692. KERNEL1x2_2(xxx)
  693. prefetcht0 B_PR1+64(BO,BI,SIZE)
  694. KERNEL1x2_3(xxx)
  695. KERNEL1x2_4(xxx)
  696. prefetcht0 B_PR1(BO,BI,SIZE)
  697. KERNEL1x2_1(xxx)
  698. KERNEL1x2_2(xxx)
  699. prefetcht0 B_PR1+64(BO,BI,SIZE)
  700. KERNEL1x2_3(xxx)
  701. KERNEL1x2_4(xxx)
  702. je .L2_46
  703. prefetcht0 B_PR1(BO,BI,SIZE)
  704. KERNEL1x2_1(xxx)
  705. KERNEL1x2_2(xxx)
  706. prefetcht0 B_PR1+64(BO,BI,SIZE)
  707. KERNEL1x2_3(xxx)
  708. KERNEL1x2_4(xxx)
  709. prefetcht0 B_PR1(BO,BI,SIZE)
  710. KERNEL1x2_1(xxx)
  711. KERNEL1x2_2(xxx)
  712. prefetcht0 B_PR1+64(BO,BI,SIZE)
  713. KERNEL1x2_3(xxx)
  714. KERNEL1x2_4(xxx)
  715. je .L2_46
  716. jmp .L2_42
  717. ALIGN_4
  718. .L2_46:
  719. #ifndef TRMMKERNEL
  720. movq K, %rax
  721. #else
  722. movq KKK, %rax
  723. #endif
  724. andq $7, %rax # if (k & 1)
  725. je .L2_49
  726. movq %rax, BI // Index for BO
  727. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  728. salq $1, %rax // rax = rax * 2 ; number of values
  729. leaq (AO, %rax, SIZE), AO
  730. leaq (BO, BI, SIZE), BO
  731. negq BI
  732. negq %rax
  733. ALIGN_4
  734. .L2_47:
  735. KERNEL1x2_SUB(xxx)
  736. jl .L2_47
  737. ALIGN_4
  738. .L2_49:
  739. vmovddup ALPHA_R, %xmm0
  740. vmovddup ALPHA_I, %xmm1
  741. // swap high and low 64 bytes
  742. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  743. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  744. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  745. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  746. vaddsubpd %xmm9, %xmm8 , %xmm8
  747. vaddsubpd %xmm11,%xmm10, %xmm10
  748. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  749. vshufpd $0x01, %xmm10, %xmm10, %xmm11
  750. #else
  751. vaddsubpd %xmm8, %xmm9, %xmm9
  752. vaddsubpd %xmm10,%xmm11, %xmm11
  753. vmovapd %xmm9, %xmm8
  754. vmovapd %xmm11, %xmm10
  755. // swap high and low 64 bytes
  756. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  757. vshufpd $0x01, %xmm11, %xmm11, %xmm11
  758. #endif
  759. // multiply with ALPHA_R
  760. vmulpd %xmm8 , %xmm0, %xmm8
  761. vmulpd %xmm10, %xmm0, %xmm10
  762. // multiply with ALPHA_I
  763. vmulpd %xmm9 , %xmm1, %xmm9
  764. vmulpd %xmm11, %xmm1, %xmm11
  765. vaddsubpd %xmm9, %xmm8 , %xmm8
  766. vaddsubpd %xmm11,%xmm10, %xmm10
  767. #ifndef TRMMKERNEL
  768. vaddpd (CO1), %xmm8 , %xmm8
  769. vaddpd (CO1, LDC), %xmm10, %xmm10
  770. #endif
  771. vmovups %xmm8 , (CO1)
  772. vmovups %xmm10 , (CO1, LDC)
  773. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  774. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  775. movq K, %rax
  776. subq KKK, %rax
  777. movq %rax, BI // Index for BO
  778. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  779. leaq (BO, BI, SIZE), BO
  780. salq $1, %rax // rax = rax * 2 ; number of values
  781. leaq (AO, %rax, SIZE), AO
  782. #endif
  783. #if defined(TRMMKERNEL) && defined(LEFT)
  784. addq $1, KK
  785. #endif
  786. addq $2 * SIZE, CO1 # coffset += 2
  787. ALIGN_4
  788. .L2_60:
  789. #if defined(TRMMKERNEL) && !defined(LEFT)
  790. addq $2, KK
  791. #endif
  792. decq J // j --
  793. jg .L2_01 // next 2 lines of N
  794. .L1_0:
  795. /************************************************************************************************
  796. * Loop for Nmod6 % 2 > 0
  797. *************************************************************************************************/
  798. movq Nmod6, J
  799. andq $1, J // j % 2
  800. je .L999
  801. ALIGN_4
  802. .L1_01:
  803. // copy to sub buffer
  804. movq B, BO1
  805. leaq BUFFER1, BO // first buffer to BO
  806. movq K, %rax
  807. ALIGN_4
  808. .L1_02b:
  809. vmovups (BO1), %xmm0
  810. vmovups %xmm0, (BO)
  811. addq $2*SIZE,BO1
  812. addq $2*SIZE,BO
  813. decq %rax
  814. jnz .L1_02b
  815. .L1_02c:
  816. movq BO1, B // next offset of B
  817. .L1_10:
  818. movq C, CO1
  819. leaq (C, LDC, 1), C // c += 1 * ldc
  820. #if defined(TRMMKERNEL) && defined(LEFT)
  821. movq OFFSET, %rax
  822. movq %rax, KK
  823. #endif
  824. movq A, AO // aoffset = a
  825. addq $8 * SIZE, AO
  826. movq M, I
  827. sarq $1, I // i = (m >> 1)
  828. je .L1_40
  829. ALIGN_4
  830. .L1_11:
  831. #if !defined(TRMMKERNEL) || \
  832. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  833. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  834. leaq BUFFER1, BO // first buffer to BO
  835. addq $4 * SIZE, BO
  836. #else
  837. movq KK, %rax
  838. leaq BUFFER1, BO // first buffer to BO
  839. addq $4 * SIZE, BO
  840. movq %rax, BI // Index for BO
  841. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  842. leaq (BO, BI, SIZE), BO
  843. salq $2, %rax // rax = rax * 4 ; number of values
  844. leaq (AO, %rax, SIZE), AO
  845. #endif
  846. vzeroall
  847. #ifndef TRMMKERNEL
  848. movq K, %rax
  849. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  850. movq K, %rax
  851. subq KK, %rax
  852. movq %rax, KKK
  853. #else
  854. movq KK, %rax
  855. #ifdef LEFT
  856. addq $2, %rax // number of values in AO
  857. #else
  858. addq $1, %rax // number of values in BO
  859. #endif
  860. movq %rax, KKK
  861. #endif
  862. andq $-8, %rax // K = K - ( K % 8 )
  863. je .L1_16
  864. movq %rax, BI // Index for BO
  865. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  866. salq $2, %rax // rax = rax * 4 ; number of values
  867. leaq (AO, %rax, SIZE), AO
  868. leaq (BO, BI, SIZE), BO
  869. negq BI
  870. negq %rax
  871. ALIGN_4
  872. .L1_12:
  873. prefetcht0 B_PR1(BO,BI,SIZE)
  874. KERNEL2x1_1(xxx)
  875. KERNEL2x1_2(xxx)
  876. KERNEL2x1_3(xxx)
  877. KERNEL2x1_4(xxx)
  878. prefetcht0 B_PR1(BO,BI,SIZE)
  879. KERNEL2x1_1(xxx)
  880. KERNEL2x1_2(xxx)
  881. KERNEL2x1_3(xxx)
  882. KERNEL2x1_4(xxx)
  883. je .L1_16
  884. prefetcht0 B_PR1(BO,BI,SIZE)
  885. KERNEL2x1_1(xxx)
  886. KERNEL2x1_2(xxx)
  887. KERNEL2x1_3(xxx)
  888. KERNEL2x1_4(xxx)
  889. prefetcht0 B_PR1(BO,BI,SIZE)
  890. KERNEL2x1_1(xxx)
  891. KERNEL2x1_2(xxx)
  892. KERNEL2x1_3(xxx)
  893. KERNEL2x1_4(xxx)
  894. je .L1_16
  895. jmp .L1_12
  896. ALIGN_4
  897. .L1_16:
  898. #ifndef TRMMKERNEL
  899. movq K, %rax
  900. #else
  901. movq KKK, %rax
  902. #endif
  903. andq $7, %rax # if (k & 1)
  904. je .L1_19
  905. movq %rax, BI // Index for BO
  906. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  907. salq $2, %rax // rax = rax * 4 ; number of values
  908. leaq (AO, %rax, SIZE), AO
  909. leaq (BO, BI, SIZE), BO
  910. negq BI
  911. negq %rax
  912. ALIGN_4
  913. .L1_17:
  914. KERNEL2x1_SUB(xxx)
  915. jl .L1_17
  916. ALIGN_4
  917. .L1_19:
  918. vmovddup ALPHA_R, %xmm0
  919. vmovddup ALPHA_I, %xmm1
  920. // swap high and low 64 bytes
  921. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  922. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  923. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  924. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  925. vaddsubpd %xmm9, %xmm8 , %xmm8
  926. vaddsubpd %xmm13,%xmm12 , %xmm12
  927. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  928. vshufpd $0x01, %xmm12, %xmm12, %xmm13
  929. #else
  930. vaddsubpd %xmm8, %xmm9 , %xmm9
  931. vaddsubpd %xmm12,%xmm13, %xmm13
  932. vmovapd %xmm9, %xmm8
  933. vmovapd %xmm13, %xmm12
  934. // swap high and low 64 bytes
  935. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  936. vshufpd $0x01, %xmm13, %xmm13, %xmm13
  937. #endif
  938. // multiply with ALPHA_R
  939. vmulpd %xmm8 , %xmm0, %xmm8
  940. vmulpd %xmm12, %xmm0, %xmm12
  941. // multiply with ALPHA_I
  942. vmulpd %xmm9 , %xmm1, %xmm9
  943. vmulpd %xmm13, %xmm1, %xmm13
  944. vaddsubpd %xmm9, %xmm8 , %xmm8
  945. vaddsubpd %xmm13, %xmm12, %xmm12
  946. #ifndef TRMMKERNEL
  947. vaddpd (CO1), %xmm8 , %xmm8
  948. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  949. #endif
  950. vmovups %xmm8 , (CO1)
  951. vmovups %xmm12 , 2 * SIZE(CO1)
  952. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  953. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  954. movq K, %rax
  955. subq KKK, %rax
  956. movq %rax, BI // Index for BO
  957. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  958. leaq (BO, BI, SIZE), BO
  959. salq $2, %rax // rax = rax * 4 ; number of values
  960. leaq (AO, %rax, SIZE), AO
  961. #endif
  962. #if defined(TRMMKERNEL) && defined(LEFT)
  963. addq $2, KK
  964. #endif
  965. addq $4 * SIZE, CO1 # coffset += 4
  966. decq I # i --
  967. jg .L1_11
  968. ALIGN_4
  969. /**************************************************************************
  970. * Rest of M
  971. ***************************************************************************/
  972. .L1_40:
  973. testq $1, M
  974. jz .L999
  975. ALIGN_4
  976. .L1_41:
  977. #if !defined(TRMMKERNEL) || \
  978. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  979. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  980. leaq BUFFER1, BO // first buffer to BO
  981. addq $4 * SIZE, BO
  982. #else
  983. movq KK, %rax
  984. leaq BUFFER1, BO // first buffer to BO
  985. addq $4 * SIZE, BO
  986. movq %rax, BI // Index for BO
  987. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  988. leaq (BO, BI, SIZE), BO
  989. salq $1, %rax // rax = rax * 2 ; number of values
  990. leaq (AO, %rax, SIZE), AO
  991. #endif
  992. vzeroall
  993. #ifndef TRMMKERNEL
  994. movq K, %rax
  995. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  996. movq K, %rax
  997. subq KK, %rax
  998. movq %rax, KKK
  999. #else
  1000. movq KK, %rax
  1001. #ifdef LEFT
  1002. addq $1, %rax // number of values in AO
  1003. #else
  1004. addq $1, %rax // number of values in BO
  1005. #endif
  1006. movq %rax, KKK
  1007. #endif
  1008. andq $-8, %rax // K = K - ( K % 8 )
  1009. je .L1_46
  1010. movq %rax, BI // Index for BO
  1011. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1012. salq $1, %rax // rax = rax * 2 ; number of values
  1013. leaq (AO, %rax, SIZE), AO
  1014. leaq (BO, BI, SIZE), BO
  1015. negq BI
  1016. negq %rax
  1017. ALIGN_4
  1018. .L1_42:
  1019. prefetcht0 B_PR1(BO,BI,SIZE)
  1020. KERNEL1x1_1(xxx)
  1021. KERNEL1x1_2(xxx)
  1022. KERNEL1x1_3(xxx)
  1023. KERNEL1x1_4(xxx)
  1024. prefetcht0 B_PR1(BO,BI,SIZE)
  1025. KERNEL1x1_1(xxx)
  1026. KERNEL1x1_2(xxx)
  1027. KERNEL1x1_3(xxx)
  1028. KERNEL1x1_4(xxx)
  1029. je .L1_46
  1030. prefetcht0 B_PR1(BO,BI,SIZE)
  1031. KERNEL1x1_1(xxx)
  1032. KERNEL1x1_2(xxx)
  1033. KERNEL1x1_3(xxx)
  1034. KERNEL1x1_4(xxx)
  1035. prefetcht0 B_PR1(BO,BI,SIZE)
  1036. KERNEL1x1_1(xxx)
  1037. KERNEL1x1_2(xxx)
  1038. KERNEL1x1_3(xxx)
  1039. KERNEL1x1_4(xxx)
  1040. je .L1_46
  1041. jmp .L1_42
  1042. ALIGN_4
  1043. .L1_46:
  1044. #ifndef TRMMKERNEL
  1045. movq K, %rax
  1046. #else
  1047. movq KKK, %rax
  1048. #endif
  1049. andq $7, %rax # if (k & 1)
  1050. je .L1_49
  1051. movq %rax, BI // Index for BO
  1052. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1053. salq $1, %rax // rax = rax * 2 ; number of values
  1054. leaq (AO, %rax, SIZE), AO
  1055. leaq (BO, BI, SIZE), BO
  1056. negq BI
  1057. negq %rax
  1058. ALIGN_4
  1059. .L1_47:
  1060. KERNEL1x1_SUB(xxx)
  1061. jl .L1_47
  1062. ALIGN_4
  1063. .L1_49:
  1064. vmovddup ALPHA_R, %xmm0
  1065. vmovddup ALPHA_I, %xmm1
  1066. // swap high and low 64 bytes
  1067. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  1068. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1069. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1070. vaddsubpd %xmm9, %xmm8, %xmm8
  1071. vshufpd $0x01, %xmm8 , %xmm8, %xmm9
  1072. #else
  1073. vaddsubpd %xmm8, %xmm9, %xmm9
  1074. vmovapd %xmm9, %xmm8
  1075. // swap high and low 64 bytes
  1076. vshufpd $0x01, %xmm9 , %xmm9, %xmm9
  1077. #endif
  1078. // multiply with ALPHA_R
  1079. vmulpd %xmm8 , %xmm0, %xmm8
  1080. // multiply with ALPHA_I
  1081. vmulpd %xmm9 , %xmm1, %xmm9
  1082. vaddsubpd %xmm9 ,%xmm8, %xmm8
  1083. #ifndef TRMMKERNEL
  1084. vaddpd (CO1), %xmm8 , %xmm8
  1085. #endif
  1086. vmovups %xmm8 , (CO1)
  1087. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1088. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1089. movq K, %rax
  1090. subq KKK, %rax
  1091. movq %rax, BI // Index for BO
  1092. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1093. leaq (BO, BI, SIZE), BO
  1094. salq $1, %rax // rax = rax * 2 ; number of values
  1095. leaq (AO, %rax, SIZE), AO
  1096. #endif
  1097. #if defined(TRMMKERNEL) && defined(LEFT)
  1098. addq $1, KK
  1099. #endif
  1100. addq $2 * SIZE, CO1 # coffset += 2
  1101. ALIGN_4
  1102. .L999:
  1103. vzeroupper
  1104. movq SP, %rsp
  1105. movq (%rsp), %rbx
  1106. movq 8(%rsp), %rbp
  1107. movq 16(%rsp), %r12
  1108. movq 24(%rsp), %r13
  1109. movq 32(%rsp), %r14
  1110. movq 40(%rsp), %r15
  1111. #ifdef WINDOWS_ABI
  1112. movq 48(%rsp), %rdi
  1113. movq 56(%rsp), %rsi
  1114. vmovups 64(%rsp), %xmm6
  1115. vmovups 80(%rsp), %xmm7
  1116. vmovups 96(%rsp), %xmm8
  1117. vmovups 112(%rsp), %xmm9
  1118. vmovups 128(%rsp), %xmm10
  1119. vmovups 144(%rsp), %xmm11
  1120. vmovups 160(%rsp), %xmm12
  1121. vmovups 176(%rsp), %xmm13
  1122. vmovups 192(%rsp), %xmm14
  1123. vmovups 208(%rsp), %xmm15
  1124. #endif
  1125. addq $STACKSIZE, %rsp
  1126. ret
  1127. EPILOGUE