You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x2_sandy.S 56 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************
  28. * 2014/07/29 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/10/28 Saar
  34. * Parameter:
  35. * CGEMM_DEFAULT_UNROLL_N 2
  36. * CGEMM_DEFAULT_UNROLL_M 8
  37. * CGEMM_DEFAULT_P 768
  38. * CGEMM_DEFAULT_Q 512
  39. * A_PR1 512
  40. * B_PR1 512
  41. *
  42. * 2014/07/29 Saar
  43. * Performance at 6192x6192x6192:
  44. * 1 thread: 49 GFLOPS (MKL: 52)
  45. * 2 threads: 99 GFLOPS (MKL: 102)
  46. * 3 threads: 148 GFLOPS (MKL: 150)
  47. * 4 threads: 195 GFLOPS (MKL: 194)
  48. * 8 threads: 354 GFLOPS (MKL: 317)
  49. *
  50. *
  51. *********************************************************************/
  52. #define ASSEMBLER
  53. #include "common.h"
  54. #define OLD_M %rdi
  55. #define OLD_N %rsi
  56. #define M %r13
  57. #define J %r14
  58. #define OLD_K %rdx
  59. #define A %rcx
  60. #define B %r8
  61. #define C %r9
  62. #define LDC %r10
  63. #define I %r11
  64. #define AO %rdi
  65. #define BO %rsi
  66. #define CO1 %r15
  67. #define K %r12
  68. #define BI %rbp
  69. #define SP %rbx
  70. #define BO1 %rdi
  71. #define BO2 %r15
  72. #ifndef WINDOWS_ABI
  73. #define STACKSIZE 96
  74. #else
  75. #define STACKSIZE 320
  76. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  77. #define OLD_A 48 + STACKSIZE(%rsp)
  78. #define OLD_B 56 + STACKSIZE(%rsp)
  79. #define OLD_C 64 + STACKSIZE(%rsp)
  80. #define OLD_LDC 72 + STACKSIZE(%rsp)
  81. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  82. #endif
  83. #define L_BUFFER_SIZE 8192
  84. #define Ndiv6 24(%rsp)
  85. #define Nmod6 32(%rsp)
  86. #define N 40(%rsp)
  87. #define ALPHA_R 48(%rsp)
  88. #define ALPHA_I 56(%rsp)
  89. #define OFFSET 64(%rsp)
  90. #define KK 72(%rsp)
  91. #define KKK 80(%rsp)
  92. #define BUFFER1 128(%rsp)
  93. #if defined(OS_WINDOWS)
  94. #if L_BUFFER_SIZE > 16384
  95. #define STACK_TOUCH \
  96. movl $ 0, 4096 * 4(%rsp);\
  97. movl $ 0, 4096 * 3(%rsp);\
  98. movl $ 0, 4096 * 2(%rsp);\
  99. movl $ 0, 4096 * 1(%rsp);
  100. #elif L_BUFFER_SIZE > 12288
  101. #define STACK_TOUCH \
  102. movl $ 0, 4096 * 3(%rsp);\
  103. movl $ 0, 4096 * 2(%rsp);\
  104. movl $ 0, 4096 * 1(%rsp);
  105. #elif L_BUFFER_SIZE > 8192
  106. #define STACK_TOUCH \
  107. movl $ 0, 4096 * 2(%rsp);\
  108. movl $ 0, 4096 * 1(%rsp);
  109. #elif L_BUFFER_SIZE > 4096
  110. #define STACK_TOUCH \
  111. movl $ 0, 4096 * 1(%rsp);
  112. #else
  113. #define STACK_TOUCH
  114. #endif
  115. #else
  116. #define STACK_TOUCH
  117. #endif
  118. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  119. #define VFMADDPS_YR( y0,y1,y2 ) \
  120. vmulps y1,y2,%ymm2;\
  121. vaddps y0,%ymm2,y0
  122. #define VFMADDPS_YI( y0,y1,y2 ) \
  123. vmulps y1,y2,%ymm3;\
  124. vaddps y0,%ymm3,y0
  125. #define VFMADDPS_R( y0,y1,y2 ) \
  126. vmulps y1,y2,%xmm2;\
  127. vaddps y0,%xmm2,y0
  128. #define VFMADDPS_I( y0,y1,y2 ) \
  129. vmulps y1,y2,%xmm3;\
  130. vaddps y0,%xmm3,y0
  131. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  132. #define VFMADDPS_YR( y0,y1,y2 ) \
  133. vmulps y1,y2,%ymm2;\
  134. vsubps %ymm2,y0,y0
  135. #define VFMADDPS_YI( y0,y1,y2 ) \
  136. vmulps y1,y2,%ymm3;\
  137. vaddps y0,%ymm3,y0
  138. #define VFMADDPS_R( y0,y1,y2 ) \
  139. vmulps y1,y2,%xmm2;\
  140. vsubps %xmm2,y0,y0
  141. #define VFMADDPS_I( y0,y1,y2 ) \
  142. vmulps y1,y2,%xmm3;\
  143. vaddps y0,%xmm3,y0
  144. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  145. #define VFMADDPS_YR( y0,y1,y2 ) \
  146. vmulps y1,y2,%ymm2;\
  147. vaddps y0,%ymm2,y0
  148. #define VFMADDPS_YI( y0,y1,y2 ) \
  149. vmulps y1,y2,%ymm3;\
  150. vsubps %ymm3,y0,y0
  151. #define VFMADDPS_R( y0,y1,y2 ) \
  152. vmulps y1,y2,%xmm2;\
  153. vaddps y0,%xmm2,y0
  154. #define VFMADDPS_I( y0,y1,y2 ) \
  155. vmulps y1,y2,%xmm3;\
  156. vsubps %xmm3,y0,y0
  157. #else
  158. #define VFMADDPS_YR( y0,y1,y2 ) \
  159. vmulps y1,y2,%ymm2;\
  160. vsubps %ymm2,y0,y0
  161. #define VFMADDPS_YI( y0,y1,y2 ) \
  162. vmulps y1,y2,%ymm3;\
  163. vsubps %ymm3,y0,y0
  164. #define VFMADDPS_R( y0,y1,y2 ) \
  165. vmulps y1,y2,%xmm2;\
  166. vsubps %xmm2,y0,y0
  167. #define VFMADDPS_I( y0,y1,y2 ) \
  168. vmulps y1,y2,%xmm3;\
  169. vsubps %xmm3,y0,y0
  170. #endif
  171. #define A_PR1 512
  172. #define B_PR1 512
  173. /***************************************************************************************************************************/
  174. .macro KERNEL8x2_1
  175. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  176. vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
  177. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  178. vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
  179. prefetcht0 A_PR1(AO, %rax, SIZE)
  180. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  181. vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
  182. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  183. vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
  184. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  185. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  186. VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
  187. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
  188. VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
  189. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
  190. VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
  191. vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
  192. VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
  193. vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1
  194. prefetcht0 A_PR1+64(AO, %rax, SIZE)
  195. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  196. vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6
  197. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  198. vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7
  199. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  200. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  201. VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
  202. vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4
  203. VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
  204. vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5
  205. VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
  206. vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
  207. VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
  208. vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1
  209. prefetcht0 A_PR1+128(AO, %rax, SIZE)
  210. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  211. vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6
  212. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  213. vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7
  214. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  215. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  216. VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
  217. vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4
  218. VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
  219. vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5
  220. VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
  221. vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0
  222. VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
  223. vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1
  224. prefetcht0 A_PR1+192(AO, %rax, SIZE)
  225. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  226. vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6
  227. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  228. vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7
  229. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  230. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  231. VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
  232. VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
  233. addq $ 16, BI
  234. VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
  235. VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
  236. addq $ 64, %rax
  237. .endm
  238. .macro KERNEL8x2_SUB
  239. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  240. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  241. vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
  242. vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
  243. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  244. vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
  245. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  246. vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
  247. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  248. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  249. VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
  250. VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
  251. VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
  252. VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
  253. addq $ 4 , BI
  254. addq $ 16, %rax
  255. .endm
  256. .macro SAVE8x2
  257. vbroadcastss ALPHA_R, %ymm0
  258. vbroadcastss ALPHA_I, %ymm1
  259. // swap high and low 64 bytes
  260. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  261. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  262. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  263. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  264. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  265. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  266. vaddsubps %ymm9, %ymm8 , %ymm8
  267. vaddsubps %ymm11,%ymm10, %ymm10
  268. vaddsubps %ymm13,%ymm12, %ymm12
  269. vaddsubps %ymm15,%ymm14, %ymm14
  270. vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
  271. vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
  272. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  273. vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
  274. #else
  275. vaddsubps %ymm8, %ymm9 ,%ymm9
  276. vaddsubps %ymm10, %ymm11,%ymm11
  277. vaddsubps %ymm12, %ymm13,%ymm13
  278. vaddsubps %ymm14, %ymm15,%ymm15
  279. vmovaps %ymm9, %ymm8
  280. vmovaps %ymm11, %ymm10
  281. vmovaps %ymm13, %ymm12
  282. vmovaps %ymm15, %ymm14
  283. // swap high and low 64 bytes
  284. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  285. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  286. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  287. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  288. #endif
  289. // multiply with ALPHA_R
  290. vmulps %ymm8 , %ymm0, %ymm8
  291. vmulps %ymm10, %ymm0, %ymm10
  292. vmulps %ymm12, %ymm0, %ymm12
  293. vmulps %ymm14, %ymm0, %ymm14
  294. // multiply with ALPHA_I
  295. vmulps %ymm9 , %ymm1, %ymm9
  296. vmulps %ymm11, %ymm1, %ymm11
  297. vmulps %ymm13, %ymm1, %ymm13
  298. vmulps %ymm15, %ymm1, %ymm15
  299. vaddsubps %ymm9, %ymm8 , %ymm8
  300. vaddsubps %ymm11,%ymm10, %ymm10
  301. vaddsubps %ymm13,%ymm12, %ymm12
  302. vaddsubps %ymm15,%ymm14, %ymm14
  303. #ifndef TRMMKERNEL
  304. vaddps (CO1), %ymm8 , %ymm8
  305. vaddps 8 * SIZE(CO1), %ymm12, %ymm12
  306. vaddps (CO1, LDC), %ymm10, %ymm10
  307. vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14
  308. #endif
  309. vmovups %ymm8 , (CO1)
  310. vmovups %ymm12 , 8 * SIZE(CO1)
  311. vmovups %ymm10 , (CO1, LDC)
  312. vmovups %ymm14 , 8 * SIZE(CO1, LDC)
  313. prefetcht0 64(CO1)
  314. prefetcht0 64(CO1, LDC)
  315. .endm
  316. /***************************************************************************************************************************/
  317. .macro KERNEL4x2_SUB
  318. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  319. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  320. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  321. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
  322. VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
  323. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  324. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  325. VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
  326. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  327. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  328. VFMADDPS_R( %xmm14,%xmm6,%xmm1 )
  329. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  330. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  331. VFMADDPS_I( %xmm15,%xmm7,%xmm1 )
  332. addq $ 4, BI
  333. addq $ 8, %rax
  334. .endm
  335. .macro SAVE4x2
  336. vbroadcastss ALPHA_R, %xmm0
  337. vbroadcastss ALPHA_I, %xmm1
  338. // swap high and low 64 bytes
  339. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  340. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  341. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  342. vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
  343. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  344. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  345. vaddsubps %xmm9, %xmm8 , %xmm8
  346. vaddsubps %xmm11,%xmm10, %xmm10
  347. vaddsubps %xmm13,%xmm12, %xmm12
  348. vaddsubps %xmm15,%xmm14, %xmm14
  349. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  350. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  351. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  352. vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
  353. #else
  354. vaddsubps %xmm8, %xmm9 ,%xmm9
  355. vaddsubps %xmm10, %xmm11,%xmm11
  356. vaddsubps %xmm12, %xmm13,%xmm13
  357. vaddsubps %xmm14, %xmm15,%xmm15
  358. vmovaps %xmm9, %xmm8
  359. vmovaps %xmm11, %xmm10
  360. vmovaps %xmm13, %xmm12
  361. vmovaps %xmm15, %xmm14
  362. // swap high and low 64 bytes
  363. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  364. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  365. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  366. vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
  367. #endif
  368. // multiply with ALPHA_R
  369. vmulps %xmm8 , %xmm0, %xmm8
  370. vmulps %xmm10, %xmm0, %xmm10
  371. vmulps %xmm12, %xmm0, %xmm12
  372. vmulps %xmm14, %xmm0, %xmm14
  373. // multiply with ALPHA_I
  374. vmulps %xmm9 , %xmm1, %xmm9
  375. vmulps %xmm11, %xmm1, %xmm11
  376. vmulps %xmm13, %xmm1, %xmm13
  377. vmulps %xmm15, %xmm1, %xmm15
  378. vaddsubps %xmm9, %xmm8 , %xmm8
  379. vaddsubps %xmm11,%xmm10, %xmm10
  380. vaddsubps %xmm13,%xmm12, %xmm12
  381. vaddsubps %xmm15,%xmm14, %xmm14
  382. #ifndef TRMMKERNEL
  383. vaddps (CO1), %xmm8 , %xmm8
  384. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  385. vaddps (CO1, LDC), %xmm10, %xmm10
  386. vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
  387. #endif
  388. vmovups %xmm8 , (CO1)
  389. vmovups %xmm12 , 4 * SIZE(CO1)
  390. vmovups %xmm10 , (CO1, LDC)
  391. vmovups %xmm14 , 4 * SIZE(CO1, LDC)
  392. .endm
  393. /************************************************************************************************/
  394. .macro KERNEL2x2_SUB
  395. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  396. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  397. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  398. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  399. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  400. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  401. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  402. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  403. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  404. addq $ 4, BI
  405. addq $ 4, %rax
  406. .endm
  407. .macro SAVE2x2
  408. vbroadcastss ALPHA_R, %xmm0
  409. vbroadcastss ALPHA_I, %xmm1
  410. // swap high and low 4 bytes
  411. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  412. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  413. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  414. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  415. vaddsubps %xmm9, %xmm8 , %xmm8
  416. vaddsubps %xmm11,%xmm10, %xmm10
  417. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  418. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  419. #else
  420. vaddsubps %xmm8, %xmm9 ,%xmm9
  421. vaddsubps %xmm10, %xmm11,%xmm11
  422. vmovaps %xmm9, %xmm8
  423. vmovaps %xmm11, %xmm10
  424. // swap high and low 4 bytes
  425. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  426. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  427. #endif
  428. // multiply with ALPHA_R
  429. vmulps %xmm8 , %xmm0, %xmm8
  430. vmulps %xmm10, %xmm0, %xmm10
  431. // multiply with ALPHA_I
  432. vmulps %xmm9 , %xmm1, %xmm9
  433. vmulps %xmm11, %xmm1, %xmm11
  434. vaddsubps %xmm9, %xmm8 , %xmm8
  435. vaddsubps %xmm11,%xmm10, %xmm10
  436. #ifndef TRMMKERNEL
  437. vaddps (CO1), %xmm8 , %xmm8
  438. vaddps (CO1, LDC), %xmm10, %xmm10
  439. #endif
  440. vmovups %xmm8 , (CO1)
  441. vmovups %xmm10 , (CO1, LDC)
  442. .endm
  443. /************************************************************************************************/
  444. .macro KERNEL1x2_SUB
  445. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
  446. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  447. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  448. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  449. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  450. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  451. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  452. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  453. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  454. addq $ 4, BI
  455. addq $ 2, %rax
  456. .endm
  457. .macro SAVE1x2
  458. vbroadcastss ALPHA_R, %xmm0
  459. vbroadcastss ALPHA_I, %xmm1
  460. // swap high and low 64 bytes
  461. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  462. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  463. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  464. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  465. vaddsubps %xmm9, %xmm8 , %xmm8
  466. vaddsubps %xmm11,%xmm10, %xmm10
  467. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  468. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  469. #else
  470. vaddsubps %xmm8, %xmm9 ,%xmm9
  471. vaddsubps %xmm10, %xmm11,%xmm11
  472. vmovaps %xmm9, %xmm8
  473. vmovaps %xmm11, %xmm10
  474. // swap high and low 64 bytes
  475. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  476. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  477. #endif
  478. // multiply with ALPHA_R
  479. vmulps %xmm8 , %xmm0, %xmm8
  480. vmulps %xmm10, %xmm0, %xmm10
  481. // multiply with ALPHA_I
  482. vmulps %xmm9 , %xmm1, %xmm9
  483. vmulps %xmm11, %xmm1, %xmm11
  484. vaddsubps %xmm9, %xmm8 , %xmm8
  485. vaddsubps %xmm11,%xmm10, %xmm10
  486. #ifndef TRMMKERNEL
  487. vmovsd (CO1), %xmm14
  488. vaddps %xmm14, %xmm8 , %xmm8
  489. vmovsd (CO1, LDC), %xmm15
  490. vaddps %xmm15, %xmm10, %xmm10
  491. #endif
  492. vmovsd %xmm8 , (CO1)
  493. vmovsd %xmm10 , (CO1, LDC)
  494. .endm
  495. /************************************************************************************************/
  496. .macro KERNEL8x1_SUB
  497. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  498. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  499. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
  500. VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
  501. VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
  502. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
  503. VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
  504. VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
  505. addq $ 2 , BI
  506. addq $ 16, %rax
  507. .endm
  508. .macro SAVE8x1
  509. vbroadcastss ALPHA_R, %ymm0
  510. vbroadcastss ALPHA_I, %ymm1
  511. // swap high and low 64 bytes
  512. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  513. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  514. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  515. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  516. vaddsubps %ymm9, %ymm8 , %ymm8
  517. vaddsubps %ymm13,%ymm12, %ymm12
  518. vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
  519. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  520. #else
  521. vaddsubps %ymm8, %ymm9 ,%ymm9
  522. vaddsubps %ymm12, %ymm13,%ymm13
  523. vmovaps %ymm9, %ymm8
  524. vmovaps %ymm13, %ymm12
  525. // swap high and low 64 bytes
  526. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  527. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  528. #endif
  529. // multiply with ALPHA_R
  530. vmulps %ymm8 , %ymm0, %ymm8
  531. vmulps %ymm12, %ymm0, %ymm12
  532. // multiply with ALPHA_I
  533. vmulps %ymm9 , %ymm1, %ymm9
  534. vmulps %ymm13, %ymm1, %ymm13
  535. vaddsubps %ymm9, %ymm8 , %ymm8
  536. vaddsubps %ymm13,%ymm12, %ymm12
  537. #ifndef TRMMKERNEL
  538. vaddps (CO1), %ymm8 , %ymm8
  539. vaddps 8 * SIZE(CO1), %ymm12, %ymm12
  540. #endif
  541. vmovups %ymm8 , (CO1)
  542. vmovups %ymm12 , 8 * SIZE(CO1)
  543. .endm
  544. /************************************************************************************************/
  545. .macro KERNEL4x1_SUB
  546. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  547. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  548. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  549. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
  550. VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
  551. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  552. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  553. VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
  554. addq $ 2, BI
  555. addq $ 8, %rax
  556. .endm
  557. .macro SAVE4x1
  558. vbroadcastss ALPHA_R, %xmm0
  559. vbroadcastss ALPHA_I, %xmm1
  560. // swap high and low 4 bytes
  561. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  562. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  563. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  564. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  565. vaddsubps %xmm9, %xmm8 , %xmm8
  566. vaddsubps %xmm13,%xmm12, %xmm12
  567. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  568. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  569. #else
  570. vaddsubps %xmm8, %xmm9 ,%xmm9
  571. vaddsubps %xmm12, %xmm13,%xmm13
  572. vmovaps %xmm9, %xmm8
  573. vmovaps %xmm13, %xmm12
  574. // swap high and low 4 bytes
  575. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  576. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  577. #endif
  578. // multiply with ALPHA_R
  579. vmulps %xmm8 , %xmm0, %xmm8
  580. vmulps %xmm12, %xmm0, %xmm12
  581. // multiply with ALPHA_I
  582. vmulps %xmm9 , %xmm1, %xmm9
  583. vmulps %xmm13, %xmm1, %xmm13
  584. vaddsubps %xmm9, %xmm8 , %xmm8
  585. vaddsubps %xmm13,%xmm12, %xmm12
  586. #ifndef TRMMKERNEL
  587. vaddps (CO1), %xmm8 , %xmm8
  588. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  589. #endif
  590. vmovups %xmm8 , (CO1)
  591. vmovups %xmm12 , 4 * SIZE(CO1)
  592. .endm
  593. /************************************************************************************************/
  594. .macro KERNEL2x1_SUB
  595. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  596. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  597. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  598. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  599. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  600. addq $ 2, BI
  601. addq $ 4, %rax
  602. .endm
  603. .macro SAVE2x1
  604. vbroadcastss ALPHA_R, %xmm0
  605. vbroadcastss ALPHA_I, %xmm1
  606. // swap high and low 64 bytes
  607. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  608. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  609. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  610. vaddsubps %xmm9, %xmm8 , %xmm8
  611. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  612. #else
  613. vaddsubps %xmm8, %xmm9 ,%xmm9
  614. vmovaps %xmm9, %xmm8
  615. // swap high and low 64 bytes
  616. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  617. #endif
  618. // multiply with ALPHA_R
  619. vmulps %xmm8 , %xmm0, %xmm8
  620. // multiply with ALPHA_I
  621. vmulps %xmm9 , %xmm1, %xmm9
  622. vaddsubps %xmm9, %xmm8 , %xmm8
  623. #ifndef TRMMKERNEL
  624. vaddps (CO1), %xmm8 , %xmm8
  625. #endif
  626. vmovups %xmm8 , (CO1)
  627. .endm
  628. /************************************************************************************************/
  629. .macro KERNEL1x1_SUB
  630. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
  631. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  632. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  633. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  634. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  635. addq $ 2, BI
  636. addq $ 2, %rax
  637. .endm
  638. .macro SAVE1x1
  639. vbroadcastss ALPHA_R, %xmm0
  640. vbroadcastss ALPHA_I, %xmm1
  641. // swap high and low 64 bytes
  642. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  643. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  644. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  645. vaddsubps %xmm9, %xmm8 , %xmm8
  646. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  647. #else
  648. vaddsubps %xmm8, %xmm9 ,%xmm9
  649. vmovaps %xmm9, %xmm8
  650. // swap high and low 64 bytes
  651. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  652. #endif
  653. // multiply with ALPHA_R
  654. vmulps %xmm8 , %xmm0, %xmm8
  655. // multiply with ALPHA_I
  656. vmulps %xmm9 , %xmm1, %xmm9
  657. vaddsubps %xmm9, %xmm8 , %xmm8
  658. #ifndef TRMMKERNEL
  659. vmovsd (CO1), %xmm14
  660. vaddps %xmm14, %xmm8 , %xmm8
  661. #endif
  662. vmovsd %xmm8 , (CO1)
  663. .endm
  664. /************************************************************************************************/
  665. PROLOGUE
  666. PROFCODE
  667. subq $ STACKSIZE, %rsp
  668. movq %rbx, (%rsp)
  669. movq %rbp, 8(%rsp)
  670. movq %r12, 16(%rsp)
  671. movq %r13, 24(%rsp)
  672. movq %r14, 32(%rsp)
  673. movq %r15, 40(%rsp)
  674. vzeroupper
  675. #ifdef WINDOWS_ABI
  676. movq %rdi, 48(%rsp)
  677. movq %rsi, 56(%rsp)
  678. vmovups %xmm6, 64(%rsp)
  679. vmovups %xmm7, 80(%rsp)
  680. vmovups %xmm8, 96(%rsp)
  681. vmovups %xmm9, 112(%rsp)
  682. vmovups %xmm10, 128(%rsp)
  683. vmovups %xmm11, 144(%rsp)
  684. vmovups %xmm12, 160(%rsp)
  685. vmovups %xmm13, 176(%rsp)
  686. vmovups %xmm14, 192(%rsp)
  687. vmovups %xmm15, 208(%rsp)
  688. movq ARG1, OLD_M
  689. movq ARG2, OLD_N
  690. movq ARG3, OLD_K
  691. movq OLD_A, A
  692. movq OLD_B, B
  693. movq OLD_C, C
  694. movq OLD_LDC, LDC
  695. #ifdef TRMMKERNEL
  696. movsd OLD_OFFSET, %xmm12
  697. #endif
  698. vmovaps %xmm3, %xmm0
  699. vmovsd OLD_ALPHA_I, %xmm1
  700. #else
  701. movq STACKSIZE + 8(%rsp), LDC
  702. #ifdef TRMMKERNEL
  703. movsd STACKSIZE + 16(%rsp), %xmm12
  704. #endif
  705. #endif
  706. movq %rsp, SP # save old stack
  707. subq $ 128 + L_BUFFER_SIZE, %rsp
  708. andq $ -4096, %rsp # align stack
  709. STACK_TOUCH
  710. cmpq $ 0, OLD_M
  711. je .L999
  712. cmpq $ 0, OLD_N
  713. je .L999
  714. cmpq $ 0, OLD_K
  715. je .L999
  716. movq OLD_M, M
  717. movq OLD_N, N
  718. movq OLD_K, K
  719. vmovss %xmm0, ALPHA_R
  720. vmovss %xmm1, ALPHA_I
  721. salq $ ZBASE_SHIFT, LDC
  722. movq N, %rax
  723. xorq %rdx, %rdx
  724. movq $ 2, %rdi
  725. divq %rdi // N / 2
  726. movq %rax, Ndiv6 // N / 2
  727. movq %rdx, Nmod6 // N % 2
  728. #ifdef TRMMKERNEL
  729. vmovsd %xmm12, OFFSET
  730. vmovsd %xmm12, KK
  731. #ifndef LEFT
  732. negq KK
  733. #endif
  734. #endif
  735. .L2_0:
  736. movq Ndiv6, J
  737. cmpq $ 0, J
  738. je .L1_0
  739. ALIGN_4
  740. .L2_01:
  741. // copy to sub buffer
  742. movq B, BO1
  743. leaq BUFFER1, BO // first buffer to BO
  744. movq K, %rax
  745. ALIGN_4
  746. .L2_02b:
  747. vmovups (BO1), %xmm0
  748. vmovups %xmm0, (BO)
  749. addq $ 4*SIZE,BO1
  750. addq $ 4*SIZE,BO
  751. decq %rax
  752. jnz .L2_02b
  753. .L2_02c:
  754. movq BO1, B // next offset of B
  755. .L2_10:
  756. movq C, CO1
  757. leaq (C, LDC, 2), C // c += 2 * ldc
  758. #if defined(TRMMKERNEL) && defined(LEFT)
  759. movq OFFSET, %rax
  760. movq %rax, KK
  761. #endif
  762. movq A, AO // aoffset = a
  763. addq $ 16 * SIZE, AO
  764. movq M, I
  765. sarq $ 3, I // i = (m >> 3)
  766. je .L2_4_10
  767. ALIGN_4
  768. /**********************************************************************************************************/
  769. .L2_8_11:
  770. #if !defined(TRMMKERNEL) || \
  771. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  772. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  773. leaq BUFFER1, BO // first buffer to BO
  774. addq $ 8 * SIZE, BO
  775. #else
  776. movq KK, %rax
  777. leaq BUFFER1, BO // first buffer to BO
  778. addq $ 8 * SIZE, BO
  779. movq %rax, BI // Index for BO
  780. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  781. leaq (BO, BI, SIZE), BO
  782. salq $ 4, %rax // rax = rax *16 ; number of values
  783. leaq (AO, %rax, SIZE), AO
  784. #endif
  785. vzeroall
  786. #ifndef TRMMKERNEL
  787. movq K, %rax
  788. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  789. movq K, %rax
  790. subq KK, %rax
  791. movq %rax, KKK
  792. #else
  793. movq KK, %rax
  794. #ifdef LEFT
  795. addq $ 8, %rax // number of values in AO
  796. #else
  797. addq $ 2, %rax // number of values in BO
  798. #endif
  799. movq %rax, KKK
  800. #endif
  801. andq $ -8, %rax // K = K - ( K % 8 )
  802. je .L2_8_16
  803. movq %rax, BI // Index for BO
  804. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  805. salq $ 4, %rax // rax = rax *16 ; number of values
  806. leaq (AO, %rax, SIZE), AO
  807. leaq (BO, BI, SIZE), BO
  808. negq BI
  809. negq %rax
  810. ALIGN_4
  811. .L2_8_12:
  812. prefetcht0 B_PR1(BO,BI,SIZE)
  813. KERNEL8x2_1
  814. prefetcht0 B_PR1(BO,BI,SIZE)
  815. KERNEL8x2_1
  816. je .L2_8_16
  817. prefetcht0 B_PR1(BO,BI,SIZE)
  818. KERNEL8x2_1
  819. prefetcht0 B_PR1(BO,BI,SIZE)
  820. KERNEL8x2_1
  821. je .L2_8_16
  822. jmp .L2_8_12
  823. ALIGN_4
  824. .L2_8_16:
  825. #ifndef TRMMKERNEL
  826. movq K, %rax
  827. #else
  828. movq KKK, %rax
  829. #endif
  830. andq $ 7, %rax # if (k & 1)
  831. je .L2_8_19
  832. movq %rax, BI // Index for BO
  833. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  834. salq $ 4, %rax // rax = rax *16 ; number of values
  835. leaq (AO, %rax, SIZE), AO
  836. leaq (BO, BI, SIZE), BO
  837. negq BI
  838. negq %rax
  839. ALIGN_4
  840. .L2_8_17:
  841. KERNEL8x2_SUB
  842. jl .L2_8_17
  843. ALIGN_4
  844. .L2_8_19:
  845. SAVE8x2
  846. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  847. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  848. movq K, %rax
  849. subq KKK, %rax
  850. movq %rax, BI // Index for BO
  851. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  852. leaq (BO, BI, SIZE), BO
  853. salq $ 4, %rax // rax = rax *16 ; number of values
  854. leaq (AO, %rax, SIZE), AO
  855. #endif
  856. #if defined(TRMMKERNEL) && defined(LEFT)
  857. addq $ 8, KK
  858. #endif
  859. addq $ 16 * SIZE, CO1 # coffset += 16
  860. decq I # i --
  861. jg .L2_8_11
  862. ALIGN_4
  863. /**********************************************************************************************************/
  864. .L2_4_10:
  865. testq $ 7, M
  866. jz .L2_4_60 // to next 2 lines of N
  867. testq $ 4, M
  868. jz .L2_4_20
  869. ALIGN_4
  870. .L2_4_11:
  871. #if !defined(TRMMKERNEL) || \
  872. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  873. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  874. leaq BUFFER1, BO // first buffer to BO
  875. addq $ 8 * SIZE, BO
  876. #else
  877. movq KK, %rax
  878. leaq BUFFER1, BO // first buffer to BO
  879. addq $ 8 * SIZE, BO
  880. movq %rax, BI // Index for BO
  881. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  882. leaq (BO, BI, SIZE), BO
  883. salq $ 3, %rax // rax = rax * 8 ; number of values
  884. leaq (AO, %rax, SIZE), AO
  885. #endif
  886. vzeroall
  887. #ifndef TRMMKERNEL
  888. movq K, %rax
  889. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  890. movq K, %rax
  891. subq KK, %rax
  892. movq %rax, KKK
  893. #else
  894. movq KK, %rax
  895. #ifdef LEFT
  896. addq $ 4, %rax // number of values in AO
  897. #else
  898. addq $ 2, %rax // number of values in BO
  899. #endif
  900. movq %rax, KKK
  901. #endif
  902. andq $ -8, %rax // K = K - ( K % 8 )
  903. je .L2_4_16
  904. movq %rax, BI // Index for BO
  905. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  906. salq $ 3, %rax // rax = rax * 8 ; number of values
  907. leaq (AO, %rax, SIZE), AO
  908. leaq (BO, BI, SIZE), BO
  909. negq BI
  910. negq %rax
  911. ALIGN_4
  912. .L2_4_12:
  913. prefetcht0 A_PR1(AO,%rax,SIZE)
  914. KERNEL4x2_SUB
  915. KERNEL4x2_SUB
  916. prefetcht0 A_PR1(AO,%rax,SIZE)
  917. KERNEL4x2_SUB
  918. KERNEL4x2_SUB
  919. prefetcht0 A_PR1(AO,%rax,SIZE)
  920. KERNEL4x2_SUB
  921. KERNEL4x2_SUB
  922. prefetcht0 A_PR1(AO,%rax,SIZE)
  923. KERNEL4x2_SUB
  924. KERNEL4x2_SUB
  925. je .L2_4_16
  926. prefetcht0 A_PR1(AO,%rax,SIZE)
  927. KERNEL4x2_SUB
  928. KERNEL4x2_SUB
  929. prefetcht0 A_PR1(AO,%rax,SIZE)
  930. KERNEL4x2_SUB
  931. KERNEL4x2_SUB
  932. prefetcht0 A_PR1(AO,%rax,SIZE)
  933. KERNEL4x2_SUB
  934. KERNEL4x2_SUB
  935. prefetcht0 A_PR1(AO,%rax,SIZE)
  936. KERNEL4x2_SUB
  937. KERNEL4x2_SUB
  938. je .L2_4_16
  939. jmp .L2_4_12
  940. ALIGN_4
  941. .L2_4_16:
  942. #ifndef TRMMKERNEL
  943. movq K, %rax
  944. #else
  945. movq KKK, %rax
  946. #endif
  947. andq $ 7, %rax # if (k & 1)
  948. je .L2_4_19
  949. movq %rax, BI // Index for BO
  950. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  951. salq $ 3, %rax // rax = rax * 8 ; number of values
  952. leaq (AO, %rax, SIZE), AO
  953. leaq (BO, BI, SIZE), BO
  954. negq BI
  955. negq %rax
  956. ALIGN_4
  957. .L2_4_17:
  958. KERNEL4x2_SUB
  959. jl .L2_4_17
  960. ALIGN_4
  961. .L2_4_19:
  962. SAVE4x2
  963. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  964. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  965. movq K, %rax
  966. subq KKK, %rax
  967. movq %rax, BI // Index for BO
  968. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  969. leaq (BO, BI, SIZE), BO
  970. salq $ 3, %rax // rax = rax * 8 ; number of values
  971. leaq (AO, %rax, SIZE), AO
  972. #endif
  973. #if defined(TRMMKERNEL) && defined(LEFT)
  974. addq $ 4, KK
  975. #endif
  976. addq $ 8 * SIZE, CO1 # coffset += 8
  977. ALIGN_4
  978. /**************************************************************************
  979. * Rest of M
  980. ***************************************************************************/
  981. .L2_4_20:
  982. testq $ 2, M
  983. jz .L2_4_40
  984. ALIGN_4
  985. .L2_4_21:
  986. #if !defined(TRMMKERNEL) || \
  987. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  988. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  989. leaq BUFFER1, BO // first buffer to BO
  990. addq $ 8 * SIZE, BO
  991. #else
  992. movq KK, %rax
  993. leaq BUFFER1, BO // first buffer to BO
  994. addq $ 8 * SIZE, BO
  995. movq %rax, BI // Index for BO
  996. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  997. leaq (BO, BI, SIZE), BO
  998. salq $ 2, %rax // rax = rax * 4 ; number of values
  999. leaq (AO, %rax, SIZE), AO
  1000. #endif
  1001. vzeroall
  1002. #ifndef TRMMKERNEL
  1003. movq K, %rax
  1004. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1005. movq K, %rax
  1006. subq KK, %rax
  1007. movq %rax, KKK
  1008. #else
  1009. movq KK, %rax
  1010. #ifdef LEFT
  1011. addq $ 2, %rax // number of values in AO
  1012. #else
  1013. addq $ 2, %rax // number of values in BO
  1014. #endif
  1015. movq %rax, KKK
  1016. #endif
  1017. andq $ -8, %rax // K = K - ( K % 8 )
  1018. je .L2_4_26
  1019. movq %rax, BI // Index for BO
  1020. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1021. salq $ 2, %rax // rax = rax * 4 ; number of values
  1022. leaq (AO, %rax, SIZE), AO
  1023. leaq (BO, BI, SIZE), BO
  1024. negq BI
  1025. negq %rax
  1026. ALIGN_4
  1027. .L2_4_22:
  1028. prefetcht0 A_PR1(AO,%rax,SIZE)
  1029. KERNEL2x2_SUB
  1030. KERNEL2x2_SUB
  1031. KERNEL2x2_SUB
  1032. KERNEL2x2_SUB
  1033. prefetcht0 A_PR1(AO,%rax,SIZE)
  1034. KERNEL2x2_SUB
  1035. KERNEL2x2_SUB
  1036. KERNEL2x2_SUB
  1037. KERNEL2x2_SUB
  1038. je .L2_4_26
  1039. prefetcht0 A_PR1(AO,%rax,SIZE)
  1040. KERNEL2x2_SUB
  1041. KERNEL2x2_SUB
  1042. KERNEL2x2_SUB
  1043. KERNEL2x2_SUB
  1044. prefetcht0 A_PR1(AO,%rax,SIZE)
  1045. KERNEL2x2_SUB
  1046. KERNEL2x2_SUB
  1047. KERNEL2x2_SUB
  1048. KERNEL2x2_SUB
  1049. je .L2_4_26
  1050. jmp .L2_4_22
  1051. ALIGN_4
  1052. .L2_4_26:
  1053. #ifndef TRMMKERNEL
  1054. movq K, %rax
  1055. #else
  1056. movq KKK, %rax
  1057. #endif
  1058. andq $ 7, %rax # if (k & 1)
  1059. je .L2_4_29
  1060. movq %rax, BI // Index for BO
  1061. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1062. salq $ 2, %rax // rax = rax * 4 ; number of values
  1063. leaq (AO, %rax, SIZE), AO
  1064. leaq (BO, BI, SIZE), BO
  1065. negq BI
  1066. negq %rax
  1067. ALIGN_4
  1068. .L2_4_27:
  1069. KERNEL2x2_SUB
  1070. jl .L2_4_27
  1071. ALIGN_4
  1072. .L2_4_29:
  1073. vbroadcastss ALPHA_R, %xmm0
  1074. vbroadcastss ALPHA_I, %xmm1
  1075. // swap high and low 64 bytes
  1076. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  1077. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  1078. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1079. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1080. vaddsubps %xmm9, %xmm8 , %xmm8
  1081. vaddsubps %xmm11,%xmm10, %xmm10
  1082. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  1083. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  1084. #else
  1085. vaddsubps %xmm8, %xmm9 ,%xmm9
  1086. vaddsubps %xmm10, %xmm11,%xmm11
  1087. vmovaps %xmm9, %xmm8
  1088. vmovaps %xmm11, %xmm10
  1089. // swap high and low 64 bytes
  1090. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  1091. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  1092. #endif
  1093. // multiply with ALPHA_R
  1094. vmulps %xmm8 , %xmm0, %xmm8
  1095. vmulps %xmm10, %xmm0, %xmm10
  1096. // multiply with ALPHA_I
  1097. vmulps %xmm9 , %xmm1, %xmm9
  1098. vmulps %xmm11, %xmm1, %xmm11
  1099. vaddsubps %xmm9, %xmm8 , %xmm8
  1100. vaddsubps %xmm11,%xmm10, %xmm10
  1101. #ifndef TRMMKERNEL
  1102. vaddps (CO1), %xmm8 , %xmm8
  1103. vaddps (CO1, LDC), %xmm10, %xmm10
  1104. #endif
  1105. vmovups %xmm8 , (CO1)
  1106. vmovups %xmm10 , (CO1, LDC)
  1107. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1108. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1109. movq K, %rax
  1110. subq KKK, %rax
  1111. movq %rax, BI // Index for BO
  1112. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1113. leaq (BO, BI, SIZE), BO
  1114. salq $ 2, %rax // rax = rax * 4 ; number of values
  1115. leaq (AO, %rax, SIZE), AO
  1116. #endif
  1117. #if defined(TRMMKERNEL) && defined(LEFT)
  1118. addq $ 2, KK
  1119. #endif
  1120. addq $ 4 * SIZE, CO1 # coffset += 4
  1121. decq I # i --
  1122. jg .L2_4_21
  1123. ALIGN_4
  1124. /**************************************************************************/
  1125. .L2_4_40:
  1126. testq $ 1, M
  1127. jz .L2_4_60 // to next 2 lines of N
  1128. ALIGN_4
  1129. .L2_4_41:
  1130. #if !defined(TRMMKERNEL) || \
  1131. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1132. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1133. leaq BUFFER1, BO // first buffer to BO
  1134. addq $ 8 * SIZE, BO
  1135. #else
  1136. movq KK, %rax
  1137. leaq BUFFER1, BO // first buffer to BO
  1138. addq $ 8 * SIZE, BO
  1139. movq %rax, BI // Index for BO
  1140. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1141. leaq (BO, BI, SIZE), BO
  1142. salq $ 1, %rax // rax = rax * 2 ; number of values
  1143. leaq (AO, %rax, SIZE), AO
  1144. #endif
  1145. vzeroall
  1146. #ifndef TRMMKERNEL
  1147. movq K, %rax
  1148. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1149. movq K, %rax
  1150. subq KK, %rax
  1151. movq %rax, KKK
  1152. #else
  1153. movq KK, %rax
  1154. #ifdef LEFT
  1155. addq $ 1, %rax // number of values in AO
  1156. #else
  1157. addq $ 2, %rax // number of values in BO
  1158. #endif
  1159. movq %rax, KKK
  1160. #endif
  1161. andq $ -8, %rax // K = K - ( K % 8 )
  1162. je .L2_4_46
  1163. movq %rax, BI // Index for BO
  1164. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1165. salq $ 1, %rax // rax = rax * 2 ; number of values
  1166. leaq (AO, %rax, SIZE), AO
  1167. leaq (BO, BI, SIZE), BO
  1168. negq BI
  1169. negq %rax
  1170. ALIGN_4
  1171. .L2_4_42:
  1172. prefetcht0 A_PR1(AO,%rax,SIZE)
  1173. KERNEL1x2_SUB
  1174. KERNEL1x2_SUB
  1175. KERNEL1x2_SUB
  1176. KERNEL1x2_SUB
  1177. KERNEL1x2_SUB
  1178. KERNEL1x2_SUB
  1179. KERNEL1x2_SUB
  1180. KERNEL1x2_SUB
  1181. je .L2_4_46
  1182. prefetcht0 A_PR1(AO,%rax,SIZE)
  1183. KERNEL1x2_SUB
  1184. KERNEL1x2_SUB
  1185. KERNEL1x2_SUB
  1186. KERNEL1x2_SUB
  1187. KERNEL1x2_SUB
  1188. KERNEL1x2_SUB
  1189. KERNEL1x2_SUB
  1190. KERNEL1x2_SUB
  1191. je .L2_4_46
  1192. jmp .L2_4_42
  1193. ALIGN_4
  1194. .L2_4_46:
  1195. #ifndef TRMMKERNEL
  1196. movq K, %rax
  1197. #else
  1198. movq KKK, %rax
  1199. #endif
  1200. andq $ 7, %rax # if (k & 1)
  1201. je .L2_4_49
  1202. movq %rax, BI // Index for BO
  1203. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1204. salq $ 1, %rax // rax = rax * 2 ; number of values
  1205. leaq (AO, %rax, SIZE), AO
  1206. leaq (BO, BI, SIZE), BO
  1207. negq BI
  1208. negq %rax
  1209. ALIGN_4
  1210. .L2_4_47:
  1211. KERNEL1x2_SUB
  1212. jl .L2_4_47
  1213. ALIGN_4
  1214. .L2_4_49:
  1215. SAVE1x2
  1216. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1217. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1218. movq K, %rax
  1219. subq KKK, %rax
  1220. movq %rax, BI // Index for BO
  1221. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1222. leaq (BO, BI, SIZE), BO
  1223. salq $ 1, %rax // rax = rax * 2 ; number of values
  1224. leaq (AO, %rax, SIZE), AO
  1225. #endif
  1226. #if defined(TRMMKERNEL) && defined(LEFT)
  1227. addq $ 1, KK
  1228. #endif
  1229. addq $ 2 * SIZE, CO1 # coffset += 2
  1230. decq I # i --
  1231. jg .L2_4_41
  1232. ALIGN_4
  1233. .L2_4_60:
  1234. #if defined(TRMMKERNEL) && !defined(LEFT)
  1235. addq $ 2, KK
  1236. #endif
  1237. decq J // j --
  1238. jg .L2_01 // next 2 lines of N
  1239. .L1_0:
  1240. /************************************************************************************************
  1241. * Loop for Nmod6 % 2 > 0
  1242. *************************************************************************************************/
  1243. movq Nmod6, J
  1244. andq $ 1, J // j % 2
  1245. je .L999
  1246. ALIGN_4
  1247. .L1_01:
  1248. // copy to sub buffer
  1249. movq B, BO1
  1250. leaq BUFFER1, BO // first buffer to BO
  1251. movq K, %rax
  1252. ALIGN_4
  1253. .L1_02b:
  1254. vmovsd (BO1), %xmm0
  1255. vmovsd %xmm0, (BO)
  1256. addq $ 2*SIZE,BO1
  1257. addq $ 2*SIZE,BO
  1258. decq %rax
  1259. jnz .L1_02b
  1260. .L1_02c:
  1261. movq BO1, B // next offset of B
  1262. .L1_10:
  1263. movq C, CO1
  1264. leaq (C, LDC, 1), C // c += 1 * ldc
  1265. #if defined(TRMMKERNEL) && defined(LEFT)
  1266. movq OFFSET, %rax
  1267. movq %rax, KK
  1268. #endif
  1269. movq A, AO // aoffset = a
  1270. addq $ 16 * SIZE, AO
  1271. movq M, I
  1272. sarq $ 3, I // i = (m >> 3)
  1273. je .L1_4_10
  1274. ALIGN_4
  1275. /**************************************************************************************************/
  1276. .L1_8_11:
  1277. #if !defined(TRMMKERNEL) || \
  1278. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1279. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1280. leaq BUFFER1, BO // first buffer to BO
  1281. addq $ 4 * SIZE, BO
  1282. #else
  1283. movq KK, %rax
  1284. leaq BUFFER1, BO // first buffer to BO
  1285. addq $ 4 * SIZE, BO
  1286. movq %rax, BI // Index for BO
  1287. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1288. leaq (BO, BI, SIZE), BO
  1289. salq $ 4, %rax // rax = rax *16 ; number of values
  1290. leaq (AO, %rax, SIZE), AO
  1291. #endif
  1292. vzeroall
  1293. #ifndef TRMMKERNEL
  1294. movq K, %rax
  1295. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1296. movq K, %rax
  1297. subq KK, %rax
  1298. movq %rax, KKK
  1299. #else
  1300. movq KK, %rax
  1301. #ifdef LEFT
  1302. addq $ 8, %rax // number of values in AO
  1303. #else
  1304. addq $ 1, %rax // number of values in BO
  1305. #endif
  1306. movq %rax, KKK
  1307. #endif
  1308. andq $ -8, %rax // K = K - ( K % 8 )
  1309. je .L1_8_16
  1310. movq %rax, BI // Index for BO
  1311. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1312. salq $ 4, %rax // rax = rax *16 ; number of values
  1313. leaq (AO, %rax, SIZE), AO
  1314. leaq (BO, BI, SIZE), BO
  1315. negq BI
  1316. negq %rax
  1317. ALIGN_4
  1318. .L1_8_12:
  1319. prefetcht0 A_PR1(AO,%rax,SIZE)
  1320. prefetcht0 B_PR1(BO,BI,SIZE)
  1321. KERNEL8x1_SUB
  1322. prefetcht0 A_PR1(AO,%rax,SIZE)
  1323. KERNEL8x1_SUB
  1324. prefetcht0 A_PR1(AO,%rax,SIZE)
  1325. KERNEL8x1_SUB
  1326. prefetcht0 A_PR1(AO,%rax,SIZE)
  1327. KERNEL8x1_SUB
  1328. prefetcht0 A_PR1(AO,%rax,SIZE)
  1329. KERNEL8x1_SUB
  1330. prefetcht0 A_PR1(AO,%rax,SIZE)
  1331. KERNEL8x1_SUB
  1332. prefetcht0 A_PR1(AO,%rax,SIZE)
  1333. KERNEL8x1_SUB
  1334. prefetcht0 A_PR1(AO,%rax,SIZE)
  1335. KERNEL8x1_SUB
  1336. je .L1_8_16
  1337. prefetcht0 A_PR1(AO,%rax,SIZE)
  1338. prefetcht0 B_PR1(BO,BI,SIZE)
  1339. KERNEL8x1_SUB
  1340. prefetcht0 A_PR1(AO,%rax,SIZE)
  1341. KERNEL8x1_SUB
  1342. prefetcht0 A_PR1(AO,%rax,SIZE)
  1343. KERNEL8x1_SUB
  1344. prefetcht0 A_PR1(AO,%rax,SIZE)
  1345. KERNEL8x1_SUB
  1346. prefetcht0 A_PR1(AO,%rax,SIZE)
  1347. KERNEL8x1_SUB
  1348. prefetcht0 A_PR1(AO,%rax,SIZE)
  1349. KERNEL8x1_SUB
  1350. prefetcht0 A_PR1(AO,%rax,SIZE)
  1351. KERNEL8x1_SUB
  1352. prefetcht0 A_PR1(AO,%rax,SIZE)
  1353. KERNEL8x1_SUB
  1354. je .L1_8_16
  1355. jmp .L1_8_12
  1356. ALIGN_4
  1357. .L1_8_16:
  1358. #ifndef TRMMKERNEL
  1359. movq K, %rax
  1360. #else
  1361. movq KKK, %rax
  1362. #endif
  1363. andq $ 7, %rax # if (k & 1)
  1364. je .L1_8_19
  1365. movq %rax, BI // Index for BO
  1366. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  1367. salq $ 4, %rax // rax = rax *16 ; number of values
  1368. leaq (AO, %rax, SIZE), AO
  1369. leaq (BO, BI, SIZE), BO
  1370. negq BI
  1371. negq %rax
  1372. ALIGN_4
  1373. .L1_8_17:
  1374. KERNEL8x1_SUB
  1375. jl .L1_8_17
  1376. ALIGN_4
  1377. .L1_8_19:
  1378. SAVE8x1
  1379. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1380. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1381. movq K, %rax
  1382. subq KKK, %rax
  1383. movq %rax, BI // Index for BO
  1384. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1385. leaq (BO, BI, SIZE), BO
  1386. salq $ 4, %rax // rax = rax *16 ; number of values
  1387. leaq (AO, %rax, SIZE), AO
  1388. #endif
  1389. #if defined(TRMMKERNEL) && defined(LEFT)
  1390. addq $ 8, KK
  1391. #endif
  1392. addq $ 16 * SIZE, CO1 # coffset += 16
  1393. decq I # i --
  1394. jg .L1_8_11
  1395. ALIGN_4
  1396. /**************************************************************************************************/
  1397. .L1_4_10:
  1398. testq $ 7, M
  1399. jz .L999
  1400. testq $ 4, M
  1401. jz .L1_4_20
  1402. .L1_4_11:
  1403. #if !defined(TRMMKERNEL) || \
  1404. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1405. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1406. leaq BUFFER1, BO // first buffer to BO
  1407. addq $ 4 * SIZE, BO
  1408. #else
  1409. movq KK, %rax
  1410. leaq BUFFER1, BO // first buffer to BO
  1411. addq $ 4 * SIZE, BO
  1412. movq %rax, BI // Index for BO
  1413. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1414. leaq (BO, BI, SIZE), BO
  1415. salq $ 3, %rax // rax = rax * 8 ; number of values
  1416. leaq (AO, %rax, SIZE), AO
  1417. #endif
  1418. vzeroall
  1419. #ifndef TRMMKERNEL
  1420. movq K, %rax
  1421. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1422. movq K, %rax
  1423. subq KK, %rax
  1424. movq %rax, KKK
  1425. #else
  1426. movq KK, %rax
  1427. #ifdef LEFT
  1428. addq $ 4, %rax // number of values in AO
  1429. #else
  1430. addq $ 1, %rax // number of values in BO
  1431. #endif
  1432. movq %rax, KKK
  1433. #endif
  1434. andq $ -8, %rax // K = K - ( K % 8 )
  1435. je .L1_4_16
  1436. movq %rax, BI // Index for BO
  1437. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1438. salq $ 3, %rax // rax = rax * 8 ; number of values
  1439. leaq (AO, %rax, SIZE), AO
  1440. leaq (BO, BI, SIZE), BO
  1441. negq BI
  1442. negq %rax
  1443. ALIGN_4
  1444. .L1_4_12:
  1445. prefetcht0 A_PR1(AO,%rax,SIZE)
  1446. prefetcht0 B_PR1(BO,BI,SIZE)
  1447. KERNEL4x1_SUB
  1448. KERNEL4x1_SUB
  1449. prefetcht0 A_PR1(AO,%rax,SIZE)
  1450. KERNEL4x1_SUB
  1451. KERNEL4x1_SUB
  1452. prefetcht0 A_PR1(AO,%rax,SIZE)
  1453. KERNEL4x1_SUB
  1454. KERNEL4x1_SUB
  1455. prefetcht0 A_PR1(AO,%rax,SIZE)
  1456. KERNEL4x1_SUB
  1457. KERNEL4x1_SUB
  1458. je .L1_4_16
  1459. prefetcht0 A_PR1(AO,%rax,SIZE)
  1460. prefetcht0 B_PR1(BO,BI,SIZE)
  1461. KERNEL4x1_SUB
  1462. KERNEL4x1_SUB
  1463. prefetcht0 A_PR1(AO,%rax,SIZE)
  1464. KERNEL4x1_SUB
  1465. KERNEL4x1_SUB
  1466. prefetcht0 A_PR1(AO,%rax,SIZE)
  1467. KERNEL4x1_SUB
  1468. KERNEL4x1_SUB
  1469. prefetcht0 A_PR1(AO,%rax,SIZE)
  1470. KERNEL4x1_SUB
  1471. KERNEL4x1_SUB
  1472. je .L1_4_16
  1473. jmp .L1_4_12
  1474. ALIGN_4
  1475. .L1_4_16:
  1476. #ifndef TRMMKERNEL
  1477. movq K, %rax
  1478. #else
  1479. movq KKK, %rax
  1480. #endif
  1481. andq $ 7, %rax # if (k & 1)
  1482. je .L1_4_19
  1483. movq %rax, BI // Index for BO
  1484. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  1485. salq $ 3, %rax // rax = rax * 8 ; number of values
  1486. leaq (AO, %rax, SIZE), AO
  1487. leaq (BO, BI, SIZE), BO
  1488. negq BI
  1489. negq %rax
  1490. ALIGN_4
  1491. .L1_4_17:
  1492. KERNEL4x1_SUB
  1493. jl .L1_4_17
  1494. ALIGN_4
  1495. .L1_4_19:
  1496. SAVE4x1
  1497. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1498. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1499. movq K, %rax
  1500. subq KKK, %rax
  1501. movq %rax, BI // Index for BO
  1502. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1503. leaq (BO, BI, SIZE), BO
  1504. salq $ 3, %rax // rax = rax * 8 ; number of values
  1505. leaq (AO, %rax, SIZE), AO
  1506. #endif
  1507. #if defined(TRMMKERNEL) && defined(LEFT)
  1508. addq $ 4, KK
  1509. #endif
  1510. addq $ 8 * SIZE, CO1 # coffset += 8
  1511. ALIGN_4
  1512. /**************************************************************************
  1513. * Rest of M
  1514. ***************************************************************************/
  1515. .L1_4_20:
  1516. testq $ 2, M
  1517. jz .L1_4_40
  1518. ALIGN_4
  1519. .L1_4_21:
  1520. #if !defined(TRMMKERNEL) || \
  1521. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1522. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1523. leaq BUFFER1, BO // first buffer to BO
  1524. addq $ 4 * SIZE, BO
  1525. #else
  1526. movq KK, %rax
  1527. leaq BUFFER1, BO // first buffer to BO
  1528. addq $ 4 * SIZE, BO
  1529. movq %rax, BI // Index for BO
  1530. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1531. leaq (BO, BI, SIZE), BO
  1532. salq $ 2, %rax // rax = rax * 4 ; number of values
  1533. leaq (AO, %rax, SIZE), AO
  1534. #endif
  1535. vzeroall
  1536. #ifndef TRMMKERNEL
  1537. movq K, %rax
  1538. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1539. movq K, %rax
  1540. subq KK, %rax
  1541. movq %rax, KKK
  1542. #else
  1543. movq KK, %rax
  1544. #ifdef LEFT
  1545. addq $ 2, %rax // number of values in AO
  1546. #else
  1547. addq $ 1, %rax // number of values in BO
  1548. #endif
  1549. movq %rax, KKK
  1550. #endif
  1551. andq $ -8, %rax // K = K - ( K % 8 )
  1552. je .L1_4_26
  1553. movq %rax, BI // Index for BO
  1554. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1555. salq $ 2, %rax // rax = rax * 4 ; number of values
  1556. leaq (AO, %rax, SIZE), AO
  1557. leaq (BO, BI, SIZE), BO
  1558. negq BI
  1559. negq %rax
  1560. ALIGN_4
  1561. .L1_4_22:
  1562. prefetcht0 A_PR1(AO,%rax,SIZE)
  1563. prefetcht0 B_PR1(BO,BI,SIZE)
  1564. KERNEL2x1_SUB
  1565. KERNEL2x1_SUB
  1566. KERNEL2x1_SUB
  1567. KERNEL2x1_SUB
  1568. prefetcht0 A_PR1(AO,%rax,SIZE)
  1569. KERNEL2x1_SUB
  1570. KERNEL2x1_SUB
  1571. KERNEL2x1_SUB
  1572. KERNEL2x1_SUB
  1573. je .L1_4_26
  1574. prefetcht0 A_PR1(AO,%rax,SIZE)
  1575. prefetcht0 B_PR1(BO,BI,SIZE)
  1576. KERNEL2x1_SUB
  1577. KERNEL2x1_SUB
  1578. KERNEL2x1_SUB
  1579. KERNEL2x1_SUB
  1580. prefetcht0 A_PR1(AO,%rax,SIZE)
  1581. KERNEL2x1_SUB
  1582. KERNEL2x1_SUB
  1583. KERNEL2x1_SUB
  1584. KERNEL2x1_SUB
  1585. je .L1_4_26
  1586. jmp .L1_4_22
  1587. ALIGN_4
  1588. .L1_4_26:
  1589. #ifndef TRMMKERNEL
  1590. movq K, %rax
  1591. #else
  1592. movq KKK, %rax
  1593. #endif
  1594. andq $ 7, %rax # if (k & 1)
  1595. je .L1_4_29
  1596. movq %rax, BI // Index for BO
  1597. leaq ( ,BI,2), BI // BI = BI * 2; number of values
  1598. salq $ 2, %rax // rax = rax * 4 ; number of values
  1599. leaq (AO, %rax, SIZE), AO
  1600. leaq (BO, BI, SIZE), BO
  1601. negq BI
  1602. negq %rax
  1603. ALIGN_4
  1604. .L1_4_27:
  1605. KERNEL2x1_SUB
  1606. jl .L1_4_27
  1607. ALIGN_4
  1608. .L1_4_29:
  1609. SAVE2x1
  1610. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1611. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1612. movq K, %rax
  1613. subq KKK, %rax
  1614. movq %rax, BI // Index for BO
  1615. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1616. leaq (BO, BI, SIZE), BO
  1617. salq $ 2, %rax // rax = rax * 4 ; number of values
  1618. leaq (AO, %rax, SIZE), AO
  1619. #endif
  1620. #if defined(TRMMKERNEL) && defined(LEFT)
  1621. addq $ 2, KK
  1622. #endif
  1623. addq $ 4 * SIZE, CO1 # coffset += 4
  1624. ALIGN_4
  1625. /**************************************************************************/
  1626. .L1_4_40:
  1627. testq $ 1, M
  1628. jz .L999 // to next 2 lines of N
  1629. ALIGN_4
  1630. .L1_4_41:
  1631. #if !defined(TRMMKERNEL) || \
  1632. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1633. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1634. leaq BUFFER1, BO // first buffer to BO
  1635. addq $ 4 * SIZE, BO
  1636. #else
  1637. movq KK, %rax
  1638. leaq BUFFER1, BO // first buffer to BO
  1639. addq $ 4 * SIZE, BO
  1640. movq %rax, BI // Index for BO
  1641. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1642. leaq (BO, BI, SIZE), BO
  1643. salq $ 1, %rax // rax = rax * 2 ; number of values
  1644. leaq (AO, %rax, SIZE), AO
  1645. #endif
  1646. vzeroall
  1647. #ifndef TRMMKERNEL
  1648. movq K, %rax
  1649. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1650. movq K, %rax
  1651. subq KK, %rax
  1652. movq %rax, KKK
  1653. #else
  1654. movq KK, %rax
  1655. #ifdef LEFT
  1656. addq $ 1, %rax // number of values in AO
  1657. #else
  1658. addq $ 1, %rax // number of values in BO
  1659. #endif
  1660. movq %rax, KKK
  1661. #endif
  1662. andq $ -8, %rax // K = K - ( K % 8 )
  1663. je .L1_4_46
  1664. movq %rax, BI // Index for BO
  1665. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1666. salq $ 1, %rax // rax = rax * 2 ; number of values
  1667. leaq (AO, %rax, SIZE), AO
  1668. leaq (BO, BI, SIZE), BO
  1669. negq BI
  1670. negq %rax
  1671. ALIGN_4
  1672. .L1_4_42:
  1673. prefetcht0 A_PR1(AO,%rax,SIZE)
  1674. prefetcht0 B_PR1(BO,BI,SIZE)
  1675. KERNEL1x1_SUB
  1676. KERNEL1x1_SUB
  1677. KERNEL1x1_SUB
  1678. KERNEL1x1_SUB
  1679. KERNEL1x1_SUB
  1680. KERNEL1x1_SUB
  1681. KERNEL1x1_SUB
  1682. KERNEL1x1_SUB
  1683. je .L1_4_46
  1684. prefetcht0 A_PR1(AO,%rax,SIZE)
  1685. prefetcht0 B_PR1(BO,BI,SIZE)
  1686. KERNEL1x1_SUB
  1687. KERNEL1x1_SUB
  1688. KERNEL1x1_SUB
  1689. KERNEL1x1_SUB
  1690. KERNEL1x1_SUB
  1691. KERNEL1x1_SUB
  1692. KERNEL1x1_SUB
  1693. KERNEL1x1_SUB
  1694. je .L1_4_46
  1695. jmp .L1_4_42
  1696. ALIGN_4
  1697. .L1_4_46:
  1698. #ifndef TRMMKERNEL
  1699. movq K, %rax
  1700. #else
  1701. movq KKK, %rax
  1702. #endif
  1703. andq $ 7, %rax # if (k & 1)
  1704. je .L1_4_49
  1705. movq %rax, BI // Index for BO
  1706. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1707. salq $ 1, %rax // rax = rax * 2 ; number of values
  1708. leaq (AO, %rax, SIZE), AO
  1709. leaq (BO, BI, SIZE), BO
  1710. negq BI
  1711. negq %rax
  1712. ALIGN_4
  1713. .L1_4_47:
  1714. KERNEL1x1_SUB
  1715. jl .L1_4_47
  1716. ALIGN_4
  1717. .L1_4_49:
  1718. SAVE1x1
  1719. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1720. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1721. movq K, %rax
  1722. subq KKK, %rax
  1723. movq %rax, BI // Index for BO
  1724. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1725. leaq (BO, BI, SIZE), BO
  1726. salq $ 1, %rax // rax = rax * 2 ; number of values
  1727. leaq (AO, %rax, SIZE), AO
  1728. #endif
  1729. #if defined(TRMMKERNEL) && defined(LEFT)
  1730. addq $ 1, KK
  1731. #endif
  1732. addq $ 2 * SIZE, CO1 # coffset += 2
  1733. ALIGN_4
  1734. .L999:
  1735. vzeroupper
  1736. movq SP, %rsp
  1737. movq (%rsp), %rbx
  1738. movq 8(%rsp), %rbp
  1739. movq 16(%rsp), %r12
  1740. movq 24(%rsp), %r13
  1741. movq 32(%rsp), %r14
  1742. movq 40(%rsp), %r15
  1743. #ifdef WINDOWS_ABI
  1744. movq 48(%rsp), %rdi
  1745. movq 56(%rsp), %rsi
  1746. vmovups 64(%rsp), %xmm6
  1747. vmovups 80(%rsp), %xmm7
  1748. vmovups 96(%rsp), %xmm8
  1749. vmovups 112(%rsp), %xmm9
  1750. vmovups 128(%rsp), %xmm10
  1751. vmovups 144(%rsp), %xmm11
  1752. vmovups 160(%rsp), %xmm12
  1753. vmovups 176(%rsp), %xmm13
  1754. vmovups 192(%rsp), %xmm14
  1755. vmovups 208(%rsp), %xmm15
  1756. #endif
  1757. addq $ STACKSIZE, %rsp
  1758. ret
  1759. EPILOGUE