You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x4_nehalem.S 31 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %rbp
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rdx
  55. #define BB %r12
  56. #define PREA %r10
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA_R 48(%rsp)
  62. #define ALPHA_I 56(%rsp)
  63. #define J 64(%rsp)
  64. #define OFFSET 72(%rsp)
  65. #define KK 80(%rsp)
  66. #define KKK 88(%rsp)
  67. #else
  68. #define STACKSIZE 512
  69. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  70. #define OLD_A 48 + STACKSIZE(%rsp)
  71. #define OLD_B 56 + STACKSIZE(%rsp)
  72. #define OLD_C 64 + STACKSIZE(%rsp)
  73. #define OLD_LDC 72 + STACKSIZE(%rsp)
  74. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  75. #define ALPHA_R 224(%rsp)
  76. #define ALPHA_I 232(%rsp)
  77. #define J 240(%rsp)
  78. #define OFFSET 248(%rsp)
  79. #define KK 256(%rsp)
  80. #define KKK 264(%rsp)
  81. #endif
  82. #define PREFETCHSIZE 8
  83. #define PREFETCH prefetcht0
  84. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  85. #define ADD1 addps
  86. #define ADD2 addps
  87. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  88. #define ADD1 addps
  89. #define ADD2 addps
  90. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  91. #define ADD1 addps
  92. #define ADD2 addps
  93. #else
  94. #define ADD1 addps
  95. #define ADD2 subps
  96. #endif
  97. PROLOGUE
  98. PROFCODE
  99. subq $STACKSIZE, %rsp
  100. movq %rbx, 0(%rsp)
  101. movq %rbp, 8(%rsp)
  102. movq %r12, 16(%rsp)
  103. movq %r13, 24(%rsp)
  104. movq %r14, 32(%rsp)
  105. movq %r15, 40(%rsp)
  106. #ifdef WINDOWS_ABI
  107. movq %rdi, 48(%rsp)
  108. movq %rsi, 56(%rsp)
  109. movups %xmm6, 64(%rsp)
  110. movups %xmm7, 80(%rsp)
  111. movups %xmm8, 96(%rsp)
  112. movups %xmm9, 112(%rsp)
  113. movups %xmm10, 128(%rsp)
  114. movups %xmm11, 144(%rsp)
  115. movups %xmm12, 160(%rsp)
  116. movups %xmm13, 176(%rsp)
  117. movups %xmm14, 192(%rsp)
  118. movups %xmm15, 208(%rsp)
  119. movq ARG1, OLD_M
  120. movq ARG2, OLD_N
  121. movq ARG3, OLD_K
  122. movq OLD_A, A
  123. movq OLD_B, B
  124. movq OLD_C, C
  125. movq OLD_LDC, LDC
  126. #ifdef TRMMKERNEL
  127. movq OLD_OFFSET, %r11
  128. #endif
  129. movaps %xmm3, %xmm0
  130. movss OLD_ALPHA_I, %xmm1
  131. #else
  132. movq OLD_LDC, LDC
  133. #ifdef TRMMKERNEL
  134. movq OLD_OFFSET, %r11
  135. #endif
  136. #endif
  137. unpcklps %xmm0, %xmm0
  138. unpcklps %xmm1, %xmm1
  139. movlps %xmm0, ALPHA_R
  140. movlps %xmm1, ALPHA_I
  141. subq $-32 * SIZE, A
  142. subq $-32 * SIZE, B
  143. movq OLD_M, M
  144. movq OLD_N, N
  145. movq OLD_K, K
  146. salq $ZBASE_SHIFT, LDC
  147. #ifdef TRMMKERNEL
  148. movq %r11, OFFSET
  149. #ifndef LEFT
  150. negq %r11
  151. #endif
  152. movq %r11, KK
  153. #endif
  154. movq N, J
  155. sarq $2, J
  156. NOBRANCH
  157. jle .L30
  158. ALIGN_4
  159. .L01:
  160. #if defined(TRMMKERNEL) && defined(LEFT)
  161. movq OFFSET, %rax
  162. movq %rax, KK
  163. #endif
  164. movq C, CO1
  165. leaq (C, LDC, 2), CO2
  166. movq A, AO
  167. movq K, %rax
  168. salq $ZBASE_SHIFT + 2, %rax
  169. leaq (B, %rax), BB
  170. movq M, I
  171. sarq $1, I
  172. NOBRANCH
  173. jle .L20
  174. ALIGN_4
  175. .L11:
  176. prefetcht2 -32 * SIZE(BB)
  177. subq $-16 * SIZE, BB
  178. #if !defined(TRMMKERNEL) || \
  179. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  180. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  181. movq B, BO
  182. #else
  183. movq B, BO
  184. movq KK, %rax
  185. salq $ZBASE_SHIFT, %rax
  186. leaq (AO, %rax, 2), AO
  187. leaq (BO, %rax, 4), BO
  188. #endif
  189. xorps %xmm1, %xmm1
  190. xorps %xmm2, %xmm2
  191. xorps %xmm3, %xmm3
  192. xorps %xmm4, %xmm4
  193. xorps %xmm8, %xmm8
  194. prefetcht0 1 * SIZE(CO1)
  195. xorps %xmm9, %xmm9
  196. prefetcht0 3 * SIZE(CO1, LDC, 1)
  197. xorps %xmm10, %xmm10
  198. xorps %xmm11, %xmm11
  199. movaps -32 * SIZE(AO), %xmm0
  200. xorps %xmm12, %xmm12
  201. prefetcht0 1 * SIZE(CO2)
  202. xorps %xmm13, %xmm13
  203. prefetcht0 3 * SIZE(CO2, LDC, 1)
  204. xorps %xmm14, %xmm14
  205. xorps %xmm15, %xmm15
  206. #ifndef TRMMKERNEL
  207. movq K, %rax
  208. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. movq K, %rax
  210. subq KK, %rax
  211. movq %rax, KKK
  212. #else
  213. movq KK, %rax
  214. #ifdef LEFT
  215. addq $2, %rax
  216. #else
  217. addq $4, %rax
  218. #endif
  219. movq %rax, KKK
  220. #endif
  221. sarq $2, %rax
  222. NOBRANCH
  223. jle .L15
  224. ALIGN_3
  225. .L12:
  226. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  227. ADD1 %xmm1, %xmm12
  228. movaps -32 * SIZE(BO), %xmm1
  229. ADD2 %xmm2, %xmm13
  230. pshufd $0xb1, %xmm1, %xmm2
  231. mulps %xmm0, %xmm1
  232. pshufd $0x1b, %xmm2, %xmm5
  233. mulps %xmm0, %xmm2
  234. ADD1 %xmm3, %xmm14
  235. ADD2 %xmm4, %xmm15
  236. pshufd $0xb1, %xmm5, %xmm6
  237. mulps %xmm0, %xmm5
  238. mulps %xmm0, %xmm6
  239. ADD1 %xmm1, %xmm8
  240. movaps -28 * SIZE(BO), %xmm1
  241. ADD2 %xmm2, %xmm9
  242. pshufd $0xb1, %xmm1, %xmm2
  243. mulps %xmm0, %xmm1
  244. pshufd $0x1b, %xmm2, %xmm3
  245. mulps %xmm0, %xmm2
  246. ADD1 %xmm5, %xmm10
  247. ADD2 %xmm6, %xmm11
  248. pshufd $0xb1, %xmm3, %xmm4
  249. movaps -28 * SIZE(AO), %xmm7
  250. mulps %xmm0, %xmm3
  251. mulps %xmm0, %xmm4
  252. ADD1 %xmm1, %xmm12
  253. movaps -24 * SIZE(BO), %xmm1
  254. ADD2 %xmm2, %xmm13
  255. pshufd $0xb1, %xmm1, %xmm2
  256. mulps %xmm7, %xmm1
  257. pshufd $0x1b, %xmm2, %xmm5
  258. mulps %xmm7, %xmm2
  259. ADD1 %xmm3, %xmm14
  260. ADD2 %xmm4, %xmm15
  261. pshufd $0xb1, %xmm5, %xmm6
  262. mulps %xmm7, %xmm5
  263. mulps %xmm7, %xmm6
  264. ADD1 %xmm1, %xmm8
  265. movaps -20 * SIZE(BO), %xmm1
  266. ADD2 %xmm2, %xmm9
  267. pshufd $0xb1, %xmm1, %xmm2
  268. mulps %xmm7, %xmm1
  269. pshufd $0x1b, %xmm2, %xmm3
  270. mulps %xmm7, %xmm2
  271. ADD1 %xmm5, %xmm10
  272. ADD2 %xmm6, %xmm11
  273. pshufd $0xb1, %xmm3, %xmm4
  274. movaps -24 * SIZE(AO), %xmm0
  275. mulps %xmm7, %xmm3
  276. mulps %xmm7, %xmm4
  277. ADD1 %xmm1, %xmm12
  278. movaps -16 * SIZE(BO), %xmm1
  279. ADD2 %xmm2, %xmm13
  280. pshufd $0xb1, %xmm1, %xmm2
  281. mulps %xmm0, %xmm1
  282. pshufd $0x1b, %xmm2, %xmm5
  283. mulps %xmm0, %xmm2
  284. ADD1 %xmm3, %xmm14
  285. ADD2 %xmm4, %xmm15
  286. pshufd $0xb1, %xmm5, %xmm6
  287. mulps %xmm0, %xmm5
  288. mulps %xmm0, %xmm6
  289. ADD1 %xmm1, %xmm8
  290. movaps -12 * SIZE(BO), %xmm1
  291. ADD2 %xmm2, %xmm9
  292. pshufd $0xb1, %xmm1, %xmm2
  293. mulps %xmm0, %xmm1
  294. pshufd $0x1b, %xmm2, %xmm3
  295. mulps %xmm0, %xmm2
  296. ADD1 %xmm5, %xmm10
  297. ADD2 %xmm6, %xmm11
  298. pshufd $0xb1, %xmm3, %xmm4
  299. movaps -20 * SIZE(AO), %xmm7
  300. mulps %xmm0, %xmm3
  301. mulps %xmm0, %xmm4
  302. ADD1 %xmm1, %xmm12
  303. movaps -8 * SIZE(BO), %xmm1
  304. ADD2 %xmm2, %xmm13
  305. pshufd $0xb1, %xmm1, %xmm2
  306. mulps %xmm7, %xmm1
  307. pshufd $0x1b, %xmm2, %xmm5
  308. mulps %xmm7, %xmm2
  309. ADD1 %xmm3, %xmm14
  310. ADD2 %xmm4, %xmm15
  311. pshufd $0xb1, %xmm5, %xmm6
  312. mulps %xmm7, %xmm5
  313. mulps %xmm7, %xmm6
  314. ADD1 %xmm1, %xmm8
  315. movaps -4 * SIZE(BO), %xmm1
  316. ADD2 %xmm2, %xmm9
  317. subq $-32 * SIZE, BO
  318. pshufd $0xb1, %xmm1, %xmm2
  319. mulps %xmm7, %xmm1
  320. pshufd $0x1b, %xmm2, %xmm3
  321. mulps %xmm7, %xmm2
  322. ADD1 %xmm5, %xmm10
  323. ADD2 %xmm6, %xmm11
  324. pshufd $0xb1, %xmm3, %xmm4
  325. mulps %xmm7, %xmm3
  326. movaps -16 * SIZE(AO), %xmm0
  327. mulps %xmm7, %xmm4
  328. subq $-16 * SIZE, AO
  329. subq $1, %rax
  330. BRANCH
  331. jg .L12
  332. ALIGN_3
  333. .L15:
  334. #ifndef TRMMKERNEL
  335. movq K, %rax
  336. #else
  337. movq KKK, %rax
  338. #endif
  339. andq $3, %rax # if (k & 1)
  340. BRANCH
  341. je .L18
  342. ALIGN_3
  343. .L16:
  344. ADD1 %xmm1, %xmm12
  345. movaps -32 * SIZE(BO), %xmm1
  346. ADD2 %xmm2, %xmm13
  347. pshufd $0xb1, %xmm1, %xmm2
  348. mulps %xmm0, %xmm1
  349. pshufd $0x1b, %xmm2, %xmm5
  350. mulps %xmm0, %xmm2
  351. ADD1 %xmm3, %xmm14
  352. ADD2 %xmm4, %xmm15
  353. pshufd $0xb1, %xmm5, %xmm6
  354. mulps %xmm0, %xmm5
  355. mulps %xmm0, %xmm6
  356. ADD1 %xmm1, %xmm8
  357. movaps -28 * SIZE(BO), %xmm1
  358. ADD2 %xmm2, %xmm9
  359. pshufd $0xb1, %xmm1, %xmm2
  360. mulps %xmm0, %xmm1
  361. pshufd $0x1b, %xmm2, %xmm3
  362. mulps %xmm0, %xmm2
  363. ADD1 %xmm5, %xmm10
  364. ADD2 %xmm6, %xmm11
  365. pshufd $0xb1, %xmm3, %xmm4
  366. mulps %xmm0, %xmm3
  367. mulps %xmm0, %xmm4
  368. movaps -28 * SIZE(AO), %xmm0
  369. addq $4 * SIZE, AO
  370. addq $8 * SIZE, BO
  371. subq $1, %rax
  372. BRANCH
  373. jg .L16
  374. ALIGN_3
  375. .L18:
  376. ADD1 %xmm1, %xmm12
  377. ADD2 %xmm2, %xmm13
  378. ADD1 %xmm3, %xmm14
  379. ADD2 %xmm4, %xmm15
  380. movddup ALPHA_R, %xmm2
  381. movddup ALPHA_I, %xmm3
  382. pcmpeqb %xmm0, %xmm0
  383. psllq $63, %xmm0
  384. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  385. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  386. pxor %xmm0, %xmm8
  387. pxor %xmm0, %xmm10
  388. pxor %xmm0, %xmm12
  389. pxor %xmm0, %xmm14
  390. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  391. pshufd $0xb1, %xmm0, %xmm0
  392. pxor %xmm0, %xmm9
  393. pxor %xmm0, %xmm11
  394. pxor %xmm0, %xmm13
  395. pxor %xmm0, %xmm15
  396. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  397. pxor %xmm0, %xmm9
  398. pxor %xmm0, %xmm11
  399. pxor %xmm0, %xmm13
  400. pxor %xmm0, %xmm15
  401. #endif
  402. haddps %xmm9, %xmm8
  403. haddps %xmm11, %xmm10
  404. haddps %xmm13, %xmm12
  405. haddps %xmm15, %xmm14
  406. shufps $0xd8, %xmm8, %xmm8
  407. shufps $0xd8, %xmm10, %xmm10
  408. shufps $0xd8, %xmm12, %xmm12
  409. shufps $0xd8, %xmm14, %xmm14
  410. movaps %xmm8, %xmm9
  411. shufps $0xe4, %xmm10, %xmm8
  412. shufps $0xe4, %xmm9, %xmm10
  413. movaps %xmm12, %xmm13
  414. shufps $0xe4, %xmm14, %xmm12
  415. shufps $0xe4, %xmm13, %xmm14
  416. pshufd $0xb1, %xmm8, %xmm9
  417. pshufd $0xb1, %xmm10, %xmm11
  418. pshufd $0xb1, %xmm12, %xmm13
  419. pshufd $0xb1, %xmm14, %xmm15
  420. mulps %xmm2, %xmm8
  421. mulps %xmm3, %xmm9
  422. mulps %xmm2, %xmm12
  423. mulps %xmm3, %xmm13
  424. mulps %xmm2, %xmm10
  425. mulps %xmm3, %xmm11
  426. mulps %xmm2, %xmm14
  427. mulps %xmm3, %xmm15
  428. addsubps %xmm9, %xmm8
  429. addsubps %xmm11, %xmm10
  430. addsubps %xmm13, %xmm12
  431. addsubps %xmm15, %xmm14
  432. #ifndef TRMMKERNEL
  433. movups 0 * SIZE(CO1), %xmm0
  434. movups 0 * SIZE(CO1, LDC), %xmm1
  435. movups 0 * SIZE(CO2), %xmm2
  436. movups 0 * SIZE(CO2, LDC), %xmm3
  437. addps %xmm0, %xmm8
  438. addps %xmm1, %xmm10
  439. addps %xmm2, %xmm12
  440. addps %xmm3, %xmm14
  441. #endif
  442. movups %xmm8, 0 * SIZE(CO1)
  443. movups %xmm10, 0 * SIZE(CO1, LDC)
  444. movups %xmm12, 0 * SIZE(CO2)
  445. movups %xmm14, 0 * SIZE(CO2, LDC)
  446. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  447. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  448. movq K, %rax
  449. subq KKK, %rax
  450. salq $ZBASE_SHIFT, %rax
  451. leaq (AO, %rax, 2), AO
  452. leaq (BO, %rax, 4), BO
  453. #endif
  454. #if defined(TRMMKERNEL) && defined(LEFT)
  455. addq $2, KK
  456. #endif
  457. addq $4 * SIZE, CO1
  458. addq $4 * SIZE, CO2
  459. decq I # i --
  460. BRANCH
  461. jg .L11
  462. ALIGN_4
  463. .L20:
  464. testq $1, M
  465. BRANCH
  466. jle .L29
  467. #if !defined(TRMMKERNEL) || \
  468. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  469. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  470. movq B, BO
  471. #else
  472. movq B, BO
  473. movq KK, %rax
  474. salq $ZBASE_SHIFT, %rax
  475. leaq (AO, %rax, 1), AO
  476. leaq (BO, %rax, 4), BO
  477. #endif
  478. xorps %xmm1, %xmm1
  479. movddup -32 * SIZE(AO), %xmm0
  480. xorps %xmm2, %xmm2
  481. movaps -32 * SIZE(BO), %xmm5
  482. xorps %xmm3, %xmm3
  483. xorps %xmm4, %xmm4
  484. xorps %xmm8, %xmm8
  485. xorps %xmm9, %xmm9
  486. xorps %xmm10, %xmm10
  487. xorps %xmm11, %xmm11
  488. #ifndef TRMMKERNEL
  489. movq K, %rax
  490. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  491. movq K, %rax
  492. subq KK, %rax
  493. movq %rax, KKK
  494. #else
  495. movq KK, %rax
  496. #ifdef LEFT
  497. addq $1, %rax
  498. #else
  499. addq $4, %rax
  500. #endif
  501. movq %rax, KKK
  502. #endif
  503. sarq $2, %rax
  504. NOBRANCH
  505. jle .L25
  506. ALIGN_3
  507. .L22:
  508. ADD1 %xmm1, %xmm8
  509. pshufd $0xa0, %xmm5, %xmm1
  510. mulps %xmm0, %xmm1
  511. ADD2 %xmm2, %xmm9
  512. pshufd $0xf5, %xmm5, %xmm2
  513. movaps -28 * SIZE(BO), %xmm5
  514. mulps %xmm0, %xmm2
  515. ADD1 %xmm3, %xmm10
  516. pshufd $0xa0, %xmm5, %xmm3
  517. mulps %xmm0, %xmm3
  518. ADD2 %xmm4, %xmm11
  519. pshufd $0xf5, %xmm5, %xmm4
  520. movaps -24 * SIZE(BO), %xmm5
  521. mulps %xmm0, %xmm4
  522. movddup -30 * SIZE(AO), %xmm0
  523. ADD1 %xmm1, %xmm8
  524. pshufd $0xa0, %xmm5, %xmm1
  525. mulps %xmm0, %xmm1
  526. ADD2 %xmm2, %xmm9
  527. pshufd $0xf5, %xmm5, %xmm2
  528. movaps -20 * SIZE(BO), %xmm5
  529. mulps %xmm0, %xmm2
  530. ADD1 %xmm3, %xmm10
  531. pshufd $0xa0, %xmm5, %xmm3
  532. mulps %xmm0, %xmm3
  533. ADD2 %xmm4, %xmm11
  534. pshufd $0xf5, %xmm5, %xmm4
  535. movaps -16 * SIZE(BO), %xmm5
  536. mulps %xmm0, %xmm4
  537. movddup -28 * SIZE(AO), %xmm0
  538. ADD1 %xmm1, %xmm8
  539. pshufd $0xa0, %xmm5, %xmm1
  540. mulps %xmm0, %xmm1
  541. ADD2 %xmm2, %xmm9
  542. pshufd $0xf5, %xmm5, %xmm2
  543. movaps -12 * SIZE(BO), %xmm5
  544. mulps %xmm0, %xmm2
  545. ADD1 %xmm3, %xmm10
  546. pshufd $0xa0, %xmm5, %xmm3
  547. mulps %xmm0, %xmm3
  548. ADD2 %xmm4, %xmm11
  549. pshufd $0xf5, %xmm5, %xmm4
  550. movaps -8 * SIZE(BO), %xmm5
  551. mulps %xmm0, %xmm4
  552. movddup -26 * SIZE(AO), %xmm0
  553. ADD1 %xmm1, %xmm8
  554. pshufd $0xa0, %xmm5, %xmm1
  555. mulps %xmm0, %xmm1
  556. ADD2 %xmm2, %xmm9
  557. pshufd $0xf5, %xmm5, %xmm2
  558. movaps -4 * SIZE(BO), %xmm5
  559. mulps %xmm0, %xmm2
  560. ADD1 %xmm3, %xmm10
  561. pshufd $0xa0, %xmm5, %xmm3
  562. mulps %xmm0, %xmm3
  563. ADD2 %xmm4, %xmm11
  564. pshufd $0xf5, %xmm5, %xmm4
  565. movaps 0 * SIZE(BO), %xmm5
  566. mulps %xmm0, %xmm4
  567. movddup -24 * SIZE(AO), %xmm0
  568. subq $-32 * SIZE, BO
  569. subq $ -8 * SIZE, AO
  570. subq $1, %rax
  571. BRANCH
  572. jg .L22
  573. ALIGN_3
  574. .L25:
  575. #ifndef TRMMKERNEL
  576. movq K, %rax
  577. #else
  578. movq KKK, %rax
  579. #endif
  580. andq $3, %rax # if (k & 1)
  581. BRANCH
  582. je .L28
  583. ALIGN_3
  584. .L26:
  585. ADD1 %xmm1, %xmm8
  586. pshufd $0xa0, %xmm5, %xmm1
  587. mulps %xmm0, %xmm1
  588. ADD2 %xmm2, %xmm9
  589. pshufd $0xf5, %xmm5, %xmm2
  590. movaps -28 * SIZE(BO), %xmm5
  591. mulps %xmm0, %xmm2
  592. ADD1 %xmm3, %xmm10
  593. pshufd $0xa0, %xmm5, %xmm3
  594. mulps %xmm0, %xmm3
  595. ADD2 %xmm4, %xmm11
  596. pshufd $0xf5, %xmm5, %xmm4
  597. movaps -24 * SIZE(BO), %xmm5
  598. mulps %xmm0, %xmm4
  599. movddup -30 * SIZE(AO), %xmm0
  600. addq $2 * SIZE, AO
  601. addq $8 * SIZE, BO
  602. subq $1, %rax
  603. BRANCH
  604. jg .L26
  605. ALIGN_3
  606. .L28:
  607. ADD1 %xmm1, %xmm8
  608. ADD2 %xmm2, %xmm9
  609. ADD1 %xmm3, %xmm10
  610. ADD2 %xmm4, %xmm11
  611. pcmpeqb %xmm0, %xmm0
  612. psllq $63, %xmm0
  613. movddup ALPHA_R, %xmm2
  614. movddup ALPHA_I, %xmm3
  615. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  616. pxor %xmm0, %xmm9
  617. pxor %xmm0, %xmm11
  618. shufps $0xb1, %xmm9, %xmm9
  619. shufps $0xb1, %xmm11, %xmm11
  620. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  621. shufps $0xb1, %xmm9, %xmm9
  622. shufps $0xb1, %xmm11, %xmm11
  623. pxor %xmm0, %xmm9
  624. pxor %xmm0, %xmm11
  625. #else
  626. pxor %xmm0, %xmm8
  627. pxor %xmm0, %xmm10
  628. shufps $0xb1, %xmm9, %xmm9
  629. shufps $0xb1, %xmm11, %xmm11
  630. #endif
  631. addps %xmm9, %xmm8
  632. addps %xmm11, %xmm10
  633. pshufd $0xb1, %xmm8, %xmm9
  634. pshufd $0xb1, %xmm10, %xmm11
  635. mulps %xmm2, %xmm8
  636. mulps %xmm3, %xmm9
  637. mulps %xmm2, %xmm10
  638. mulps %xmm3, %xmm11
  639. addsubps %xmm9, %xmm8
  640. addsubps %xmm11, %xmm10
  641. #ifndef TRMMKERNEL
  642. movsd (CO1), %xmm0
  643. movhps (CO1, LDC), %xmm0
  644. movsd (CO2), %xmm1
  645. movhps (CO2, LDC), %xmm1
  646. addps %xmm0, %xmm8
  647. addps %xmm1, %xmm10
  648. #endif
  649. movsd %xmm8, (CO1)
  650. movhps %xmm8, (CO1, LDC)
  651. movsd %xmm10, (CO2)
  652. movhps %xmm10, (CO2, LDC)
  653. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  654. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  655. movq K, %rax
  656. subq KKK, %rax
  657. salq $ZBASE_SHIFT, %rax
  658. leaq (AO, %rax, 1), AO
  659. leaq (BO, %rax, 4), BO
  660. #endif
  661. #if defined(TRMMKERNEL) && defined(LEFT)
  662. addq $1, KK
  663. #endif
  664. ALIGN_4
  665. .L29:
  666. #if defined(TRMMKERNEL) && !defined(LEFT)
  667. addq $4, KK
  668. #endif
  669. leaq (C, LDC, 4), C
  670. movq BO, B
  671. subq $1, J
  672. BRANCH
  673. jg .L01
  674. ALIGN_4
  675. .L30:
  676. testq $2, N
  677. BRANCH
  678. jle .L50
  679. #if defined(TRMMKERNEL) && defined(LEFT)
  680. movq OFFSET, %rax
  681. movq %rax, KK
  682. #endif
  683. movq C, CO1
  684. leaq (C, LDC), CO2
  685. movq A, AO
  686. movq M, I
  687. sarq $1, I
  688. NOBRANCH
  689. jle .L40
  690. ALIGN_4
  691. .L31:
  692. #if !defined(TRMMKERNEL) || \
  693. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  694. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  695. movq B, BO
  696. #else
  697. movq B, BO
  698. movq KK, %rax
  699. salq $ZBASE_SHIFT, %rax
  700. leaq (AO, %rax, 2), AO
  701. leaq (BO, %rax, 2), BO
  702. #endif
  703. xorps %xmm1, %xmm1
  704. movaps -32 * SIZE(AO), %xmm0
  705. xorps %xmm2, %xmm2
  706. xorps %xmm3, %xmm3
  707. xorps %xmm4, %xmm4
  708. xorps %xmm8, %xmm8
  709. prefetcht2 4 * SIZE(CO1)
  710. xorps %xmm9, %xmm9
  711. prefetcht2 4 * SIZE(CO2)
  712. xorps %xmm10, %xmm10
  713. xorps %xmm11, %xmm11
  714. #ifndef TRMMKERNEL
  715. movq K, %rax
  716. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  717. movq K, %rax
  718. subq KK, %rax
  719. movq %rax, KKK
  720. #else
  721. movq KK, %rax
  722. #ifdef LEFT
  723. addq $2, %rax
  724. #else
  725. addq $2, %rax
  726. #endif
  727. movq %rax, KKK
  728. #endif
  729. sarq $2, %rax
  730. NOBRANCH
  731. jle .L35
  732. ALIGN_3
  733. .L32:
  734. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  735. ADD1 %xmm1, %xmm8
  736. movaps -32 * SIZE(BO), %xmm1
  737. ADD2 %xmm2, %xmm9
  738. pshufd $0xb1, %xmm1, %xmm2
  739. mulps %xmm0, %xmm1
  740. ADD1 %xmm3, %xmm10
  741. pshufd $0x1b, %xmm2, %xmm3
  742. mulps %xmm0, %xmm2
  743. ADD2 %xmm4, %xmm11
  744. pshufd $0xb1, %xmm3, %xmm4
  745. mulps %xmm0, %xmm3
  746. mulps %xmm0, %xmm4
  747. movaps -28 * SIZE(AO), %xmm0
  748. ADD1 %xmm1, %xmm8
  749. movaps -28 * SIZE(BO), %xmm1
  750. ADD2 %xmm2, %xmm9
  751. pshufd $0xb1, %xmm1, %xmm2
  752. mulps %xmm0, %xmm1
  753. ADD1 %xmm3, %xmm10
  754. pshufd $0x1b, %xmm2, %xmm3
  755. mulps %xmm0, %xmm2
  756. ADD2 %xmm4, %xmm11
  757. pshufd $0xb1, %xmm3, %xmm4
  758. mulps %xmm0, %xmm3
  759. mulps %xmm0, %xmm4
  760. movaps -24 * SIZE(AO), %xmm0
  761. ADD1 %xmm1, %xmm8
  762. movaps -24 * SIZE(BO), %xmm1
  763. ADD2 %xmm2, %xmm9
  764. pshufd $0xb1, %xmm1, %xmm2
  765. mulps %xmm0, %xmm1
  766. ADD1 %xmm3, %xmm10
  767. pshufd $0x1b, %xmm2, %xmm3
  768. mulps %xmm0, %xmm2
  769. ADD2 %xmm4, %xmm11
  770. pshufd $0xb1, %xmm3, %xmm4
  771. mulps %xmm0, %xmm3
  772. mulps %xmm0, %xmm4
  773. movaps -20 * SIZE(AO), %xmm0
  774. ADD1 %xmm1, %xmm8
  775. movaps -20 * SIZE(BO), %xmm1
  776. ADD2 %xmm2, %xmm9
  777. pshufd $0xb1, %xmm1, %xmm2
  778. mulps %xmm0, %xmm1
  779. ADD1 %xmm3, %xmm10
  780. pshufd $0x1b, %xmm2, %xmm3
  781. mulps %xmm0, %xmm2
  782. ADD2 %xmm4, %xmm11
  783. pshufd $0xb1, %xmm3, %xmm4
  784. mulps %xmm0, %xmm3
  785. mulps %xmm0, %xmm4
  786. movaps -16 * SIZE(AO), %xmm0
  787. subq $-16 * SIZE, BO
  788. subq $-16 * SIZE, AO
  789. subq $1, %rax
  790. BRANCH
  791. jg .L32
  792. ALIGN_3
  793. .L35:
  794. #ifndef TRMMKERNEL
  795. movq K, %rax
  796. #else
  797. movq KKK, %rax
  798. #endif
  799. andq $3, %rax # if (k & 1)
  800. BRANCH
  801. je .L38
  802. ALIGN_3
  803. .L36:
  804. ADD1 %xmm1, %xmm8
  805. movaps -32 * SIZE(BO), %xmm1
  806. ADD2 %xmm2, %xmm9
  807. pshufd $0xb1, %xmm1, %xmm2
  808. mulps %xmm0, %xmm1
  809. ADD1 %xmm3, %xmm10
  810. pshufd $0x1b, %xmm2, %xmm3
  811. mulps %xmm0, %xmm2
  812. ADD2 %xmm4, %xmm11
  813. pshufd $0xb1, %xmm3, %xmm4
  814. mulps %xmm0, %xmm3
  815. mulps %xmm0, %xmm4
  816. movaps -28 * SIZE(AO), %xmm0
  817. addq $4 * SIZE, AO
  818. addq $4 * SIZE, BO
  819. subq $1, %rax
  820. BRANCH
  821. jg .L36
  822. ALIGN_3
  823. .L38:
  824. ADD1 %xmm1, %xmm8
  825. ADD2 %xmm2, %xmm9
  826. ADD1 %xmm3, %xmm10
  827. ADD2 %xmm4, %xmm11
  828. pcmpeqb %xmm0, %xmm0
  829. psllq $63, %xmm0
  830. movddup ALPHA_R, %xmm2
  831. movddup ALPHA_I, %xmm3
  832. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  833. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  834. pxor %xmm0, %xmm8
  835. pxor %xmm0, %xmm10
  836. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  837. pshufd $0xb1, %xmm0, %xmm0
  838. pxor %xmm0, %xmm9
  839. pxor %xmm0, %xmm11
  840. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  841. pxor %xmm0, %xmm9
  842. pxor %xmm0, %xmm11
  843. #endif
  844. haddps %xmm9, %xmm8
  845. haddps %xmm11, %xmm10
  846. shufps $0xd8, %xmm8, %xmm8
  847. shufps $0xd8, %xmm10, %xmm10
  848. movaps %xmm8, %xmm9
  849. shufps $0xe4, %xmm10, %xmm8
  850. shufps $0xe4, %xmm9, %xmm10
  851. pshufd $0xb1, %xmm8, %xmm9
  852. pshufd $0xb1, %xmm10, %xmm11
  853. mulps %xmm2, %xmm8
  854. mulps %xmm3, %xmm9
  855. mulps %xmm2, %xmm10
  856. mulps %xmm3, %xmm11
  857. addsubps %xmm9, %xmm8
  858. addsubps %xmm11, %xmm10
  859. #ifndef TRMMKERNEL
  860. movsd 0 * SIZE(CO1), %xmm0
  861. movhps 2 * SIZE(CO1), %xmm0
  862. movsd 0 * SIZE(CO2), %xmm1
  863. movhps 2 * SIZE(CO2), %xmm1
  864. addps %xmm0, %xmm8
  865. addps %xmm1, %xmm10
  866. #endif
  867. movsd %xmm8, 0 * SIZE(CO1)
  868. movhps %xmm8, 2 * SIZE(CO1)
  869. movsd %xmm10, 0 * SIZE(CO2)
  870. movhps %xmm10, 2 * SIZE(CO2)
  871. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  872. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  873. movq K, %rax
  874. subq KKK, %rax
  875. salq $ZBASE_SHIFT, %rax
  876. leaq (AO, %rax, 2), AO
  877. leaq (BO, %rax, 2), BO
  878. #endif
  879. #if defined(TRMMKERNEL) && defined(LEFT)
  880. addq $2, KK
  881. #endif
  882. addq $4 * SIZE, CO1
  883. addq $4 * SIZE, CO2
  884. decq I # i --
  885. BRANCH
  886. jg .L31
  887. ALIGN_4
  888. .L40:
  889. testq $1, M
  890. BRANCH
  891. jle .L49
  892. #if !defined(TRMMKERNEL) || \
  893. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  894. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  895. movq B, BO
  896. #else
  897. movq B, BO
  898. movq KK, %rax
  899. salq $ZBASE_SHIFT, %rax
  900. leaq (AO, %rax, 1), AO
  901. leaq (BO, %rax, 2), BO
  902. #endif
  903. xorps %xmm1, %xmm1
  904. movddup -32 * SIZE(AO), %xmm0
  905. xorps %xmm2, %xmm2
  906. movaps -32 * SIZE(BO), %xmm5
  907. xorps %xmm3, %xmm3
  908. xorps %xmm4, %xmm4
  909. xorps %xmm8, %xmm8
  910. xorps %xmm9, %xmm9
  911. xorps %xmm10, %xmm10
  912. xorps %xmm11, %xmm11
  913. #ifndef TRMMKERNEL
  914. movq K, %rax
  915. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  916. movq K, %rax
  917. subq KK, %rax
  918. movq %rax, KKK
  919. #else
  920. movq KK, %rax
  921. #ifdef LEFT
  922. addq $1, %rax
  923. #else
  924. addq $2, %rax
  925. #endif
  926. movq %rax, KKK
  927. #endif
  928. sarq $2, %rax
  929. NOBRANCH
  930. jle .L45
  931. ALIGN_3
  932. .L42:
  933. ADD1 %xmm1, %xmm8
  934. pshufd $0xa0, %xmm5, %xmm1
  935. mulps %xmm0, %xmm1
  936. ADD2 %xmm2, %xmm9
  937. pshufd $0xf5, %xmm5, %xmm2
  938. movaps -28 * SIZE(BO), %xmm5
  939. mulps %xmm0, %xmm2
  940. movddup -30 * SIZE(AO), %xmm0
  941. ADD1 %xmm1, %xmm8
  942. pshufd $0xa0, %xmm5, %xmm1
  943. mulps %xmm0, %xmm1
  944. ADD2 %xmm2, %xmm9
  945. pshufd $0xf5, %xmm5, %xmm2
  946. movaps -24 * SIZE(BO), %xmm5
  947. mulps %xmm0, %xmm2
  948. movddup -28 * SIZE(AO), %xmm0
  949. ADD1 %xmm1, %xmm8
  950. pshufd $0xa0, %xmm5, %xmm1
  951. mulps %xmm0, %xmm1
  952. ADD2 %xmm2, %xmm9
  953. pshufd $0xf5, %xmm5, %xmm2
  954. movaps -20 * SIZE(BO), %xmm5
  955. mulps %xmm0, %xmm2
  956. movddup -26 * SIZE(AO), %xmm0
  957. ADD1 %xmm1, %xmm8
  958. pshufd $0xa0, %xmm5, %xmm1
  959. mulps %xmm0, %xmm1
  960. ADD2 %xmm2, %xmm9
  961. pshufd $0xf5, %xmm5, %xmm2
  962. movaps -16 * SIZE(BO), %xmm5
  963. mulps %xmm0, %xmm2
  964. movddup -24 * SIZE(AO), %xmm0
  965. subq $-16 * SIZE, BO
  966. subq $ -8 * SIZE, AO
  967. subq $1, %rax
  968. BRANCH
  969. jg .L42
  970. ALIGN_3
  971. .L45:
  972. #ifndef TRMMKERNEL
  973. movq K, %rax
  974. #else
  975. movq KKK, %rax
  976. #endif
  977. andq $3, %rax # if (k & 1)
  978. BRANCH
  979. je .L48
  980. ALIGN_3
  981. .L46:
  982. ADD1 %xmm1, %xmm8
  983. pshufd $0xa0, %xmm5, %xmm1
  984. mulps %xmm0, %xmm1
  985. ADD2 %xmm2, %xmm9
  986. pshufd $0xf5, %xmm5, %xmm2
  987. movaps -28 * SIZE(BO), %xmm5
  988. mulps %xmm0, %xmm2
  989. movddup -30 * SIZE(AO), %xmm0
  990. addq $2 * SIZE, AO
  991. addq $4 * SIZE, BO
  992. subq $1, %rax
  993. BRANCH
  994. jg .L46
  995. ALIGN_3
  996. .L48:
  997. ADD1 %xmm1, %xmm8
  998. ADD2 %xmm2, %xmm9
  999. pcmpeqb %xmm0, %xmm0
  1000. psllq $63, %xmm0
  1001. movddup ALPHA_R, %xmm2
  1002. movddup ALPHA_I, %xmm3
  1003. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1004. pxor %xmm0, %xmm9
  1005. shufps $0xb1, %xmm9, %xmm9
  1006. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1007. shufps $0xb1, %xmm9, %xmm9
  1008. pxor %xmm0, %xmm9
  1009. #else
  1010. pxor %xmm0, %xmm8
  1011. shufps $0xb1, %xmm9, %xmm9
  1012. #endif
  1013. addps %xmm9, %xmm8
  1014. pshufd $0xb1, %xmm8, %xmm9
  1015. mulps %xmm2, %xmm8
  1016. mulps %xmm3, %xmm9
  1017. addsubps %xmm9, %xmm8
  1018. #ifndef TRMMKERNEL
  1019. movsd (CO1), %xmm0
  1020. movhps (CO2), %xmm0
  1021. addps %xmm0, %xmm8
  1022. #endif
  1023. movsd %xmm8, (CO1)
  1024. movhps %xmm8, (CO2)
  1025. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1026. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1027. movq K, %rax
  1028. subq KKK, %rax
  1029. salq $ZBASE_SHIFT, %rax
  1030. leaq (AO, %rax, 1), AO
  1031. leaq (BO, %rax, 2), BO
  1032. #endif
  1033. #if defined(TRMMKERNEL) && defined(LEFT)
  1034. addq $1, KK
  1035. #endif
  1036. ALIGN_4
  1037. .L49:
  1038. #if defined(TRMMKERNEL) && !defined(LEFT)
  1039. addq $2, KK
  1040. #endif
  1041. leaq (C, LDC, 2), C
  1042. movq BO, B
  1043. ALIGN_4
  1044. .L50:
  1045. testq $1, N
  1046. BRANCH
  1047. jle .L999
  1048. #if defined(TRMMKERNEL) && defined(LEFT)
  1049. movq OFFSET, %rax
  1050. movq %rax, KK
  1051. #endif
  1052. movq C, CO1
  1053. movq A, AO
  1054. movq M, I
  1055. sarq $1, I
  1056. NOBRANCH
  1057. jle .L60
  1058. ALIGN_4
  1059. .L51:
  1060. #if !defined(TRMMKERNEL) || \
  1061. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1062. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1063. movq B, BO
  1064. #else
  1065. movq B, BO
  1066. movq KK, %rax
  1067. salq $ZBASE_SHIFT, %rax
  1068. leaq (AO, %rax, 2), AO
  1069. leaq (BO, %rax, 1), BO
  1070. #endif
  1071. xorps %xmm1, %xmm1
  1072. movaps -32 * SIZE(AO), %xmm0
  1073. xorps %xmm2, %xmm2
  1074. xorps %xmm3, %xmm3
  1075. xorps %xmm4, %xmm4
  1076. xorps %xmm8, %xmm8
  1077. prefetcht2 4 * SIZE(CO1)
  1078. xorps %xmm9, %xmm9
  1079. xorps %xmm10, %xmm10
  1080. xorps %xmm11, %xmm11
  1081. #ifndef TRMMKERNEL
  1082. movq K, %rax
  1083. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1084. movq K, %rax
  1085. subq KK, %rax
  1086. movq %rax, KKK
  1087. #else
  1088. movq KK, %rax
  1089. #ifdef LEFT
  1090. addq $2, %rax
  1091. #else
  1092. addq $1, %rax
  1093. #endif
  1094. movq %rax, KKK
  1095. #endif
  1096. sarq $2, %rax
  1097. NOBRANCH
  1098. jle .L55
  1099. ALIGN_3
  1100. .L52:
  1101. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1102. ADD1 %xmm1, %xmm8
  1103. movddup -32 * SIZE(BO), %xmm1
  1104. ADD2 %xmm2, %xmm9
  1105. pshufd $0xb1, %xmm1, %xmm2
  1106. mulps %xmm0, %xmm1
  1107. mulps %xmm0, %xmm2
  1108. movaps -28 * SIZE(AO), %xmm0
  1109. ADD1 %xmm1, %xmm8
  1110. movddup -30 * SIZE(BO), %xmm1
  1111. ADD2 %xmm2, %xmm9
  1112. pshufd $0xb1, %xmm1, %xmm2
  1113. mulps %xmm0, %xmm1
  1114. mulps %xmm0, %xmm2
  1115. movaps -24 * SIZE(AO), %xmm0
  1116. ADD1 %xmm1, %xmm8
  1117. movddup -28 * SIZE(BO), %xmm1
  1118. ADD2 %xmm2, %xmm9
  1119. pshufd $0xb1, %xmm1, %xmm2
  1120. mulps %xmm0, %xmm1
  1121. mulps %xmm0, %xmm2
  1122. movaps -20 * SIZE(AO), %xmm0
  1123. ADD1 %xmm1, %xmm8
  1124. movddup -26 * SIZE(BO), %xmm1
  1125. ADD2 %xmm2, %xmm9
  1126. pshufd $0xb1, %xmm1, %xmm2
  1127. mulps %xmm0, %xmm1
  1128. mulps %xmm0, %xmm2
  1129. movaps -16 * SIZE(AO), %xmm0
  1130. subq $ -8 * SIZE, BO
  1131. subq $-16 * SIZE, AO
  1132. subq $1, %rax
  1133. BRANCH
  1134. jg .L52
  1135. ALIGN_3
  1136. .L55:
  1137. #ifndef TRMMKERNEL
  1138. movq K, %rax
  1139. #else
  1140. movq KKK, %rax
  1141. #endif
  1142. andq $3, %rax # if (k & 1)
  1143. BRANCH
  1144. je .L58
  1145. ALIGN_3
  1146. .L56:
  1147. ADD1 %xmm1, %xmm8
  1148. movddup -32 * SIZE(BO), %xmm1
  1149. ADD2 %xmm2, %xmm9
  1150. pshufd $0xb1, %xmm1, %xmm2
  1151. mulps %xmm0, %xmm1
  1152. mulps %xmm0, %xmm2
  1153. movaps -28 * SIZE(AO), %xmm0
  1154. addq $4 * SIZE, AO
  1155. addq $2 * SIZE, BO
  1156. subq $1, %rax
  1157. BRANCH
  1158. jg .L56
  1159. ALIGN_3
  1160. .L58:
  1161. ADD1 %xmm1, %xmm8
  1162. ADD2 %xmm2, %xmm9
  1163. pcmpeqb %xmm0, %xmm0
  1164. psllq $63, %xmm0
  1165. movddup ALPHA_R, %xmm2
  1166. movddup ALPHA_I, %xmm3
  1167. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1168. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1169. pxor %xmm0, %xmm8
  1170. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1171. pshufd $0xb1, %xmm0, %xmm0
  1172. pxor %xmm0, %xmm9
  1173. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1174. pxor %xmm0, %xmm9
  1175. #endif
  1176. haddps %xmm9, %xmm8
  1177. shufps $0xd8, %xmm8, %xmm8
  1178. pshufd $0xb1, %xmm8, %xmm9
  1179. mulps %xmm2, %xmm8
  1180. mulps %xmm3, %xmm9
  1181. addsubps %xmm9, %xmm8
  1182. #ifndef TRMMKERNEL
  1183. movsd 0 * SIZE(CO1), %xmm0
  1184. movhps 2 * SIZE(CO1), %xmm0
  1185. addps %xmm0, %xmm8
  1186. #endif
  1187. movsd %xmm8, 0 * SIZE(CO1)
  1188. movhps %xmm8, 2 * SIZE(CO1)
  1189. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1190. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1191. movq K, %rax
  1192. subq KKK, %rax
  1193. salq $ZBASE_SHIFT, %rax
  1194. leaq (AO, %rax, 2), AO
  1195. leaq (BO, %rax, 1), BO
  1196. #endif
  1197. #if defined(TRMMKERNEL) && defined(LEFT)
  1198. addq $2, KK
  1199. #endif
  1200. addq $4 * SIZE, CO1
  1201. decq I # i --
  1202. BRANCH
  1203. jg .L51
  1204. ALIGN_4
  1205. .L60:
  1206. testq $1, M
  1207. BRANCH
  1208. jle .L999
  1209. #if !defined(TRMMKERNEL) || \
  1210. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1211. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1212. movq B, BO
  1213. #else
  1214. movq B, BO
  1215. movq KK, %rax
  1216. salq $ZBASE_SHIFT, %rax
  1217. leaq (AO, %rax, 1), AO
  1218. leaq (BO, %rax, 1), BO
  1219. #endif
  1220. xorps %xmm1, %xmm1
  1221. movddup -32 * SIZE(AO), %xmm0
  1222. xorps %xmm2, %xmm2
  1223. movsd -32 * SIZE(BO), %xmm5
  1224. xorps %xmm8, %xmm8
  1225. xorps %xmm9, %xmm9
  1226. #ifndef TRMMKERNEL
  1227. movq K, %rax
  1228. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1229. movq K, %rax
  1230. subq KK, %rax
  1231. movq %rax, KKK
  1232. #else
  1233. movq KK, %rax
  1234. #ifdef LEFT
  1235. addq $1, %rax
  1236. #else
  1237. addq $1, %rax
  1238. #endif
  1239. movq %rax, KKK
  1240. #endif
  1241. sarq $2, %rax
  1242. NOBRANCH
  1243. jle .L65
  1244. ALIGN_3
  1245. .L62:
  1246. ADD1 %xmm1, %xmm8
  1247. pshufd $0xa0, %xmm5, %xmm1
  1248. mulps %xmm0, %xmm1
  1249. ADD2 %xmm2, %xmm9
  1250. pshufd $0xf5, %xmm5, %xmm2
  1251. movsd -30 * SIZE(BO), %xmm5
  1252. mulps %xmm0, %xmm2
  1253. movddup -30 * SIZE(AO), %xmm0
  1254. ADD1 %xmm1, %xmm8
  1255. pshufd $0xa0, %xmm5, %xmm1
  1256. mulps %xmm0, %xmm1
  1257. ADD2 %xmm2, %xmm9
  1258. pshufd $0xf5, %xmm5, %xmm2
  1259. movsd -28 * SIZE(BO), %xmm5
  1260. mulps %xmm0, %xmm2
  1261. movddup -28 * SIZE(AO), %xmm0
  1262. ADD1 %xmm1, %xmm8
  1263. pshufd $0xa0, %xmm5, %xmm1
  1264. mulps %xmm0, %xmm1
  1265. ADD2 %xmm2, %xmm9
  1266. pshufd $0xf5, %xmm5, %xmm2
  1267. movsd -26 * SIZE(BO), %xmm5
  1268. mulps %xmm0, %xmm2
  1269. movddup -26 * SIZE(AO), %xmm0
  1270. ADD1 %xmm1, %xmm8
  1271. pshufd $0xa0, %xmm5, %xmm1
  1272. mulps %xmm0, %xmm1
  1273. ADD2 %xmm2, %xmm9
  1274. pshufd $0xf5, %xmm5, %xmm2
  1275. movsd -24 * SIZE(BO), %xmm5
  1276. mulps %xmm0, %xmm2
  1277. movddup -24 * SIZE(AO), %xmm0
  1278. subq $-8 * SIZE, BO
  1279. subq $-8 * SIZE, AO
  1280. subq $1, %rax
  1281. BRANCH
  1282. jg .L62
  1283. ALIGN_3
  1284. .L65:
  1285. #ifndef TRMMKERNEL
  1286. movq K, %rax
  1287. #else
  1288. movq KKK, %rax
  1289. #endif
  1290. andq $3, %rax # if (k & 1)
  1291. BRANCH
  1292. je .L68
  1293. ALIGN_3
  1294. .L66:
  1295. ADD1 %xmm1, %xmm8
  1296. pshufd $0xa0, %xmm5, %xmm1
  1297. mulps %xmm0, %xmm1
  1298. ADD2 %xmm2, %xmm9
  1299. pshufd $0xf5, %xmm5, %xmm2
  1300. movsd -30 * SIZE(BO), %xmm5
  1301. mulps %xmm0, %xmm2
  1302. movddup -30 * SIZE(AO), %xmm0
  1303. addq $2 * SIZE, AO
  1304. addq $2 * SIZE, BO
  1305. subq $1, %rax
  1306. BRANCH
  1307. jg .L66
  1308. ALIGN_3
  1309. .L68:
  1310. ADD1 %xmm1, %xmm8
  1311. ADD2 %xmm2, %xmm9
  1312. pcmpeqb %xmm0, %xmm0
  1313. psllq $63, %xmm0
  1314. movddup ALPHA_R, %xmm2
  1315. movddup ALPHA_I, %xmm3
  1316. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1317. pxor %xmm0, %xmm9
  1318. shufps $0xb1, %xmm9, %xmm9
  1319. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1320. shufps $0xb1, %xmm9, %xmm9
  1321. pxor %xmm0, %xmm9
  1322. #else
  1323. pxor %xmm0, %xmm8
  1324. shufps $0xb1, %xmm9, %xmm9
  1325. #endif
  1326. addps %xmm9, %xmm8
  1327. pshufd $0xb1, %xmm8, %xmm9
  1328. mulps %xmm2, %xmm8
  1329. mulps %xmm3, %xmm9
  1330. addsubps %xmm9, %xmm8
  1331. #ifndef TRMMKERNEL
  1332. movsd (CO1), %xmm0
  1333. addps %xmm0, %xmm8
  1334. #endif
  1335. movsd %xmm8, (CO1)
  1336. ALIGN_4
  1337. .L999:
  1338. movq 0(%rsp), %rbx
  1339. movq 8(%rsp), %rbp
  1340. movq 16(%rsp), %r12
  1341. movq 24(%rsp), %r13
  1342. movq 32(%rsp), %r14
  1343. movq 40(%rsp), %r15
  1344. #ifdef WINDOWS_ABI
  1345. movq 48(%rsp), %rdi
  1346. movq 56(%rsp), %rsi
  1347. movups 64(%rsp), %xmm6
  1348. movups 80(%rsp), %xmm7
  1349. movups 96(%rsp), %xmm8
  1350. movups 112(%rsp), %xmm9
  1351. movups 128(%rsp), %xmm10
  1352. movups 144(%rsp), %xmm11
  1353. movups 160(%rsp), %xmm12
  1354. movups 176(%rsp), %xmm13
  1355. movups 192(%rsp), %xmm14
  1356. movups 208(%rsp), %xmm15
  1357. #endif
  1358. addq $STACKSIZE, %rsp
  1359. ret
  1360. EPILOGUE