You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x4_barcelona.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define OLD_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define OLD_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #define B %edi
  56. #define LDC %ebp
  57. #define AO %edx
  58. #define BO %ecx
  59. #define CO %esi
  60. #define I %ebx
  61. #define movsd movlps
  62. #define movapd movups
  63. #define movlpd movlps
  64. #define movhpd movhps
  65. #define PREFETCH prefetch
  66. #define PREFETCHSIZE (8 * 7 + 0)
  67. #define KERNEL1(address) \
  68. mulpd %xmm0, %xmm1; \
  69. mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \
  70. addpd %xmm1, %xmm4; \
  71. movapd -12 * SIZE(BO, %eax, 4), %xmm1; \
  72. addpd %xmm0, %xmm5; \
  73. movddup -15 * SIZE(AO, %eax, 2), %xmm0; \
  74. mulpd %xmm0, %xmm2; \
  75. mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \
  76. addpd %xmm0, %xmm7; \
  77. movddup -14 * SIZE(AO, %eax, 2), %xmm0
  78. #define KERNEL2(address) \
  79. addpd %xmm2, %xmm6; \
  80. movapd %xmm1, %xmm2; \
  81. mulpd %xmm0, %xmm1; \
  82. mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \
  83. addpd %xmm1, %xmm4; \
  84. movapd -8 * SIZE(BO, %eax, 4), %xmm1; \
  85. addpd %xmm0, %xmm5; \
  86. movddup -13 * SIZE(AO, %eax, 2), %xmm0; \
  87. mulpd %xmm0, %xmm2; \
  88. mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \
  89. addpd %xmm0, %xmm7; \
  90. movddup -12 * SIZE(AO, %eax, 2), %xmm0
  91. #define KERNEL3(address) \
  92. addpd %xmm2, %xmm6; \
  93. movapd %xmm1, %xmm2; \
  94. mulpd %xmm0, %xmm1; \
  95. mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \
  96. addpd %xmm1, %xmm4; \
  97. movapd -4 * SIZE(BO, %eax, 4), %xmm1; \
  98. addpd %xmm0, %xmm5; \
  99. movddup -11 * SIZE(AO, %eax, 2), %xmm0; \
  100. mulpd %xmm0, %xmm2; \
  101. mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \
  102. addpd %xmm0, %xmm7; \
  103. movddup -10 * SIZE(AO, %eax, 2), %xmm0
  104. #define KERNEL4(address) \
  105. addpd %xmm2, %xmm6; \
  106. movapd %xmm1, %xmm2; \
  107. mulpd %xmm0, %xmm1; \
  108. mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \
  109. addpd %xmm1, %xmm4; \
  110. movapd (BO, %eax, 4), %xmm1; \
  111. addpd %xmm0, %xmm5; \
  112. movddup -9 * SIZE(AO, %eax, 2), %xmm0; \
  113. mulpd %xmm0, %xmm2; \
  114. mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \
  115. addpd %xmm0, %xmm7; \
  116. movddup (AO, %eax, 2), %xmm0
  117. #define KERNEL5(address) \
  118. addpd %xmm2, %xmm6; \
  119. movapd %xmm1, %xmm2; \
  120. mulpd %xmm3, %xmm1; \
  121. mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \
  122. addpd %xmm1, %xmm4; \
  123. movapd 4 * SIZE(BO, %eax, 4), %xmm1; \
  124. addpd %xmm3, %xmm5; \
  125. movddup -7 * SIZE(AO, %eax, 2), %xmm3; \
  126. mulpd %xmm3, %xmm2; \
  127. mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \
  128. addpd %xmm3, %xmm7; \
  129. movddup -6 * SIZE(AO, %eax, 2), %xmm3
  130. #define KERNEL6(address) \
  131. addpd %xmm2, %xmm6; \
  132. movapd %xmm1, %xmm2; \
  133. mulpd %xmm3, %xmm1; \
  134. mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \
  135. addpd %xmm1, %xmm4; \
  136. movapd 8 * SIZE(BO, %eax, 4), %xmm1; \
  137. addpd %xmm3, %xmm5; \
  138. movddup -5 * SIZE(AO, %eax, 2), %xmm3; \
  139. mulpd %xmm3, %xmm2; \
  140. mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \
  141. addpd %xmm3, %xmm7; \
  142. movddup -4 * SIZE(AO, %eax, 2), %xmm3
  143. #define KERNEL7(address) \
  144. addpd %xmm2, %xmm6; \
  145. movapd %xmm1, %xmm2; \
  146. mulpd %xmm3, %xmm1; \
  147. mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \
  148. addpd %xmm1, %xmm4; \
  149. movapd 12 * SIZE(BO, %eax, 4), %xmm1; \
  150. addpd %xmm3, %xmm5; \
  151. movddup -3 * SIZE(AO, %eax, 2), %xmm3; \
  152. mulpd %xmm3, %xmm2; \
  153. mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \
  154. addpd %xmm3, %xmm7; \
  155. movddup -2 * SIZE(AO, %eax, 2), %xmm3
  156. #define KERNEL8(address) \
  157. addpd %xmm2, %xmm6; \
  158. movapd %xmm1, %xmm2; \
  159. mulpd %xmm3, %xmm1; \
  160. mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \
  161. addpd %xmm1, %xmm4; \
  162. movapd 16 * SIZE(BO, %eax, 4), %xmm1; \
  163. addpd %xmm3, %xmm5; \
  164. movddup -1 * SIZE(AO, %eax, 2), %xmm3; \
  165. mulpd %xmm3, %xmm2; \
  166. mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \
  167. addpd %xmm3, %xmm7; \
  168. movddup 8 * SIZE(AO, %eax, 2), %xmm3; \
  169. addpd %xmm2, %xmm6; \
  170. movapd %xmm1, %xmm2
  171. PROLOGUE
  172. subl $ARGS, %esp
  173. pushl %ebp
  174. pushl %edi
  175. pushl %esi
  176. pushl %ebx
  177. PROFCODE
  178. movl OLD_B, B
  179. movl OLD_LDC, LDC
  180. #ifdef TRMMKERNEL
  181. movl OFFSET, %eax
  182. #ifndef LEFT
  183. negl %eax
  184. #endif
  185. movl %eax, KK
  186. #endif
  187. subl $-16 * SIZE, A
  188. subl $-16 * SIZE, B
  189. leal (, LDC, SIZE), LDC
  190. movl N, %eax
  191. sarl $2, %eax
  192. movl %eax, J
  193. jle .L30
  194. ALIGN_2
  195. .L01:
  196. #if defined(TRMMKERNEL) && defined(LEFT)
  197. movl OFFSET, %eax
  198. movl %eax, KK
  199. #endif
  200. leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
  201. movl %eax, BX
  202. movl C, CO # coffset = c
  203. movl A, AO # aoffset = a
  204. movl M, I
  205. sarl $1, I # i = (m >> 2)
  206. jle .L20
  207. ALIGN_4
  208. .L11:
  209. #if !defined(TRMMKERNEL) || \
  210. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  211. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  212. movl B, BO
  213. #else
  214. movl KK, %eax
  215. leal (, %eax, SIZE), %eax
  216. leal (AO, %eax, 2), AO
  217. leal (B, %eax, 4), BO
  218. #endif
  219. movddup -16 * SIZE(AO), %xmm0
  220. movapd -16 * SIZE(BO), %xmm1
  221. pxor %xmm4, %xmm4
  222. movddup -8 * SIZE(AO), %xmm3
  223. leal (LDC, LDC, 2), %eax
  224. prefetchw 1 * SIZE(CO)
  225. pxor %xmm5, %xmm5
  226. prefetchw 3 * SIZE(CO, LDC)
  227. pxor %xmm6, %xmm6
  228. prefetchw 1 * SIZE(CO, LDC, 2)
  229. pxor %xmm7, %xmm7
  230. prefetchw 3 * SIZE(CO, %eax)
  231. movapd %xmm1, %xmm2
  232. movl BX, %eax
  233. prefetch -16 * SIZE(%eax)
  234. addl $8 * SIZE, %eax
  235. movl %eax, BX
  236. #ifndef TRMMKERNEL
  237. movl K, %eax
  238. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  239. movl K, %eax
  240. subl KK, %eax
  241. movl %eax, KKK
  242. #else
  243. movl KK, %eax
  244. #ifdef LEFT
  245. addl $2, %eax
  246. #else
  247. addl $4, %eax
  248. #endif
  249. movl %eax, KKK
  250. #endif
  251. andl $-8, %eax
  252. leal (, %eax, SIZE), %eax
  253. leal (AO, %eax, 2), AO
  254. leal (BO, %eax, 4), BO
  255. negl %eax
  256. NOBRANCH
  257. je .L15
  258. ALIGN_3
  259. .L12:
  260. KERNEL1(16 * 0)
  261. KERNEL2(16 * 0)
  262. KERNEL3(16 * 0)
  263. KERNEL4(16 * 0)
  264. KERNEL5(16 * 0)
  265. KERNEL6(16 * 0)
  266. KERNEL7(16 * 0)
  267. KERNEL8(16 * 0)
  268. addl $8 * SIZE, %eax
  269. NOBRANCH
  270. je .L15
  271. KERNEL1(16 * 0)
  272. KERNEL2(16 * 0)
  273. KERNEL3(16 * 0)
  274. KERNEL4(16 * 0)
  275. KERNEL5(16 * 0)
  276. KERNEL6(16 * 0)
  277. KERNEL7(16 * 0)
  278. KERNEL8(16 * 0)
  279. addl $8 * SIZE, %eax
  280. NOBRANCH
  281. je .L15
  282. KERNEL1(16 * 0)
  283. KERNEL2(16 * 0)
  284. KERNEL3(16 * 0)
  285. KERNEL4(16 * 0)
  286. KERNEL5(16 * 0)
  287. KERNEL6(16 * 0)
  288. KERNEL7(16 * 0)
  289. KERNEL8(16 * 0)
  290. addl $8 * SIZE, %eax
  291. NOBRANCH
  292. je .L15
  293. KERNEL1(16 * 0)
  294. KERNEL2(16 * 0)
  295. KERNEL3(16 * 0)
  296. KERNEL4(16 * 0)
  297. KERNEL5(16 * 0)
  298. KERNEL6(16 * 0)
  299. KERNEL7(16 * 0)
  300. KERNEL8(16 * 0)
  301. addl $8 * SIZE, %eax
  302. NOBRANCH
  303. je .L15
  304. KERNEL1(16 * 0)
  305. KERNEL2(16 * 0)
  306. KERNEL3(16 * 0)
  307. KERNEL4(16 * 0)
  308. KERNEL5(16 * 0)
  309. KERNEL6(16 * 0)
  310. KERNEL7(16 * 0)
  311. KERNEL8(16 * 0)
  312. addl $8 * SIZE, %eax
  313. NOBRANCH
  314. je .L15
  315. KERNEL1(16 * 0)
  316. KERNEL2(16 * 0)
  317. KERNEL3(16 * 0)
  318. KERNEL4(16 * 0)
  319. KERNEL5(16 * 0)
  320. KERNEL6(16 * 0)
  321. KERNEL7(16 * 0)
  322. KERNEL8(16 * 0)
  323. addl $8 * SIZE, %eax
  324. NOBRANCH
  325. je .L15
  326. KERNEL1(16 * 0)
  327. KERNEL2(16 * 0)
  328. KERNEL3(16 * 0)
  329. KERNEL4(16 * 0)
  330. KERNEL5(16 * 0)
  331. KERNEL6(16 * 0)
  332. KERNEL7(16 * 0)
  333. KERNEL8(16 * 0)
  334. addl $8 * SIZE, %eax
  335. NOBRANCH
  336. je .L15
  337. KERNEL1(16 * 0)
  338. KERNEL2(16 * 0)
  339. KERNEL3(16 * 0)
  340. KERNEL4(16 * 0)
  341. KERNEL5(16 * 0)
  342. KERNEL6(16 * 0)
  343. KERNEL7(16 * 0)
  344. KERNEL8(16 * 0)
  345. addl $8 * SIZE, %eax
  346. BRANCH
  347. jl .L12
  348. ALIGN_3
  349. .L15:
  350. movddup ALPHA, %xmm3
  351. #ifndef TRMMKERNEL
  352. movl K, %eax
  353. #else
  354. movl KKK, %eax
  355. #endif
  356. andl $7, %eax # if (k & 1)
  357. je .L18
  358. leal (, %eax, SIZE), %eax
  359. leal (AO, %eax, 2), AO
  360. leal (BO, %eax, 4), BO
  361. negl %eax
  362. ALIGN_3
  363. .L17:
  364. mulpd %xmm0, %xmm1
  365. mulpd -14 * SIZE(BO, %eax, 4), %xmm0
  366. addpd %xmm1, %xmm4
  367. movapd -12 * SIZE(BO, %eax, 4), %xmm1
  368. addpd %xmm0, %xmm5
  369. movddup -15 * SIZE(AO, %eax, 2), %xmm0
  370. mulpd %xmm0, %xmm2
  371. mulpd -14 * SIZE(BO, %eax, 4), %xmm0
  372. addpd %xmm0, %xmm7
  373. movddup -14 * SIZE(AO, %eax, 2), %xmm0
  374. addpd %xmm2, %xmm6
  375. movapd %xmm1, %xmm2
  376. addl $SIZE, %eax
  377. jl .L17
  378. ALIGN_4
  379. .L18:
  380. leal (CO, LDC, 2), %eax
  381. mulpd %xmm3, %xmm4
  382. mulpd %xmm3, %xmm5
  383. mulpd %xmm3, %xmm6
  384. mulpd %xmm3, %xmm7
  385. #ifndef TRMMKERNEL
  386. movsd 0 * SIZE(CO ), %xmm0
  387. movhpd 0 * SIZE(CO, LDC), %xmm0
  388. movsd 0 * SIZE(%eax ), %xmm1
  389. movhpd 0 * SIZE(%eax, LDC), %xmm1
  390. movsd 1 * SIZE(CO ), %xmm2
  391. movhpd 1 * SIZE(CO, LDC), %xmm2
  392. movsd 1 * SIZE(%eax ), %xmm3
  393. movhpd 1 * SIZE(%eax, LDC), %xmm3
  394. addpd %xmm0, %xmm4
  395. addpd %xmm1, %xmm5
  396. addpd %xmm2, %xmm6
  397. addpd %xmm3, %xmm7
  398. #endif
  399. movsd %xmm4, 0 * SIZE(CO)
  400. movsd %xmm6, 1 * SIZE(CO)
  401. movhpd %xmm4, 0 * SIZE(CO, LDC)
  402. movhpd %xmm6, 1 * SIZE(CO, LDC)
  403. movsd %xmm5, 0 * SIZE(%eax)
  404. movsd %xmm7, 1 * SIZE(%eax)
  405. movhpd %xmm5, 0 * SIZE(%eax, LDC)
  406. movhpd %xmm7, 1 * SIZE(%eax, LDC)
  407. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  408. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  409. movl K, %eax
  410. subl KKK, %eax
  411. leal (,%eax, SIZE), %eax
  412. leal (AO, %eax, 2), AO
  413. leal (BO, %eax, 4), BO
  414. #endif
  415. #if defined(TRMMKERNEL) && defined(LEFT)
  416. addl $2, KK
  417. #endif
  418. addl $2 * SIZE, CO # coffset += 2
  419. decl I # i --
  420. jg .L11
  421. ALIGN_4
  422. .L20:
  423. movl M, I
  424. testl $1, I # i = (m >> 2)
  425. jle .L29
  426. #if !defined(TRMMKERNEL) || \
  427. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  428. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  429. movl B, BO
  430. #else
  431. movl KK, %eax
  432. leal (, %eax, SIZE), %eax
  433. leal (AO, %eax, 1), AO
  434. leal (B, %eax, 4), BO
  435. #endif
  436. movddup -16 * SIZE(AO), %xmm0
  437. movapd -16 * SIZE(BO), %xmm1
  438. movddup -8 * SIZE(AO), %xmm3
  439. pxor %xmm4, %xmm4
  440. pxor %xmm5, %xmm5
  441. pxor %xmm6, %xmm6
  442. pxor %xmm7, %xmm7
  443. #ifndef TRMMKERNEL
  444. movl K, %eax
  445. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  446. movl K, %eax
  447. subl KK, %eax
  448. movl %eax, KKK
  449. #else
  450. movl KK, %eax
  451. #ifdef LEFT
  452. addl $1, %eax
  453. #else
  454. addl $4, %eax
  455. #endif
  456. movl %eax, KKK
  457. #endif
  458. sarl $3, %eax
  459. je .L25
  460. ALIGN_4
  461. .L22:
  462. mulpd %xmm0, %xmm1
  463. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  464. mulpd -14 * SIZE(BO), %xmm0
  465. addpd %xmm1, %xmm4
  466. movapd -12 * SIZE(BO), %xmm1
  467. addpd %xmm0, %xmm5
  468. movddup -15 * SIZE(AO), %xmm0
  469. mulpd %xmm0, %xmm1
  470. mulpd -10 * SIZE(BO), %xmm0
  471. addpd %xmm1, %xmm6
  472. movapd -8 * SIZE(BO), %xmm1
  473. addpd %xmm0, %xmm7
  474. movddup -14 * SIZE(AO), %xmm0
  475. mulpd %xmm0, %xmm1
  476. mulpd -6 * SIZE(BO), %xmm0
  477. addpd %xmm1, %xmm4
  478. movapd -4 * SIZE(BO), %xmm1
  479. addpd %xmm0, %xmm5
  480. movddup -13 * SIZE(AO), %xmm0
  481. mulpd %xmm0, %xmm1
  482. mulpd -2 * SIZE(BO), %xmm0
  483. addpd %xmm1, %xmm6
  484. movapd (BO), %xmm1
  485. addpd %xmm0, %xmm7
  486. movddup -12 * SIZE(AO), %xmm0
  487. mulpd %xmm0, %xmm1
  488. mulpd 2 * SIZE(BO), %xmm0
  489. addpd %xmm1, %xmm4
  490. movapd 4 * SIZE(BO), %xmm1
  491. addpd %xmm0, %xmm5
  492. movddup -11 * SIZE(AO), %xmm0
  493. mulpd %xmm0, %xmm1
  494. mulpd 6 * SIZE(BO), %xmm0
  495. addpd %xmm1, %xmm6
  496. movapd 8 * SIZE(BO), %xmm1
  497. addpd %xmm0, %xmm7
  498. movddup -10 * SIZE(AO), %xmm0
  499. mulpd %xmm0, %xmm1
  500. mulpd 10 * SIZE(BO), %xmm0
  501. addpd %xmm1, %xmm4
  502. movapd 12 * SIZE(BO), %xmm1
  503. addpd %xmm0, %xmm5
  504. movddup -9 * SIZE(AO), %xmm0
  505. mulpd %xmm0, %xmm1
  506. mulpd 14 * SIZE(BO), %xmm0
  507. addpd %xmm1, %xmm6
  508. movapd 16 * SIZE(BO), %xmm1
  509. addpd %xmm0, %xmm7
  510. movddup -8 * SIZE(AO), %xmm0
  511. subl $ -8 * SIZE, AO
  512. subl $-32 * SIZE, BO
  513. decl %eax
  514. jne .L22
  515. ALIGN_4
  516. .L25:
  517. movddup ALPHA, %xmm3
  518. #ifndef TRMMKERNEL
  519. movl K, %eax
  520. #else
  521. movl KKK, %eax
  522. #endif
  523. andl $7, %eax # if (k & 1)
  524. BRANCH
  525. je .L28
  526. .L26:
  527. mulpd %xmm0, %xmm1
  528. mulpd -14 * SIZE(BO), %xmm0
  529. addpd %xmm1, %xmm4
  530. movapd -12 * SIZE(BO), %xmm1
  531. addpd %xmm0, %xmm5
  532. movddup -15 * SIZE(AO), %xmm0
  533. addl $1 * SIZE, AO
  534. addl $4 * SIZE, BO
  535. decl %eax
  536. jg .L26
  537. ALIGN_4
  538. .L28:
  539. leal (CO, LDC, 2), %eax
  540. addpd %xmm6, %xmm4
  541. addpd %xmm7, %xmm5
  542. mulpd %xmm3, %xmm4
  543. mulpd %xmm3, %xmm5
  544. #ifndef TRMMKERNEL
  545. movsd 0 * SIZE(CO ), %xmm0
  546. movhpd 0 * SIZE(CO, LDC), %xmm0
  547. movsd 0 * SIZE(%eax ), %xmm1
  548. movhpd 0 * SIZE(%eax, LDC), %xmm1
  549. addpd %xmm0, %xmm4
  550. addpd %xmm1, %xmm5
  551. #endif
  552. movsd %xmm4, 0 * SIZE(CO )
  553. movhpd %xmm4, 0 * SIZE(CO, LDC)
  554. movsd %xmm5, 0 * SIZE(%eax )
  555. movhpd %xmm5, 0 * SIZE(%eax, LDC)
  556. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  557. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  558. movl K, %eax
  559. subl KKK, %eax
  560. leal (,%eax, SIZE), %eax
  561. leal (AO, %eax, 1), AO
  562. leal (BO, %eax, 4), BO
  563. #endif
  564. #if defined(TRMMKERNEL) && defined(LEFT)
  565. addl $1, KK
  566. #endif
  567. ALIGN_4
  568. .L29:
  569. #if defined(TRMMKERNEL) && !defined(LEFT)
  570. addl $4, KK
  571. #endif
  572. movl BO, B
  573. leal (, LDC, 4), %eax
  574. addl %eax, C # c += 4 * ldc
  575. decl J # j --
  576. jg .L01
  577. ALIGN_4
  578. .L30:
  579. testl $2, N
  580. je .L60
  581. ALIGN_2
  582. .L31:
  583. #if defined(TRMMKERNEL) && defined(LEFT)
  584. movl OFFSET, %eax
  585. movl %eax, KK
  586. #endif
  587. movl C, CO # coffset = c
  588. movl A, AO # aoffset = a
  589. movl M, I
  590. sarl $1, I # i = (m >> 2)
  591. jle .L50
  592. ALIGN_4
  593. .L41:
  594. #if !defined(TRMMKERNEL) || \
  595. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  596. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  597. movl B, BO
  598. #else
  599. movl KK, %eax
  600. leal (, %eax, SIZE), %eax
  601. leal (AO, %eax, 2), AO
  602. leal (B, %eax, 2), BO
  603. #endif
  604. movddup -16 * SIZE(AO), %xmm0
  605. pxor %xmm4, %xmm4
  606. prefetchw 1 * SIZE(CO)
  607. pxor %xmm5, %xmm5
  608. prefetchw 1 * SIZE(CO, LDC)
  609. pxor %xmm6, %xmm6
  610. pxor %xmm7, %xmm7
  611. #ifndef TRMMKERNEL
  612. movl K, %eax
  613. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  614. movl K, %eax
  615. subl KK, %eax
  616. movl %eax, KKK
  617. #else
  618. movl KK, %eax
  619. #ifdef LEFT
  620. addl $2, %eax
  621. #else
  622. addl $2, %eax
  623. #endif
  624. movl %eax, KKK
  625. #endif
  626. sarl $3, %eax
  627. je .L45
  628. ALIGN_4
  629. .L42:
  630. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO)
  631. mulpd -16 * SIZE(BO), %xmm0
  632. movddup -15 * SIZE(AO), %xmm1
  633. addpd %xmm0, %xmm4
  634. mulpd -16 * SIZE(BO), %xmm1
  635. movddup -14 * SIZE(AO), %xmm0
  636. addpd %xmm1, %xmm5
  637. mulpd -14 * SIZE(BO), %xmm0
  638. movddup -13 * SIZE(AO), %xmm1
  639. addpd %xmm0, %xmm6
  640. mulpd -14 * SIZE(BO), %xmm1
  641. movddup -12 * SIZE(AO), %xmm0
  642. addpd %xmm1, %xmm7
  643. mulpd -12 * SIZE(BO), %xmm0
  644. movddup -11 * SIZE(AO), %xmm1
  645. addpd %xmm0, %xmm4
  646. mulpd -12 * SIZE(BO), %xmm1
  647. movddup -10 * SIZE(AO), %xmm0
  648. addpd %xmm1, %xmm5
  649. mulpd -10 * SIZE(BO), %xmm0
  650. movddup -9 * SIZE(AO), %xmm1
  651. addpd %xmm0, %xmm6
  652. mulpd -10 * SIZE(BO), %xmm1
  653. movddup -8 * SIZE(AO), %xmm0
  654. addpd %xmm1, %xmm7
  655. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO)
  656. mulpd -8 * SIZE(BO), %xmm0
  657. movddup -7 * SIZE(AO), %xmm1
  658. addpd %xmm0, %xmm4
  659. mulpd -8 * SIZE(BO), %xmm1
  660. movddup -6 * SIZE(AO), %xmm0
  661. addpd %xmm1, %xmm5
  662. mulpd -6 * SIZE(BO), %xmm0
  663. movddup -5 * SIZE(AO), %xmm1
  664. addpd %xmm0, %xmm6
  665. mulpd -6 * SIZE(BO), %xmm1
  666. movddup -4 * SIZE(AO), %xmm0
  667. addpd %xmm1, %xmm7
  668. mulpd -4 * SIZE(BO), %xmm0
  669. movddup -3 * SIZE(AO), %xmm1
  670. addpd %xmm0, %xmm4
  671. mulpd -4 * SIZE(BO), %xmm1
  672. movddup -2 * SIZE(AO), %xmm0
  673. addpd %xmm1, %xmm5
  674. mulpd -2 * SIZE(BO), %xmm0
  675. movddup -1 * SIZE(AO), %xmm1
  676. addpd %xmm0, %xmm6
  677. mulpd -2 * SIZE(BO), %xmm1
  678. movddup 0 * SIZE(AO), %xmm0
  679. addpd %xmm1, %xmm7
  680. subl $-16 * SIZE, AO
  681. subl $-16 * SIZE, BO
  682. decl %eax
  683. jne .L42
  684. ALIGN_4
  685. .L45:
  686. #ifndef TRMMKERNEL
  687. movl K, %eax
  688. #else
  689. movl KKK, %eax
  690. #endif
  691. movddup ALPHA, %xmm3
  692. andl $7, %eax # if (k & 1)
  693. BRANCH
  694. je .L48
  695. ALIGN_3
  696. .L46:
  697. mulpd -16 * SIZE(BO), %xmm0
  698. movddup -15 * SIZE(AO), %xmm1
  699. addpd %xmm0, %xmm4
  700. mulpd -16 * SIZE(BO), %xmm1
  701. movddup -14 * SIZE(AO), %xmm0
  702. addpd %xmm1, %xmm5
  703. addl $2 * SIZE, AO
  704. addl $2 * SIZE, BO
  705. decl %eax
  706. jg .L46
  707. ALIGN_4
  708. .L48:
  709. #ifndef TRMMKERNEL
  710. movsd 0 * SIZE(CO), %xmm0
  711. movhpd 0 * SIZE(CO, LDC), %xmm0
  712. movsd 1 * SIZE(CO), %xmm1
  713. movhpd 1 * SIZE(CO, LDC), %xmm1
  714. #endif
  715. addpd %xmm6, %xmm4
  716. addpd %xmm7, %xmm5
  717. mulpd %xmm3, %xmm4
  718. mulpd %xmm3, %xmm5
  719. #ifndef TRMMKERNEL
  720. addpd %xmm0, %xmm4
  721. addpd %xmm1, %xmm5
  722. #endif
  723. movlpd %xmm4, 0 * SIZE(CO)
  724. movlpd %xmm5, 1 * SIZE(CO)
  725. movhpd %xmm4, 0 * SIZE(CO, LDC)
  726. movhpd %xmm5, 1 * SIZE(CO, LDC)
  727. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  728. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  729. movl K, %eax
  730. subl KKK, %eax
  731. leal (,%eax, SIZE), %eax
  732. leal (AO, %eax, 2), AO
  733. leal (BO, %eax, 2), BO
  734. #endif
  735. #if defined(TRMMKERNEL) && defined(LEFT)
  736. addl $2, KK
  737. #endif
  738. addl $2 * SIZE, CO # coffset += 2
  739. decl I # i --
  740. jg .L41
  741. ALIGN_4
  742. .L50:
  743. movl M, I
  744. testl $1, I # i = (m >> 2)
  745. jle .L59
  746. #if !defined(TRMMKERNEL) || \
  747. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  748. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  749. movl B, BO
  750. #else
  751. movl KK, %eax
  752. leal (, %eax, SIZE), %eax
  753. leal (AO, %eax, 1), AO
  754. leal (B, %eax, 2), BO
  755. #endif
  756. movddup -16 * SIZE(AO), %xmm0
  757. pxor %xmm4, %xmm4
  758. pxor %xmm5, %xmm5
  759. pxor %xmm6, %xmm6
  760. pxor %xmm7, %xmm7
  761. #ifndef TRMMKERNEL
  762. movl K, %eax
  763. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  764. movl K, %eax
  765. subl KK, %eax
  766. movl %eax, KKK
  767. #else
  768. movl KK, %eax
  769. #ifdef LEFT
  770. addl $1, %eax
  771. #else
  772. addl $2, %eax
  773. #endif
  774. movl %eax, KKK
  775. #endif
  776. sarl $3, %eax
  777. je .L55
  778. ALIGN_4
  779. .L52:
  780. mulpd -16 * SIZE(BO), %xmm0
  781. addpd %xmm0, %xmm4
  782. movddup -15 * SIZE(AO), %xmm0
  783. mulpd -14 * SIZE(BO), %xmm0
  784. addpd %xmm0, %xmm4
  785. movddup -14 * SIZE(AO), %xmm0
  786. mulpd -12 * SIZE(BO), %xmm0
  787. addpd %xmm0, %xmm4
  788. movddup -13 * SIZE(AO), %xmm0
  789. mulpd -10 * SIZE(BO), %xmm0
  790. addpd %xmm0, %xmm4
  791. movddup -12 * SIZE(AO), %xmm0
  792. mulpd -8 * SIZE(BO), %xmm0
  793. addpd %xmm0, %xmm4
  794. movddup -11 * SIZE(AO), %xmm0
  795. mulpd -6 * SIZE(BO), %xmm0
  796. addpd %xmm0, %xmm4
  797. movddup -10 * SIZE(AO), %xmm0
  798. mulpd -4 * SIZE(BO), %xmm0
  799. addpd %xmm0, %xmm4
  800. movddup -9 * SIZE(AO), %xmm0
  801. mulpd -2 * SIZE(BO), %xmm0
  802. addpd %xmm0, %xmm4
  803. movddup -8 * SIZE(AO), %xmm0
  804. subl $ -8 * SIZE, AO
  805. subl $-16 * SIZE, BO
  806. decl %eax
  807. jne .L52
  808. ALIGN_4
  809. .L55:
  810. movddup ALPHA, %xmm3
  811. #ifndef TRMMKERNEL
  812. movl K, %eax
  813. #else
  814. movl KKK, %eax
  815. #endif
  816. andl $7, %eax # if (k & 1)
  817. BRANCH
  818. je .L58
  819. .L56:
  820. mulpd -16 * SIZE(BO), %xmm0
  821. addpd %xmm0, %xmm4
  822. movddup -15 * SIZE(AO), %xmm0
  823. subl $-1 * SIZE, AO
  824. subl $-2 * SIZE, BO
  825. decl %eax
  826. jg .L56
  827. ALIGN_4
  828. .L58:
  829. addpd %xmm6, %xmm4
  830. addpd %xmm7, %xmm5
  831. addpd %xmm5, %xmm4
  832. mulpd %xmm3, %xmm4
  833. #ifndef TRMMKERNEL
  834. movsd 0 * SIZE(CO), %xmm0
  835. movhpd 0 * SIZE(CO, LDC), %xmm0
  836. addpd %xmm0, %xmm4
  837. #endif
  838. movlpd %xmm4, 0 * SIZE(CO)
  839. movhpd %xmm4, 0 * SIZE(CO, LDC, 1)
  840. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  841. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  842. movl K, %eax
  843. subl KKK, %eax
  844. leal (,%eax, SIZE), %eax
  845. leal (AO, %eax, 1), AO
  846. leal (BO, %eax, 2), BO
  847. #endif
  848. #if defined(TRMMKERNEL) && defined(LEFT)
  849. addl $1, KK
  850. #endif
  851. ALIGN_4
  852. .L59:
  853. #if defined(TRMMKERNEL) && !defined(LEFT)
  854. addl $2, KK
  855. #endif
  856. movl BO, B
  857. leal (, LDC, 2), %eax
  858. addl %eax, C # c += 4 * ldc
  859. ALIGN_4
  860. .L60:
  861. testl $1, N
  862. je .L999
  863. #if defined(TRMMKERNEL) && defined(LEFT)
  864. movl OFFSET, %eax
  865. movl %eax, KK
  866. #endif
  867. movl C, CO # coffset = c
  868. movl A, AO # aoffset = a
  869. movl M, I
  870. sarl $1, I # i = (m >> 2)
  871. jle .L80
  872. ALIGN_4
  873. .L71:
  874. #if !defined(TRMMKERNEL) || \
  875. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  876. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  877. movl B, BO
  878. #else
  879. movl KK, %eax
  880. leal (, %eax, SIZE), %eax
  881. leal (AO, %eax, 2), AO
  882. leal (B, %eax, 1), BO
  883. #endif
  884. movddup -16 * SIZE(BO), %xmm0
  885. pxor %xmm4, %xmm4
  886. pxor %xmm5, %xmm5
  887. pxor %xmm6, %xmm6
  888. pxor %xmm7, %xmm7
  889. prefetchw 1 * SIZE(CO)
  890. #ifndef TRMMKERNEL
  891. movl K, %eax
  892. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  893. movl K, %eax
  894. subl KK, %eax
  895. movl %eax, KKK
  896. #else
  897. movl KK, %eax
  898. #ifdef LEFT
  899. addl $2, %eax
  900. #else
  901. addl $1, %eax
  902. #endif
  903. movl %eax, KKK
  904. #endif
  905. sarl $3, %eax
  906. je .L75
  907. ALIGN_4
  908. .L72:
  909. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  910. mulpd -16 * SIZE(AO), %xmm0
  911. addpd %xmm0, %xmm4
  912. movddup -15 * SIZE(BO), %xmm0
  913. mulpd -14 * SIZE(AO), %xmm0
  914. addpd %xmm0, %xmm4
  915. movddup -14 * SIZE(BO), %xmm0
  916. mulpd -12 * SIZE(AO), %xmm0
  917. addpd %xmm0, %xmm4
  918. movddup -13 * SIZE(BO), %xmm0
  919. mulpd -10 * SIZE(AO), %xmm0
  920. addpd %xmm0, %xmm4
  921. movddup -12 * SIZE(BO), %xmm0
  922. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  923. mulpd -8 * SIZE(AO), %xmm0
  924. addpd %xmm0, %xmm4
  925. movddup -11 * SIZE(BO), %xmm0
  926. mulpd -6 * SIZE(AO), %xmm0
  927. addpd %xmm0, %xmm4
  928. movddup -10 * SIZE(BO), %xmm0
  929. mulpd -4 * SIZE(AO), %xmm0
  930. addpd %xmm0, %xmm4
  931. movddup -9 * SIZE(BO), %xmm0
  932. mulpd -2 * SIZE(AO), %xmm0
  933. addpd %xmm0, %xmm4
  934. movddup -8 * SIZE(BO), %xmm0
  935. subl $-16 * SIZE, AO
  936. subl $ -8 * SIZE, BO
  937. decl %eax
  938. jne .L72
  939. ALIGN_4
  940. .L75:
  941. movddup ALPHA, %xmm3
  942. #ifndef TRMMKERNEL
  943. movl K, %eax
  944. #else
  945. movl KKK, %eax
  946. #endif
  947. andl $7, %eax # if (k & 1)
  948. BRANCH
  949. je .L78
  950. ALIGN_3
  951. .L76:
  952. mulpd -16 * SIZE(AO), %xmm0
  953. addpd %xmm0, %xmm4
  954. movddup -15 * SIZE(BO), %xmm0
  955. addl $2 * SIZE, AO
  956. addl $1 * SIZE, BO
  957. decl %eax
  958. jg .L76
  959. ALIGN_4
  960. .L78:
  961. mulpd %xmm3, %xmm4
  962. #ifndef TRMMKERNEL
  963. movsd 0 * SIZE(CO), %xmm0
  964. movhpd 1 * SIZE(CO), %xmm0
  965. addpd %xmm0, %xmm4
  966. #endif
  967. movsd %xmm4, 0 * SIZE(CO)
  968. movhpd %xmm4, 1 * SIZE(CO)
  969. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  970. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  971. movl K, %eax
  972. subl KKK, %eax
  973. leal (,%eax, SIZE), %eax
  974. leal (AO, %eax, 2), AO
  975. leal (BO, %eax, 1), BO
  976. #endif
  977. #if defined(TRMMKERNEL) && defined(LEFT)
  978. addl $2, KK
  979. #endif
  980. addl $2 * SIZE, CO # coffset += 2
  981. decl I # i --
  982. jg .L71
  983. ALIGN_4
  984. .L80:
  985. movl M, I
  986. testl $1, I # i = (m >> 2)
  987. jle .L999
  988. #if !defined(TRMMKERNEL) || \
  989. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  990. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  991. movl B, BO
  992. #else
  993. movl KK, %eax
  994. leal (, %eax, SIZE), %eax
  995. leal (AO, %eax, 1), AO
  996. leal (B, %eax, 1), BO
  997. #endif
  998. movaps -16 * SIZE(AO), %xmm0
  999. pxor %xmm4, %xmm4
  1000. pxor %xmm5, %xmm5
  1001. pxor %xmm6, %xmm6
  1002. pxor %xmm7, %xmm7
  1003. #ifndef TRMMKERNEL
  1004. movl K, %eax
  1005. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1006. movl K, %eax
  1007. subl KK, %eax
  1008. movl %eax, KKK
  1009. #else
  1010. movl KK, %eax
  1011. #ifdef LEFT
  1012. addl $1, %eax
  1013. #else
  1014. addl $1, %eax
  1015. #endif
  1016. movl %eax, KKK
  1017. #endif
  1018. sarl $3, %eax
  1019. je .L85
  1020. ALIGN_4
  1021. .L82:
  1022. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1023. mulpd -16 * SIZE(BO), %xmm0
  1024. addpd %xmm0, %xmm4
  1025. movapd -14 * SIZE(AO), %xmm0
  1026. mulpd -14 * SIZE(BO), %xmm0
  1027. addpd %xmm0, %xmm5
  1028. movapd -12 * SIZE(AO), %xmm0
  1029. mulpd -12 * SIZE(BO), %xmm0
  1030. addpd %xmm0, %xmm6
  1031. movapd -10 * SIZE(AO), %xmm0
  1032. mulpd -10 * SIZE(BO), %xmm0
  1033. addpd %xmm0, %xmm7
  1034. movapd -8 * SIZE(AO), %xmm0
  1035. subl $-8 * SIZE, AO
  1036. subl $-8 * SIZE, BO
  1037. decl %eax
  1038. jne .L82
  1039. ALIGN_4
  1040. .L85:
  1041. movddup ALPHA, %xmm3
  1042. #ifndef TRMMKERNEL
  1043. movl K, %eax
  1044. #else
  1045. movl KKK, %eax
  1046. #endif
  1047. andl $7, %eax # if (k & 1)
  1048. BRANCH
  1049. je .L88
  1050. .L86:
  1051. mulsd -16 * SIZE(BO), %xmm0
  1052. addsd %xmm0, %xmm4
  1053. movsd -15 * SIZE(AO), %xmm0
  1054. addl $1 * SIZE, AO
  1055. addl $1 * SIZE, BO
  1056. decl %eax
  1057. jg .L86
  1058. ALIGN_4
  1059. .L88:
  1060. addpd %xmm5, %xmm4
  1061. addpd %xmm7, %xmm6
  1062. addpd %xmm6, %xmm4
  1063. haddpd %xmm4, %xmm4
  1064. mulsd %xmm3, %xmm4
  1065. #ifndef TRMMKERNEL
  1066. movsd 0 * SIZE(CO), %xmm0
  1067. addsd %xmm0, %xmm4
  1068. #endif
  1069. movsd %xmm4, 0 * SIZE(CO)
  1070. ALIGN_4
  1071. .L999:
  1072. popl %ebx
  1073. popl %esi
  1074. popl %edi
  1075. popl %ebp
  1076. addl $ARGS, %esp
  1077. ret
  1078. EPILOGUE