You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LN_2x1_atom.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define BB %rbx
  53. #define KK %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define KKK 56(%rsp)
  60. #define AORIG 64(%rsp)
  61. #else
  62. #define STACKSIZE 256
  63. #define OLD_A 48 + STACKSIZE(%rsp)
  64. #define OLD_B 56 + STACKSIZE(%rsp)
  65. #define OLD_C 64 + STACKSIZE(%rsp)
  66. #define OLD_LDC 72 + STACKSIZE(%rsp)
  67. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  68. #define OFFSET 224(%rsp)
  69. #define KKK 232(%rsp)
  70. #define AORIG 240(%rsp)
  71. #endif
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHSIZE (8 * 8 + 3)
  74. #ifndef CONJ
  75. #define ADDSD1 addsd
  76. #define ADDSD2 addsd
  77. #define ADDSD3 addsd
  78. #define ADDSD4 subsd
  79. #elif defined(LN) || defined(LT)
  80. #define ADDSD1 addsd
  81. #define ADDSD2 addsd
  82. #define ADDSD3 subsd
  83. #define ADDSD4 addsd
  84. #else
  85. #define ADDSD1 addsd
  86. #define ADDSD2 subsd
  87. #define ADDSD3 addsd
  88. #define ADDSD4 addsd
  89. #endif
  90. PROLOGUE
  91. PROFCODE
  92. subq $STACKSIZE, %rsp
  93. movq %rbx, 0(%rsp)
  94. movq %rbp, 8(%rsp)
  95. movq %r12, 16(%rsp)
  96. movq %r13, 24(%rsp)
  97. movq %r14, 32(%rsp)
  98. movq %r15, 40(%rsp)
  99. #ifdef WINDOWS_ABI
  100. movq %rdi, 48(%rsp)
  101. movq %rsi, 56(%rsp)
  102. movups %xmm6, 64(%rsp)
  103. movups %xmm7, 80(%rsp)
  104. movups %xmm8, 96(%rsp)
  105. movups %xmm9, 112(%rsp)
  106. movups %xmm10, 128(%rsp)
  107. movups %xmm11, 144(%rsp)
  108. movups %xmm12, 160(%rsp)
  109. movups %xmm13, 176(%rsp)
  110. movups %xmm14, 192(%rsp)
  111. movups %xmm15, 208(%rsp)
  112. movq ARG1, M
  113. movq ARG2, N
  114. movq ARG3, K
  115. movq OLD_A, A
  116. movq OLD_B, B
  117. movq OLD_C, C
  118. movq OLD_LDC, LDC
  119. #endif
  120. movq OLD_LDC, LDC
  121. movq OLD_OFFSET, KK
  122. movq KK, OFFSET
  123. salq $ZBASE_SHIFT, LDC
  124. #ifdef LN
  125. movq M, %rax
  126. salq $ZBASE_SHIFT, %rax
  127. addq %rax, C
  128. imulq K, %rax
  129. addq %rax, A
  130. #endif
  131. #ifdef RT
  132. movq N, %rax
  133. salq $ZBASE_SHIFT, %rax
  134. imulq K, %rax
  135. addq %rax, B
  136. movq N, %rax
  137. imulq LDC, %rax
  138. addq %rax, C
  139. #endif
  140. #ifdef RN
  141. negq KK
  142. #endif
  143. #ifdef RT
  144. movq N, KK
  145. subq OFFSET, KK
  146. #endif
  147. movq N, J
  148. testq N, N
  149. jle .L999
  150. ALIGN_4
  151. .L01:
  152. #if defined(LT) || defined(RN)
  153. movq A, AO
  154. #else
  155. movq A, AORIG
  156. #endif
  157. #ifdef RT
  158. movq K, %rax
  159. salq $ZBASE_SHIFT, %rax
  160. subq %rax, B
  161. subq LDC, C
  162. #endif
  163. movq C, CO1
  164. #ifndef RT
  165. addq LDC, C
  166. #endif
  167. #ifdef LN
  168. movq OFFSET, KK
  169. addq M, KK
  170. #endif
  171. #ifdef LT
  172. movq OFFSET, KK
  173. #endif
  174. movq K, %rax
  175. salq $ZBASE_SHIFT, %rax
  176. leaq (B, %rax), BB
  177. testq $1, M
  178. jle .L20
  179. #ifdef LN
  180. movq K, %rax
  181. salq $0 + ZBASE_SHIFT, %rax
  182. subq %rax, AORIG
  183. #endif
  184. #if defined(LN) || defined(RT)
  185. movq KK, %rax
  186. leaq (, %rax, SIZE), %rax
  187. movq AORIG, AO
  188. leaq (AO, %rax, 2), AO
  189. leaq (B, %rax, 2), BO
  190. #else
  191. movq B, BO
  192. #endif
  193. movsd 0 * SIZE(AO), %xmm0
  194. xorps %xmm2, %xmm2
  195. movsd 1 * SIZE(AO), %xmm4
  196. xorps %xmm5, %xmm5
  197. movsd 2 * SIZE(AO), %xmm5
  198. xorps %xmm6, %xmm6
  199. movsd 3 * SIZE(AO), %xmm7
  200. movsd 0 * SIZE(BO), %xmm1
  201. xorps %xmm8, %xmm8
  202. xorps %xmm9, %xmm9
  203. movsd 1 * SIZE(BO), %xmm3
  204. xorps %xmm10, %xmm10
  205. xorps %xmm11, %xmm11
  206. #if defined(LT) || defined(RN)
  207. movq KK, %rax
  208. #else
  209. movq K, %rax
  210. subq KK, %rax
  211. #endif
  212. sarq $2, %rax
  213. je .L25
  214. ALIGN_4
  215. .L22:
  216. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  217. ADDSD2 %xmm2, %xmm9
  218. movaps %xmm0, %xmm2
  219. mulsd %xmm1, %xmm0
  220. ADDSD4 %xmm6, %xmm11
  221. movaps %xmm4, %xmm6
  222. mulsd %xmm1, %xmm4
  223. movsd 2 * SIZE(BO), %xmm1
  224. ADDSD1 %xmm0, %xmm8
  225. movsd 4 * SIZE(AO), %xmm0
  226. mulsd %xmm3, %xmm2
  227. ADDSD3 %xmm4, %xmm10
  228. movsd 5 * SIZE(AO), %xmm4
  229. mulsd %xmm3, %xmm6
  230. movsd 3 * SIZE(BO), %xmm3
  231. ADDSD2 %xmm2, %xmm9
  232. movaps %xmm5, %xmm2
  233. mulsd %xmm1, %xmm5
  234. ADDSD4 %xmm6, %xmm11
  235. movaps %xmm7, %xmm6
  236. mulsd %xmm1, %xmm7
  237. movsd 4 * SIZE(BO), %xmm1
  238. ADDSD1 %xmm5, %xmm8
  239. movsd 6 * SIZE(AO), %xmm5
  240. mulsd %xmm3, %xmm2
  241. ADDSD3 %xmm7, %xmm10
  242. movsd 7 * SIZE(AO), %xmm7
  243. mulsd %xmm3, %xmm6
  244. movsd 5 * SIZE(BO), %xmm3
  245. ADDSD2 %xmm2, %xmm9
  246. movaps %xmm0, %xmm2
  247. mulsd %xmm1, %xmm0
  248. ADDSD4 %xmm6, %xmm11
  249. movaps %xmm4, %xmm6
  250. mulsd %xmm1, %xmm4
  251. movsd 6 * SIZE(BO), %xmm1
  252. ADDSD1 %xmm0, %xmm8
  253. movsd 8 * SIZE(AO), %xmm0
  254. mulsd %xmm3, %xmm2
  255. ADDSD3 %xmm4, %xmm10
  256. movsd 9 * SIZE(AO), %xmm4
  257. mulsd %xmm3, %xmm6
  258. movsd 7 * SIZE(BO), %xmm3
  259. ADDSD2 %xmm2, %xmm9
  260. movaps %xmm5, %xmm2
  261. mulsd %xmm1, %xmm5
  262. ADDSD4 %xmm6, %xmm11
  263. movaps %xmm7, %xmm6
  264. mulsd %xmm1, %xmm7
  265. movsd 8 * SIZE(BO), %xmm1
  266. ADDSD1 %xmm5, %xmm8
  267. movsd 10 * SIZE(AO), %xmm5
  268. mulsd %xmm3, %xmm2
  269. ADDSD3 %xmm7, %xmm10
  270. movsd 11 * SIZE(AO), %xmm7
  271. mulsd %xmm3, %xmm6
  272. movsd 9 * SIZE(BO), %xmm3
  273. addq $8 * SIZE, AO
  274. addq $8 * SIZE, BO
  275. decq %rax
  276. jne .L22
  277. ALIGN_4
  278. .L25:
  279. #if defined(LT) || defined(RN)
  280. movq KK, %rax
  281. #else
  282. movq K, %rax
  283. subq KK, %rax
  284. #endif
  285. andq $3, %rax
  286. BRANCH
  287. BRANCH
  288. je .L29
  289. ALIGN_4
  290. .L26:
  291. ADDSD2 %xmm2, %xmm9
  292. movaps %xmm0, %xmm2
  293. mulsd %xmm1, %xmm0
  294. ADDSD4 %xmm6, %xmm11
  295. movaps %xmm4, %xmm6
  296. mulsd %xmm1, %xmm4
  297. movsd 2 * SIZE(BO), %xmm1
  298. mulsd %xmm3, %xmm2
  299. ADDSD1 %xmm0, %xmm8
  300. movsd 2 * SIZE(AO), %xmm0
  301. mulsd %xmm3, %xmm6
  302. movsd 3 * SIZE(BO), %xmm3
  303. ADDSD3 %xmm4, %xmm10
  304. movsd 3 * SIZE(AO), %xmm4
  305. addq $2 * SIZE, AO
  306. addq $2 * SIZE, BO
  307. decq %rax
  308. BRANCH
  309. jg .L26
  310. ALIGN_4
  311. .L29:
  312. ADDSD2 %xmm2, %xmm9
  313. ADDSD4 %xmm6, %xmm11
  314. addsd %xmm11, %xmm8
  315. addsd %xmm9, %xmm10
  316. #if defined(LN) || defined(RT)
  317. movq KK, %rax
  318. #ifdef LN
  319. subq $1, %rax
  320. #else
  321. subq $1, %rax
  322. #endif
  323. leaq (, %rax, SIZE), %rax
  324. movq AORIG, AO
  325. leaq (AO, %rax, 2), AO
  326. leaq (B, %rax, 2), BO
  327. #endif
  328. #if defined(LN) || defined(LT)
  329. movsd 0 * SIZE(BO), %xmm0
  330. movsd 1 * SIZE(BO), %xmm1
  331. #else
  332. movsd 0 * SIZE(AO), %xmm0
  333. movsd 1 * SIZE(AO), %xmm1
  334. #endif
  335. subsd %xmm8, %xmm0
  336. subsd %xmm10, %xmm1
  337. #if defined(LN) || defined(LT)
  338. movsd 0 * SIZE(AO), %xmm6
  339. movaps %xmm0, %xmm5
  340. movsd 1 * SIZE(AO), %xmm7
  341. movaps %xmm1, %xmm4
  342. mulsd %xmm6, %xmm0
  343. mulsd %xmm6, %xmm1
  344. mulsd %xmm7, %xmm5
  345. mulsd %xmm7, %xmm4
  346. ADDSD4 %xmm4, %xmm0
  347. ADDSD3 %xmm5, %xmm1
  348. #endif
  349. #if defined(RN) || defined(RT)
  350. movsd 0 * SIZE(BO), %xmm8
  351. movaps %xmm0, %xmm5
  352. movsd 1 * SIZE(BO), %xmm9
  353. movaps %xmm1, %xmm4
  354. mulsd %xmm8, %xmm0
  355. mulsd %xmm8, %xmm1
  356. mulsd %xmm9, %xmm5
  357. mulsd %xmm9, %xmm4
  358. ADDSD4 %xmm4, %xmm0
  359. ADDSD2 %xmm5, %xmm1
  360. #endif
  361. #ifdef LN
  362. subq $2 * SIZE, CO1
  363. #endif
  364. movsd %xmm0, 0 * SIZE(CO1)
  365. movsd %xmm1, 1 * SIZE(CO1)
  366. #if defined(LN) || defined(LT)
  367. movsd %xmm0, 0 * SIZE(BO)
  368. movsd %xmm1, 1 * SIZE(BO)
  369. #else
  370. movsd %xmm0, 0 * SIZE(AO)
  371. movsd %xmm1, 1 * SIZE(AO)
  372. #endif
  373. #ifndef LN
  374. addq $2 * SIZE, CO1
  375. #endif
  376. #if defined(LT) || defined(RN)
  377. movq K, %rax
  378. subq KK, %rax
  379. leaq (,%rax, SIZE), %rax
  380. leaq (AO, %rax, 2), AO
  381. leaq (BO, %rax, 2), BO
  382. #endif
  383. #ifdef LN
  384. subq $1, KK
  385. #endif
  386. #ifdef LT
  387. addq $1, KK
  388. #endif
  389. #ifdef RT
  390. movq K, %rax
  391. salq $0 + ZBASE_SHIFT, %rax
  392. addq %rax, AORIG
  393. #endif
  394. ALIGN_4
  395. .L20:
  396. movq M, I
  397. sarq $1, I
  398. jle .L99
  399. ALIGN_4
  400. .L10:
  401. #ifdef LN
  402. movq K, %rax
  403. salq $1 + ZBASE_SHIFT, %rax
  404. subq %rax, AORIG
  405. #endif
  406. #if defined(LN) || defined(RT)
  407. movq KK, %rax
  408. leaq (, %rax, SIZE), %rax
  409. movq AORIG, AO
  410. leaq (AO, %rax, 4), AO
  411. leaq (B, %rax, 2), BO
  412. #else
  413. movq B, BO
  414. #endif
  415. prefetcht0 0 * SIZE(BB)
  416. subq $-8 * SIZE, BB
  417. movsd 0 * SIZE(AO), %xmm0
  418. xorps %xmm2, %xmm2
  419. movsd 1 * SIZE(AO), %xmm4
  420. xorps %xmm5, %xmm5
  421. movsd 2 * SIZE(AO), %xmm5
  422. xorps %xmm6, %xmm6
  423. xorps %xmm7, %xmm7
  424. movsd 0 * SIZE(BO), %xmm1
  425. xorps %xmm8, %xmm8
  426. xorps %xmm9, %xmm9
  427. movsd 1 * SIZE(BO), %xmm3
  428. xorps %xmm10, %xmm10
  429. xorps %xmm11, %xmm11
  430. prefetcht0 3 * SIZE(CO1)
  431. xorps %xmm12, %xmm12
  432. xorps %xmm13, %xmm13
  433. xorps %xmm14, %xmm14
  434. xorps %xmm15, %xmm15
  435. #if defined(LT) || defined(RN)
  436. movq KK, %rax
  437. #else
  438. movq K, %rax
  439. subq KK, %rax
  440. #endif
  441. sarq $2, %rax
  442. je .L15
  443. ALIGN_4
  444. .L12:
  445. ADDSD2 %xmm2, %xmm13
  446. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  447. movaps %xmm0, %xmm2
  448. mulsd %xmm1, %xmm0
  449. ADDSD3 %xmm7, %xmm14
  450. movsd 3 * SIZE(AO), %xmm7
  451. mulsd %xmm3, %xmm2
  452. ADDSD4 %xmm6, %xmm15
  453. PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
  454. movaps %xmm4, %xmm6
  455. mulsd %xmm1, %xmm4
  456. ADDSD1 %xmm0, %xmm8
  457. movsd 4 * SIZE(AO), %xmm0
  458. mulsd %xmm3, %xmm6
  459. ADDSD2 %xmm2, %xmm9
  460. movaps %xmm5, %xmm2
  461. mulsd %xmm1, %xmm5
  462. ADDSD3 %xmm4, %xmm10
  463. movsd 5 * SIZE(AO), %xmm4
  464. mulsd %xmm3, %xmm2
  465. ADDSD4 %xmm6, %xmm11
  466. movaps %xmm7, %xmm6
  467. mulsd %xmm1, %xmm7
  468. movsd 2 * SIZE(BO), %xmm1
  469. ADDSD1 %xmm5, %xmm12
  470. movsd 6 * SIZE(AO), %xmm5
  471. mulsd %xmm3, %xmm6
  472. movsd 3 * SIZE(BO), %xmm3
  473. ADDSD2 %xmm2, %xmm13
  474. movaps %xmm0, %xmm2
  475. mulsd %xmm1, %xmm0
  476. ADDSD3 %xmm7, %xmm14
  477. movsd 7 * SIZE(AO), %xmm7
  478. mulsd %xmm3, %xmm2
  479. ADDSD4 %xmm6, %xmm15
  480. movaps %xmm4, %xmm6
  481. mulsd %xmm1, %xmm4
  482. ADDSD1 %xmm0, %xmm8
  483. movsd 8 * SIZE(AO), %xmm0
  484. mulsd %xmm3, %xmm6
  485. ADDSD2 %xmm2, %xmm9
  486. movaps %xmm5, %xmm2
  487. mulsd %xmm1, %xmm5
  488. ADDSD3 %xmm4, %xmm10
  489. movsd 9 * SIZE(AO), %xmm4
  490. mulsd %xmm3, %xmm2
  491. ADDSD4 %xmm6, %xmm11
  492. movaps %xmm7, %xmm6
  493. mulsd %xmm1, %xmm7
  494. movsd 4 * SIZE(BO), %xmm1
  495. ADDSD1 %xmm5, %xmm12
  496. movsd 10 * SIZE(AO), %xmm5
  497. mulsd %xmm3, %xmm6
  498. movsd 5 * SIZE(BO), %xmm3
  499. ADDSD2 %xmm2, %xmm13
  500. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  501. movaps %xmm0, %xmm2
  502. mulsd %xmm1, %xmm0
  503. ADDSD3 %xmm7, %xmm14
  504. movsd 11 * SIZE(AO), %xmm7
  505. mulsd %xmm3, %xmm2
  506. ADDSD4 %xmm6, %xmm15
  507. movaps %xmm4, %xmm6
  508. mulsd %xmm1, %xmm4
  509. ADDSD1 %xmm0, %xmm8
  510. movsd 12 * SIZE(AO), %xmm0
  511. mulsd %xmm3, %xmm6
  512. ADDSD2 %xmm2, %xmm9
  513. movaps %xmm5, %xmm2
  514. mulsd %xmm1, %xmm5
  515. ADDSD3 %xmm4, %xmm10
  516. movsd 13 * SIZE(AO), %xmm4
  517. mulsd %xmm3, %xmm2
  518. ADDSD4 %xmm6, %xmm11
  519. movaps %xmm7, %xmm6
  520. mulsd %xmm1, %xmm7
  521. movsd 6 * SIZE(BO), %xmm1
  522. ADDSD1 %xmm5, %xmm12
  523. movsd 14 * SIZE(AO), %xmm5
  524. mulsd %xmm3, %xmm6
  525. movsd 7 * SIZE(BO), %xmm3
  526. ADDSD2 %xmm2, %xmm13
  527. movaps %xmm0, %xmm2
  528. mulsd %xmm1, %xmm0
  529. ADDSD3 %xmm7, %xmm14
  530. movsd 15 * SIZE(AO), %xmm7
  531. mulsd %xmm3, %xmm2
  532. subq $-16 * SIZE, AO
  533. ADDSD4 %xmm6, %xmm15
  534. movaps %xmm4, %xmm6
  535. mulsd %xmm1, %xmm4
  536. ADDSD1 %xmm0, %xmm8
  537. movsd 0 * SIZE(AO), %xmm0
  538. mulsd %xmm3, %xmm6
  539. ADDSD2 %xmm2, %xmm9
  540. movaps %xmm5, %xmm2
  541. mulsd %xmm1, %xmm5
  542. addq $ 8 * SIZE, BO
  543. ADDSD3 %xmm4, %xmm10
  544. movsd 1 * SIZE(AO), %xmm4
  545. mulsd %xmm3, %xmm2
  546. decq %rax
  547. ADDSD4 %xmm6, %xmm11
  548. movaps %xmm7, %xmm6
  549. mulsd %xmm1, %xmm7
  550. movsd 0 * SIZE(BO), %xmm1
  551. ADDSD1 %xmm5, %xmm12
  552. movsd 2 * SIZE(AO), %xmm5
  553. mulsd %xmm3, %xmm6
  554. movsd 1 * SIZE(BO), %xmm3
  555. jne .L12
  556. ALIGN_4
  557. .L15:
  558. #if defined(LT) || defined(RN)
  559. movq KK, %rax
  560. #else
  561. movq K, %rax
  562. subq KK, %rax
  563. #endif
  564. andq $3, %rax
  565. BRANCH
  566. BRANCH
  567. je .L18
  568. ALIGN_4
  569. .L16:
  570. ADDSD2 %xmm2, %xmm13
  571. movaps %xmm0, %xmm2
  572. mulsd %xmm1, %xmm0
  573. ADDSD3 %xmm7, %xmm14
  574. movsd 3 * SIZE(AO), %xmm7
  575. mulsd %xmm3, %xmm2
  576. ADDSD4 %xmm6, %xmm15
  577. movaps %xmm4, %xmm6
  578. mulsd %xmm1, %xmm4
  579. ADDSD1 %xmm0, %xmm8
  580. movsd 4 * SIZE(AO), %xmm0
  581. mulsd %xmm3, %xmm6
  582. ADDSD2 %xmm2, %xmm9
  583. movaps %xmm5, %xmm2
  584. mulsd %xmm1, %xmm5
  585. ADDSD3 %xmm4, %xmm10
  586. movsd 5 * SIZE(AO), %xmm4
  587. mulsd %xmm3, %xmm2
  588. ADDSD4 %xmm6, %xmm11
  589. movaps %xmm7, %xmm6
  590. mulsd %xmm1, %xmm7
  591. movsd 2 * SIZE(BO), %xmm1
  592. ADDSD1 %xmm5, %xmm12
  593. movsd 6 * SIZE(AO), %xmm5
  594. mulsd %xmm3, %xmm6
  595. movsd 3 * SIZE(BO), %xmm3
  596. addq $4 * SIZE, AO
  597. addq $2 * SIZE, BO
  598. decq %rax
  599. BRANCH
  600. jg .L16
  601. ALIGN_4
  602. .L18:
  603. ADDSD2 %xmm2, %xmm13
  604. ADDSD3 %xmm7, %xmm14
  605. ADDSD4 %xmm6, %xmm15
  606. addsd %xmm11, %xmm8
  607. addsd %xmm9, %xmm10
  608. addsd %xmm15, %xmm12
  609. addsd %xmm13, %xmm14
  610. #if defined(LN) || defined(RT)
  611. movq KK, %rax
  612. #ifdef LN
  613. subq $2, %rax
  614. #else
  615. subq $1, %rax
  616. #endif
  617. leaq (, %rax, SIZE), %rax
  618. movq AORIG, AO
  619. leaq (AO, %rax, 4), AO
  620. leaq (B, %rax, 2), BO
  621. #endif
  622. #if defined(LN) || defined(LT)
  623. movsd 0 * SIZE(BO), %xmm0
  624. movsd 1 * SIZE(BO), %xmm1
  625. movsd 2 * SIZE(BO), %xmm2
  626. movsd 3 * SIZE(BO), %xmm3
  627. #else
  628. movsd 0 * SIZE(AO), %xmm0
  629. movsd 1 * SIZE(AO), %xmm1
  630. movsd 2 * SIZE(AO), %xmm2
  631. movsd 3 * SIZE(AO), %xmm3
  632. #endif
  633. subsd %xmm8, %xmm0
  634. subsd %xmm10, %xmm1
  635. subsd %xmm12, %xmm2
  636. subsd %xmm14, %xmm3
  637. #ifdef LN
  638. movsd 6 * SIZE(AO), %xmm6
  639. movsd 7 * SIZE(AO), %xmm7
  640. movaps %xmm2, %xmm5
  641. movaps %xmm3, %xmm4
  642. mulsd %xmm6, %xmm2
  643. mulsd %xmm6, %xmm3
  644. movsd 4 * SIZE(AO), %xmm6
  645. mulsd %xmm7, %xmm5
  646. mulsd %xmm7, %xmm4
  647. movsd 5 * SIZE(AO), %xmm7
  648. ADDSD4 %xmm4, %xmm2
  649. ADDSD3 %xmm5, %xmm3
  650. movaps %xmm2, %xmm4
  651. movaps %xmm3, %xmm5
  652. mulsd %xmm6, %xmm4
  653. mulsd %xmm7, %xmm5
  654. mulsd %xmm3, %xmm6
  655. mulsd %xmm2, %xmm7
  656. subsd %xmm4, %xmm0
  657. subsd %xmm6, %xmm1
  658. movsd 0 * SIZE(AO), %xmm6
  659. ADDSD3 %xmm5, %xmm0
  660. ADDSD4 %xmm7, %xmm1
  661. movsd 1 * SIZE(AO), %xmm7
  662. movaps %xmm0, %xmm5
  663. movaps %xmm1, %xmm4
  664. mulsd %xmm6, %xmm0
  665. mulsd %xmm6, %xmm1
  666. mulsd %xmm7, %xmm5
  667. mulsd %xmm7, %xmm4
  668. ADDSD4 %xmm4, %xmm0
  669. ADDSD3 %xmm5, %xmm1
  670. #endif
  671. #ifdef LT
  672. movsd 0 * SIZE(AO), %xmm6
  673. movsd 1 * SIZE(AO), %xmm7
  674. movaps %xmm0, %xmm5
  675. movaps %xmm1, %xmm4
  676. mulsd %xmm6, %xmm0
  677. mulsd %xmm6, %xmm1
  678. movsd 2 * SIZE(AO), %xmm6
  679. mulsd %xmm7, %xmm5
  680. mulsd %xmm7, %xmm4
  681. movsd 3 * SIZE(AO), %xmm7
  682. ADDSD4 %xmm4, %xmm0
  683. ADDSD3 %xmm5, %xmm1
  684. movaps %xmm0, %xmm4
  685. movaps %xmm1, %xmm5
  686. mulsd %xmm6, %xmm4
  687. mulsd %xmm7, %xmm5
  688. mulsd %xmm1, %xmm6
  689. mulsd %xmm0, %xmm7
  690. subsd %xmm4, %xmm2
  691. subsd %xmm6, %xmm3
  692. movsd 6 * SIZE(AO), %xmm6
  693. ADDSD3 %xmm5, %xmm2
  694. ADDSD4 %xmm7, %xmm3
  695. movsd 7 * SIZE(AO), %xmm7
  696. movaps %xmm2, %xmm5
  697. movaps %xmm3, %xmm4
  698. mulsd %xmm6, %xmm2
  699. mulsd %xmm6, %xmm3
  700. mulsd %xmm7, %xmm5
  701. mulsd %xmm7, %xmm4
  702. ADDSD4 %xmm4, %xmm2
  703. ADDSD3 %xmm5, %xmm3
  704. #endif
  705. #if defined(RN) || defined(RT)
  706. movsd 0 * SIZE(BO), %xmm8
  707. movaps %xmm0, %xmm5
  708. movsd 1 * SIZE(BO), %xmm9
  709. movaps %xmm1, %xmm4
  710. movaps %xmm2, %xmm7
  711. movaps %xmm3, %xmm6
  712. mulsd %xmm8, %xmm0
  713. mulsd %xmm8, %xmm1
  714. mulsd %xmm9, %xmm5
  715. mulsd %xmm9, %xmm4
  716. ADDSD4 %xmm4, %xmm0
  717. mulsd %xmm8, %xmm2
  718. ADDSD2 %xmm5, %xmm1
  719. mulsd %xmm8, %xmm3
  720. mulsd %xmm9, %xmm7
  721. mulsd %xmm9, %xmm6
  722. ADDSD4 %xmm6, %xmm2
  723. ADDSD2 %xmm7, %xmm3
  724. #endif
  725. #ifdef LN
  726. subq $4 * SIZE, CO1
  727. #endif
  728. movsd %xmm0, 0 * SIZE(CO1)
  729. movsd %xmm1, 1 * SIZE(CO1)
  730. movsd %xmm2, 2 * SIZE(CO1)
  731. movsd %xmm3, 3 * SIZE(CO1)
  732. #if defined(LN) || defined(LT)
  733. movsd %xmm0, 0 * SIZE(BO)
  734. movsd %xmm1, 1 * SIZE(BO)
  735. movsd %xmm2, 2 * SIZE(BO)
  736. movsd %xmm3, 3 * SIZE(BO)
  737. #else
  738. movsd %xmm0, 0 * SIZE(AO)
  739. movsd %xmm1, 1 * SIZE(AO)
  740. movsd %xmm2, 2 * SIZE(AO)
  741. movsd %xmm3, 3 * SIZE(AO)
  742. #endif
  743. #ifndef LN
  744. addq $4 * SIZE, CO1
  745. #endif
  746. #if defined(LT) || defined(RN)
  747. movq K, %rax
  748. subq KK, %rax
  749. leaq (,%rax, SIZE), %rax
  750. leaq (AO, %rax, 4), AO
  751. leaq (BO, %rax, 2), BO
  752. #endif
  753. #ifdef LN
  754. subq $2, KK
  755. #endif
  756. #ifdef LT
  757. addq $2, KK
  758. #endif
  759. #ifdef RT
  760. movq K, %rax
  761. salq $1 + ZBASE_SHIFT, %rax
  762. addq %rax, AORIG
  763. #endif
  764. decq I # i --
  765. jg .L10
  766. ALIGN_4
  767. .L99:
  768. #ifdef LN
  769. leaq (, K, SIZE), %rax
  770. leaq (B, %rax, 2), B
  771. #endif
  772. #if defined(LT) || defined(RN)
  773. movq BO, B
  774. #endif
  775. #ifdef RN
  776. addq $1, KK
  777. #endif
  778. #ifdef RT
  779. subq $1, KK
  780. #endif
  781. decq J # j --
  782. jg .L01
  783. ALIGN_4
  784. .L999:
  785. movq 0(%rsp), %rbx
  786. movq 8(%rsp), %rbp
  787. movq 16(%rsp), %r12
  788. movq 24(%rsp), %r13
  789. movq 32(%rsp), %r14
  790. movq 40(%rsp), %r15
  791. #ifdef WINDOWS_ABI
  792. movq 48(%rsp), %rdi
  793. movq 56(%rsp), %rsi
  794. movups 64(%rsp), %xmm6
  795. movups 80(%rsp), %xmm7
  796. movups 96(%rsp), %xmm8
  797. movups 112(%rsp), %xmm9
  798. movups 128(%rsp), %xmm10
  799. movups 144(%rsp), %xmm11
  800. movups 160(%rsp), %xmm12
  801. movups 176(%rsp), %xmm13
  802. movups 192(%rsp), %xmm14
  803. movups 208(%rsp), %xmm15
  804. #endif
  805. addq $STACKSIZE, %rsp
  806. ret
  807. EPILOGUE