You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_tcopy_8.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef NEHALEM
  41. #define PREFETCHSIZE 16
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #define MOVUPS_A movups
  45. #endif
  46. #ifdef SANDYBRIDGE
  47. #define PREFETCHSIZE 16
  48. #define PREFETCH prefetcht0
  49. #define PREFETCHW prefetcht0
  50. #define MOVUPS_A movups
  51. #endif
  52. #ifdef MOVUPS_A
  53. #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
  54. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
  55. #else
  56. #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
  57. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS
  58. #endif
  59. #ifndef WINDOWS_ABI
  60. #define N ARG1 /* rsi */
  61. #define M ARG2 /* rdi */
  62. #define A ARG3 /* rdx */
  63. #define LDA ARG4 /* rcx */
  64. #define B ARG5 /* r8 */
  65. #define AO1 %r9
  66. #define AO2 %r10
  67. #define LDA3 %r11
  68. #define M8 %r12
  69. #else
  70. #define N ARG1 /* rdx */
  71. #define M ARG2 /* rcx */
  72. #define A ARG3 /* r8 */
  73. #define LDA ARG4 /* r9 */
  74. #define OLD_B 40 + 56(%rsp)
  75. #define B %r12
  76. #define AO1 %rsi
  77. #define AO2 %rdi
  78. #define LDA3 %r10
  79. #define M8 %r11
  80. #endif
  81. #define I %rax
  82. #define B0 %rbp
  83. #define B1 %r13
  84. #define B2 %r14
  85. #define B3 %r15
  86. PROLOGUE
  87. PROFCODE
  88. #ifdef WINDOWS_ABI
  89. pushq %rdi
  90. pushq %rsi
  91. #endif
  92. pushq %r15
  93. pushq %r14
  94. pushq %r13
  95. pushq %r12
  96. pushq %rbp
  97. #ifdef WINDOWS_ABI
  98. movq OLD_B, B
  99. #endif
  100. subq $-16 * SIZE, B
  101. movq M, B1
  102. movq M, B2
  103. movq M, B3
  104. andq $-8, B1
  105. andq $-4, B2
  106. andq $-2, B3
  107. imulq N, B1
  108. imulq N, B2
  109. imulq N, B3
  110. leaq (B, B1, SIZE), B1
  111. leaq (B, B2, SIZE), B2
  112. leaq (B, B3, SIZE), B3
  113. leaq (,LDA, SIZE), LDA
  114. leaq (LDA, LDA, 2), LDA3
  115. leaq (, N, SIZE), M8
  116. cmpq $8, N
  117. jl .L20
  118. ALIGN_4
  119. .L11:
  120. subq $8, N
  121. movq A, AO1
  122. leaq (A, LDA, 4), AO2
  123. leaq (A, LDA, 8), A
  124. movq B, B0
  125. addq $64 * SIZE, B
  126. movq M, I
  127. sarq $3, I
  128. jle .L14
  129. ALIGN_4
  130. .L13:
  131. #ifdef PREFETCH
  132. PREFETCH PREFETCHSIZE * SIZE(AO1)
  133. #endif
  134. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  135. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  136. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  137. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  138. #ifdef PREFETCHW
  139. PREFETCHW 48 * SIZE(B0)
  140. #endif
  141. movaps %xmm0, -16 * SIZE(B0)
  142. movaps %xmm1, -14 * SIZE(B0)
  143. movaps %xmm2, -12 * SIZE(B0)
  144. movaps %xmm3, -10 * SIZE(B0)
  145. #ifdef PREFETCH
  146. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  147. #endif
  148. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
  149. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
  150. MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
  151. MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
  152. #ifdef PREFETCHW
  153. PREFETCHW 56 * SIZE(B0)
  154. #endif
  155. movaps %xmm0, -8 * SIZE(B0)
  156. movaps %xmm1, -6 * SIZE(B0)
  157. movaps %xmm2, -4 * SIZE(B0)
  158. movaps %xmm3, -2 * SIZE(B0)
  159. #ifdef PREFETCH
  160. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2)
  161. #endif
  162. MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
  163. MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
  164. MOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
  165. MOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)
  166. #ifdef PREFETCHW
  167. PREFETCHW 64 * SIZE(B0)
  168. #endif
  169. movaps %xmm0, 0 * SIZE(B0)
  170. movaps %xmm1, 2 * SIZE(B0)
  171. movaps %xmm2, 4 * SIZE(B0)
  172. movaps %xmm3, 6 * SIZE(B0)
  173. #ifdef PREFETCH
  174. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3)
  175. #endif
  176. MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
  177. MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
  178. MOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
  179. MOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)
  180. #ifdef PREFETCHW
  181. PREFETCHW 72 * SIZE(B0)
  182. #endif
  183. movaps %xmm0, 8 * SIZE(B0)
  184. movaps %xmm1, 10 * SIZE(B0)
  185. movaps %xmm2, 12 * SIZE(B0)
  186. movaps %xmm3, 14 * SIZE(B0)
  187. #ifdef PREFETCH
  188. PREFETCH PREFETCHSIZE * SIZE(AO2)
  189. #endif
  190. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  191. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  192. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  193. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  194. #ifdef PREFETCHW
  195. PREFETCHW 80 * SIZE(B0)
  196. #endif
  197. movaps %xmm0, 16 * SIZE(B0)
  198. movaps %xmm1, 18 * SIZE(B0)
  199. movaps %xmm2, 20 * SIZE(B0)
  200. movaps %xmm3, 22 * SIZE(B0)
  201. #ifdef PREFETCH
  202. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  203. #endif
  204. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
  205. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
  206. MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
  207. MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
  208. #ifdef PREFETCHW
  209. PREFETCHW 88 * SIZE(B0)
  210. #endif
  211. movaps %xmm0, 24 * SIZE(B0)
  212. movaps %xmm1, 26 * SIZE(B0)
  213. movaps %xmm2, 28 * SIZE(B0)
  214. movaps %xmm3, 30 * SIZE(B0)
  215. #ifdef PREFETCH
  216. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2)
  217. #endif
  218. MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
  219. MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
  220. MOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
  221. MOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)
  222. #ifdef PREFETCHW
  223. PREFETCHW 96 * SIZE(B0)
  224. #endif
  225. movaps %xmm0, 32 * SIZE(B0)
  226. movaps %xmm1, 34 * SIZE(B0)
  227. movaps %xmm2, 36 * SIZE(B0)
  228. movaps %xmm3, 38 * SIZE(B0)
  229. #ifdef PREFETCH
  230. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3)
  231. #endif
  232. MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
  233. MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
  234. MOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
  235. MOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)
  236. #ifdef PREFETCHW
  237. PREFETCHW 104 * SIZE(B0)
  238. #endif
  239. movaps %xmm0, 40 * SIZE(B0)
  240. movaps %xmm1, 42 * SIZE(B0)
  241. movaps %xmm2, 44 * SIZE(B0)
  242. movaps %xmm3, 46 * SIZE(B0)
  243. addq $8 * SIZE, AO1
  244. addq $8 * SIZE, AO2
  245. leaq (B0, M8, 8), B0
  246. decq I
  247. jg .L13
  248. ALIGN_4
  249. .L14:
  250. testq $4, M
  251. jle .L16
  252. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  253. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  254. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
  255. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
  256. movaps %xmm0, -16 * SIZE(B1)
  257. movaps %xmm1, -14 * SIZE(B1)
  258. movaps %xmm2, -12 * SIZE(B1)
  259. movaps %xmm3, -10 * SIZE(B1)
  260. MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
  261. MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
  262. MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
  263. MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)
  264. movaps %xmm0, -8 * SIZE(B1)
  265. movaps %xmm1, -6 * SIZE(B1)
  266. movaps %xmm2, -4 * SIZE(B1)
  267. movaps %xmm3, -2 * SIZE(B1)
  268. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  269. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  270. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
  271. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
  272. movaps %xmm0, 0 * SIZE(B1)
  273. movaps %xmm1, 2 * SIZE(B1)
  274. movaps %xmm2, 4 * SIZE(B1)
  275. movaps %xmm3, 6 * SIZE(B1)
  276. MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
  277. MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
  278. MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
  279. MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)
  280. movaps %xmm0, 8 * SIZE(B1)
  281. movaps %xmm1, 10 * SIZE(B1)
  282. movaps %xmm2, 12 * SIZE(B1)
  283. movaps %xmm3, 14 * SIZE(B1)
  284. addq $4 * SIZE, AO1
  285. addq $4 * SIZE, AO2
  286. subq $-32 * SIZE, B1
  287. ALIGN_4
  288. .L16:
  289. testq $2, M
  290. jle .L18
  291. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  292. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
  293. MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
  294. MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)
  295. movaps %xmm0, -16 * SIZE(B2)
  296. movaps %xmm1, -14 * SIZE(B2)
  297. movaps %xmm2, -12 * SIZE(B2)
  298. movaps %xmm3, -10 * SIZE(B2)
  299. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  300. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
  301. MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
  302. MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)
  303. movaps %xmm0, -8 * SIZE(B2)
  304. movaps %xmm1, -6 * SIZE(B2)
  305. movaps %xmm2, -4 * SIZE(B2)
  306. movaps %xmm3, -2 * SIZE(B2)
  307. addq $2 * SIZE, AO1
  308. addq $2 * SIZE, AO2
  309. subq $-16 * SIZE, B2
  310. ALIGN_4
  311. .L18:
  312. testq $1, M
  313. jle .L19
  314. movsd 0 * SIZE(AO1), %xmm0
  315. movsd 0 * SIZE(AO1, LDA), %xmm1
  316. movsd 0 * SIZE(AO1, LDA, 2), %xmm2
  317. movsd 0 * SIZE(AO1, LDA3), %xmm3
  318. unpcklpd %xmm1, %xmm0
  319. unpcklpd %xmm3, %xmm2
  320. movaps %xmm0, -16 * SIZE(B3)
  321. movaps %xmm2, -14 * SIZE(B3)
  322. movsd 0 * SIZE(AO2), %xmm0
  323. movsd 0 * SIZE(AO2, LDA), %xmm1
  324. movsd 0 * SIZE(AO2, LDA, 2), %xmm2
  325. movsd 0 * SIZE(AO2, LDA3), %xmm3
  326. unpcklpd %xmm1, %xmm0
  327. unpcklpd %xmm3, %xmm2
  328. movaps %xmm0, -12 * SIZE(B3)
  329. movaps %xmm2, -10 * SIZE(B3)
  330. subq $-8 * SIZE, B3
  331. ALIGN_4
  332. .L19:
  333. cmpq $8, N
  334. jge .L11
  335. ALIGN_4
  336. .L20:
  337. cmpq $4, N
  338. jl .L30
  339. subq $4, N
  340. movq A, AO1
  341. leaq (A, LDA, 2), AO2
  342. leaq (A, LDA, 4), A
  343. movq B, B0
  344. addq $32 * SIZE, B
  345. movq M, I
  346. sarq $3, I
  347. jle .L24
  348. ALIGN_4
  349. .L23:
  350. #ifdef PREFETCH
  351. PREFETCH PREFETCHSIZE * SIZE(AO1)
  352. #endif
  353. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  354. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  355. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  356. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  357. #ifdef PREFETCHW
  358. PREFETCHW 16 * SIZE(B0)
  359. #endif
  360. movaps %xmm0, -16 * SIZE(B0)
  361. movaps %xmm1, -14 * SIZE(B0)
  362. movaps %xmm2, -12 * SIZE(B0)
  363. movaps %xmm3, -10 * SIZE(B0)
  364. #ifdef PREFETCH
  365. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  366. #endif
  367. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
  368. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
  369. MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
  370. MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
  371. #ifdef PREFETCHW
  372. PREFETCHW 24 * SIZE(B0)
  373. #endif
  374. movaps %xmm0, -8 * SIZE(B0)
  375. movaps %xmm1, -6 * SIZE(B0)
  376. movaps %xmm2, -4 * SIZE(B0)
  377. movaps %xmm3, -2 * SIZE(B0)
  378. #ifdef PREFETCH
  379. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2)
  380. #endif
  381. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  382. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  383. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  384. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  385. #ifdef PREFETCHW
  386. PREFETCHW 32 * SIZE(B0)
  387. #endif
  388. movaps %xmm0, 0 * SIZE(B0)
  389. movaps %xmm1, 2 * SIZE(B0)
  390. movaps %xmm2, 4 * SIZE(B0)
  391. movaps %xmm3, 6 * SIZE(B0)
  392. #ifdef PREFETCH
  393. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3)
  394. #endif
  395. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
  396. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
  397. MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
  398. MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
  399. #ifdef PREFETCHW
  400. PREFETCHW 40 * SIZE(B0)
  401. #endif
  402. movaps %xmm0, 8 * SIZE(B0)
  403. movaps %xmm1, 10 * SIZE(B0)
  404. movaps %xmm2, 12 * SIZE(B0)
  405. movaps %xmm3, 14 * SIZE(B0)
  406. addq $8 * SIZE, AO1
  407. addq $8 * SIZE, AO2
  408. leaq (B0, M8, 8), B0
  409. decq I
  410. jg .L23
  411. ALIGN_4
  412. .L24:
  413. testq $4, M
  414. jle .L26
  415. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  416. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  417. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
  418. MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
  419. movaps %xmm0, -16 * SIZE(B1)
  420. movaps %xmm1, -14 * SIZE(B1)
  421. movaps %xmm2, -12 * SIZE(B1)
  422. movaps %xmm3, -10 * SIZE(B1)
  423. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  424. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  425. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
  426. MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
  427. movaps %xmm0, -8 * SIZE(B1)
  428. movaps %xmm1, -6 * SIZE(B1)
  429. movaps %xmm2, -4 * SIZE(B1)
  430. movaps %xmm3, -2 * SIZE(B1)
  431. addq $4 * SIZE, AO1
  432. addq $4 * SIZE, AO2
  433. subq $-16 * SIZE, B1
  434. ALIGN_4
  435. .L26:
  436. testq $2, M
  437. jle .L28
  438. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  439. MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
  440. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  441. MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
  442. movaps %xmm0, -16 * SIZE(B2)
  443. movaps %xmm1, -14 * SIZE(B2)
  444. movaps %xmm2, -12 * SIZE(B2)
  445. movaps %xmm3, -10 * SIZE(B2)
  446. addq $2 * SIZE, AO1
  447. addq $2 * SIZE, AO2
  448. subq $-8 * SIZE, B2
  449. ALIGN_4
  450. .L28:
  451. testq $1, M
  452. jle .L30
  453. movsd 0 * SIZE(AO1), %xmm0
  454. movsd 0 * SIZE(AO1, LDA), %xmm1
  455. movsd 0 * SIZE(AO2), %xmm2
  456. movsd 0 * SIZE(AO2, LDA), %xmm3
  457. unpcklpd %xmm1, %xmm0
  458. unpcklpd %xmm3, %xmm2
  459. movaps %xmm0, -16 * SIZE(B3)
  460. movaps %xmm2, -14 * SIZE(B3)
  461. subq $-4 * SIZE, B3
  462. ALIGN_4
  463. .L30:
  464. cmpq $2, N
  465. jl .L40
  466. subq $2, N
  467. movq A, AO1
  468. leaq (A, LDA), AO2
  469. leaq (A, LDA, 2), A
  470. movq B, B0
  471. addq $16 * SIZE, B
  472. movq M, I
  473. sarq $3, I
  474. jle .L34
  475. ALIGN_4
  476. .L33:
  477. #ifdef PREFETCH
  478. PREFETCH PREFETCHSIZE * SIZE(AO1)
  479. #endif
  480. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  481. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  482. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  483. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  484. #ifdef PREFETCHW
  485. PREFETCHW 0 * SIZE(B0)
  486. #endif
  487. movaps %xmm0, -16 * SIZE(B0)
  488. movaps %xmm1, -14 * SIZE(B0)
  489. movaps %xmm2, -12 * SIZE(B0)
  490. movaps %xmm3, -10 * SIZE(B0)
  491. #ifdef PREFETCH
  492. PREFETCH PREFETCHSIZE * SIZE(AO2)
  493. #endif
  494. MOVUPS_A1(0 * SIZE, AO2, %xmm0)
  495. MOVUPS_A1(2 * SIZE, AO2, %xmm1)
  496. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  497. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  498. #ifdef PREFETCHW
  499. PREFETCHW 8 * SIZE(B0)
  500. #endif
  501. movaps %xmm0, -8 * SIZE(B0)
  502. movaps %xmm1, -6 * SIZE(B0)
  503. movaps %xmm2, -4 * SIZE(B0)
  504. movaps %xmm3, -2 * SIZE(B0)
  505. addq $8 * SIZE, AO1
  506. addq $8 * SIZE, AO2
  507. leaq (B0, M8, 8), B0
  508. decq I
  509. jg .L33
  510. ALIGN_4
  511. .L34:
  512. testq $4, M
  513. jle .L36
  514. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  515. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  516. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  517. MOVUPS_A1(2 * SIZE, AO2, %xmm3)
  518. movaps %xmm0, -16 * SIZE(B1)
  519. movaps %xmm1, -14 * SIZE(B1)
  520. movaps %xmm2, -12 * SIZE(B1)
  521. movaps %xmm3, -10 * SIZE(B1)
  522. addq $4 * SIZE, AO1
  523. addq $4 * SIZE, AO2
  524. subq $-8 * SIZE, B1
  525. ALIGN_4
  526. .L36:
  527. testq $2, M
  528. jle .L38
  529. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  530. MOVUPS_A1(0 * SIZE, AO2, %xmm1)
  531. movaps %xmm0, -16 * SIZE(B2)
  532. movaps %xmm1, -14 * SIZE(B2)
  533. addq $2 * SIZE, AO1
  534. addq $2 * SIZE, AO2
  535. subq $-4 * SIZE, B2
  536. ALIGN_4
  537. .L38:
  538. testq $1, M
  539. jle .L40
  540. movsd 0 * SIZE(AO1), %xmm0
  541. movsd 0 * SIZE(AO2), %xmm1
  542. unpcklpd %xmm1, %xmm0
  543. movaps %xmm0, -16 * SIZE(B3)
  544. subq $-2 * SIZE, B3
  545. ALIGN_4
  546. .L40:
  547. cmpq $1, N
  548. jl .L999
  549. movq A, AO1
  550. movq B, B0
  551. movq M, I
  552. sarq $3, I
  553. jle .L44
  554. ALIGN_4
  555. .L43:
  556. #ifdef PREFETCH
  557. PREFETCH PREFETCHSIZE * 8 * SIZE(AO1)
  558. #endif
  559. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  560. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  561. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  562. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  563. #ifdef PREFETCHW
  564. PREFETCHW -8 * SIZE(B0)
  565. #endif
  566. movaps %xmm0, -16 * SIZE(B0)
  567. movaps %xmm1, -14 * SIZE(B0)
  568. movaps %xmm2, -12 * SIZE(B0)
  569. movaps %xmm3, -10 * SIZE(B0)
  570. addq $8 * SIZE, AO1
  571. leaq (B0, M8, 8), B0
  572. decq I
  573. jg .L43
  574. ALIGN_4
  575. .L44:
  576. testq $4, M
  577. jle .L45
  578. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  579. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  580. movaps %xmm0, -16 * SIZE(B1)
  581. movaps %xmm1, -14 * SIZE(B1)
  582. addq $4 * SIZE, AO1
  583. subq $-4 * SIZE, B1
  584. ALIGN_4
  585. .L45:
  586. testq $2, M
  587. jle .L46
  588. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  589. movaps %xmm0, -16 * SIZE(B2)
  590. addq $2 * SIZE, AO1
  591. subq $-2 * SIZE, B2
  592. ALIGN_4
  593. .L46:
  594. testq $1, M
  595. jle .L999
  596. movsd 0 * SIZE(AO1), %xmm0
  597. movlpd %xmm0, -16 * SIZE(B3)
  598. jmp .L999
  599. ALIGN_4
  600. .L999:
  601. popq %rbp
  602. popq %r12
  603. popq %r13
  604. popq %r14
  605. popq %r15
  606. #ifdef WINDOWS_ABI
  607. popq %rsi
  608. popq %rdi
  609. #endif
  610. ret
  611. EPILOGUE