You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r32
  41. #define X1 r33
  42. #define INCX r34
  43. #define Y1 r35
  44. #define INCY r36
  45. #define PREA r2
  46. #define PREB r3
  47. #define I r14
  48. #define J r15
  49. #define X2 r16
  50. #define Y2 r17
  51. #define INCX3 r18
  52. #define INCY3 r19
  53. #define INCX5 r20
  54. #define INCY5 r21
  55. #define INCX16 r22
  56. #define INCY16 r23
  57. #define XX r24
  58. #define YY r25
  59. #define XA r26
  60. #define YA r27
  61. #define PR r30
  62. #define ARLC r31
  63. #ifdef DOUBLE
  64. #define PREFETCH_SIZE (4 * 32)
  65. #else
  66. #define PREFETCH_SIZE (4 * 64)
  67. #endif
  68. PROLOGUE
  69. .prologue
  70. PROFCODE
  71. { .mmi
  72. shladd INCX = INCX, BASE_SHIFT, r0
  73. shladd INCY = INCY, BASE_SHIFT, r0
  74. .save ar.lc, ARLC
  75. mov ARLC = ar.lc
  76. }
  77. { .mib
  78. cmp.lt p0, p6 = r0, N
  79. tbit.z p0, p7 = X1, BASE_SHIFT
  80. (p6) br.ret.sptk.many b0
  81. }
  82. ;;
  83. .body
  84. { .mmi
  85. sub XA = Y1, X1
  86. (p7) LDFD f32 = [X1], INCX
  87. mov PR = pr
  88. }
  89. { .mmi
  90. mov YY = Y1
  91. (p7) adds N = -1, N
  92. (p7) add Y1 = Y1, INCY
  93. }
  94. ;;
  95. { .mmi
  96. shladd INCX5 = INCX, 2, INCX
  97. shladd INCY5 = INCY, 2, INCY
  98. mov pr.rot = 0
  99. }
  100. { .mmi
  101. mov XX = X1
  102. nop.m 0
  103. shr.u XA = XA, BASE_SHIFT
  104. }
  105. ;;
  106. { .mmi
  107. and J = 15, N
  108. cmp.eq p16, p0 = r0, r0
  109. shr I = N, 4
  110. }
  111. { .mmb
  112. cmp.ne p6, p0 = SIZE, INCX
  113. #ifdef DOUBLE
  114. adds XA = 2, XA
  115. #else
  116. nop.m 0
  117. #endif
  118. (p6) br.cond.dpnt .L100
  119. }
  120. ;;
  121. /* INCX == 1 */
  122. { .mmi
  123. shladd INCX16 = INCX, 4, r0
  124. shladd INCY16 = INCY, 4, r0
  125. tbit.z p0, p12 = N, 3
  126. }
  127. { .mmi
  128. #ifdef DOUBLE
  129. and XA = 31, XA
  130. #else
  131. and XA = 63, XA
  132. #endif
  133. adds I = -1, I
  134. tbit.z p0, p13 = N, 2
  135. }
  136. ;;
  137. { .mmi
  138. shladd X2 = INCX, 2, X1
  139. shladd Y2 = INCY, 2, Y1
  140. mov ar.lc = I
  141. }
  142. { .mib
  143. #ifdef DOUBLE
  144. cmp.gt p8, p0 = 15, XA
  145. #else
  146. cmp.gt p8, p0 = 30, XA
  147. #endif
  148. cmp.eq p9, p0 = r0, J
  149. (p8)br.cond.dpnt .L30
  150. }
  151. ;;
  152. { .mmi
  153. (p7) STFD [YY] = f32
  154. cmp.gt p8 ,p0 = r0, I
  155. mov ar.ec = 5
  156. }
  157. { .mmb
  158. adds PREA = PREFETCH_SIZE * SIZE + 32, X1
  159. #ifdef DOUBLE
  160. adds PREB = PREFETCH_SIZE * SIZE + 32, Y1
  161. #else
  162. adds PREB = PREFETCH_SIZE * SIZE - 40, Y1
  163. #endif
  164. (p8) br.cond.dpnt .L25
  165. }
  166. ;;
  167. .align 32
  168. .L22:
  169. { .mmi
  170. (p20) STFD [Y1] = f36
  171. (p20) STFD [Y2] = f56
  172. (p20) add Y1 = INCY, Y1
  173. }
  174. { .mmi
  175. (p16) lfetch.nt1 [PREA], INCX16
  176. (p16) LDFPD f32, f37 = [X1], 2 * SIZE
  177. (p20) add Y2 = INCY, Y2
  178. }
  179. ;;
  180. { .mmi
  181. (p20) STFD [Y1] = f41
  182. (p20) STFD [Y2] = f61
  183. (p20) add Y1 = INCY, Y1
  184. }
  185. { .mmi
  186. (p16) lfetch.excl.nt1 [PREB], INCY16
  187. (p16) LDFPD f42, f47 = [X1], 2 * SIZE
  188. (p20) add Y2 = INCY, Y2
  189. }
  190. ;;
  191. { .mmi
  192. (p20) STFD [Y1] = f46
  193. (p20) STFD [Y2] = f66
  194. (p20) add Y1 = INCY, Y1
  195. }
  196. { .mmi
  197. (p16) LDFPD f52, f57 = [X1], 2 * SIZE
  198. nop.m 0
  199. (p20) add Y2 = INCY, Y2
  200. }
  201. ;;
  202. { .mmi
  203. (p20) STFD [Y1] = f51
  204. (p20) STFD [Y2] = f71
  205. (p20) add Y1 = INCY5, Y1
  206. }
  207. { .mmi
  208. (p16) LDFPD f62, f67 = [X1], 2 * SIZE
  209. nop.m 0
  210. (p20) add Y2 = INCY5, Y2
  211. }
  212. ;;
  213. { .mmi
  214. (p20) STFD [Y1] = f76
  215. (p20) STFD [Y2] = f96
  216. (p16) adds XX = 8 * SIZE, X1
  217. }
  218. { .mmi
  219. (p16) LDFPD f72, f77 = [X1], 2 * SIZE
  220. (p20) add Y1 = INCY, Y1
  221. (p20) add Y2 = INCY, Y2
  222. }
  223. ;;
  224. { .mmi
  225. (p20) STFD [Y1] = f81
  226. (p20) STFD [Y2] = f101
  227. (p20) add Y1 = INCY, Y1
  228. }
  229. { .mmi
  230. (p16) LDFPD f82, f87 = [X1], 2 * SIZE
  231. nop.m 0
  232. (p20) add Y2 = INCY, Y2
  233. }
  234. ;;
  235. { .mmi
  236. (p20) STFD [Y1] = f86
  237. (p20) STFD [Y2] = f106
  238. (p16) shladd X2 = INCX, 2, XX
  239. }
  240. { .mmi
  241. (p16) LDFPD f92, f97 = [X1], 2 * SIZE
  242. (p20) add Y1 = INCY, Y1
  243. (p20) add Y2 = INCY, Y2
  244. }
  245. ;;
  246. { .mmi
  247. (p20) STFD [Y1] = f91
  248. (p20) STFD [Y2] = f111
  249. (p20) add Y1 = INCY5, Y1
  250. }
  251. { .mmb
  252. (p16) LDFPD f102, f107 = [X1], 2 * SIZE
  253. (p20) add Y2 = INCY5, Y2
  254. br.ctop.sptk.few .L22
  255. }
  256. ;;
  257. .align 32
  258. .L25:
  259. { .mmi
  260. (p12) LDFPD f48, f49 = [X1], 2 * SIZE
  261. (p12) LDFPD f52, f53 = [X2], 2 * SIZE
  262. mov ar.lc = ARLC
  263. }
  264. { .mmi
  265. (p12) adds XX = 8 * SIZE, XX
  266. nop.m 0
  267. tbit.z p0, p14 = N, 1
  268. }
  269. ;;
  270. { .mmi
  271. (p12) LDFPD f50, f51 = [X1]
  272. (p12) LDFPD f54, f55 = [X2]
  273. mov pr = PR, -65474
  274. }
  275. { .mmb
  276. (p12) adds X1 = 6 * SIZE, X1
  277. (p13) adds XX = 4 * SIZE, XX
  278. (p9) br.ret.sptk.many b0
  279. }
  280. ;;
  281. { .mmi
  282. (p13) LDFPD f56, f57 = [X1], 2 * SIZE
  283. (p14) LDFPD f60, f61 = [XX], 2 * SIZE
  284. tbit.z p0, p15 = N, 0
  285. }
  286. ;;
  287. { .mmi
  288. (p13) LDFPD f58, f59 = [X1], 2 * SIZE
  289. (p15) LDFD f62 = [XX]
  290. nop.i 0
  291. }
  292. ;;
  293. { .mmi
  294. (p12) STFD [Y1] = f48
  295. (p12) STFD [Y2] = f52
  296. mov YY = Y1
  297. }
  298. { .mmi
  299. (p12) add Y1 = INCY, Y1
  300. (p12) add Y2 = INCY, Y2
  301. nop.i 0
  302. }
  303. ;;
  304. { .mmi
  305. (p12) STFD [Y1] = f49
  306. (p12) STFD [Y2] = f53
  307. (p12) add Y1 = INCY, Y1
  308. }
  309. { .mmi
  310. (p12) add Y2 = INCY, Y2
  311. (p12) shladd YY = INCY, 3, YY
  312. nop.i 0
  313. }
  314. ;;
  315. { .mmi
  316. (p12) STFD [Y1] = f50
  317. (p12) STFD [Y2] = f54
  318. (p12) add Y1 = INCY, Y1
  319. }
  320. { .mmi
  321. (p12) add Y2 = INCY, Y2
  322. (p13) shladd YY = INCY, 2, YY
  323. nop.i 0
  324. }
  325. ;;
  326. { .mmi
  327. (p12) STFD [Y1] = f51
  328. (p12) STFD [Y2] = f55
  329. (p12) add Y1 = INCY5, Y1
  330. }
  331. { .mmi
  332. (p12) add Y2 = INCY5, Y2
  333. nop.m 0
  334. nop.i 0
  335. }
  336. ;;
  337. { .mmi
  338. (p13) STFD [Y1] = f56
  339. (p14) STFD [YY] = f60
  340. (p13) add Y1 = INCY, Y1
  341. }
  342. { .mmi
  343. (p14) add YY = INCY, YY
  344. nop.m 0
  345. nop.i 0
  346. }
  347. ;;
  348. { .mmi
  349. (p13) STFD [Y1] = f57
  350. (p14) STFD [YY] = f61
  351. (p13) add Y1 = INCY, Y1
  352. }
  353. { .mmi
  354. (p14) add YY = INCY, YY
  355. nop.m 0
  356. nop.i 0
  357. }
  358. ;;
  359. { .mmi
  360. (p13) STFD [Y1] = f58
  361. (p15) STFD [YY] = f62
  362. (p13) add Y1 = INCY, Y1
  363. }
  364. ;;
  365. { .mmb
  366. (p13) STFD [Y1] = f59
  367. nop.m 0
  368. br.ret.sptk.many b0
  369. }
  370. .align 32
  371. ;;
  372. .L30:
  373. { .mmi
  374. (p7) STFD [YY] = f32
  375. cmp.gt p8 ,p0 = r0, I
  376. mov ar.ec = 4
  377. }
  378. { .mmb
  379. adds PREA = PREFETCH_SIZE * SIZE + 24, X1
  380. #ifdef DOUBLE
  381. adds PREB = PREFETCH_SIZE * SIZE + 64, Y1
  382. #else
  383. adds PREB = PREFETCH_SIZE * SIZE + 72, Y1
  384. #endif
  385. (p8) br.cond.dpnt .L35
  386. }
  387. ;;
  388. .align 32
  389. .L32:
  390. { .mmi
  391. (p19) STFD [Y1] = f35
  392. (p19) STFD [Y2] = f55
  393. (p19) add Y1 = INCY, Y1
  394. }
  395. { .mmi
  396. (p16) lfetch.nt1 [PREA], INCX16
  397. (p16) LDFPD f32, f37 = [X1], 2 * SIZE
  398. (p19) add Y2 = INCY, Y2
  399. }
  400. ;;
  401. { .mmi
  402. (p19) STFD [Y1] = f40
  403. (p19) STFD [Y2] = f60
  404. (p19) add Y1 = INCY, Y1
  405. }
  406. { .mmi
  407. (p16) lfetch.excl.nt1 [PREB], INCY16
  408. (p16) LDFPD f42, f47 = [X1], 2 * SIZE
  409. (p19) add Y2 = INCY, Y2
  410. }
  411. ;;
  412. { .mmi
  413. (p19) STFD [Y1] = f45
  414. (p19) STFD [Y2] = f65
  415. (p19) add Y1 = INCY, Y1
  416. }
  417. { .mmi
  418. (p16) LDFPD f52, f57 = [X1], 2 * SIZE
  419. nop.m 0
  420. (p19) add Y2 = INCY, Y2
  421. }
  422. ;;
  423. { .mmi
  424. (p19) STFD [Y1] = f50
  425. (p19) STFD [Y2] = f70
  426. (p19) add Y1 = INCY5, Y1
  427. }
  428. { .mmi
  429. (p16) LDFPD f62, f67 = [X1], 2 * SIZE
  430. nop.m 0
  431. (p19) add Y2 = INCY5, Y2
  432. }
  433. ;;
  434. { .mmi
  435. (p19) STFD [Y1] = f75
  436. (p19) STFD [Y2] = f95
  437. (p16) adds XX = 8 * SIZE, X1
  438. }
  439. { .mmi
  440. (p16) LDFPD f72, f77 = [X1], 2 * SIZE
  441. (p19) add Y1 = INCY, Y1
  442. (p19) add Y2 = INCY, Y2
  443. }
  444. ;;
  445. { .mmi
  446. (p19) STFD [Y1] = f80
  447. (p19) STFD [Y2] = f100
  448. (p19) add Y1 = INCY, Y1
  449. }
  450. { .mmi
  451. (p16) LDFPD f82, f87 = [X1], 2 * SIZE
  452. nop.m 0
  453. (p19) add Y2 = INCY, Y2
  454. }
  455. ;;
  456. { .mmi
  457. (p19) STFD [Y1] = f85
  458. (p19) STFD [Y2] = f105
  459. (p16) shladd X2 = INCX, 2, XX
  460. }
  461. { .mmi
  462. (p16) LDFPD f92, f97 = [X1], 2 * SIZE
  463. (p19) add Y1 = INCY, Y1
  464. (p19) add Y2 = INCY, Y2
  465. }
  466. ;;
  467. { .mmi
  468. (p19) STFD [Y1] = f90
  469. (p19) STFD [Y2] = f110
  470. (p19) add Y1 = INCY5, Y1
  471. }
  472. { .mmb
  473. (p16) LDFPD f102, f107 = [X1], 2 * SIZE
  474. (p19) add Y2 = INCY5, Y2
  475. br.ctop.sptk.few .L32
  476. }
  477. ;;
  478. .align 32
  479. .L35:
  480. { .mmi
  481. (p12) LDFPD f48, f49 = [X1], 2 * SIZE
  482. (p12) LDFPD f52, f53 = [X2], 2 * SIZE
  483. mov ar.lc = ARLC
  484. }
  485. { .mmi
  486. (p12) adds XX = 8 * SIZE, XX
  487. nop.m 0
  488. tbit.z p0, p14 = N, 1
  489. }
  490. ;;
  491. { .mmi
  492. (p12) LDFPD f50, f51 = [X1]
  493. (p12) LDFPD f54, f55 = [X2]
  494. mov pr = PR, -65474
  495. }
  496. { .mmi
  497. (p12) adds X1 = 6 * SIZE, X1
  498. (p12) adds X2 = 6 * SIZE, X2
  499. (p13) adds XX = 4 * SIZE, XX
  500. }
  501. ;;
  502. { .mmi
  503. (p13) LDFPD f56, f57 = [X1], 2 * SIZE
  504. (p14) LDFPD f60, f61 = [XX], 2 * SIZE
  505. tbit.z p0, p15 = N, 0
  506. }
  507. ;;
  508. { .mmb
  509. (p13) LDFPD f58, f59 = [X1], 2 * SIZE
  510. (p15) LDFD f62 = [XX]
  511. (p9) br.ret.sptk.many b0
  512. }
  513. ;;
  514. { .mmi
  515. (p12) STFD [Y1] = f48
  516. (p12) STFD [Y2] = f52
  517. mov YY = Y1
  518. }
  519. { .mmi
  520. (p12) add Y1 = INCY, Y1
  521. (p12) add Y2 = INCY, Y2
  522. nop.i 0
  523. }
  524. ;;
  525. { .mmi
  526. (p12) STFD [Y1] = f49
  527. (p12) STFD [Y2] = f53
  528. (p12) add Y1 = INCY, Y1
  529. }
  530. { .mmi
  531. (p12) add Y2 = INCY, Y2
  532. (p12) shladd YY = INCY, 3, YY
  533. nop.i 0
  534. }
  535. ;;
  536. { .mmi
  537. (p12) STFD [Y1] = f50
  538. (p12) STFD [Y2] = f54
  539. (p12) add Y1 = INCY, Y1
  540. }
  541. { .mmi
  542. (p12) add Y2 = INCY, Y2
  543. (p13) shladd YY = INCY, 2, YY
  544. nop.i 0
  545. }
  546. ;;
  547. { .mmi
  548. (p12) STFD [Y1] = f51
  549. (p12) STFD [Y2] = f55
  550. nop.i 0
  551. }
  552. { .mmi
  553. (p12) add Y1 = INCY5, Y1
  554. (p12) add Y2 = INCY5, Y2
  555. nop.i 0
  556. }
  557. ;;
  558. { .mmi
  559. (p13) STFD [Y1] = f56
  560. (p14) STFD [YY] = f60
  561. nop.i 0
  562. }
  563. { .mmi
  564. (p13) add Y1 = INCY, Y1
  565. (p14) add YY = INCY, YY
  566. nop.i 0
  567. }
  568. ;;
  569. { .mmi
  570. (p13) STFD [Y1] = f57
  571. (p14) STFD [YY] = f61
  572. nop.i 0
  573. }
  574. { .mmi
  575. (p13) add Y1 = INCY, Y1
  576. (p14) add YY = INCY, YY
  577. nop.i 0
  578. }
  579. ;;
  580. { .mmi
  581. (p13) STFD [Y1] = f58
  582. (p15) STFD [YY] = f62
  583. (p13) add Y1 = INCY, Y1
  584. }
  585. ;;
  586. { .mib
  587. (p13) STFD [Y1] = f59
  588. nop.i 0
  589. br.ret.sptk.many b0
  590. }
  591. .align 32
  592. ;;
  593. /* INCX != 1 */
  594. .L100:
  595. { .mmi
  596. shladd INCX16 = INCX, 4, r0
  597. shladd INCY16 = INCY, 4, r0
  598. tbit.z p0, p12 = N, 3
  599. }
  600. { .mmi
  601. nop.m 0
  602. nop.m 0
  603. nop.i 0
  604. }
  605. ;;
  606. { .mmi
  607. adds PREA = PREFETCH_SIZE * SIZE, X1
  608. adds PREB = PREFETCH_SIZE * SIZE, Y1
  609. mov ar.ec = 6
  610. }
  611. { .mmi
  612. cmp.eq p8 ,p0 = r0, I
  613. cmp.eq p9, p0 = r0, J
  614. adds I = -1, I
  615. }
  616. ;;
  617. { .mmi
  618. (p7) STFD [YY] = f32
  619. shladd X2 = INCX, 2, X1
  620. mov ar.lc = I
  621. }
  622. { .mib
  623. shladd Y2 = INCY, 2, Y1
  624. cmp.eq p16, p0 = r0, r0
  625. (p8) br.cond.dpnt .L120
  626. }
  627. ;;
  628. .align 32
  629. .L110:
  630. { .mmi
  631. (p21) STFD [Y1] = f37
  632. (p21) STFD [Y2] = f61
  633. (p21) add Y1 = INCY, Y1
  634. }
  635. { .mmi
  636. (p16) lfetch.nt1 [PREA], INCX16
  637. (p16) lfetch.excl.nt1 [PREB], INCY16
  638. (p21) add Y2 = INCY, Y2
  639. }
  640. ;;
  641. { .mmi
  642. (p21) STFD [Y1] = f43
  643. (p21) STFD [Y2] = f67
  644. (p21) add Y1 = INCY, Y1
  645. }
  646. { .mmi
  647. (p16) LDFD f56 = [X2], INCX
  648. (p16) LDFD f32 = [X1], INCX
  649. (p21) add Y2 = INCY, Y2
  650. }
  651. ;;
  652. { .mmi
  653. (p21) STFD [Y1] = f49
  654. (p21) STFD [Y2] = f73
  655. (p21) add Y1 = INCY, Y1
  656. }
  657. { .mmi
  658. (p16) LDFD f38 = [X1], INCX
  659. (p16) LDFD f62 = [X2], INCX
  660. (p21) add Y2 = INCY, Y2
  661. }
  662. ;;
  663. { .mmi
  664. (p21) STFD [Y1] = f55
  665. (p21) STFD [Y2] = f79
  666. (p21) add Y1 = INCY5, Y1
  667. }
  668. { .mmi
  669. (p16) LDFD f44 = [X1], INCX
  670. (p16) LDFD f68 = [X2], INCX
  671. (p21) add Y2 = INCY5, Y2
  672. }
  673. ;;
  674. { .mmi
  675. (p21) STFD [Y1] = f85
  676. (p21) STFD [Y2] = f109
  677. (p21) add Y1 = INCY, Y1
  678. }
  679. { .mmi
  680. (p16) LDFD f50 = [X1], INCX5
  681. (p16) LDFD f74 = [X2], INCX5
  682. (p21) add Y2 = INCY, Y2
  683. }
  684. ;;
  685. { .mmi
  686. (p21) STFD [Y1] = f91
  687. (p21) STFD [Y2] = f115
  688. (p21) add Y1 = INCY, Y1
  689. }
  690. { .mmi
  691. (p16) LDFD f80 = [X1], INCX
  692. (p16) LDFD f104 = [X2], INCX
  693. (p21) add Y2 = INCY, Y2
  694. }
  695. ;;
  696. { .mmi
  697. (p21) STFD [Y1] = f97
  698. (p21) STFD [Y2] = f121
  699. (p21) add Y1 = INCY, Y1
  700. }
  701. { .mmi
  702. (p16) LDFD f86 = [X1], INCX
  703. (p16) LDFD f110 = [X2], INCX
  704. (p21) add Y2 = INCY, Y2
  705. }
  706. ;;
  707. { .mmi
  708. (p21) STFD [Y1] = f103
  709. (p21) STFD [Y2] = f127
  710. (p21) add Y1 = INCY5, Y1
  711. }
  712. { .mmi
  713. (p16) LDFD f92 = [X1], INCX
  714. (p16) LDFD f116 = [X2], INCX
  715. (p21) add Y2 = INCY5, Y2
  716. }
  717. ;;
  718. { .mmi
  719. nop.m 0
  720. (p16) add XX = INCX5, X1
  721. nop.i 0
  722. }
  723. { .mmb
  724. (p16) LDFD f98 = [X1], INCX5
  725. (p16) LDFD f122 = [X2], INCX5
  726. br.ctop.sptk.few .L110
  727. }
  728. ;;
  729. .align 32
  730. .L120:
  731. { .mmi
  732. (p12) LDFD f48 = [X1], INCX
  733. (p12) LDFD f52 = [X2], INCX
  734. mov ar.lc = ARLC
  735. }
  736. ;;
  737. { .mmi
  738. (p12) LDFD f49 = [X1], INCX
  739. (p12) LDFD f53 = [X2], INCX
  740. mov pr = PR, -65474
  741. }
  742. ;;
  743. { .mmi
  744. (p12) LDFD f50 = [X1], INCX
  745. (p12) LDFD f54 = [X2], INCX
  746. tbit.z p0, p13 = N, 2
  747. }
  748. { .mmb
  749. nop.m 0
  750. nop.m 0
  751. (p9) br.ret.sptk.many b0
  752. }
  753. ;;
  754. { .mmi
  755. (p12) LDFD f51 = [X1], INCX5
  756. (p12) LDFD f55 = [X2], INCX5
  757. (p12) shladd XX = INCX, 3, XX
  758. }
  759. ;;
  760. { .mmi
  761. (p13) LDFD f56 = [X1], INCX
  762. (p13) shladd XX = INCX, 2, XX
  763. tbit.z p0, p14 = N, 1
  764. }
  765. ;;
  766. { .mmi
  767. (p13) LDFD f57 = [X1], INCX
  768. (p14) LDFD f60 = [XX], INCX
  769. }
  770. ;;
  771. { .mmi
  772. (p13) LDFD f58 = [X1], INCX
  773. (p14) LDFD f61 = [XX], INCX
  774. tbit.z p0, p15 = N, 0
  775. }
  776. ;;
  777. { .mmi
  778. (p13) LDFD f59 = [X1], INCX
  779. (p15) LDFD f62 = [XX]
  780. mov YY = Y1
  781. }
  782. ;;
  783. { .mmi
  784. (p12) STFD [Y1] = f48
  785. (p12) STFD [Y2] = f52
  786. nop.i 0
  787. }
  788. { .mmi
  789. (p12) add Y1 = INCY, Y1
  790. (p12) add Y2 = INCY, Y2
  791. nop.i 0
  792. }
  793. ;;
  794. { .mmi
  795. (p12) STFD [Y1] = f49
  796. (p12) STFD [Y2] = f53
  797. nop.i 0
  798. }
  799. { .mmi
  800. (p12) add Y1 = INCY, Y1
  801. (p12) add Y2 = INCY, Y2
  802. nop.i 0
  803. }
  804. ;;
  805. { .mmi
  806. (p12) STFD [Y1] = f50
  807. (p12) STFD [Y2] = f54
  808. nop.i 0
  809. }
  810. { .mmi
  811. (p12) add Y1 = INCY, Y1
  812. (p12) add Y2 = INCY, Y2
  813. nop.i 0
  814. }
  815. ;;
  816. { .mmi
  817. (p12) STFD [Y1] = f51
  818. (p12) STFD [Y2] = f55
  819. (p12) add Y1 = INCY5, Y1
  820. }
  821. { .mmi
  822. (p12) add Y2 = INCY5, Y2
  823. (p12) shladd YY = INCY, 3, YY
  824. nop.i 0
  825. }
  826. ;;
  827. { .mmi
  828. (p13) STFD [Y1] = f56
  829. (p13) add Y1 = INCY, Y1
  830. (p13) shladd YY =INCY, 2, YY
  831. }
  832. ;;
  833. { .mmi
  834. (p13) STFD [Y1] = f57
  835. (p14) STFD [YY] = f60
  836. nop.i 0
  837. }
  838. { .mmi
  839. (p13) add Y1 = INCY, Y1
  840. (p14) add YY = INCY, YY
  841. nop.i 0
  842. }
  843. ;;
  844. { .mmi
  845. (p13) STFD [Y1] = f58
  846. (p14) STFD [YY] = f61
  847. nop.i 0
  848. }
  849. { .mmi
  850. (p13) add Y1 = INCY, Y1
  851. (p14) add YY = INCY, YY
  852. nop.i 0
  853. }
  854. ;;
  855. { .mmb
  856. (p13) STFD [Y1] = f59
  857. (p15) STFD [YY] = f62
  858. br.ret.sptk.many b0
  859. }
  860. ;;
  861. EPILOGUE