You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_U.S 10 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define SP r12
  41. #define M r32
  42. #define A r34
  43. #define LDA r35
  44. #define X r36
  45. #define INCX r37
  46. #define Y r38
  47. #define INCY r39
  48. #define BUFFER r33
  49. #define I r14
  50. #define IS r15
  51. #define A1 r16
  52. #define A2 r17
  53. #define A3 r18
  54. #define A4 r19
  55. #define NEW_X r20
  56. #define NEW_Y r21
  57. #define XX r22
  58. #define YY r23
  59. #define TEMP r24
  60. #define YYS r25
  61. #define PREA1 loc0
  62. #define PREA2 loc1
  63. #define PREA3 loc2
  64. #define PREA4 loc3
  65. #define A11 loc4
  66. #define A21 loc5
  67. #define A31 loc6
  68. #define A41 loc7
  69. #define PREX r8
  70. #define PREY r9
  71. #define ARLC r29
  72. #define PR r30
  73. #define ARPFS r31
  74. #ifdef DOUBLE
  75. #define RPREFETCH (16 * 3 + 4)
  76. #else
  77. #define RPREFETCH (16 * 3 + 16)
  78. #endif
  79. #define PREFETCH lfetch.nt1
  80. #define PREFETCHW lfetch.excl.nt1
  81. #define alpha f8
  82. #define atemp1 f6
  83. #define atemp2 f7
  84. #define atemp3 f10
  85. #define atemp4 f11
  86. #define xsum1 f12
  87. #define xsum2 f13
  88. #define xsum3 f14
  89. #define xsum4 f15
  90. PROLOGUE
  91. .prologue
  92. PROFCODE
  93. { .mmi
  94. .save ar.pfs, ARPFS
  95. alloc ARPFS = ar.pfs, 8, 16, 8, 0
  96. mov ARLC = ar.lc
  97. }
  98. ;;
  99. mov PR = pr
  100. adds r14 = 16, SP
  101. ;;
  102. adds r8 = -8 * 16, SP
  103. adds r9 = -7 * 16, SP
  104. adds SP = -8 * 16, SP
  105. ;;
  106. stf.spill [r8] = f16, 32
  107. stf.spill [r9] = f17, 32
  108. ;;
  109. stf.spill [r8] = f18, 32
  110. stf.spill [r9] = f19, 32
  111. ;;
  112. stf.spill [r8] = f20, 32
  113. stf.spill [r9] = f21, 32
  114. ;;
  115. stf.spill [r8] = f22
  116. stf.spill [r9] = f23
  117. .body
  118. ;;
  119. ld8 BUFFER = [r14]
  120. ;;
  121. shladd LDA = LDA, BASE_SHIFT, r0
  122. shladd INCX = INCX, BASE_SHIFT, r0
  123. shladd INCY = INCY, BASE_SHIFT, r0
  124. ;;
  125. cmp.ge p7, p0 = 0, M
  126. ;;
  127. (p7) br.cond.dpnt .L999
  128. ;;
  129. mov NEW_X = X
  130. cmp.eq p10, p0 = SIZE, INCX
  131. (p10) br.cond.dptk .L10
  132. ;;
  133. .L10:
  134. mov NEW_Y = Y
  135. cmp.eq p10, p0 = SIZE, INCY
  136. (p10) br.cond.dptk .L20
  137. ;;
  138. .L20:
  139. mov IS = 0
  140. cmp.gt p10, p0 = 4, M
  141. (p10) br.cond.dpnt .L30
  142. ;;
  143. .L21:
  144. mov A1 = A
  145. add A2 = LDA, A
  146. ;;
  147. shladd A3 = LDA, 1, A
  148. shladd A4 = LDA, 1, A2
  149. shladd A = LDA, 2, A
  150. ;;
  151. ;;
  152. adds PREX = RPREFETCH * SIZE, NEW_X
  153. adds PREY = RPREFETCH * SIZE, NEW_Y
  154. adds PREA1 = RPREFETCH * SIZE, A1
  155. adds PREA2 = RPREFETCH * SIZE, A2
  156. adds PREA3 = RPREFETCH * SIZE, A3
  157. adds PREA4 = RPREFETCH * SIZE, A4
  158. ;;
  159. shladd TEMP = IS, BASE_SHIFT, NEW_X
  160. ;;
  161. LDFD atemp1 = [TEMP], 1 * SIZE
  162. ;;
  163. LDFD atemp2 = [TEMP], 1 * SIZE
  164. ;;
  165. LDFD atemp3 = [TEMP], 1 * SIZE
  166. ;;
  167. LDFD atemp4 = [TEMP], 1 * SIZE
  168. ;;
  169. FMPY atemp1 = alpha, atemp1
  170. FMPY atemp2 = alpha, atemp2
  171. FMPY atemp3 = alpha, atemp3
  172. FMPY atemp4 = alpha, atemp4
  173. ;;
  174. mov xsum1 = f0
  175. mov xsum2 = f0
  176. mov xsum3 = f0
  177. mov xsum4 = f0
  178. ;;
  179. mov XX = NEW_X
  180. mov YY = NEW_Y
  181. mov YYS = NEW_Y
  182. ;;
  183. shr I = IS, 2
  184. mov pr.rot = 0
  185. ;;
  186. mov ar.ec = 3
  187. cmp.eq p16, p0 = r0, r0
  188. ;;
  189. cmp.eq p6, p0 = 0, I
  190. adds I = -1, I
  191. ;;
  192. mov ar.lc = I
  193. (p6) br.cond.dpnt .L28
  194. ;;
  195. .align 16
  196. .L22:
  197. { .mmf
  198. (p16) LDFPD f32, f35 = [A1], 2 * SIZE
  199. (p19) STFD [YYS] = f95, 1 * SIZE
  200. (p18) FMA xsum1 = f82, f34, xsum1
  201. }
  202. { .mmf
  203. (p18) FMA f94 = atemp1, f34, f94
  204. }
  205. ;;
  206. { .mmf
  207. (p17) LDFD f90 = [XX], 1 * SIZE
  208. (p18) FMA xsum2 = f82, f46, xsum2
  209. }
  210. { .mmf
  211. (p18) FMA f98 = atemp1, f37, f98
  212. }
  213. ;;
  214. { .mmf
  215. (p16) LDFPD f44, f47 = [A2], 2 * SIZE
  216. (p19) STFD [YYS] = f99, 1 * SIZE
  217. (p18) FMA xsum3 = f82, f58, xsum3
  218. }
  219. { .mmf
  220. (p18) FMA f102 = atemp1, f40, f102
  221. }
  222. ;;
  223. { .mmf
  224. (p16) PREFETCHW [PREY], 4 * SIZE
  225. (p16) LDFD f92 = [YY], 1 * SIZE
  226. (p18) FMA xsum4 = f82, f70, xsum4
  227. }
  228. { .mmf
  229. (p18) FMA f106 = atemp1, f43, f106
  230. }
  231. ;;
  232. { .mmf
  233. (p16) LDFPD f56, f59 = [A3], 2 * SIZE
  234. (p19) STFD [YYS] = f103, 1 * SIZE
  235. (p18) FMA xsum1 = f85, f37, xsum1
  236. }
  237. { .mmf
  238. (p18) FMA f94 = atemp2, f46, f94
  239. }
  240. ;;
  241. { .mmf
  242. (p16) LDFD f96 = [YY], 1 * SIZE
  243. (p18) FMA xsum2 = f85, f49, xsum2
  244. }
  245. { .mmf
  246. (p18) FMA f98 = atemp2, f49, f98
  247. }
  248. ;;
  249. { .mmf
  250. (p16) LDFPD f68, f71 = [A4], 2 * SIZE
  251. (p19) STFD [YYS] = f107, 1 * SIZE
  252. (p18) FMA xsum3 = f85, f61, xsum3
  253. }
  254. { .mmf
  255. (p18) FMA f102 = atemp2, f52, f102
  256. }
  257. ;;
  258. { .mmf
  259. (p16) LDFD f100 = [YY], 1 * SIZE
  260. (p18) FMA xsum4 = f85, f73, xsum4
  261. }
  262. { .mmf
  263. (p18) FMA f106 = atemp2, f55, f106
  264. }
  265. ;;
  266. { .mmf
  267. (p16) PREFETCH [PREA1], 4 * SIZE
  268. (p16) LDFPD f38, f41 = [A1], 2 * SIZE
  269. (p18) FMA xsum1 = f88, f40, xsum1
  270. }
  271. { .mmf
  272. (p18) FMA f94 = atemp3, f58, f94
  273. }
  274. ;;
  275. { .mmf
  276. (p16) LDFD f104 = [YY], 1 * SIZE
  277. (p18) FMA xsum2 = f88, f52, xsum2
  278. }
  279. { .mmf
  280. (p18) FMA f98 = atemp3, f61, f98
  281. }
  282. ;;
  283. { .mmf
  284. (p16) PREFETCH [PREA2], 4 * SIZE
  285. (p16) LDFPD f50, f53 = [A2], 2 * SIZE
  286. (p18) FMA xsum3 = f88, f64, xsum3
  287. }
  288. { .mmf
  289. (p18) FMA f102 = atemp3, f64, f102
  290. }
  291. ;;
  292. { .mmf
  293. (p16) PREFETCH [PREX], 4 * SIZE
  294. (p16) LDFD f80 = [XX], 1 * SIZE
  295. (p18) FMA xsum4 = f88, f76, xsum4
  296. }
  297. { .mmf
  298. (p18) FMA f106 = atemp3, f67, f106
  299. }
  300. ;;
  301. { .mmf
  302. (p16) PREFETCH [PREA3], 4 * SIZE
  303. (p16) LDFPD f62, f65 = [A3], 2 * SIZE
  304. (p18) FMA xsum1 = f91, f43, xsum1
  305. }
  306. { .mmf
  307. (p18) FMA f94 = atemp4, f70, f94
  308. }
  309. ;;
  310. { .mmf
  311. (p16) LDFD f83 = [XX], 1 * SIZE
  312. (p18) FMA xsum2 = f91, f55, xsum2
  313. }
  314. { .mmf
  315. (p18) FMA f98 = atemp4, f73, f98
  316. }
  317. ;;
  318. { .mmf
  319. (p16) PREFETCH [PREA4], 4 * SIZE
  320. (p16) LDFPD f74, f77 = [A4], 2 * SIZE
  321. (p18) FMA xsum3 = f91, f67, xsum3
  322. }
  323. { .mmf
  324. (p18) FMA f102 = atemp4, f76, f102
  325. }
  326. ;;
  327. { .mmf
  328. (p16) LDFD f86 = [XX], 1 * SIZE
  329. (p18) FMA xsum4 = f91, f79, xsum4
  330. }
  331. { .mfb
  332. (p18) FMA f106 = atemp4, f79, f106
  333. br.ctop.sptk.few .L22
  334. }
  335. ;;
  336. (p19) STFD [YYS] = f95, 1 * SIZE
  337. ;;
  338. (p19) STFD [YYS] = f99, 1 * SIZE
  339. ;;
  340. (p19) STFD [YYS] = f103, 1 * SIZE
  341. ;;
  342. (p19) STFD [YYS] = f107, 1 * SIZE
  343. ;;
  344. ;;
  345. .align 16
  346. .L28:
  347. FMPY xsum1 = alpha, xsum1
  348. FMPY xsum2 = alpha, xsum2
  349. FMPY xsum3 = alpha, xsum3
  350. FMPY xsum4 = alpha, xsum4
  351. ;;
  352. LDFD f64 = [A1], 1 * SIZE
  353. LDFD f65 = [A2], 1 * SIZE
  354. LDFD f66 = [A3], 1 * SIZE
  355. LDFD f67 = [A4], 1 * SIZE
  356. ;;
  357. LDFD f68 = [A1], 1 * SIZE
  358. LDFD f69 = [A2], 1 * SIZE
  359. LDFD f70 = [A3], 1 * SIZE
  360. LDFD f71 = [A4], 1 * SIZE
  361. ;;
  362. LDFD f72 = [A1], 1 * SIZE
  363. LDFD f73 = [A2], 1 * SIZE
  364. LDFD f74 = [A3], 1 * SIZE
  365. LDFD f75 = [A4], 1 * SIZE
  366. ;;
  367. LDFD f76 = [A1], 1 * SIZE
  368. LDFD f77 = [A2], 1 * SIZE
  369. LDFD f78 = [A3], 1 * SIZE
  370. LDFD f79 = [A4], 1 * SIZE
  371. ;;
  372. FMA xsum1 = atemp1, f64, xsum1
  373. FMA xsum2 = atemp1, f65, xsum2
  374. FMA xsum3 = atemp1, f66, xsum3
  375. FMA xsum4 = atemp1, f67, xsum4
  376. ;;
  377. FMA xsum1 = atemp2, f65, xsum1
  378. FMA xsum2 = atemp2, f69, xsum2
  379. FMA xsum3 = atemp2, f70, xsum3
  380. FMA xsum4 = atemp2, f71, xsum4
  381. ;;
  382. FMA xsum1 = atemp3, f66, xsum1
  383. FMA xsum2 = atemp3, f70, xsum2
  384. FMA xsum3 = atemp3, f74, xsum3
  385. FMA xsum4 = atemp3, f75, xsum4
  386. ;;
  387. FMA xsum1 = atemp4, f67, xsum1
  388. FMA xsum2 = atemp4, f71, xsum2
  389. FMA xsum3 = atemp4, f75, xsum3
  390. FMA xsum4 = atemp4, f79, xsum4
  391. ;;
  392. LDFD f36 = [YY], 1 * SIZE
  393. ;;
  394. LDFD f37 = [YY], 1 * SIZE
  395. ;;
  396. LDFD f38 = [YY], 1 * SIZE
  397. ;;
  398. LDFD f39 = [YY], 1 * SIZE
  399. ;;
  400. FADD f36 = f36, xsum1
  401. FADD f37 = f37, xsum2
  402. FADD f38 = f38, xsum3
  403. FADD f39 = f39, xsum4
  404. ;;
  405. STFD [YYS] = f36, 1 * SIZE
  406. ;;
  407. STFD [YYS] = f37, 1 * SIZE
  408. ;;
  409. STFD [YYS] = f38, 1 * SIZE
  410. ;;
  411. STFD [YYS] = f39, 1 * SIZE
  412. ;;
  413. adds IS = 4, IS
  414. ;;
  415. adds TEMP = 4, IS
  416. ;;
  417. cmp.le p6, p0 = TEMP, M
  418. ;;
  419. (p6) br.cond.dpnt .L21
  420. ;;
  421. .L30:
  422. .L990:
  423. .L999:
  424. mov r8 = r0
  425. adds r9 = 1 * 16, SP
  426. ;;
  427. ldf.fill f16 = [SP], 32
  428. ldf.fill f17 = [r9], 32
  429. mov ar.lc = ARLC
  430. ;;
  431. ldf.fill f18 = [SP], 32
  432. ldf.fill f19 = [r9], 32
  433. mov pr = PR, -1
  434. ;;
  435. ldf.fill f20 = [SP], 32
  436. ldf.fill f21 = [r9], 32
  437. mov ar.pfs = ARPFS
  438. ;;
  439. ldf.fill f22 = [SP], 32
  440. ldf.fill f23 = [r9]
  441. br.ret.sptk.many b0
  442. ;;
  443. EPILOGUE