You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy.S 12 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define PREFETCHSIZE 40
  42. #ifndef CONJ
  43. #define ADD1 SUB
  44. #define ADD2 ADD
  45. #else
  46. #define ADD1 ADD
  47. #define ADD2 SUB
  48. #endif
  49. PROLOGUE
  50. PROFCODE
  51. .frame $sp, 16, $26, 0
  52. ldl $19, 0($sp)
  53. fmov $f19, $f29
  54. ldq $20, 8($sp)
  55. fmov $f20, $f30
  56. mov $21, $18
  57. ldl $21, 16($sp)
  58. lda $sp, -64($sp)
  59. nop
  60. stt $f2, 0($sp)
  61. cmpeq $19, 1, $1
  62. stt $f3, 8($sp)
  63. cmpeq $21, 1, $2
  64. stt $f4, 16($sp)
  65. and $16, 3, $5
  66. stt $f5, 24($sp)
  67. stt $f6, 32($sp)
  68. stt $f7, 40($sp)
  69. stt $f8, 48($sp)
  70. #ifndef PROFILE
  71. .prologue 0
  72. #else
  73. .prologue 1
  74. #endif
  75. and $1, $2, $1
  76. ble $16, $End
  77. sra $16, 2, $4
  78. beq $1, $Sub
  79. ble $4, $Remain
  80. subq $4, 1, $4
  81. LD $f0, 0*SIZE($18)
  82. LD $f1, 1*SIZE($18)
  83. LD $f2, 2*SIZE($18)
  84. LD $f3, 3*SIZE($18)
  85. LD $f4, 4*SIZE($18)
  86. LD $f5, 5*SIZE($18)
  87. LD $f6, 6*SIZE($18)
  88. LD $f7, 7*SIZE($18)
  89. LD $f8, 0*SIZE($20)
  90. LD $f28, 1*SIZE($20)
  91. LD $f10, 2*SIZE($20)
  92. LD $f11, 3*SIZE($20)
  93. LD $f12, 4*SIZE($20)
  94. LD $f13, 5*SIZE($20)
  95. LD $f14, 6*SIZE($20)
  96. LD $f15, 7*SIZE($20)
  97. addq $18, 8*SIZE, $18
  98. ble $4, $MainLoopEnd
  99. .align 4
  100. $MainLoop:
  101. ldt $f31, PREFETCHSIZE * SIZE($20)
  102. ldl $31, PREFETCHSIZE * SIZE($18)
  103. MUL $f29, $f0, $f20
  104. LD $f31, 9*SIZE($18)
  105. MUL $f30, $f1, $f21
  106. unop
  107. MUL $f30, $f0, $f22
  108. LD $f0, 0*SIZE($18)
  109. MUL $f29, $f1, $f23
  110. LD $f1, 1*SIZE($18)
  111. MUL $f29, $f2, $f24
  112. unop
  113. MUL $f30, $f3, $f25
  114. nop
  115. MUL $f30, $f2, $f26
  116. LD $f2, 2*SIZE($18)
  117. MUL $f29, $f3, $f27
  118. LD $f3, 3*SIZE($18)
  119. ADD1 $f20, $f21, $f16
  120. MUL $f29, $f4, $f20
  121. ADD2 $f22, $f23, $f17
  122. MUL $f30, $f5, $f21
  123. ADD1 $f24, $f25, $f18
  124. unop
  125. MUL $f30, $f4, $f22
  126. LD $f4, 4*SIZE($18)
  127. ADD2 $f26, $f27, $f19
  128. addq $20, 8*SIZE, $20
  129. MUL $f29, $f5, $f23
  130. LD $f5, 5*SIZE($18)
  131. ADD $f16, $f8, $f16
  132. LD $f8, 0*SIZE($20)
  133. MUL $f29, $f6, $f24
  134. unop
  135. ADD $f17, $f28, $f17
  136. LD $f28, 1*SIZE($20)
  137. MUL $f30, $f7, $f25
  138. unop
  139. ADD $f18, $f10, $f18
  140. LD $f10, 2*SIZE($20)
  141. MUL $f30, $f6, $f26
  142. LD $f6, 6*SIZE($18)
  143. ADD $f19, $f11, $f19
  144. LD $f11, 3*SIZE($20)
  145. MUL $f29, $f7, $f27
  146. LD $f7, 7*SIZE($18)
  147. ST $f16,-8*SIZE($20)
  148. ADD1 $f20, $f21, $f16
  149. ST $f17,-7*SIZE($20)
  150. ADD2 $f22, $f23, $f17
  151. ST $f18,-6*SIZE($20)
  152. ADD1 $f24, $f25, $f18
  153. ST $f19,-5*SIZE($20)
  154. ADD2 $f26, $f27, $f19
  155. ADD $f16, $f12, $f16
  156. LD $f12, 4*SIZE($20)
  157. ADD $f17, $f13, $f17
  158. LD $f13, 5*SIZE($20)
  159. ADD $f18, $f14, $f18
  160. LD $f14, 6*SIZE($20)
  161. ADD $f19, $f15, $f19
  162. LD $f15, 7*SIZE($20)
  163. ST $f16,-4*SIZE($20)
  164. addq $18, 8*SIZE, $18
  165. ST $f17,-3*SIZE($20)
  166. subq $4, 1, $4
  167. ST $f18,-2*SIZE($20)
  168. nop
  169. ST $f19,-1*SIZE($20)
  170. bgt $4, $MainLoop
  171. .align 4
  172. $MainLoopEnd:
  173. MUL $f29, $f0, $f20
  174. MUL $f30, $f1, $f21
  175. MUL $f30, $f0, $f22
  176. MUL $f29, $f1, $f23
  177. MUL $f29, $f2, $f24
  178. MUL $f30, $f3, $f25
  179. MUL $f30, $f2, $f26
  180. MUL $f29, $f3, $f27
  181. ADD1 $f20, $f21, $f16
  182. MUL $f29, $f4, $f20
  183. ADD2 $f22, $f23, $f17
  184. MUL $f30, $f5, $f21
  185. ADD1 $f24, $f25, $f18
  186. MUL $f30, $f4, $f22
  187. ADD2 $f26, $f27, $f19
  188. MUL $f29, $f5, $f23
  189. ADD $f16, $f8, $f16
  190. MUL $f29, $f6, $f24
  191. ADD $f17, $f28, $f17
  192. MUL $f30, $f7, $f25
  193. ADD $f18, $f10, $f18
  194. MUL $f30, $f6, $f26
  195. ADD $f19, $f11, $f19
  196. MUL $f29, $f7, $f27
  197. ST $f16, 0*SIZE($20)
  198. ADD1 $f20, $f21, $f16
  199. ST $f17, 1*SIZE($20)
  200. ADD2 $f22, $f23, $f17
  201. ST $f18, 2*SIZE($20)
  202. ADD1 $f24, $f25, $f18
  203. ST $f19, 3*SIZE($20)
  204. ADD2 $f26, $f27, $f19
  205. ADD $f16, $f12, $f16
  206. ADD $f17, $f13, $f17
  207. ADD $f18, $f14, $f18
  208. ADD $f19, $f15, $f19
  209. ST $f16, 4*SIZE($20)
  210. ST $f17, 5*SIZE($20)
  211. ST $f18, 6*SIZE($20)
  212. ST $f19, 7*SIZE($20)
  213. unop
  214. addq $20, 8*SIZE, $20
  215. unop
  216. ble $5, $End
  217. .align 4
  218. $Remain:
  219. subq $5, 1, $6
  220. ble $5, $End
  221. LD $f0, 0*SIZE($18)
  222. LD $f1, 1*SIZE($18)
  223. LD $f8, 0*SIZE($20)
  224. LD $f28, 1*SIZE($20)
  225. addq $18, 2*SIZE, $18
  226. ble $6, $RemainLoopEnd
  227. .align 4
  228. $RemainLoop:
  229. MUL $f29, $f0, $f20
  230. subq $6, 1, $6
  231. MUL $f30, $f1, $f21
  232. addq $20, 2*SIZE, $20
  233. MUL $f30, $f0, $f22
  234. LD $f0, 0*SIZE($18)
  235. MUL $f29, $f1, $f23
  236. LD $f1, 1*SIZE($18)
  237. ADD1 $f20, $f21, $f16
  238. ADD2 $f22, $f23, $f17
  239. ADD $f16, $f8, $f16
  240. LD $f8, 0*SIZE($20)
  241. ADD $f17, $f28, $f17
  242. LD $f28, 1*SIZE($20)
  243. ST $f16,-2*SIZE($20)
  244. addq $18, 2*SIZE, $18
  245. ST $f17,-1*SIZE($20)
  246. bgt $6, $RemainLoop
  247. .align 4
  248. $RemainLoopEnd:
  249. MUL $f29, $f0, $f20
  250. MUL $f30, $f1, $f21
  251. MUL $f30, $f0, $f22
  252. MUL $f29, $f1, $f23
  253. ADD1 $f20, $f21, $f16
  254. ADD2 $f22, $f23, $f17
  255. ADD $f16, $f8, $f16
  256. ADD $f17, $f28, $f17
  257. ST $f16, 0*SIZE($20)
  258. nop
  259. ST $f17, 1*SIZE($20)
  260. nop
  261. .align 4
  262. $End:
  263. ldt $f2, 0($sp)
  264. ldt $f3, 8($sp)
  265. ldt $f4, 16($sp)
  266. ldt $f5, 24($sp)
  267. ldt $f6, 32($sp)
  268. ldt $f7, 40($sp)
  269. ldt $f8, 48($sp)
  270. lda $sp, 64($sp)
  271. ret
  272. .align 4
  273. $Sub:
  274. SXSUBL $16, SIZE, $22
  275. addq $22, $22, $22 # Complex
  276. .align 4
  277. addq $19, $19, $19 # Complex
  278. addq $21, $21, $21 # Complex
  279. ble $4, $SubRemain
  280. LD $f0, 0*SIZE($18)
  281. LD $f1, 1*SIZE($18)
  282. SXADDQ $19, $18, $18
  283. LD $f2, 0*SIZE($18)
  284. LD $f3, 1*SIZE($18)
  285. SXADDQ $19, $18, $18
  286. LD $f4, 0*SIZE($18)
  287. LD $f5, 1*SIZE($18)
  288. SXADDQ $19, $18, $18
  289. LD $f6, 0*SIZE($18)
  290. LD $f7, 1*SIZE($18)
  291. SXADDQ $19, $18, $18
  292. LD $f8, 0*SIZE($20)
  293. LD $f28, 1*SIZE($20)
  294. SXADDQ $21, $20, $24
  295. LD $f10, 0*SIZE($24)
  296. LD $f11, 1*SIZE($24)
  297. SXADDQ $21, $24, $24
  298. LD $f12, 0*SIZE($24)
  299. LD $f13, 1*SIZE($24)
  300. SXADDQ $21, $24, $24
  301. LD $f14, 0*SIZE($24)
  302. LD $f15, 1*SIZE($24)
  303. SXADDQ $21, $24, $24
  304. subq $4, 1, $4
  305. ble $4, $SubMainLoopEnd
  306. .align 4
  307. $SubMainLoop:
  308. MUL $f29, $f0, $f20
  309. unop
  310. MUL $f30, $f1, $f21
  311. unop
  312. MUL $f30, $f0, $f22
  313. LD $f0, 0*SIZE($18)
  314. MUL $f29, $f1, $f23
  315. LD $f1, 1*SIZE($18)
  316. MUL $f29, $f2, $f24
  317. SXADDQ $19, $18, $18
  318. MUL $f30, $f3, $f25
  319. unop
  320. MUL $f30, $f2, $f26
  321. LD $f2, 0*SIZE($18)
  322. MUL $f29, $f3, $f27
  323. LD $f3, 1*SIZE($18)
  324. ADD1 $f20, $f21, $f16
  325. SXADDQ $19, $18, $18
  326. MUL $f29, $f4, $f20
  327. unop
  328. ADD2 $f22, $f23, $f17
  329. unop
  330. MUL $f30, $f5, $f21
  331. unop
  332. ADD1 $f24, $f25, $f18
  333. unop
  334. MUL $f30, $f4, $f22
  335. LD $f4, 0*SIZE($18)
  336. ADD2 $f26, $f27, $f19
  337. unop
  338. MUL $f29, $f5, $f23
  339. LD $f5, 1*SIZE($18)
  340. ADD $f16, $f8, $f16
  341. LD $f8, 0*SIZE($24)
  342. MUL $f29, $f6, $f24
  343. SXADDQ $19, $18, $18
  344. ADD $f17, $f28, $f17
  345. LD $f28, 1*SIZE($24)
  346. MUL $f30, $f7, $f25
  347. SXADDQ $21, $24, $24
  348. ADD $f18, $f10, $f18
  349. LD $f10, 0*SIZE($24)
  350. MUL $f30, $f6, $f26
  351. LD $f6, 0*SIZE($18)
  352. ADD $f19, $f11, $f19
  353. LD $f11, 1*SIZE($24)
  354. MUL $f29, $f7, $f27
  355. LD $f7, 1*SIZE($18)
  356. ST $f16, 0*SIZE($20)
  357. SXADDQ $19, $18, $18
  358. ADD1 $f20, $f21, $f16
  359. unop
  360. ST $f17, 1*SIZE($20)
  361. SXADDQ $21, $20, $20
  362. ADD2 $f22, $f23, $f17
  363. unop
  364. ST $f18, 0*SIZE($20)
  365. SXADDQ $21, $24, $24
  366. ADD1 $f24, $f25, $f18
  367. unop
  368. ST $f19, 1*SIZE($20)
  369. unop
  370. ADD2 $f26, $f27, $f19
  371. SXADDQ $21, $20, $20
  372. ADD $f16, $f12, $f16
  373. unop
  374. LD $f12, 0*SIZE($24)
  375. unop
  376. ADD $f17, $f13, $f17
  377. unop
  378. LD $f13, 1*SIZE($24)
  379. SXADDQ $21, $24, $24
  380. ADD $f18, $f14, $f18
  381. subq $4, 1, $4
  382. LD $f14, 0*SIZE($24)
  383. unop
  384. ADD $f19, $f15, $f19
  385. unop
  386. LD $f15, 1*SIZE($24)
  387. SXADDQ $21, $24, $24
  388. ST $f16, 0*SIZE($20)
  389. ST $f17, 1*SIZE($20)
  390. SXADDQ $21, $20, $20
  391. unop
  392. ST $f18, 0*SIZE($20)
  393. ST $f19, 1*SIZE($20)
  394. SXADDQ $21, $20, $20
  395. bgt $4, $SubMainLoop
  396. .align 4
  397. $SubMainLoopEnd:
  398. MUL $f29, $f0, $f20
  399. MUL $f30, $f1, $f21
  400. MUL $f30, $f0, $f22
  401. MUL $f29, $f1, $f23
  402. MUL $f29, $f2, $f24
  403. MUL $f30, $f3, $f25
  404. MUL $f30, $f2, $f26
  405. MUL $f29, $f3, $f27
  406. ADD1 $f20, $f21, $f16
  407. MUL $f29, $f4, $f20
  408. ADD2 $f22, $f23, $f17
  409. MUL $f30, $f5, $f21
  410. ADD1 $f24, $f25, $f18
  411. MUL $f30, $f4, $f22
  412. ADD2 $f26, $f27, $f19
  413. MUL $f29, $f5, $f23
  414. ADD $f16, $f8, $f16
  415. MUL $f29, $f6, $f24
  416. ADD $f17, $f28, $f17
  417. MUL $f30, $f7, $f25
  418. ADD $f18, $f10, $f18
  419. MUL $f30, $f6, $f26
  420. ADD $f19, $f11, $f19
  421. MUL $f29, $f7, $f27
  422. ST $f16, 0*SIZE($20)
  423. ADD1 $f20, $f21, $f16
  424. ST $f17, 1*SIZE($20)
  425. ADD2 $f22, $f23, $f17
  426. SXADDQ $21, $20, $20
  427. nop
  428. ST $f18, 0*SIZE($20)
  429. ADD1 $f24, $f25, $f18
  430. ST $f19, 1*SIZE($20)
  431. ADD2 $f26, $f27, $f19
  432. SXADDQ $21, $20, $20
  433. ADD $f16, $f12, $f16
  434. ADD $f17, $f13, $f17
  435. ADD $f18, $f14, $f18
  436. ADD $f19, $f15, $f19
  437. ST $f16, 0*SIZE($20)
  438. ST $f17, 1*SIZE($20)
  439. SXADDQ $21, $20, $20
  440. ST $f18, 0*SIZE($20)
  441. ST $f19, 1*SIZE($20)
  442. SXADDQ $21, $20, $20
  443. ble $5, $SubEnd
  444. .align 4
  445. $SubRemain:
  446. subq $5, 1, $6
  447. ble $5, $SubEnd
  448. LD $f0, 0*SIZE($18)
  449. LD $f1, 1*SIZE($18)
  450. LD $f8, 0*SIZE($20)
  451. LD $f28, 1*SIZE($20)
  452. SXADDQ $19, $18, $18
  453. SXADDQ $21, $20, $24
  454. ble $6, $SubRemainLoopEnd
  455. .align 4
  456. $SubRemainLoop:
  457. MUL $f29, $f0, $f20
  458. MUL $f30, $f1, $f21
  459. MUL $f30, $f0, $f22
  460. LD $f0, 0*SIZE($18)
  461. MUL $f29, $f1, $f23
  462. LD $f1, 1*SIZE($18)
  463. ADD1 $f20, $f21, $f16
  464. SXADDQ $19, $18, $18
  465. ADD2 $f22, $f23, $f17
  466. nop
  467. ADD $f16, $f8, $f16
  468. LD $f8, 0*SIZE($24)
  469. ADD $f17, $f28, $f17
  470. LD $f28, 1*SIZE($24)
  471. SXADDQ $21, $24, $24
  472. subq $6, 1, $6
  473. ST $f16, 0*SIZE($20)
  474. ST $f17, 1*SIZE($20)
  475. SXADDQ $21, $20, $20
  476. bgt $6, $SubRemainLoop
  477. .align 4
  478. $SubRemainLoopEnd:
  479. MUL $f29, $f0, $f20
  480. MUL $f30, $f1, $f21
  481. MUL $f30, $f0, $f22
  482. MUL $f29, $f1, $f23
  483. ADD1 $f20, $f21, $f16
  484. ADD2 $f22, $f23, $f17
  485. ADD $f16, $f8, $f16
  486. ADD $f17, $f28, $f17
  487. ST $f16, 0*SIZE($20)
  488. nop
  489. ST $f17, 1*SIZE($20)
  490. nop
  491. .align 4
  492. $SubEnd:
  493. ldt $f2, 0($sp)
  494. ldt $f3, 8($sp)
  495. ldt $f4, 16($sp)
  496. ldt $f5, 24($sp)
  497. ldt $f6, 32($sp)
  498. ldt $f7, 40($sp)
  499. ldt $f8, 48($sp)
  500. lda $sp, 64($sp)
  501. ret
  502. EPILOGUE