You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2_ppc440.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define NN r6
  44. #define XX r7
  45. #define PRE r8
  46. #define FZERO 144(SP)
  47. #define FONE 148(SP)
  48. #define FMAX 152(SP)
  49. #define C1 156(SP)
  50. #define C2 160(SP)
  51. #define STACKSIZE 168
  52. PROLOGUE
  53. PROFCODE
  54. addi SP, SP, -STACKSIZE
  55. li r10, 0
  56. lis r11, 0x3f80
  57. lis r12, 0x5fe0
  58. lis r6, 0x3f00
  59. lis r7, 0x4040
  60. stfd f14, 0(SP)
  61. stfd f15, 8(SP)
  62. stfd f16, 16(SP)
  63. stfd f17, 24(SP)
  64. stfd f18, 32(SP)
  65. stfd f19, 40(SP)
  66. stfd f20, 48(SP)
  67. stfd f21, 56(SP)
  68. stfd f22, 64(SP)
  69. stfd f23, 72(SP)
  70. stfd f24, 80(SP)
  71. stfd f25, 88(SP)
  72. stfd f26, 96(SP)
  73. stfd f27, 104(SP)
  74. stfd f28, 112(SP)
  75. stfd f29, 120(SP)
  76. stfd f30, 128(SP)
  77. stfd f31, 136(SP)
  78. stw r10, FZERO
  79. stw r11, FONE
  80. stw r12, FMAX
  81. stw r10, 4 + FMAX
  82. stw r6, C1
  83. stw r7, C2
  84. lfs f1, FZERO
  85. #ifdef F_INTERFACE
  86. LDINT N, 0(N)
  87. LDINT INCX, 0(INCX)
  88. #endif
  89. slwi INCX, INCX, BASE_SHIFT
  90. sub X, X, INCX
  91. li PRE, 3 * 16 * SIZE
  92. cmpwi cr0, N, 0
  93. ble- LL(999)
  94. cmpwi cr0, INCX, 0
  95. ble- LL(999)
  96. mr NN, N
  97. mr XX, X
  98. LFDUX f1, X, INCX
  99. fabs f0, f1
  100. fabs f2, f1
  101. fabs f3, f1
  102. fabs f4, f1
  103. fabs f5, f1
  104. fabs f6, f1
  105. fabs f7, f1
  106. fabs f1, f1
  107. subi N, N, 1
  108. cmpwi cr0, N, 0
  109. ble- LL(999)
  110. srawi. r0, N, 4
  111. mtspr CTR, r0
  112. beq- LL(50)
  113. LFDUX f24, X, INCX
  114. LFDUX f25, X, INCX
  115. LFDUX f26, X, INCX
  116. LFDUX f27, X, INCX
  117. LFDUX f28, X, INCX
  118. LFDUX f29, X, INCX
  119. LFDUX f30, X, INCX
  120. LFDUX f31, X, INCX
  121. fabs f8, f24
  122. LFDUX f24, X, INCX
  123. fabs f9, f25
  124. LFDUX f25, X, INCX
  125. fabs f10, f26
  126. LFDUX f26, X, INCX
  127. fabs f11, f27
  128. LFDUX f27, X, INCX
  129. fabs f12, f28
  130. LFDUX f28, X, INCX
  131. fabs f13, f29
  132. LFDUX f29, X, INCX
  133. fabs f14, f30
  134. LFDUX f30, X, INCX
  135. fabs f15, f31
  136. LFDUX f31, X, INCX
  137. bdz LL(20)
  138. .align 4
  139. LL(10):
  140. fsub f16, f0, f8
  141. fsub f17, f1, f9
  142. fsub f18, f2, f10
  143. fsub f19, f3, f11
  144. fsub f20, f4, f12
  145. fsub f21, f5, f13
  146. fsub f22, f6, f14
  147. fsub f23, f7, f15
  148. fsel f0, f16, f0, f8
  149. #ifdef PPCG4
  150. dcbt X, PRE
  151. #endif
  152. fabs f8, f24
  153. LFDUX f24, X, INCX
  154. fsel f1, f17, f1, f9
  155. fabs f9, f25
  156. LFDUX f25, X, INCX
  157. fsel f2, f18, f2, f10
  158. fabs f10, f26
  159. LFDUX f26, X, INCX
  160. fsel f3, f19, f3, f11
  161. fabs f11, f27
  162. LFDUX f27, X, INCX
  163. fsel f4, f20, f4, f12
  164. #ifdef PPCG4
  165. dcbt X, PRE
  166. #endif
  167. fabs f12, f28
  168. LFDUX f28, X, INCX
  169. fsel f5, f21, f5, f13
  170. fabs f13, f29
  171. LFDUX f29, X, INCX
  172. fsel f6, f22, f6, f14
  173. fabs f14, f30
  174. LFDUX f30, X, INCX
  175. fsel f7, f23, f7, f15
  176. fabs f15, f31
  177. LFDUX f31, X, INCX
  178. fsub f16, f0, f8
  179. fsub f17, f1, f9
  180. fsub f18, f2, f10
  181. fsub f19, f3, f11
  182. fsub f20, f4, f12
  183. fsub f21, f5, f13
  184. fsub f22, f6, f14
  185. fsub f23, f7, f15
  186. fsel f0, f16, f0, f8
  187. #ifdef PPCG4
  188. dcbt X, PRE
  189. #endif
  190. fabs f8, f24
  191. LFDUX f24, X, INCX
  192. fsel f1, f17, f1, f9
  193. fabs f9, f25
  194. LFDUX f25, X, INCX
  195. fsel f2, f18, f2, f10
  196. fabs f10, f26
  197. LFDUX f26, X, INCX
  198. fsel f3, f19, f3, f11
  199. fabs f11, f27
  200. LFDUX f27, X, INCX
  201. fsel f4, f20, f4, f12
  202. #ifdef PPCG4
  203. dcbt X, PRE
  204. #endif
  205. fabs f12, f28
  206. LFDUX f28, X, INCX
  207. fsel f5, f21, f5, f13
  208. fabs f13, f29
  209. LFDUX f29, X, INCX
  210. fsel f6, f22, f6, f14
  211. fabs f14, f30
  212. LFDUX f30, X, INCX
  213. fsel f7, f23, f7, f15
  214. fabs f15, f31
  215. LFDUX f31, X, INCX
  216. bdnz LL(10)
  217. .align 4
  218. LL(20):
  219. fsub f16, f0, f8
  220. fsub f17, f1, f9
  221. fsub f18, f2, f10
  222. fsub f19, f3, f11
  223. fsub f20, f4, f12
  224. fsub f21, f5, f13
  225. fsub f22, f6, f14
  226. fsub f23, f7, f15
  227. fsel f0, f16, f0, f8
  228. fabs f8, f24
  229. fsel f1, f17, f1, f9
  230. fabs f9, f25
  231. fsel f2, f18, f2, f10
  232. fabs f10, f26
  233. fsel f3, f19, f3, f11
  234. fabs f11, f27
  235. fsel f4, f20, f4, f12
  236. fabs f12, f28
  237. fsel f5, f21, f5, f13
  238. fabs f13, f29
  239. fsel f6, f22, f6, f14
  240. fabs f14, f30
  241. fsel f7, f23, f7, f15
  242. fabs f15, f31
  243. fsub f16, f0, f8
  244. fsub f17, f1, f9
  245. fsub f18, f2, f10
  246. fsub f19, f3, f11
  247. fsub f20, f4, f12
  248. fsub f21, f5, f13
  249. fsub f22, f6, f14
  250. fsub f23, f7, f15
  251. fsel f0, f16, f0, f8
  252. fsel f1, f17, f1, f9
  253. fsel f2, f18, f2, f10
  254. fsel f3, f19, f3, f11
  255. fsel f4, f20, f4, f12
  256. fsel f5, f21, f5, f13
  257. fsel f6, f22, f6, f14
  258. fsel f7, f23, f7, f15
  259. .align 4
  260. LL(50):
  261. andi. r0, N, 15
  262. mtspr CTR, r0
  263. beq LL(99)
  264. .align 4
  265. LL(60):
  266. LFDUX f8, X, INCX
  267. fabs f8, f8
  268. fsub f16, f1, f8
  269. fsel f1, f16, f1, f8
  270. bdnz LL(60)
  271. .align 4
  272. LL(99):
  273. fsub f8, f0, f1
  274. fsub f9, f2, f3
  275. fsub f10, f4, f5
  276. fsub f11, f6, f7
  277. fsel f0, f8, f0, f1
  278. fsel f2, f9, f2, f3
  279. fsel f4, f10, f4, f5
  280. fsel f6, f11, f6, f7
  281. fsub f8, f0, f2
  282. fsub f9, f4, f6
  283. fsel f0, f8, f0, f2
  284. fsel f4, f9, f4, f6
  285. fsub f8, f0, f4
  286. fsel f31, f8, f0, f4
  287. lfs f1, FZERO
  288. lfs f0, FONE
  289. lfd f2, FMAX
  290. fcmpu cr0, f1, f31
  291. beq- cr0, LL(999)
  292. fdiv f30, f0, f31
  293. fmr f0, f1
  294. fmr f2, f1
  295. fmr f3, f1
  296. fmr f4, f1
  297. fmr f5, f1
  298. fmr f6, f1
  299. fmr f7, f1
  300. srawi. r0, NN, 4
  301. mtspr CTR, r0
  302. beq- cr0, LL(150)
  303. LFDUX f8, XX, INCX
  304. LFDUX f9, XX, INCX
  305. LFDUX f10, XX, INCX
  306. LFDUX f11, XX, INCX
  307. LFDUX f12, XX, INCX
  308. LFDUX f13, XX, INCX
  309. LFDUX f14, XX, INCX
  310. LFDUX f15, XX, INCX
  311. fmul f16, f30, f8
  312. LFDUX f8, XX, INCX
  313. fmul f17, f30, f9
  314. LFDUX f9, XX, INCX
  315. fmul f18, f30, f10
  316. LFDUX f10, XX, INCX
  317. fmul f19, f30, f11
  318. LFDUX f11, XX, INCX
  319. fmul f20, f30, f12
  320. LFDUX f12, XX, INCX
  321. fmul f21, f30, f13
  322. LFDUX f13, XX, INCX
  323. fmul f22, f30, f14
  324. LFDUX f14, XX, INCX
  325. fmul f23, f30, f15
  326. LFDUX f15, XX, INCX
  327. bdz LL(120)
  328. .align 4
  329. LL(110):
  330. fmadd f0, f16, f16, f0
  331. #ifdef PPCG4
  332. dcbt XX, PRE
  333. #endif
  334. fmul f16, f30, f8
  335. LFDUX f8, XX, INCX
  336. fmadd f1, f17, f17, f1
  337. fmul f17, f30, f9
  338. LFDUX f9, XX, INCX
  339. fmadd f2, f18, f18, f2
  340. fmul f18, f30, f10
  341. LFDUX f10, XX, INCX
  342. fmadd f3, f19, f19, f3
  343. fmul f19, f30, f11
  344. LFDUX f11, XX, INCX
  345. fmadd f4, f20, f20, f4
  346. #ifdef PPCG4
  347. dcbt XX, PRE
  348. #endif
  349. fmul f20, f30, f12
  350. LFDUX f12, XX, INCX
  351. fmadd f5, f21, f21, f5
  352. fmul f21, f30, f13
  353. LFDUX f13, XX, INCX
  354. fmadd f6, f22, f22, f6
  355. fmul f22, f30, f14
  356. LFDUX f14, XX, INCX
  357. fmadd f7, f23, f23, f7
  358. fmul f23, f30, f15
  359. LFDUX f15, XX, INCX
  360. fmadd f0, f16, f16, f0
  361. #ifdef PPCG4
  362. dcbt XX, PRE
  363. #endif
  364. fmul f16, f30, f8
  365. LFDUX f8, XX, INCX
  366. fmadd f1, f17, f17, f1
  367. fmul f17, f30, f9
  368. LFDUX f9, XX, INCX
  369. fmadd f2, f18, f18, f2
  370. fmul f18, f30, f10
  371. LFDUX f10, XX, INCX
  372. fmadd f3, f19, f19, f3
  373. fmul f19, f30, f11
  374. LFDUX f11, XX, INCX
  375. fmadd f4, f20, f20, f4
  376. #ifdef PPCG4
  377. dcbt XX, PRE
  378. #endif
  379. fmul f20, f30, f12
  380. LFDUX f12, XX, INCX
  381. fmadd f5, f21, f21, f5
  382. fmul f21, f30, f13
  383. LFDUX f13, XX, INCX
  384. fmadd f6, f22, f22, f6
  385. fmul f22, f30, f14
  386. LFDUX f14, XX, INCX
  387. fmadd f7, f23, f23, f7
  388. fmul f23, f30, f15
  389. LFDUX f15, XX, INCX
  390. bdnz LL(110)
  391. .align 4
  392. LL(120):
  393. fmadd f0, f16, f16, f0
  394. fmul f16, f30, f8
  395. fmadd f1, f17, f17, f1
  396. fmul f17, f30, f9
  397. fmadd f2, f18, f18, f2
  398. fmul f18, f30, f10
  399. fmadd f3, f19, f19, f3
  400. fmul f19, f30, f11
  401. fmadd f4, f20, f20, f4
  402. fmul f20, f30, f12
  403. fmadd f5, f21, f21, f5
  404. fmul f21, f30, f13
  405. fmadd f6, f22, f22, f6
  406. fmul f22, f30, f14
  407. fmadd f7, f23, f23, f7
  408. fmul f23, f30, f15
  409. fmadd f0, f16, f16, f0
  410. fmadd f1, f17, f17, f1
  411. fmadd f2, f18, f18, f2
  412. fmadd f3, f19, f19, f3
  413. fmadd f4, f20, f20, f4
  414. fmadd f5, f21, f21, f5
  415. fmadd f6, f22, f22, f6
  416. fmadd f7, f23, f23, f7
  417. .align 4
  418. LL(150):
  419. andi. r0, NN, 15
  420. mtspr CTR, r0
  421. beq- cr0, LL(170)
  422. .align 4
  423. LL(160):
  424. LFDUX f8, XX, INCX
  425. fmul f16, f30, f8
  426. fmadd f0, f16, f16, f0
  427. bdnz LL(160)
  428. .align 4
  429. LL(170):
  430. fadd f0, f0, f1
  431. fadd f2, f2, f3
  432. fadd f4, f4, f5
  433. fadd f6, f6, f7
  434. fadd f0, f0, f2
  435. fadd f4, f4, f6
  436. fadd f1, f0, f4
  437. frsqrte f0, f1
  438. lfs f8, C1
  439. lfs f9, C2
  440. fmul f2, f1, f0
  441. fadd f7, f8, f8
  442. fmul f3, f0, f8
  443. fnmsub f4, f2, f0, f9
  444. fmul f0, f3, f4
  445. fmul f2, f1, f0
  446. fmul f3, f0, f8
  447. fnmsub f4, f2, f0, f9
  448. fmul f0, f3, f4
  449. fmul f2, f1, f0
  450. fmul f3, f0, f8
  451. fnmsub f4, f2, f0, f9
  452. fmul f0, f3, f4
  453. fmul f5, f1, f0
  454. fmul f2, f5, f8
  455. fnmsub f3, f5, f0, f7
  456. fmadd f1, f2, f3, f5
  457. fmul f1, f31, f1
  458. .align 4
  459. LL(999):
  460. lfd f14, 0(SP)
  461. lfd f15, 8(SP)
  462. lfd f16, 16(SP)
  463. lfd f17, 24(SP)
  464. lfd f18, 32(SP)
  465. lfd f19, 40(SP)
  466. lfd f20, 48(SP)
  467. lfd f21, 56(SP)
  468. lfd f22, 64(SP)
  469. lfd f23, 72(SP)
  470. lfd f24, 80(SP)
  471. lfd f25, 88(SP)
  472. lfd f26, 96(SP)
  473. lfd f27, 104(SP)
  474. lfd f28, 112(SP)
  475. lfd f29, 120(SP)
  476. lfd f30, 128(SP)
  477. lfd f31, 136(SP)
  478. addi SP, SP, STACKSIZE
  479. blr
  480. EPILOGUE