You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

znrm2_ppc440.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define NN r6
  44. #define XX r7
  45. #define INC1 r9
  46. #define PRE r10
  47. #define FZERO 144(SP)
  48. #define FONE 148(SP)
  49. #define FMAX 152(SP)
  50. #define C1 156(SP)
  51. #define C2 160(SP)
  52. #define STACKSIZE 168
  53. PROLOGUE
  54. PROFCODE
  55. addi SP, SP, -STACKSIZE
  56. li r10, 0
  57. lis r11, 0x3f80
  58. lis r12, 0x5fe0
  59. lis r6, 0x3f00
  60. lis r7, 0x4040
  61. stfd f14, 0(SP)
  62. stfd f15, 8(SP)
  63. stfd f16, 16(SP)
  64. stfd f17, 24(SP)
  65. stfd f18, 32(SP)
  66. stfd f19, 40(SP)
  67. stfd f20, 48(SP)
  68. stfd f21, 56(SP)
  69. stfd f22, 64(SP)
  70. stfd f23, 72(SP)
  71. stfd f24, 80(SP)
  72. stfd f25, 88(SP)
  73. stfd f26, 96(SP)
  74. stfd f27, 104(SP)
  75. stfd f28, 112(SP)
  76. stfd f29, 120(SP)
  77. stfd f30, 128(SP)
  78. stfd f31, 136(SP)
  79. stw r10, FZERO
  80. stw r11, FONE
  81. stw r12, FMAX
  82. stw r10, 4 + FMAX
  83. stw r6, C1
  84. stw r7, C2
  85. lfs f1, FZERO
  86. #ifdef F_INTERFACE
  87. LDINT N, 0(N)
  88. LDINT INCX, 0(INCX)
  89. #endif
  90. slwi INCX, INCX, ZBASE_SHIFT
  91. sub X, X, INCX
  92. li INC1, SIZE
  93. li PRE, 3 * 16 * SIZE
  94. cmpwi cr0, N, 0
  95. ble- LL(999)
  96. cmpwi cr0, INCX, 0
  97. ble- LL(999)
  98. mr NN, N
  99. mr XX, X
  100. LFDUX f0, X, INCX
  101. LFDX f1, X, INC1
  102. fabs f2, f0
  103. fabs f3, f1
  104. fabs f4, f0
  105. fabs f5, f1
  106. fabs f6, f0
  107. fabs f7, f1
  108. fabs f0, f0
  109. fabs f1, f1
  110. subi N, N, 1
  111. srawi. r0, N, 3
  112. mtspr CTR, r0
  113. beq- LL(50)
  114. LFDUX f24, X, INCX
  115. LFDX f25, X, INC1
  116. LFDUX f26, X, INCX
  117. LFDX f27, X, INC1
  118. LFDUX f28, X, INCX
  119. LFDX f29, X, INC1
  120. LFDUX f30, X, INCX
  121. LFDX f31, X, INC1
  122. fabs f8, f24
  123. LFDUX f24, X, INCX
  124. fabs f9, f25
  125. LFDX f25, X, INC1
  126. fabs f10, f26
  127. LFDUX f26, X, INCX
  128. fabs f11, f27
  129. LFDX f27, X, INC1
  130. fabs f12, f28
  131. LFDUX f28, X, INCX
  132. fabs f13, f29
  133. LFDX f29, X, INC1
  134. fabs f14, f30
  135. LFDUX f30, X, INCX
  136. fabs f15, f31
  137. LFDX f31, X, INC1
  138. bdz LL(20)
  139. .align 4
  140. LL(10):
  141. fsub f16, f0, f8
  142. fsub f17, f1, f9
  143. fsub f18, f2, f10
  144. fsub f19, f3, f11
  145. fsub f20, f4, f12
  146. fsub f21, f5, f13
  147. fsub f22, f6, f14
  148. fsub f23, f7, f15
  149. fsel f0, f16, f0, f8
  150. #ifdef PPCG4
  151. dcbt X, PRE
  152. #endif
  153. fabs f8, f24
  154. LFDUX f24, X, INCX
  155. fsel f1, f17, f1, f9
  156. fabs f9, f25
  157. LFDX f25, X, INC1
  158. fsel f2, f18, f2, f10
  159. fabs f10, f26
  160. LFDUX f26, X, INCX
  161. fsel f3, f19, f3, f11
  162. fabs f11, f27
  163. LFDX f27, X, INC1
  164. fsel f4, f20, f4, f12
  165. #ifdef PPCG4
  166. dcbt X, PRE
  167. #endif
  168. fabs f12, f28
  169. LFDUX f28, X, INCX
  170. fsel f5, f21, f5, f13
  171. fabs f13, f29
  172. LFDX f29, X, INC1
  173. fsel f6, f22, f6, f14
  174. fabs f14, f30
  175. LFDUX f30, X, INCX
  176. fsel f7, f23, f7, f15
  177. fabs f15, f31
  178. LFDX f31, X, INC1
  179. fsub f16, f0, f8
  180. fsub f17, f1, f9
  181. fsub f18, f2, f10
  182. fsub f19, f3, f11
  183. fsub f20, f4, f12
  184. fsub f21, f5, f13
  185. fsub f22, f6, f14
  186. fsub f23, f7, f15
  187. fsel f0, f16, f0, f8
  188. #ifdef PPCG4
  189. dcbt X, PRE
  190. #endif
  191. fabs f8, f24
  192. LFDUX f24, X, INCX
  193. fsel f1, f17, f1, f9
  194. fabs f9, f25
  195. LFDX f25, X, INC1
  196. fsel f2, f18, f2, f10
  197. fabs f10, f26
  198. LFDUX f26, X, INCX
  199. fsel f3, f19, f3, f11
  200. fabs f11, f27
  201. LFDX f27, X, INC1
  202. fsel f4, f20, f4, f12
  203. #ifdef PPCG4
  204. dcbt X, PRE
  205. #endif
  206. fabs f12, f28
  207. LFDUX f28, X, INCX
  208. fsel f5, f21, f5, f13
  209. fabs f13, f29
  210. LFDX f29, X, INC1
  211. fsel f6, f22, f6, f14
  212. fabs f14, f30
  213. LFDUX f30, X, INCX
  214. fsel f7, f23, f7, f15
  215. fabs f15, f31
  216. LFDX f31, X, INC1
  217. bdnz LL(10)
  218. .align 4
  219. LL(20):
  220. fsub f16, f0, f8
  221. fsub f17, f1, f9
  222. fsub f18, f2, f10
  223. fsub f19, f3, f11
  224. fsub f20, f4, f12
  225. fsub f21, f5, f13
  226. fsub f22, f6, f14
  227. fsub f23, f7, f15
  228. fsel f0, f16, f0, f8
  229. fabs f8, f24
  230. fsel f1, f17, f1, f9
  231. fabs f9, f25
  232. fsel f2, f18, f2, f10
  233. fabs f10, f26
  234. fsel f3, f19, f3, f11
  235. fabs f11, f27
  236. fsel f4, f20, f4, f12
  237. fabs f12, f28
  238. fsel f5, f21, f5, f13
  239. fabs f13, f29
  240. fsel f6, f22, f6, f14
  241. fabs f14, f30
  242. fsel f7, f23, f7, f15
  243. fabs f15, f31
  244. fsub f16, f0, f8
  245. fsub f17, f1, f9
  246. fsub f18, f2, f10
  247. fsub f19, f3, f11
  248. fsub f20, f4, f12
  249. fsub f21, f5, f13
  250. fsub f22, f6, f14
  251. fsub f23, f7, f15
  252. fsel f0, f16, f0, f8
  253. fsel f1, f17, f1, f9
  254. fsel f2, f18, f2, f10
  255. fsel f3, f19, f3, f11
  256. fsel f4, f20, f4, f12
  257. fsel f5, f21, f5, f13
  258. fsel f6, f22, f6, f14
  259. fsel f7, f23, f7, f15
  260. .align 4
  261. LL(50):
  262. andi. r0, N, 7
  263. mtspr CTR, r0
  264. beq LL(99)
  265. .align 4
  266. LL(60):
  267. LFDUX f8, X, INCX
  268. LFDX f9, X, INC1
  269. fabs f8, f8
  270. fabs f9, f9
  271. fsub f16, f0, f8
  272. fsub f17, f1, f9
  273. fsel f0, f16, f0, f8
  274. fsel f1, f17, f1, f9
  275. bdnz LL(60)
  276. .align 4
  277. LL(99):
  278. fsub f8, f0, f1
  279. fsub f9, f2, f3
  280. fsub f10, f4, f5
  281. fsub f11, f6, f7
  282. fsel f0, f8, f0, f1
  283. fsel f2, f9, f2, f3
  284. fsel f4, f10, f4, f5
  285. fsel f6, f11, f6, f7
  286. fsub f8, f0, f2
  287. fsub f9, f4, f6
  288. fsel f0, f8, f0, f2
  289. fsel f4, f9, f4, f6
  290. fsub f8, f0, f4
  291. fsel f31, f8, f0, f4
  292. lfs f1, FZERO
  293. lfs f0, FONE
  294. fcmpu cr0, f1, f31
  295. beq- cr0, LL(999)
  296. fdiv f30, f0, f31
  297. fmr f0, f1
  298. fmr f2, f1
  299. fmr f3, f1
  300. fmr f4, f1
  301. fmr f5, f1
  302. fmr f6, f1
  303. fmr f7, f1
  304. srawi. r0, NN, 3
  305. mtspr CTR, r0
  306. beq- cr0, LL(150)
  307. LFDUX f8, XX, INCX
  308. LFDX f9, XX, INC1
  309. LFDUX f10, XX, INCX
  310. LFDX f11, XX, INC1
  311. LFDUX f12, XX, INCX
  312. LFDX f13, XX, INC1
  313. LFDUX f14, XX, INCX
  314. LFDX f15, XX, INC1
  315. fmul f16, f30, f8
  316. LFDUX f8, XX, INCX
  317. fmul f17, f30, f9
  318. LFDX f9, XX, INC1
  319. fmul f18, f30, f10
  320. LFDUX f10, XX, INCX
  321. fmul f19, f30, f11
  322. LFDX f11, XX, INC1
  323. fmul f20, f30, f12
  324. LFDUX f12, XX, INCX
  325. fmul f21, f30, f13
  326. LFDX f13, XX, INC1
  327. fmul f22, f30, f14
  328. LFDUX f14, XX, INCX
  329. fmul f23, f30, f15
  330. LFDX f15, XX, INC1
  331. bdz LL(120)
  332. .align 4
  333. LL(110):
  334. fmadd f0, f16, f16, f0
  335. #ifdef PPCG4
  336. dcbt XX, PRE
  337. #endif
  338. fmul f16, f30, f8
  339. LFDUX f8, XX, INCX
  340. fmadd f1, f17, f17, f1
  341. fmul f17, f30, f9
  342. LFDX f9, XX, INC1
  343. fmadd f2, f18, f18, f2
  344. fmul f18, f30, f10
  345. LFDUX f10, XX, INCX
  346. fmadd f3, f19, f19, f3
  347. fmul f19, f30, f11
  348. LFDX f11, XX, INC1
  349. fmadd f4, f20, f20, f4
  350. #ifdef PPCG4
  351. dcbt XX, PRE
  352. #endif
  353. fmul f20, f30, f12
  354. LFDUX f12, XX, INCX
  355. fmadd f5, f21, f21, f5
  356. fmul f21, f30, f13
  357. LFDX f13, XX, INC1
  358. fmadd f6, f22, f22, f6
  359. fmul f22, f30, f14
  360. LFDUX f14, XX, INCX
  361. fmadd f7, f23, f23, f7
  362. fmul f23, f30, f15
  363. LFDX f15, XX, INC1
  364. fmadd f0, f16, f16, f0
  365. #ifdef PPCG4
  366. dcbt XX, PRE
  367. #endif
  368. fmul f16, f30, f8
  369. LFDUX f8, XX, INCX
  370. fmadd f1, f17, f17, f1
  371. fmul f17, f30, f9
  372. LFDX f9, XX, INC1
  373. fmadd f2, f18, f18, f2
  374. fmul f18, f30, f10
  375. LFDUX f10, XX, INCX
  376. fmadd f3, f19, f19, f3
  377. fmul f19, f30, f11
  378. LFDX f11, XX, INC1
  379. fmadd f4, f20, f20, f4
  380. #ifdef PPCG4
  381. dcbt XX, PRE
  382. #endif
  383. fmul f20, f30, f12
  384. LFDUX f12, XX, INCX
  385. fmadd f5, f21, f21, f5
  386. fmul f21, f30, f13
  387. LFDX f13, XX, INC1
  388. fmadd f6, f22, f22, f6
  389. fmul f22, f30, f14
  390. LFDUX f14, XX, INCX
  391. fmadd f7, f23, f23, f7
  392. fmul f23, f30, f15
  393. LFDX f15, XX, INC1
  394. bdnz LL(110)
  395. .align 4
  396. LL(120):
  397. fmadd f0, f16, f16, f0
  398. fmul f16, f30, f8
  399. fmadd f1, f17, f17, f1
  400. fmul f17, f30, f9
  401. fmadd f2, f18, f18, f2
  402. fmul f18, f30, f10
  403. fmadd f3, f19, f19, f3
  404. fmul f19, f30, f11
  405. fmadd f4, f20, f20, f4
  406. fmul f20, f30, f12
  407. fmadd f5, f21, f21, f5
  408. fmul f21, f30, f13
  409. fmadd f6, f22, f22, f6
  410. fmul f22, f30, f14
  411. fmadd f7, f23, f23, f7
  412. fmul f23, f30, f15
  413. fmadd f0, f16, f16, f0
  414. fmadd f1, f17, f17, f1
  415. fmadd f2, f18, f18, f2
  416. fmadd f3, f19, f19, f3
  417. fmadd f4, f20, f20, f4
  418. fmadd f5, f21, f21, f5
  419. fmadd f6, f22, f22, f6
  420. fmadd f7, f23, f23, f7
  421. .align 4
  422. LL(150):
  423. andi. r0, NN, 7
  424. mtspr CTR, r0
  425. beq- cr0, LL(170)
  426. .align 4
  427. LL(160):
  428. LFDUX f8, XX, INCX
  429. LFDX f9, XX, INC1
  430. fmul f16, f30, f8
  431. fmul f17, f30, f9
  432. fmadd f0, f16, f16, f0
  433. fmadd f1, f17, f17, f1
  434. bdnz LL(160)
  435. .align 4
  436. LL(170):
  437. fadd f0, f0, f1
  438. fadd f2, f2, f3
  439. fadd f4, f4, f5
  440. fadd f6, f6, f7
  441. fadd f0, f0, f2
  442. fadd f4, f4, f6
  443. fadd f1, f0, f4
  444. frsqrte f0, f1
  445. lfs f8, C1
  446. lfs f9, C2
  447. fmul f2, f1, f0
  448. fadd f7, f8, f8
  449. fmul f3, f0, f8
  450. fnmsub f4, f2, f0, f9
  451. fmul f0, f3, f4
  452. fmul f2, f1, f0
  453. fmul f3, f0, f8
  454. fnmsub f4, f2, f0, f9
  455. fmul f0, f3, f4
  456. fmul f2, f1, f0
  457. fmul f3, f0, f8
  458. fnmsub f4, f2, f0, f9
  459. fmul f0, f3, f4
  460. fmul f5, f1, f0
  461. fmul f2, f5, f8
  462. fnmsub f3, f5, f0, f7
  463. fmadd f1, f2, f3, f5
  464. fmul f1, f31, f1
  465. .align 4
  466. LL(999):
  467. lfd f14, 0(SP)
  468. lfd f15, 8(SP)
  469. lfd f16, 16(SP)
  470. lfd f17, 24(SP)
  471. lfd f18, 32(SP)
  472. lfd f19, 40(SP)
  473. lfd f20, 48(SP)
  474. lfd f21, 56(SP)
  475. lfd f22, 64(SP)
  476. lfd f23, 72(SP)
  477. lfd f24, 80(SP)
  478. lfd f25, 88(SP)
  479. lfd f26, 96(SP)
  480. lfd f27, 104(SP)
  481. lfd f28, 112(SP)
  482. lfd f29, 120(SP)
  483. lfd f30, 128(SP)
  484. lfd f31, 136(SP)
  485. addi SP, SP, STACKSIZE
  486. blr
  487. EPILOGUE