You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2.S 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define NN r6
  44. #define XX r7
  45. #define PREA r8
  46. #define FZERO 144(SP)
  47. #define FONE 148(SP)
  48. #define FMAX 152(SP)
  49. #define STACKSIZE 160
  50. PROLOGUE
  51. PROFCODE
  52. addi SP, SP, -STACKSIZE
  53. li r10, 0
  54. lis r11, 0x3f80
  55. lis r12, 0x5fe0
  56. stfd f14, 0(SP)
  57. stfd f15, 8(SP)
  58. stfd f16, 16(SP)
  59. stfd f17, 24(SP)
  60. stfd f18, 32(SP)
  61. stfd f19, 40(SP)
  62. stfd f20, 48(SP)
  63. stfd f21, 56(SP)
  64. stfd f22, 64(SP)
  65. stfd f23, 72(SP)
  66. stfd f24, 80(SP)
  67. stfd f25, 88(SP)
  68. stfd f26, 96(SP)
  69. stfd f27, 104(SP)
  70. stfd f28, 112(SP)
  71. stfd f29, 120(SP)
  72. stfd f30, 128(SP)
  73. stfd f31, 136(SP)
  74. stw r10, FZERO
  75. stw r11, FONE
  76. stw r12, FMAX
  77. stw r10, 4 + FMAX
  78. lfs f1, FZERO
  79. #ifdef F_INTERFACE
  80. LDINT N, 0(N)
  81. LDINT INCX, 0(INCX)
  82. #endif
  83. slwi INCX, INCX, BASE_SHIFT
  84. li PREA, L1_PREFETCHSIZE
  85. cmpwi cr0, N, 0
  86. ble- LL(9999)
  87. cmpwi cr0, INCX, 0
  88. ble- LL(9999)
  89. mr NN, N
  90. mr XX, X
  91. LFD f1, 0 * SIZE(X)
  92. add X, X, INCX
  93. fabs f0, f1
  94. fabs f2, f1
  95. fabs f3, f1
  96. fabs f4, f1
  97. fabs f5, f1
  98. fabs f6, f1
  99. fabs f7, f1
  100. fabs f1, f1
  101. subi N, N, 1
  102. cmpwi cr0, N, 0
  103. ble- LL(9999)
  104. cmpwi cr0, INCX, SIZE
  105. bne- cr0, LL(1000)
  106. srawi. r0, N, 4
  107. mtspr CTR, r0
  108. beq- cr0, LL(50)
  109. LFD f24, 0 * SIZE(X)
  110. LFD f25, 1 * SIZE(X)
  111. LFD f26, 2 * SIZE(X)
  112. LFD f27, 3 * SIZE(X)
  113. LFD f28, 4 * SIZE(X)
  114. LFD f29, 5 * SIZE(X)
  115. LFD f30, 6 * SIZE(X)
  116. LFD f31, 7 * SIZE(X)
  117. fabs f8, f24
  118. fabs f9, f25
  119. fabs f10, f26
  120. fabs f11, f27
  121. LFD f24, 8 * SIZE(X)
  122. LFD f25, 9 * SIZE(X)
  123. LFD f26, 10 * SIZE(X)
  124. LFD f27, 11 * SIZE(X)
  125. fabs f12, f28
  126. fabs f13, f29
  127. fabs f14, f30
  128. fabs f15, f31
  129. LFD f28, 12 * SIZE(X)
  130. LFD f29, 13 * SIZE(X)
  131. LFD f30, 14 * SIZE(X)
  132. LFD f31, 15 * SIZE(X)
  133. bdz LL(20)
  134. .align 4
  135. LL(10):
  136. fsub f16, f0, f8
  137. fsub f17, f1, f9
  138. fsub f18, f2, f10
  139. fsub f19, f3, f11
  140. fsub f20, f4, f12
  141. fsub f21, f5, f13
  142. fsub f22, f6, f14
  143. fsub f23, f7, f15
  144. fsel f0, f16, f0, f8
  145. fabs f8, f24
  146. fsel f1, f17, f1, f9
  147. fabs f9, f25
  148. fsel f2, f18, f2, f10
  149. fabs f10, f26
  150. fsel f3, f19, f3, f11
  151. fabs f11, f27
  152. LFD f24, 16 * SIZE(X)
  153. LFD f25, 17 * SIZE(X)
  154. LFD f26, 18 * SIZE(X)
  155. LFD f27, 19 * SIZE(X)
  156. fsel f4, f20, f4, f12
  157. fabs f12, f28
  158. fsel f5, f21, f5, f13
  159. fabs f13, f29
  160. fsel f6, f22, f6, f14
  161. fabs f14, f30
  162. fsel f7, f23, f7, f15
  163. fabs f15, f31
  164. LFD f28, 20 * SIZE(X)
  165. LFD f29, 21 * SIZE(X)
  166. LFD f30, 22 * SIZE(X)
  167. LFD f31, 23 * SIZE(X)
  168. fsub f16, f0, f8
  169. fsub f17, f1, f9
  170. fsub f18, f2, f10
  171. fsub f19, f3, f11
  172. fsub f20, f4, f12
  173. fsub f21, f5, f13
  174. fsub f22, f6, f14
  175. fsub f23, f7, f15
  176. fsel f0, f16, f0, f8
  177. fabs f8, f24
  178. fsel f1, f17, f1, f9
  179. fabs f9, f25
  180. fsel f2, f18, f2, f10
  181. fabs f10, f26
  182. fsel f3, f19, f3, f11
  183. fabs f11, f27
  184. LFD f24, 24 * SIZE(X)
  185. LFD f25, 25 * SIZE(X)
  186. LFD f26, 26 * SIZE(X)
  187. LFD f27, 27 * SIZE(X)
  188. fsel f4, f20, f4, f12
  189. fabs f12, f28
  190. fsel f5, f21, f5, f13
  191. fabs f13, f29
  192. fsel f6, f22, f6, f14
  193. fabs f14, f30
  194. fsel f7, f23, f7, f15
  195. fabs f15, f31
  196. LFD f28, 28 * SIZE(X)
  197. LFD f29, 29 * SIZE(X)
  198. LFD f30, 30 * SIZE(X)
  199. LFD f31, 31 * SIZE(X)
  200. #ifndef POWER6
  201. L1_PREFETCH X, PREA
  202. #endif
  203. addi X, X, 16 * SIZE
  204. #ifdef POWER6
  205. L1_PREFETCH X, PREA
  206. #endif
  207. bdnz LL(10)
  208. .align 4
  209. LL(20):
  210. fsub f16, f0, f8
  211. fsub f17, f1, f9
  212. fsub f18, f2, f10
  213. fsub f19, f3, f11
  214. fsub f20, f4, f12
  215. fsub f21, f5, f13
  216. fsub f22, f6, f14
  217. fsub f23, f7, f15
  218. fsel f0, f16, f0, f8
  219. fabs f8, f24
  220. fsel f1, f17, f1, f9
  221. fabs f9, f25
  222. fsel f2, f18, f2, f10
  223. fabs f10, f26
  224. fsel f3, f19, f3, f11
  225. fabs f11, f27
  226. fsel f4, f20, f4, f12
  227. fabs f12, f28
  228. fsel f5, f21, f5, f13
  229. fabs f13, f29
  230. fsel f6, f22, f6, f14
  231. fabs f14, f30
  232. fsel f7, f23, f7, f15
  233. fabs f15, f31
  234. fsub f16, f0, f8
  235. fsub f17, f1, f9
  236. fsub f18, f2, f10
  237. fsub f19, f3, f11
  238. fsub f20, f4, f12
  239. fsub f21, f5, f13
  240. fsub f22, f6, f14
  241. fsub f23, f7, f15
  242. fsel f0, f16, f0, f8
  243. fsel f1, f17, f1, f9
  244. fsel f2, f18, f2, f10
  245. fsel f3, f19, f3, f11
  246. fsel f4, f20, f4, f12
  247. fsel f5, f21, f5, f13
  248. fsel f6, f22, f6, f14
  249. fsel f7, f23, f7, f15
  250. addi X, X, 16 * SIZE
  251. .align 4
  252. LL(50):
  253. andi. r0, N, 15
  254. mtspr CTR, r0
  255. beq LL(100)
  256. .align 4
  257. LL(60):
  258. LFD f8, 0 * SIZE(X)
  259. addi X, X, 1 * SIZE
  260. fabs f8, f8
  261. fsub f16, f1, f8
  262. fsel f1, f16, f1, f8
  263. bdnz LL(60)
  264. .align 4
  265. LL(100):
  266. fsub f8, f0, f1
  267. fsub f9, f2, f3
  268. fsub f10, f4, f5
  269. fsub f11, f6, f7
  270. fsel f0, f8, f0, f1
  271. fsel f2, f9, f2, f3
  272. fsel f4, f10, f4, f5
  273. fsel f6, f11, f6, f7
  274. fsub f8, f0, f2
  275. fsub f9, f4, f6
  276. fsel f0, f8, f0, f2
  277. fsel f4, f9, f4, f6
  278. fsub f8, f0, f4
  279. fsel f31, f8, f0, f4
  280. lfs f1, FZERO
  281. lfs f0, FONE
  282. fcmpu cr0, f1, f31
  283. beq- cr0, LL(9999)
  284. fdiv f30, f0, f31
  285. fmr f0, f1
  286. fmr f2, f1
  287. fmr f3, f1
  288. fmr f4, f1
  289. fmr f5, f1
  290. fmr f6, f1
  291. fmr f7, f1
  292. srawi. r0, NN, 4
  293. mtspr CTR, r0
  294. beq- cr0, LL(250)
  295. LFD f8, 0 * SIZE(XX)
  296. LFD f9, 1 * SIZE(XX)
  297. LFD f10, 2 * SIZE(XX)
  298. LFD f11, 3 * SIZE(XX)
  299. LFD f12, 4 * SIZE(XX)
  300. LFD f13, 5 * SIZE(XX)
  301. LFD f14, 6 * SIZE(XX)
  302. LFD f15, 7 * SIZE(XX)
  303. fmul f16, f30, f8
  304. fmul f17, f30, f9
  305. fmul f18, f30, f10
  306. fmul f19, f30, f11
  307. LFD f8, 8 * SIZE(XX)
  308. LFD f9, 9 * SIZE(XX)
  309. LFD f10, 10 * SIZE(XX)
  310. LFD f11, 11 * SIZE(XX)
  311. fmul f20, f30, f12
  312. fmul f21, f30, f13
  313. fmul f22, f30, f14
  314. fmul f23, f30, f15
  315. LFD f12, 12 * SIZE(XX)
  316. LFD f13, 13 * SIZE(XX)
  317. LFD f14, 14 * SIZE(XX)
  318. LFD f15, 15 * SIZE(XX)
  319. bdz LL(220)
  320. .align 4
  321. LL(210):
  322. fmadd f0, f16, f16, f0
  323. fmul f16, f30, f8
  324. fmadd f1, f17, f17, f1
  325. fmul f17, f30, f9
  326. fmadd f2, f18, f18, f2
  327. fmul f18, f30, f10
  328. fmadd f3, f19, f19, f3
  329. fmul f19, f30, f11
  330. LFD f8, 16 * SIZE(XX)
  331. LFD f9, 17 * SIZE(XX)
  332. LFD f10, 18 * SIZE(XX)
  333. LFD f11, 19 * SIZE(XX)
  334. fmadd f4, f20, f20, f4
  335. fmul f20, f30, f12
  336. fmadd f5, f21, f21, f5
  337. fmul f21, f30, f13
  338. fmadd f6, f22, f22, f6
  339. fmul f22, f30, f14
  340. fmadd f7, f23, f23, f7
  341. fmul f23, f30, f15
  342. LFD f12, 20 * SIZE(XX)
  343. LFD f13, 21 * SIZE(XX)
  344. LFD f14, 22 * SIZE(XX)
  345. LFD f15, 23 * SIZE(XX)
  346. fmadd f0, f16, f16, f0
  347. fmul f16, f30, f8
  348. fmadd f1, f17, f17, f1
  349. fmul f17, f30, f9
  350. fmadd f2, f18, f18, f2
  351. fmul f18, f30, f10
  352. fmadd f3, f19, f19, f3
  353. fmul f19, f30, f11
  354. LFD f8, 24 * SIZE(XX)
  355. LFD f9, 25 * SIZE(XX)
  356. LFD f10, 26 * SIZE(XX)
  357. LFD f11, 27 * SIZE(XX)
  358. fmadd f4, f20, f20, f4
  359. fmul f20, f30, f12
  360. fmadd f5, f21, f21, f5
  361. fmul f21, f30, f13
  362. fmadd f6, f22, f22, f6
  363. fmul f22, f30, f14
  364. fmadd f7, f23, f23, f7
  365. fmul f23, f30, f15
  366. LFD f12, 28 * SIZE(XX)
  367. LFD f13, 29 * SIZE(XX)
  368. LFD f14, 30 * SIZE(XX)
  369. LFD f15, 31 * SIZE(XX)
  370. #ifndef POWER6
  371. L1_PREFETCH XX, PREA
  372. #endif
  373. addi XX, XX, 16 * SIZE
  374. #ifdef POWER6
  375. L1_PREFETCH XX, PREA
  376. #endif
  377. bdnz LL(210)
  378. .align 4
  379. LL(220):
  380. fmadd f0, f16, f16, f0
  381. fmul f16, f30, f8
  382. fmadd f1, f17, f17, f1
  383. fmul f17, f30, f9
  384. fmadd f2, f18, f18, f2
  385. fmul f18, f30, f10
  386. fmadd f3, f19, f19, f3
  387. fmul f19, f30, f11
  388. fmadd f4, f20, f20, f4
  389. fmul f20, f30, f12
  390. fmadd f5, f21, f21, f5
  391. fmul f21, f30, f13
  392. fmadd f6, f22, f22, f6
  393. fmul f22, f30, f14
  394. fmadd f7, f23, f23, f7
  395. fmul f23, f30, f15
  396. fmadd f0, f16, f16, f0
  397. fmadd f1, f17, f17, f1
  398. fmadd f2, f18, f18, f2
  399. fmadd f3, f19, f19, f3
  400. fmadd f4, f20, f20, f4
  401. fmadd f5, f21, f21, f5
  402. fmadd f6, f22, f22, f6
  403. fmadd f7, f23, f23, f7
  404. addi XX, XX, 16 * SIZE
  405. .align 4
  406. LL(250):
  407. andi. r0, NN, 15
  408. mtspr CTR, r0
  409. beq- cr0, LL(270)
  410. .align 4
  411. LL(260):
  412. LFD f8, 0 * SIZE(XX)
  413. addi XX, XX, 1 * SIZE
  414. fmul f16, f30, f8
  415. fmadd f0, f16, f16, f0
  416. bdnz LL(260)
  417. .align 4
  418. LL(270):
  419. fadd f0, f0, f1
  420. fadd f2, f2, f3
  421. fadd f4, f4, f5
  422. fadd f6, f6, f7
  423. fadd f0, f0, f2
  424. fadd f4, f4, f6
  425. fadd f0, f0, f4
  426. fsqrt f0, f0
  427. fmul f1, f31, f0
  428. b LL(9999)
  429. .align 4
  430. LL(1000):
  431. sub X, X, INCX
  432. srawi. r0, N, 4
  433. mtspr CTR, r0
  434. beq- LL(1050)
  435. LFDUX f24, X, INCX
  436. LFDUX f25, X, INCX
  437. LFDUX f26, X, INCX
  438. LFDUX f27, X, INCX
  439. LFDUX f28, X, INCX
  440. LFDUX f29, X, INCX
  441. LFDUX f30, X, INCX
  442. LFDUX f31, X, INCX
  443. fabs f8, f24
  444. fabs f9, f25
  445. fabs f10, f26
  446. fabs f11, f27
  447. LFDUX f24, X, INCX
  448. LFDUX f25, X, INCX
  449. LFDUX f26, X, INCX
  450. LFDUX f27, X, INCX
  451. fabs f12, f28
  452. fabs f13, f29
  453. fabs f14, f30
  454. fabs f15, f31
  455. LFDUX f28, X, INCX
  456. LFDUX f29, X, INCX
  457. LFDUX f30, X, INCX
  458. LFDUX f31, X, INCX
  459. bdz LL(1020)
  460. .align 4
  461. LL(1010):
  462. fsub f16, f0, f8
  463. fsub f17, f1, f9
  464. fsub f18, f2, f10
  465. fsub f19, f3, f11
  466. fsub f20, f4, f12
  467. fsub f21, f5, f13
  468. fsub f22, f6, f14
  469. fsub f23, f7, f15
  470. fsel f0, f16, f0, f8
  471. fabs f8, f24
  472. fsel f1, f17, f1, f9
  473. fabs f9, f25
  474. fsel f2, f18, f2, f10
  475. fabs f10, f26
  476. fsel f3, f19, f3, f11
  477. fabs f11, f27
  478. LFDUX f24, X, INCX
  479. LFDUX f25, X, INCX
  480. LFDUX f26, X, INCX
  481. LFDUX f27, X, INCX
  482. fsel f4, f20, f4, f12
  483. fabs f12, f28
  484. fsel f5, f21, f5, f13
  485. fabs f13, f29
  486. fsel f6, f22, f6, f14
  487. fabs f14, f30
  488. fsel f7, f23, f7, f15
  489. fabs f15, f31
  490. LFDUX f28, X, INCX
  491. LFDUX f29, X, INCX
  492. LFDUX f30, X, INCX
  493. LFDUX f31, X, INCX
  494. fsub f16, f0, f8
  495. fsub f17, f1, f9
  496. fsub f18, f2, f10
  497. fsub f19, f3, f11
  498. fsub f20, f4, f12
  499. fsub f21, f5, f13
  500. fsub f22, f6, f14
  501. fsub f23, f7, f15
  502. fsel f0, f16, f0, f8
  503. fabs f8, f24
  504. fsel f1, f17, f1, f9
  505. fabs f9, f25
  506. fsel f2, f18, f2, f10
  507. fabs f10, f26
  508. fsel f3, f19, f3, f11
  509. fabs f11, f27
  510. LFDUX f24, X, INCX
  511. LFDUX f25, X, INCX
  512. LFDUX f26, X, INCX
  513. LFDUX f27, X, INCX
  514. fsel f4, f20, f4, f12
  515. fabs f12, f28
  516. fsel f5, f21, f5, f13
  517. fabs f13, f29
  518. fsel f6, f22, f6, f14
  519. fabs f14, f30
  520. fsel f7, f23, f7, f15
  521. fabs f15, f31
  522. LFDUX f28, X, INCX
  523. LFDUX f29, X, INCX
  524. LFDUX f30, X, INCX
  525. LFDUX f31, X, INCX
  526. bdnz LL(1010)
  527. .align 4
  528. LL(1020):
  529. fsub f16, f0, f8
  530. fsub f17, f1, f9
  531. fsub f18, f2, f10
  532. fsub f19, f3, f11
  533. fsub f20, f4, f12
  534. fsub f21, f5, f13
  535. fsub f22, f6, f14
  536. fsub f23, f7, f15
  537. fsel f0, f16, f0, f8
  538. fabs f8, f24
  539. fsel f1, f17, f1, f9
  540. fabs f9, f25
  541. fsel f2, f18, f2, f10
  542. fabs f10, f26
  543. fsel f3, f19, f3, f11
  544. fabs f11, f27
  545. fsel f4, f20, f4, f12
  546. fabs f12, f28
  547. fsel f5, f21, f5, f13
  548. fabs f13, f29
  549. fsel f6, f22, f6, f14
  550. fabs f14, f30
  551. fsel f7, f23, f7, f15
  552. fabs f15, f31
  553. fsub f16, f0, f8
  554. fsub f17, f1, f9
  555. fsub f18, f2, f10
  556. fsub f19, f3, f11
  557. fsub f20, f4, f12
  558. fsub f21, f5, f13
  559. fsub f22, f6, f14
  560. fsub f23, f7, f15
  561. fsel f0, f16, f0, f8
  562. fsel f1, f17, f1, f9
  563. fsel f2, f18, f2, f10
  564. fsel f3, f19, f3, f11
  565. fsel f4, f20, f4, f12
  566. fsel f5, f21, f5, f13
  567. fsel f6, f22, f6, f14
  568. fsel f7, f23, f7, f15
  569. .align 4
  570. LL(1050):
  571. andi. r0, N, 15
  572. mtspr CTR, r0
  573. beq LL(1999)
  574. .align 4
  575. LL(1060):
  576. LFDUX f8, X, INCX
  577. fabs f8, f8
  578. fsub f16, f1, f8
  579. fsel f1, f16, f1, f8
  580. bdnz LL(1060)
  581. .align 4
  582. LL(1999):
  583. fsub f8, f0, f1
  584. fsub f9, f2, f3
  585. fsub f10, f4, f5
  586. fsub f11, f6, f7
  587. fsel f0, f8, f0, f1
  588. fsel f2, f9, f2, f3
  589. fsel f4, f10, f4, f5
  590. fsel f6, f11, f6, f7
  591. fsub f8, f0, f2
  592. fsub f9, f4, f6
  593. fsel f0, f8, f0, f2
  594. fsel f4, f9, f4, f6
  595. fsub f8, f0, f4
  596. fsel f31, f8, f0, f4
  597. lfs f1, FZERO
  598. lfs f0, FONE
  599. lfd f2, FMAX
  600. fcmpu cr0, f1, f31
  601. beq- cr0, LL(9999)
  602. fdiv f30, f0, f31
  603. fmr f0, f1
  604. fmr f2, f1
  605. fmr f3, f1
  606. fmr f4, f1
  607. fmr f5, f1
  608. fmr f6, f1
  609. fmr f7, f1
  610. sub XX, XX, INCX
  611. srawi. r0, NN, 4
  612. mtspr CTR, r0
  613. beq- cr0, LL(2150)
  614. LFDUX f8, XX, INCX
  615. LFDUX f9, XX, INCX
  616. LFDUX f10, XX, INCX
  617. LFDUX f11, XX, INCX
  618. LFDUX f12, XX, INCX
  619. LFDUX f13, XX, INCX
  620. LFDUX f14, XX, INCX
  621. LFDUX f15, XX, INCX
  622. fmul f16, f30, f8
  623. fmul f17, f30, f9
  624. fmul f18, f30, f10
  625. fmul f19, f30, f11
  626. LFDUX f8, XX, INCX
  627. LFDUX f9, XX, INCX
  628. LFDUX f10, XX, INCX
  629. LFDUX f11, XX, INCX
  630. fmul f20, f30, f12
  631. fmul f21, f30, f13
  632. fmul f22, f30, f14
  633. fmul f23, f30, f15
  634. LFDUX f12, XX, INCX
  635. LFDUX f13, XX, INCX
  636. LFDUX f14, XX, INCX
  637. LFDUX f15, XX, INCX
  638. bdz LL(2120)
  639. .align 4
  640. LL(2110):
  641. fmadd f0, f16, f16, f0
  642. fmul f16, f30, f8
  643. fmadd f1, f17, f17, f1
  644. fmul f17, f30, f9
  645. fmadd f2, f18, f18, f2
  646. fmul f18, f30, f10
  647. fmadd f3, f19, f19, f3
  648. fmul f19, f30, f11
  649. LFDUX f8, XX, INCX
  650. LFDUX f9, XX, INCX
  651. LFDUX f10, XX, INCX
  652. LFDUX f11, XX, INCX
  653. fmadd f4, f20, f20, f4
  654. fmul f20, f30, f12
  655. fmadd f5, f21, f21, f5
  656. fmul f21, f30, f13
  657. fmadd f6, f22, f22, f6
  658. fmul f22, f30, f14
  659. fmadd f7, f23, f23, f7
  660. fmul f23, f30, f15
  661. LFDUX f12, XX, INCX
  662. LFDUX f13, XX, INCX
  663. LFDUX f14, XX, INCX
  664. LFDUX f15, XX, INCX
  665. fmadd f0, f16, f16, f0
  666. fmul f16, f30, f8
  667. fmadd f1, f17, f17, f1
  668. fmul f17, f30, f9
  669. fmadd f2, f18, f18, f2
  670. fmul f18, f30, f10
  671. fmadd f3, f19, f19, f3
  672. fmul f19, f30, f11
  673. LFDUX f8, XX, INCX
  674. LFDUX f9, XX, INCX
  675. LFDUX f10, XX, INCX
  676. LFDUX f11, XX, INCX
  677. fmadd f4, f20, f20, f4
  678. fmul f20, f30, f12
  679. fmadd f5, f21, f21, f5
  680. fmul f21, f30, f13
  681. fmadd f6, f22, f22, f6
  682. fmul f22, f30, f14
  683. fmadd f7, f23, f23, f7
  684. fmul f23, f30, f15
  685. LFDUX f12, XX, INCX
  686. LFDUX f13, XX, INCX
  687. LFDUX f14, XX, INCX
  688. LFDUX f15, XX, INCX
  689. bdnz LL(2110)
  690. .align 4
  691. LL(2120):
  692. fmadd f0, f16, f16, f0
  693. fmul f16, f30, f8
  694. fmadd f1, f17, f17, f1
  695. fmul f17, f30, f9
  696. fmadd f2, f18, f18, f2
  697. fmul f18, f30, f10
  698. fmadd f3, f19, f19, f3
  699. fmul f19, f30, f11
  700. fmadd f4, f20, f20, f4
  701. fmul f20, f30, f12
  702. fmadd f5, f21, f21, f5
  703. fmul f21, f30, f13
  704. fmadd f6, f22, f22, f6
  705. fmul f22, f30, f14
  706. fmadd f7, f23, f23, f7
  707. fmul f23, f30, f15
  708. fmadd f0, f16, f16, f0
  709. fmadd f1, f17, f17, f1
  710. fmadd f2, f18, f18, f2
  711. fmadd f3, f19, f19, f3
  712. fmadd f4, f20, f20, f4
  713. fmadd f5, f21, f21, f5
  714. fmadd f6, f22, f22, f6
  715. fmadd f7, f23, f23, f7
  716. .align 4
  717. LL(2150):
  718. andi. r0, NN, 15
  719. mtspr CTR, r0
  720. beq- cr0, LL(2170)
  721. .align 4
  722. LL(2160):
  723. LFDUX f8, XX, INCX
  724. fmul f16, f30, f8
  725. fmadd f0, f16, f16, f0
  726. bdnz LL(2160)
  727. .align 4
  728. LL(2170):
  729. fadd f0, f0, f1
  730. fadd f2, f2, f3
  731. fadd f4, f4, f5
  732. fadd f6, f6, f7
  733. fadd f0, f0, f2
  734. fadd f4, f4, f6
  735. fadd f0, f0, f4
  736. fsqrt f0, f0
  737. fmul f1, f31, f0
  738. .align 4
  739. LL(9999):
  740. lfd f14, 0(SP)
  741. lfd f15, 8(SP)
  742. lfd f16, 16(SP)
  743. lfd f17, 24(SP)
  744. lfd f18, 32(SP)
  745. lfd f19, 40(SP)
  746. lfd f20, 48(SP)
  747. lfd f21, 56(SP)
  748. lfd f22, 64(SP)
  749. lfd f23, 72(SP)
  750. lfd f24, 80(SP)
  751. lfd f25, 88(SP)
  752. lfd f26, 96(SP)
  753. lfd f27, 104(SP)
  754. lfd f28, 112(SP)
  755. lfd f29, 120(SP)
  756. lfd f30, 128(SP)
  757. lfd f31, 136(SP)
  758. addi SP, SP, STACKSIZE
  759. blr
  760. EPILOGUE