You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define STACKSIZE 64
  42. #define PREFETCHSIZE 32
  43. #define M $16
  44. #define N $17
  45. #define A $21
  46. #define LDA $18
  47. #define X $19
  48. #define INCX $20
  49. #define Y $22
  50. #define INCY $23
  51. #define BUFFER $24
  52. #define I $25
  53. #define J $27
  54. #define X1 $3
  55. #define Y1 $4
  56. #define A1 $5
  57. #define A2 $6
  58. #define alpha_r $f19
  59. #define alpha_i $f20
  60. #define s0 $f0
  61. #define s1 $f1
  62. #define s2 $f10
  63. #define s3 $f11
  64. #define t0 $f12
  65. #define t1 $f13
  66. #define t2 $f14
  67. #define t3 $f15
  68. #define x0 $f16
  69. #define x1 $f17
  70. #define x2 $f18
  71. #define x3 $f21
  72. #define a0 $f22
  73. #define a1 $f23
  74. #define a2 $f24
  75. #define a3 $f25
  76. #define a4 $f26
  77. #define a5 $f27
  78. #define a6 $f28
  79. #define a7 $f29
  80. #define a8 $f2
  81. #define a9 $f3
  82. #define a10 $f4
  83. #define a11 $f5
  84. #define a12 $f6
  85. #define a13 $f7
  86. #define a14 $f8
  87. #define a15 $f9
  88. #if !defined(CONJ) && !defined(XCONJ)
  89. #define ADD1 ADD
  90. #define ADD2 ADD
  91. #define ADD3 SUB
  92. #define ADD4 ADD
  93. #elif !defined(CONJ) && defined(XCONJ)
  94. #define ADD1 ADD
  95. #define ADD2 ADD
  96. #define ADD3 ADD
  97. #define ADD4 SUB
  98. #elif defined(CONJ) && !defined(XCONJ)
  99. #define ADD1 ADD
  100. #define ADD2 SUB
  101. #define ADD3 ADD
  102. #define ADD4 ADD
  103. #else
  104. #define ADD1 ADD
  105. #define ADD2 SUB
  106. #define ADD3 SUB
  107. #define ADD4 SUB
  108. #endif
  109. PROLOGUE
  110. lda $sp, -STACKSIZE($sp)
  111. ldq LDA, 0 + STACKSIZE($sp)
  112. ldq X, 8 + STACKSIZE($sp)
  113. ldq INCX, 16 + STACKSIZE($sp)
  114. ldq Y, 24 + STACKSIZE($sp)
  115. ldq INCY, 32 + STACKSIZE($sp)
  116. ldq BUFFER, 40 + STACKSIZE($sp)
  117. stt $f2, 0($sp)
  118. stt $f3, 8($sp)
  119. stt $f4, 16($sp)
  120. stt $f5, 24($sp)
  121. stt $f6, 32($sp)
  122. stt $f7, 40($sp)
  123. stt $f8, 48($sp)
  124. stt $f9, 56($sp)
  125. PROFCODE
  126. cmple M, 0, $0
  127. sll INCX, ZBASE_SHIFT, INCX
  128. cmple N, 0, $1
  129. sll INCY, ZBASE_SHIFT, INCY
  130. or $0, $1, $0
  131. bne $0, $L999
  132. cmpeq INCX, 2 * SIZE, $0
  133. mov X, X1
  134. sll LDA, ZBASE_SHIFT,LDA
  135. bne $0, $L10
  136. sra M, 2, I
  137. mov BUFFER, Y1
  138. mov BUFFER, X
  139. ble I, $L05
  140. .align 4
  141. $L02:
  142. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  143. lda I, -1(I)
  144. LD a0, 0 * SIZE(X1)
  145. LD a1, 1 * SIZE(X1)
  146. addq X1, INCX, X1
  147. LD a2, 0 * SIZE(X1)
  148. LD a3, 1 * SIZE(X1)
  149. addq X1, INCX, X1
  150. ST a0, 0 * SIZE(Y1)
  151. ST a1, 1 * SIZE(Y1)
  152. ST a2, 2 * SIZE(Y1)
  153. ST a3, 3 * SIZE(Y1)
  154. LD a4, 0 * SIZE(X1)
  155. LD a5, 1 * SIZE(X1)
  156. addq X1, INCX, X1
  157. LD a6, 0 * SIZE(X1)
  158. LD a7, 1 * SIZE(X1)
  159. addq X1, INCX, X1
  160. ST a4, 4 * SIZE(Y1)
  161. ST a5, 5 * SIZE(Y1)
  162. ST a6, 6 * SIZE(Y1)
  163. ST a7, 7 * SIZE(Y1)
  164. lda Y1, 8 * SIZE(Y1)
  165. bgt I, $L02
  166. .align 4
  167. $L05:
  168. and M, 3, I
  169. ble I, $L10
  170. .align 4
  171. $L06:
  172. LD a0, 0 * SIZE(X1)
  173. LD a1, 1 * SIZE(X1)
  174. addq X1, INCX, X1
  175. ST a0, 0 * SIZE(Y1)
  176. ST a1, 1 * SIZE(Y1)
  177. lda Y1, 2 * SIZE(Y1)
  178. lda I, -1(I)
  179. bgt I, $L06
  180. .align 4
  181. $L10:
  182. mov Y, Y1
  183. fclr t0
  184. unop
  185. fclr t1
  186. sra N, 1, J
  187. fclr t2
  188. fclr t3
  189. ble J, $L20
  190. .align 4
  191. $L11:
  192. mov A, A1
  193. fclr s0
  194. addq A, LDA, A2
  195. fclr s1
  196. addq A2, LDA, A
  197. unop
  198. mov X, X1
  199. lds $f31, 3 * SIZE(Y)
  200. sra M, 2, I
  201. fclr s2
  202. fclr s3
  203. ble I, $L15
  204. LD a0, 0 * SIZE(A1)
  205. LD a1, 1 * SIZE(A1)
  206. LD a2, 0 * SIZE(A2)
  207. LD a3, 1 * SIZE(A2)
  208. LD a4, 2 * SIZE(A1)
  209. LD a5, 3 * SIZE(A1)
  210. LD a6, 2 * SIZE(A2)
  211. LD a7, 3 * SIZE(A2)
  212. LD a8, 4 * SIZE(A1)
  213. LD a9, 5 * SIZE(A1)
  214. LD a10, 4 * SIZE(A2)
  215. LD a11, 5 * SIZE(A2)
  216. LD a12, 6 * SIZE(A1)
  217. LD a13, 7 * SIZE(A1)
  218. LD a14, 6 * SIZE(A2)
  219. LD a15, 7 * SIZE(A2)
  220. LD x0, 0 * SIZE(X1)
  221. LD x1, 1 * SIZE(X1)
  222. LD x2, 2 * SIZE(X1)
  223. lda I, -1(I)
  224. ble I, $L13
  225. .align 4
  226. $L12:
  227. ADD3 s0, t0, s0
  228. unop
  229. MUL x0, a0, t0
  230. LD x3, 3 * SIZE(X1)
  231. ADD4 s1, t1, s1
  232. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  233. MUL x0, a1, t1
  234. unop
  235. ADD3 s2, t2, s2
  236. unop
  237. MUL x0, a2, t2
  238. unop
  239. ADD4 s3, t3, s3
  240. unop
  241. MUL x0, a3, t3
  242. LD x0, 4 * SIZE(X1)
  243. ADD1 s0, t0, s0
  244. unop
  245. MUL x1, a1, t0
  246. LD a1, 9 * SIZE(A1)
  247. ADD2 s1, t1, s1
  248. unop
  249. MUL x1, a0, t1
  250. LD a0, 8 * SIZE(A1)
  251. ADD1 s2, t2, s2
  252. unop
  253. MUL x1, a3, t2
  254. LD a3, 9 * SIZE(A2)
  255. ADD2 s3, t3, s3
  256. unop
  257. MUL x1, a2, t3
  258. LD a2, 8 * SIZE(A2)
  259. ADD3 s0, t0, s0
  260. unop
  261. MUL x2, a4, t0
  262. LD x1, 5 * SIZE(X1)
  263. ADD4 s1, t1, s1
  264. MUL x2, a5, t1
  265. ADD3 s2, t2, s2
  266. MUL x2, a6, t2
  267. ADD4 s3, t3, s3
  268. unop
  269. MUL x2, a7, t3
  270. LD x2, 6 * SIZE(X1)
  271. ADD1 s0, t0, s0
  272. unop
  273. MUL x3, a5, t0
  274. LD a5, 11 * SIZE(A1)
  275. ADD2 s1, t1, s1
  276. unop
  277. MUL x3, a4, t1
  278. LD a4, 10 * SIZE(A1)
  279. ADD1 s2, t2, s2
  280. unop
  281. MUL x3, a7, t2
  282. LD a7, 11 * SIZE(A2)
  283. ADD2 s3, t3, s3
  284. unop
  285. MUL x3, a6, t3
  286. LD a6, 10 * SIZE(A2)
  287. ADD3 s0, t0, s0
  288. unop
  289. MUL x0, a8, t0
  290. LD x3, 7 * SIZE(X1)
  291. ADD4 s1, t1, s1
  292. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  293. MUL x0, a9, t1
  294. unop
  295. ADD3 s2, t2, s2
  296. lda I, -1(I)
  297. MUL x0, a10, t2
  298. unop
  299. ADD4 s3, t3, s3
  300. unop
  301. MUL x0, a11, t3
  302. LD x0, 8 * SIZE(X1)
  303. ADD1 s0, t0, s0
  304. unop
  305. MUL x1, a9, t0
  306. LD a9, 13 * SIZE(A1)
  307. ADD2 s1, t1, s1
  308. unop
  309. MUL x1, a8, t1
  310. LD a8, 12 * SIZE(A1)
  311. ADD1 s2, t2, s2
  312. lda A1, 8 * SIZE(A1)
  313. MUL x1, a11, t2
  314. LD a11, 13 * SIZE(A2)
  315. ADD2 s3, t3, s3
  316. unop
  317. MUL x1, a10, t3
  318. LD a10, 12 * SIZE(A2)
  319. ADD3 s0, t0, s0
  320. unop
  321. MUL x2, a12, t0
  322. LD x1, 9 * SIZE(X1)
  323. ADD4 s1, t1, s1
  324. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  325. MUL x2, a13, t1
  326. lda A2, 8 * SIZE(A2)
  327. ADD3 s2, t2, s2
  328. unop
  329. MUL x2, a14, t2
  330. unop
  331. ADD4 s3, t3, s3
  332. unop
  333. MUL x2, a15, t3
  334. LD x2, 10 * SIZE(X1)
  335. ADD1 s0, t0, s0
  336. unop
  337. MUL x3, a13, t0
  338. LD a13, 7 * SIZE(A1)
  339. ADD2 s1, t1, s1
  340. lda X1, 8 * SIZE(X1)
  341. MUL x3, a12, t1
  342. LD a12, 6 * SIZE(A1)
  343. ADD1 s2, t2, s2
  344. unop
  345. MUL x3, a15, t2
  346. LD a15, 7 * SIZE(A2)
  347. ADD2 s3, t3, s3
  348. MUL x3, a14, t3
  349. LD a14, 6 * SIZE(A2)
  350. bgt I, $L12
  351. .align 4
  352. $L13:
  353. ADD3 s0, t0, s0
  354. unop
  355. MUL x0, a0, t0
  356. LD x3, 3 * SIZE(X1)
  357. ADD4 s1, t1, s1
  358. MUL x0, a1, t1
  359. ADD3 s2, t2, s2
  360. MUL x0, a2, t2
  361. ADD4 s3, t3, s3
  362. unop
  363. MUL x0, a3, t3
  364. LD x0, 4 * SIZE(X1)
  365. ADD1 s0, t0, s0
  366. MUL x1, a1, t0
  367. ADD2 s1, t1, s1
  368. MUL x1, a0, t1
  369. ADD1 s2, t2, s2
  370. unop
  371. MUL x1, a3, t2
  372. unop
  373. ADD2 s3, t3, s3
  374. lda A1, 8 * SIZE(A1)
  375. MUL x1, a2, t3
  376. LD x1, 5 * SIZE(X1)
  377. ADD3 s0, t0, s0
  378. MUL x2, a4, t0
  379. ADD4 s1, t1, s1
  380. MUL x2, a5, t1
  381. ADD3 s2, t2, s2
  382. unop
  383. MUL x2, a6, t2
  384. unop
  385. ADD4 s3, t3, s3
  386. lda A2, 8 * SIZE(A2)
  387. MUL x2, a7, t3
  388. LD x2, 6 * SIZE(X1)
  389. ADD1 s0, t0, s0
  390. MUL x3, a5, t0
  391. ADD2 s1, t1, s1
  392. MUL x3, a4, t1
  393. ADD1 s2, t2, s2
  394. unop
  395. MUL x3, a7, t2
  396. lda X1, 8 * SIZE(X1)
  397. ADD2 s3, t3, s3
  398. unop
  399. MUL x3, a6, t3
  400. LD x3, -1 * SIZE(X1)
  401. ADD3 s0, t0, s0
  402. MUL x0, a8, t0
  403. ADD4 s1, t1, s1
  404. MUL x0, a9, t1
  405. ADD3 s2, t2, s2
  406. MUL x0, a10, t2
  407. ADD4 s3, t3, s3
  408. MUL x0, a11, t3
  409. ADD1 s0, t0, s0
  410. MUL x1, a9, t0
  411. ADD2 s1, t1, s1
  412. MUL x1, a8, t1
  413. ADD1 s2, t2, s2
  414. MUL x1, a11, t2
  415. ADD2 s3, t3, s3
  416. MUL x1, a10, t3
  417. ADD3 s0, t0, s0
  418. MUL x2, a12, t0
  419. ADD4 s1, t1, s1
  420. MUL x2, a13, t1
  421. ADD3 s2, t2, s2
  422. MUL x2, a14, t2
  423. ADD4 s3, t3, s3
  424. MUL x2, a15, t3
  425. ADD1 s0, t0, s0
  426. MUL x3, a13, t0
  427. ADD2 s1, t1, s1
  428. MUL x3, a12, t1
  429. ADD1 s2, t2, s2
  430. MUL x3, a15, t2
  431. ADD2 s3, t3, s3
  432. MUL x3, a14, t3
  433. .align 4
  434. $L15:
  435. and M, 3, I
  436. ble I, $L18
  437. LD a0, 0 * SIZE(A1)
  438. LD a1, 1 * SIZE(A1)
  439. LD a2, 0 * SIZE(A2)
  440. LD a3, 1 * SIZE(A2)
  441. LD x0, 0 * SIZE(X1)
  442. lda I, -1(I)
  443. ble I, $L17
  444. .align 4
  445. $L16:
  446. ADD3 s0, t0, s0
  447. lda I, -1(I)
  448. MUL x0, a0, t0
  449. LD x1, 1 * SIZE(X1)
  450. ADD4 s1, t1, s1
  451. MUL x0, a1, t1
  452. ADD3 s2, t2, s2
  453. MUL x0, a2, t2
  454. ADD4 s3, t3, s3
  455. unop
  456. MUL x0, a3, t3
  457. LD x0, 2 * SIZE(X1)
  458. ADD1 s0, t0, s0
  459. lda A2, 2 * SIZE(A2)
  460. MUL x1, a1, t0
  461. LD a1, 3 * SIZE(A1)
  462. ADD2 s1, t1, s1
  463. lda X1, 2 * SIZE(X1)
  464. MUL x1, a0, t1
  465. LD a0, 2 * SIZE(A1)
  466. ADD1 s2, t2, s2
  467. lda A1, 2 * SIZE(A1)
  468. MUL x1, a3, t2
  469. LD a3, 1 * SIZE(A2)
  470. ADD2 s3, t3, s3
  471. MUL x1, a2, t3
  472. LD a2, 0 * SIZE(A2)
  473. bgt I, $L16
  474. .align 4
  475. $L17:
  476. ADD3 s0, t0, s0
  477. unop
  478. MUL x0, a0, t0
  479. LD x1, 1 * SIZE(X1)
  480. ADD4 s1, t1, s1
  481. unop
  482. MUL x0, a1, t1
  483. unop
  484. ADD3 s2, t2, s2
  485. MUL x0, a2, t2
  486. ADD4 s3, t3, s3
  487. MUL x0, a3, t3
  488. ADD1 s0, t0, s0
  489. MUL x1, a1, t0
  490. ADD2 s1, t1, s1
  491. MUL x1, a0, t1
  492. ADD1 s2, t2, s2
  493. MUL x1, a3, t2
  494. ADD2 s3, t3, s3
  495. MUL x1, a2, t3
  496. .align 4
  497. $L18:
  498. LD a0, 0 * SIZE(Y)
  499. unop
  500. LD a1, 1 * SIZE(Y)
  501. addq Y, INCY, Y
  502. LD a2, 0 * SIZE(Y)
  503. unop
  504. LD a3, 1 * SIZE(Y)
  505. addq Y, INCY, Y
  506. ADD3 s0, t0, s0
  507. ADD4 s1, t1, s1
  508. ADD3 s2, t2, s2
  509. ADD4 s3, t3, s3
  510. MUL alpha_r, s0, t0
  511. MUL alpha_r, s1, t1
  512. MUL alpha_r, s2, t2
  513. MUL alpha_r, s3, t3
  514. ADD a0, t0, a0
  515. MUL alpha_i, s1, t0
  516. ADD a1, t1, a1
  517. MUL alpha_i, s0, t1
  518. ADD a2, t2, a2
  519. MUL alpha_i, s3, t2
  520. ADD a3, t3, a3
  521. MUL alpha_i, s2, t3
  522. SUB a0, t0, a0
  523. ADD a1, t1, a1
  524. SUB a2, t2, a2
  525. ADD a3, t3, a3
  526. ST a0, 0 * SIZE(Y1)
  527. fclr t0
  528. ST a1, 1 * SIZE(Y1)
  529. addq Y1, INCY, Y1
  530. ST a2, 0 * SIZE(Y1)
  531. fclr t1
  532. ST a3, 1 * SIZE(Y1)
  533. addq Y1, INCY, Y1
  534. fclr t2
  535. lda J, -1(J)
  536. fclr t3
  537. bgt J, $L11
  538. .align 4
  539. $L20:
  540. blbc N, $L999
  541. mov A, A1
  542. fclr s0
  543. fclr s1
  544. mov X, X1
  545. sra M, 2, I
  546. fclr s2
  547. fclr s3
  548. ble I, $L25
  549. LD a0, 0 * SIZE(A1)
  550. LD a1, 1 * SIZE(A1)
  551. LD a4, 2 * SIZE(A1)
  552. LD a5, 3 * SIZE(A1)
  553. LD a8, 4 * SIZE(A1)
  554. LD a9, 5 * SIZE(A1)
  555. LD a12, 6 * SIZE(A1)
  556. LD a13, 7 * SIZE(A1)
  557. LD x0, 0 * SIZE(X1)
  558. LD x1, 1 * SIZE(X1)
  559. LD x2, 2 * SIZE(X1)
  560. lda I, -1(I)
  561. ble I, $L23
  562. .align 4
  563. $L22:
  564. ADD3 s0, t0, s0
  565. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  566. MUL x0, a0, t0
  567. LD x3, 3 * SIZE(X1)
  568. ADD4 s1, t1, s1
  569. unop
  570. MUL x0, a1, t1
  571. LD x0, 4 * SIZE(X1)
  572. ADD1 s2, t0, s2
  573. lda I, -1(I)
  574. MUL x1, a1, t0
  575. LD a1, 9 * SIZE(A1)
  576. ADD2 s3, t1, s3
  577. unop
  578. MUL x1, a0, t1
  579. LD a0, 8 * SIZE(A1)
  580. ADD3 s0, t0, s0
  581. unop
  582. MUL x2, a4, t0
  583. LD x1, 5 * SIZE(X1)
  584. ADD4 s1, t1, s1
  585. unop
  586. MUL x2, a5, t1
  587. LD x2, 6 * SIZE(X1)
  588. ADD1 s2, t0, s2
  589. unop
  590. MUL x3, a5, t0
  591. LD a5, 11 * SIZE(A1)
  592. ADD2 s3, t1, s3
  593. unop
  594. MUL x3, a4, t1
  595. LD a4, 10 * SIZE(A1)
  596. ADD3 s0, t0, s0
  597. unop
  598. MUL x0, a8, t0
  599. LD x3, 7 * SIZE(X1)
  600. ADD4 s1, t1, s1
  601. unop
  602. MUL x0, a9, t1
  603. LD x0, 8 * SIZE(X1)
  604. ADD1 s2, t0, s2
  605. unop
  606. MUL x1, a9, t0
  607. LD a9, 13 * SIZE(A1)
  608. ADD2 s3, t1, s3
  609. unop
  610. MUL x1, a8, t1
  611. LD a8, 12 * SIZE(A1)
  612. ADD3 s0, t0, s0
  613. unop
  614. MUL x2, a12, t0
  615. LD x1, 9 * SIZE(X1)
  616. ADD4 s1, t1, s1
  617. lda A1, 8 * SIZE(A1)
  618. MUL x2, a13, t1
  619. LD x2, 10 * SIZE(X1)
  620. ADD1 s2, t0, s2
  621. lda X1, 8 * SIZE(X1)
  622. MUL x3, a13, t0
  623. LD a13, 7 * SIZE(A1)
  624. ADD2 s3, t1, s3
  625. MUL x3, a12, t1
  626. LD a12, 6 * SIZE(A1)
  627. bgt I, $L22
  628. .align 4
  629. $L23:
  630. ADD3 s0, t0, s0
  631. unop
  632. MUL x0, a0, t0
  633. LD x3, 3 * SIZE(X1)
  634. ADD4 s1, t1, s1
  635. unop
  636. MUL x0, a1, t1
  637. LD x0, 4 * SIZE(X1)
  638. ADD1 s2, t0, s2
  639. unop
  640. MUL x1, a1, t0
  641. lda A1, 8 * SIZE(A1)
  642. ADD2 s3, t1, s3
  643. unop
  644. MUL x1, a0, t1
  645. LD x1, 5 * SIZE(X1)
  646. ADD3 s0, t0, s0
  647. unop
  648. MUL x2, a4, t0
  649. unop
  650. ADD4 s1, t1, s1
  651. unop
  652. MUL x2, a5, t1
  653. LD x2, 6 * SIZE(X1)
  654. ADD1 s2, t0, s2
  655. unop
  656. MUL x3, a5, t0
  657. lda X1, 8 * SIZE(X1)
  658. ADD2 s3, t1, s3
  659. unop
  660. MUL x3, a4, t1
  661. LD x3, -1 * SIZE(X1)
  662. ADD3 s0, t0, s0
  663. MUL x0, a8, t0
  664. ADD4 s1, t1, s1
  665. MUL x0, a9, t1
  666. ADD1 s2, t0, s2
  667. MUL x1, a9, t0
  668. ADD2 s3, t1, s3
  669. MUL x1, a8, t1
  670. ADD3 s0, t0, s0
  671. MUL x2, a12, t0
  672. ADD4 s1, t1, s1
  673. MUL x2, a13, t1
  674. ADD1 s2, t0, s2
  675. MUL x3, a13, t0
  676. ADD2 s3, t1, s3
  677. MUL x3, a12, t1
  678. .align 4
  679. $L25:
  680. and M, 3, I
  681. ble I, $L28
  682. LD a0, 0 * SIZE(A1)
  683. LD a1, 1 * SIZE(A1)
  684. LD x0, 0 * SIZE(X1)
  685. lda I, -1(I)
  686. ble I, $L27
  687. .align 4
  688. $L26:
  689. ADD3 s0, t0, s0
  690. lda A1, 2 * SIZE(A1)
  691. MUL x0, a0, t0
  692. LD x1, 1 * SIZE(X1)
  693. ADD4 s1, t1, s1
  694. lda I, -1(I)
  695. MUL x0, a1, t1
  696. LD x0, 2 * SIZE(X1)
  697. ADD1 s0, t0, s0
  698. lda X1, 2 * SIZE(X1)
  699. MUL x1, a1, t0
  700. LD a1, 1 * SIZE(A1)
  701. ADD2 s1, t1, s1
  702. MUL x1, a0, t1
  703. LD a0, 0 * SIZE(A1)
  704. bgt I, $L26
  705. .align 4
  706. $L27:
  707. ADD3 s0, t0, s0
  708. unop
  709. MUL x0, a0, t0
  710. LD x1, 1 * SIZE(X1)
  711. ADD4 s1, t1, s1
  712. unop
  713. MUL x0, a1, t1
  714. unop
  715. ADD1 s0, t0, s0
  716. MUL x1, a1, t0
  717. ADD2 s1, t1, s1
  718. MUL x1, a0, t1
  719. .align 4
  720. $L28:
  721. LD a0, 0 * SIZE(Y)
  722. LD a1, 1 * SIZE(Y)
  723. ADD3 s0, t0, s0
  724. ADD4 s1, t1, s1
  725. ADD3 s2, t2, s2
  726. ADD4 s3, t3, s3
  727. ADD s0, s2, s0
  728. ADD s1, s3, s1
  729. MUL alpha_r, s0, t0
  730. MUL alpha_r, s1, t1
  731. ADD a0, t0, a0
  732. MUL alpha_i, s1, t0
  733. ADD a1, t1, a1
  734. MUL alpha_i, s0, t1
  735. SUB a0, t0, a0
  736. ADD a1, t1, a1
  737. ST a0, 0 * SIZE(Y1)
  738. ST a1, 1 * SIZE(Y1)
  739. .align 4
  740. $L999:
  741. ldt $f2, 0($sp)
  742. ldt $f3, 8($sp)
  743. ldt $f4, 16($sp)
  744. ldt $f5, 24($sp)
  745. ldt $f6, 32($sp)
  746. ldt $f7, 40($sp)
  747. ldt $f8, 48($sp)
  748. ldt $f9, 56($sp)
  749. lda $sp, STACKSIZE($sp)
  750. ret
  751. EPILOGUE