You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 18 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define STACKSIZE 64
  42. #define PREFETCHSIZE 32
  43. #define M $16
  44. #define N $17
  45. #define A $21
  46. #define LDA $18
  47. #define X $19
  48. #define INCX $20
  49. #define Y $22
  50. #define INCY $23
  51. #define BUFFER $24
  52. #define I $25
  53. #define J $27
  54. #define Y1 $4
  55. #define A1 $5
  56. #define A2 $6
  57. #define alpha_r $f19
  58. #define alpha_i $f20
  59. #define alpha1 $f0
  60. #define alpha2 $f1
  61. #define alpha3 $f10
  62. #define alpha4 $f11
  63. #define y0 $f12
  64. #define y1 $f13
  65. #define y2 $f14
  66. #define y3 $f15
  67. #define y4 $f16
  68. #define y5 $f17
  69. #define y6 $f18
  70. #define y7 $f21
  71. #define a0 $f22
  72. #define a1 $f23
  73. #define a2 $f24
  74. #define a3 $f25
  75. #define a4 $f26
  76. #define a5 $f27
  77. #define a6 $f28
  78. #define a7 $f29
  79. #define t0 $f2
  80. #define t1 $f3
  81. #define t2 $f4
  82. #define t3 $f5
  83. #if !defined(CONJ) && !defined(XCONJ)
  84. #define ADD1 ADD
  85. #define ADD2 ADD
  86. #define ADD3 SUB
  87. #define ADD4 ADD
  88. #elif defined(CONJ) && !defined(XCONJ)
  89. #define ADD1 ADD
  90. #define ADD2 SUB
  91. #define ADD3 ADD
  92. #define ADD4 ADD
  93. #elif !defined(CONJ) && defined(XCONJ)
  94. #define ADD1 ADD
  95. #define ADD2 ADD
  96. #define ADD3 ADD
  97. #define ADD4 SUB
  98. #else
  99. #define ADD1 ADD
  100. #define ADD2 SUB
  101. #define ADD3 SUB
  102. #define ADD4 SUB
  103. #endif
  104. PROLOGUE
  105. lda $sp, -STACKSIZE($sp)
  106. ldq LDA, 0 + STACKSIZE($sp)
  107. ldq X, 8 + STACKSIZE($sp)
  108. ldq INCX, 16 + STACKSIZE($sp)
  109. ldq Y, 24 + STACKSIZE($sp)
  110. ldq INCY, 32 + STACKSIZE($sp)
  111. ldq BUFFER, 40 + STACKSIZE($sp)
  112. stt $f2, 0($sp)
  113. stt $f3, 8($sp)
  114. stt $f4, 16($sp)
  115. stt $f5, 24($sp)
  116. stt $f6, 32($sp)
  117. stt $f7, 40($sp)
  118. stt $f8, 48($sp)
  119. stt $f9, 56($sp)
  120. PROFCODE
  121. cmple M, 0, $0
  122. sll INCX, ZBASE_SHIFT, INCX
  123. cmple N, 0, $1
  124. sll INCY, ZBASE_SHIFT, INCY
  125. or $0, $1, $0
  126. bne $0, $L999
  127. cmpeq INCY, 2 * SIZE, $0
  128. sll LDA, ZBASE_SHIFT,LDA
  129. bne $0, $L10
  130. mov BUFFER, Y1
  131. mov Y, BUFFER
  132. mov Y1, Y
  133. sra M, 2, I
  134. ble I, $L05
  135. .align 4
  136. $L02:
  137. ST $f31, 0 * SIZE(Y1)
  138. ST $f31, 1 * SIZE(Y1)
  139. ST $f31, 2 * SIZE(Y1)
  140. ST $f31, 3 * SIZE(Y1)
  141. ST $f31, 4 * SIZE(Y1)
  142. ST $f31, 5 * SIZE(Y1)
  143. ST $f31, 6 * SIZE(Y1)
  144. ST $f31, 7 * SIZE(Y1)
  145. lda Y1, 8 * SIZE(Y1)
  146. lda I, -1(I)
  147. bgt I, $L02
  148. .align 4
  149. $L05:
  150. and M, 3, I
  151. ble I, $L10
  152. .align 4
  153. $L06:
  154. ST $f31, 0 * SIZE(Y1)
  155. ST $f31, 1 * SIZE(Y1)
  156. addq Y1, 2 * SIZE, Y1
  157. lda I, -1(I)
  158. bgt I, $L06
  159. .align 4
  160. $L10:
  161. sra N, 1, J
  162. ble J, $L20
  163. .align 4
  164. $L11:
  165. LD alpha1, 0 * SIZE(X)
  166. LD alpha2, 1 * SIZE(X)
  167. addq X, INCX, X
  168. LD alpha3, 0 * SIZE(X)
  169. LD alpha4, 1 * SIZE(X)
  170. addq X, INCX, X
  171. MUL alpha_r, alpha1, y0
  172. MUL alpha_r, alpha2, y1
  173. MUL alpha_r, alpha3, y2
  174. MUL alpha_r, alpha4, y3
  175. MUL alpha_i, alpha2, t0
  176. mov A, A1
  177. MUL alpha_i, alpha1, t1
  178. addq A, LDA, A2
  179. MUL alpha_i, alpha4, t2
  180. addq A2, LDA, A
  181. MUL alpha_i, alpha3, t3
  182. mov Y, Y1
  183. #ifndef XCONJ
  184. SUB y0, t0, alpha1
  185. ADD y1, t1, alpha2
  186. SUB y2, t2, alpha3
  187. ADD y3, t3, alpha4
  188. #else
  189. ADD y0, t0, alpha1
  190. SUB y1, t1, alpha2
  191. ADD y2, t2, alpha3
  192. SUB y3, t3, alpha4
  193. #endif
  194. ldl $31, 4 * SIZE(X)
  195. sra M, 2, I
  196. ble I, $L15
  197. LD a0, 0 * SIZE(A1)
  198. LD a1, 1 * SIZE(A1)
  199. LD a2, 2 * SIZE(A1)
  200. LD a3, 3 * SIZE(A1)
  201. LD a4, 0 * SIZE(A2)
  202. LD a5, 1 * SIZE(A2)
  203. LD a6, 2 * SIZE(A2)
  204. LD a7, 3 * SIZE(A2)
  205. MUL alpha1, a0, t0
  206. LD y0, 0 * SIZE(Y1)
  207. MUL alpha1, a1, t1
  208. LD y1, 1 * SIZE(Y1)
  209. MUL alpha1, a2, t2
  210. LD y2, 2 * SIZE(Y1)
  211. MUL alpha1, a3, t3
  212. LD y3, 3 * SIZE(Y1)
  213. ADD1 y0, t0, y0
  214. unop
  215. MUL alpha3, a4, t0
  216. LD y4, 4 * SIZE(Y1)
  217. ADD2 y1, t1, y1
  218. unop
  219. MUL alpha3, a5, t1
  220. LD y5, 5 * SIZE(Y1)
  221. ADD1 y2, t2, y2
  222. unop
  223. MUL alpha3, a6, t2
  224. LD y6, 6 * SIZE(Y1)
  225. ADD2 y3, t3, y3
  226. unop
  227. MUL alpha3, a7, t3
  228. LD y7, 7 * SIZE(Y1)
  229. ADD1 y0, t0, y0
  230. unop
  231. MUL alpha2, a1, t0
  232. LD a1, 5 * SIZE(A1)
  233. ADD2 y1, t1, y1
  234. unop
  235. MUL alpha2, a0, t1
  236. LD a0, 4 * SIZE(A1)
  237. ADD1 y2, t2, y2
  238. unop
  239. MUL alpha2, a3, t2
  240. LD a3, 7 * SIZE(A1)
  241. ADD2 y3, t3, y3
  242. unop
  243. MUL alpha2, a2, t3
  244. LD a2, 6 * SIZE(A1)
  245. ADD3 y0, t0, y0
  246. unop
  247. MUL alpha4, a5, t0
  248. LD a5, 5 * SIZE(A2)
  249. ADD4 y1, t1, y1
  250. unop
  251. MUL alpha4, a4, t1
  252. LD a4, 4 * SIZE(A2)
  253. ADD3 y2, t2, y2
  254. unop
  255. MUL alpha4, a7, t2
  256. LD a7, 7 * SIZE(A2)
  257. ADD4 y3, t3, y3
  258. unop
  259. MUL alpha4, a6, t3
  260. LD a6, 6 * SIZE(A2)
  261. ADD3 y0, t0, y0
  262. MUL alpha1, a0, t0
  263. ADD4 y1, t1, y1
  264. MUL alpha1, a1, t1
  265. ADD3 y2, t2, y2
  266. unop
  267. MUL alpha1, a2, t2
  268. unop
  269. ADD4 y3, t3, y3
  270. lda I, -1(I)
  271. MUL alpha1, a3, t3
  272. ble I, $L13
  273. .align 4
  274. $L12:
  275. ADD1 y4, t0, y4
  276. ST y0, 0 * SIZE(Y1)
  277. MUL alpha3, a4, t0
  278. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  279. ADD2 y5, t1, y5
  280. ST y1, 1 * SIZE(Y1)
  281. MUL alpha3, a5, t1
  282. lda I, -1(I)
  283. ADD1 y6, t2, y6
  284. ST y2, 2 * SIZE(Y1)
  285. MUL alpha3, a6, t2
  286. unop
  287. ADD2 y7, t3, y7
  288. ST y3, 3 * SIZE(Y1)
  289. MUL alpha3, a7, t3
  290. unop
  291. ADD1 y4, t0, y4
  292. unop
  293. MUL alpha2, a1, t0
  294. LD a1, 9 * SIZE(A1)
  295. ADD2 y5, t1, y5
  296. unop
  297. MUL alpha2, a0, t1
  298. LD a0, 8 * SIZE(A1)
  299. ADD1 y6, t2, y6
  300. unop
  301. MUL alpha2, a3, t2
  302. LD a3, 11 * SIZE(A1)
  303. ADD2 y7, t3, y7
  304. unop
  305. MUL alpha2, a2, t3
  306. LD a2, 10 * SIZE(A1)
  307. ADD3 y4, t0, y4
  308. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  309. MUL alpha4, a5, t0
  310. LD a5, 9 * SIZE(A2)
  311. ADD4 y5, t1, y5
  312. unop
  313. MUL alpha4, a4, t1
  314. LD a4, 8 * SIZE(A2)
  315. ADD3 y6, t2, y6
  316. unop
  317. MUL alpha4, a7, t2
  318. LD a7, 11 * SIZE(A2)
  319. ADD4 y7, t3, y7
  320. unop
  321. MUL alpha4, a6, t3
  322. LD a6, 10 * SIZE(A2)
  323. ADD3 y4, t0, y4
  324. unop
  325. MUL alpha1, a0, t0
  326. LD y0, 8 * SIZE(Y1)
  327. ADD4 y5, t1, y5
  328. unop
  329. MUL alpha1, a1, t1
  330. LD y1, 9 * SIZE(Y1)
  331. ADD3 y6, t2, y6
  332. unop
  333. MUL alpha1, a2, t2
  334. LD y2, 10 * SIZE(Y1)
  335. ADD4 y7, t3, y7
  336. unop
  337. MUL alpha1, a3, t3
  338. LD y3, 11 * SIZE(Y1)
  339. ADD1 y0, t0, y0
  340. ST y4, 4 * SIZE(Y1)
  341. MUL alpha3, a4, t0
  342. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  343. ADD2 y1, t1, y1
  344. ST y5, 5 * SIZE(Y1)
  345. MUL alpha3, a5, t1
  346. unop
  347. ADD1 y2, t2, y2
  348. ST y6, 6 * SIZE(Y1)
  349. MUL alpha3, a6, t2
  350. unop
  351. ADD2 y3, t3, y3
  352. ST y7, 7 * SIZE(Y1)
  353. MUL alpha3, a7, t3
  354. lda Y1, 8 * SIZE(Y1)
  355. ADD1 y0, t0, y0
  356. unop
  357. MUL alpha2, a1, t0
  358. LD a1, 13 * SIZE(A1)
  359. ADD2 y1, t1, y1
  360. unop
  361. MUL alpha2, a0, t1
  362. LD a0, 12 * SIZE(A1)
  363. ADD1 y2, t2, y2
  364. unop
  365. MUL alpha2, a3, t2
  366. LD a3, 15 * SIZE(A1)
  367. ADD2 y3, t3, y3
  368. unop
  369. MUL alpha2, a2, t3
  370. LD a2, 14 * SIZE(A1)
  371. ADD3 y0, t0, y0
  372. unop
  373. MUL alpha4, a5, t0
  374. LD a5, 13 * SIZE(A2)
  375. ADD4 y1, t1, y1
  376. unop
  377. MUL alpha4, a4, t1
  378. LD a4, 12 * SIZE(A2)
  379. ADD3 y2, t2, y2
  380. unop
  381. MUL alpha4, a7, t2
  382. LD a7, 15 * SIZE(A2)
  383. ADD4 y3, t3, y3
  384. unop
  385. MUL alpha4, a6, t3
  386. LD a6, 14 * SIZE(A2)
  387. ADD3 y0, t0, y0
  388. unop
  389. MUL alpha1, a0, t0
  390. LD y4, 4 * SIZE(Y1)
  391. ADD4 y1, t1, y1
  392. lda A2, 8 * SIZE(A2)
  393. MUL alpha1, a1, t1
  394. LD y5, 5 * SIZE(Y1)
  395. ADD3 y2, t2, y2
  396. lda A1, 8 * SIZE(A1)
  397. MUL alpha1, a2, t2
  398. LD y6, 6 * SIZE(Y1)
  399. ADD4 y3, t3, y3
  400. MUL alpha1, a3, t3
  401. LD y7, 7 * SIZE(Y1)
  402. bgt I, $L12
  403. .align 4
  404. $L13:
  405. ADD1 y4, t0, y4
  406. ST y0, 0 * SIZE(Y1)
  407. MUL alpha3, a4, t0
  408. unop
  409. ADD2 y5, t1, y5
  410. ST y1, 1 * SIZE(Y1)
  411. MUL alpha3, a5, t1
  412. unop
  413. ADD1 y6, t2, y6
  414. ST y2, 2 * SIZE(Y1)
  415. MUL alpha3, a6, t2
  416. unop
  417. ADD2 y7, t3, y7
  418. ST y3, 3 * SIZE(Y1)
  419. MUL alpha3, a7, t3
  420. unop
  421. ADD1 y4, t0, y4
  422. MUL alpha2, a1, t0
  423. ADD2 y5, t1, y5
  424. MUL alpha2, a0, t1
  425. ADD1 y6, t2, y6
  426. MUL alpha2, a3, t2
  427. ADD2 y7, t3, y7
  428. MUL alpha2, a2, t3
  429. ADD3 y4, t0, y4
  430. MUL alpha4, a5, t0
  431. ADD4 y5, t1, y5
  432. MUL alpha4, a4, t1
  433. ADD3 y6, t2, y6
  434. MUL alpha4, a7, t2
  435. ADD4 y7, t3, y7
  436. MUL alpha4, a6, t3
  437. ADD3 y4, t0, y4
  438. ADD4 y5, t1, y5
  439. ADD3 y6, t2, y6
  440. ADD4 y7, t3, y7
  441. ST y4, 4 * SIZE(Y1)
  442. lda A1, 8 * SIZE(A1)
  443. ST y5, 5 * SIZE(Y1)
  444. lda A2, 8 * SIZE(A2)
  445. ST y6, 6 * SIZE(Y1)
  446. unop
  447. ST y7, 7 * SIZE(Y1)
  448. lda Y1, 8 * SIZE(Y1)
  449. .align 4
  450. $L15:
  451. and M, 2, I
  452. ble I, $L17
  453. LD a0, 0 * SIZE(A1)
  454. LD a1, 1 * SIZE(A1)
  455. LD a2, 2 * SIZE(A1)
  456. LD a3, 3 * SIZE(A1)
  457. LD a4, 0 * SIZE(A2)
  458. LD a5, 1 * SIZE(A2)
  459. LD a6, 2 * SIZE(A2)
  460. LD a7, 3 * SIZE(A2)
  461. MUL alpha1, a0, t0
  462. LD y0, 0 * SIZE(Y1)
  463. MUL alpha1, a1, t1
  464. LD y1, 1 * SIZE(Y1)
  465. MUL alpha1, a2, t2
  466. LD y2, 2 * SIZE(Y1)
  467. MUL alpha1, a3, t3
  468. LD y3, 3 * SIZE(Y1)
  469. ADD1 y0, t0, y0
  470. MUL alpha3, a4, t0
  471. ADD2 y1, t1, y1
  472. MUL alpha3, a5, t1
  473. ADD1 y2, t2, y2
  474. MUL alpha3, a6, t2
  475. ADD2 y3, t3, y3
  476. MUL alpha3, a7, t3
  477. ADD1 y0, t0, y0
  478. MUL alpha2, a1, t0
  479. ADD2 y1, t1, y1
  480. MUL alpha2, a0, t1
  481. ADD1 y2, t2, y2
  482. MUL alpha2, a3, t2
  483. ADD2 y3, t3, y3
  484. MUL alpha2, a2, t3
  485. ADD3 y0, t0, y0
  486. MUL alpha4, a5, t0
  487. ADD4 y1, t1, y1
  488. MUL alpha4, a4, t1
  489. ADD3 y2, t2, y2
  490. MUL alpha4, a7, t2
  491. ADD4 y3, t3, y3
  492. MUL alpha4, a6, t3
  493. ADD3 y0, t0, y0
  494. ADD4 y1, t1, y1
  495. ADD3 y2, t2, y2
  496. ADD4 y3, t3, y3
  497. ST y0, 0 * SIZE(Y1)
  498. lda A1, 4 * SIZE(A1)
  499. ST y1, 1 * SIZE(Y1)
  500. lda A2, 4 * SIZE(A2)
  501. ST y2, 2 * SIZE(Y1)
  502. unop
  503. ST y3, 3 * SIZE(Y1)
  504. lda Y1, 4 * SIZE(Y1)
  505. .align 4
  506. $L17:
  507. blbc M, $L18
  508. LD a0, 0 * SIZE(A1)
  509. LD a1, 1 * SIZE(A1)
  510. LD a2, 0 * SIZE(A2)
  511. LD a3, 1 * SIZE(A2)
  512. LD y0, 0 * SIZE(Y1)
  513. LD y1, 1 * SIZE(Y1)
  514. MUL alpha1, a0, t0
  515. MUL alpha1, a1, t1
  516. ADD1 y0, t0, y0
  517. MUL alpha3, a2, t0
  518. ADD2 y1, t1, y1
  519. MUL alpha3, a3, t1
  520. ADD1 y0, t0, y0
  521. MUL alpha2, a1, t0
  522. ADD2 y1, t1, y1
  523. MUL alpha2, a0, t1
  524. ADD3 y0, t0, y0
  525. MUL alpha4, a3, t0
  526. ADD4 y1, t1, y1
  527. MUL alpha4, a2, t1
  528. ADD3 y0, t0, y0
  529. ADD4 y1, t1, y1
  530. ST y0, 0 * SIZE(Y1)
  531. ST y1, 1 * SIZE(Y1)
  532. .align 4
  533. $L18:
  534. lda J, -1(J)
  535. bgt J, $L11
  536. .align 4
  537. $L20:
  538. blbc N, $L990
  539. LD alpha1, 0 * SIZE(X)
  540. LD alpha2, 1 * SIZE(X)
  541. MUL alpha_r, alpha1, y0
  542. MUL alpha_r, alpha2, y1
  543. MUL alpha_i, alpha2, t0
  544. mov A, A1
  545. MUL alpha_i, alpha1, t1
  546. mov Y, Y1
  547. #ifndef XCONJ
  548. SUB y0, t0, alpha1
  549. ADD y1, t1, alpha2
  550. #else
  551. ADD y0, t0, alpha1
  552. SUB y1, t1, alpha2
  553. #endif
  554. sra M, 2, I
  555. ble I, $L25
  556. LD a0, 0 * SIZE(A1)
  557. LD a1, 1 * SIZE(A1)
  558. LD a2, 2 * SIZE(A1)
  559. LD a3, 3 * SIZE(A1)
  560. LD y0, 0 * SIZE(Y1)
  561. LD y1, 1 * SIZE(Y1)
  562. LD y2, 2 * SIZE(Y1)
  563. LD y3, 3 * SIZE(Y1)
  564. MUL alpha1, a0, t0
  565. LD a4, 4 * SIZE(A1)
  566. MUL alpha1, a1, t1
  567. LD a5, 5 * SIZE(A1)
  568. MUL alpha1, a2, t2
  569. LD a6, 6 * SIZE(A1)
  570. MUL alpha1, a3, t3
  571. LD a7, 7 * SIZE(A1)
  572. ADD1 y0, t0, y0
  573. unop
  574. MUL alpha2, a1, t0
  575. LD a1, 9 * SIZE(A1)
  576. ADD2 y1, t1, y1
  577. unop
  578. MUL alpha2, a0, t1
  579. LD a0, 8 * SIZE(A1)
  580. ADD1 y2, t2, y2
  581. unop
  582. MUL alpha2, a3, t2
  583. LD a3, 11 * SIZE(A1)
  584. ADD2 y3, t3, y3
  585. unop
  586. MUL alpha2, a2, t3
  587. LD a2, 10 * SIZE(A1)
  588. ADD3 y0, t0, y0
  589. unop
  590. LD y4, 4 * SIZE(Y1)
  591. MUL alpha1, a4, t0
  592. ADD4 y1, t1, y1
  593. unop
  594. LD y5, 5 * SIZE(Y1)
  595. MUL alpha1, a5, t1
  596. ADD3 y2, t2, y2
  597. LD y6, 6 * SIZE(Y1)
  598. MUL alpha1, a6, t2
  599. lda I, -1(I)
  600. ADD4 y3, t3, y3
  601. LD y7, 7 * SIZE(Y1)
  602. MUL alpha1, a7, t3
  603. ble I, $L23
  604. .align 4
  605. $L22:
  606. ADD1 y4, t0, y4
  607. ST y0, 0 * SIZE(Y1)
  608. MUL alpha2, a5, t0
  609. LD a5, 13 * SIZE(A1)
  610. ADD2 y5, t1, y5
  611. ST y1, 1 * SIZE(Y1)
  612. MUL alpha2, a4, t1
  613. LD a4, 12 * SIZE(A1)
  614. ADD1 y6, t2, y6
  615. ST y2, 2 * SIZE(Y1)
  616. MUL alpha2, a7, t2
  617. LD a7, 15 * SIZE(A1)
  618. ADD2 y7, t3, y7
  619. ST y3, 3 * SIZE(Y1)
  620. MUL alpha2, a6, t3
  621. LD a6, 14 * SIZE(A1)
  622. ADD3 y4, t0, y4
  623. LD y0, 8 * SIZE(Y1)
  624. MUL alpha1, a0, t0
  625. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  626. ADD4 y5, t1, y5
  627. LD y1, 9 * SIZE(Y1)
  628. MUL alpha1, a1, t1
  629. lda I, -1(I)
  630. ADD3 y6, t2, y6
  631. LD y2, 10 * SIZE(Y1)
  632. MUL alpha1, a2, t2
  633. unop
  634. ADD4 y7, t3, y7
  635. LD y3, 11 * SIZE(Y1)
  636. MUL alpha1, a3, t3
  637. unop
  638. ADD1 y0, t0, y0
  639. ST y4, 4 * SIZE(Y1)
  640. MUL alpha2, a1, t0
  641. LD a1, 17 * SIZE(A1)
  642. ADD2 y1, t1, y1
  643. ST y5, 5 * SIZE(Y1)
  644. MUL alpha2, a0, t1
  645. LD a0, 16 * SIZE(A1)
  646. ADD1 y2, t2, y2
  647. ST y6, 6 * SIZE(Y1)
  648. MUL alpha2, a3, t2
  649. LD a3, 19 * SIZE(A1)
  650. ADD2 y3, t3, y3
  651. ST y7, 7 * SIZE(Y1)
  652. MUL alpha2, a2, t3
  653. LD a2, 18 * SIZE(A1)
  654. ADD3 y0, t0, y0
  655. LD y4, 12 * SIZE(Y1)
  656. MUL alpha1, a4, t0
  657. ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1)
  658. ADD4 y1, t1, y1
  659. LD y5, 13 * SIZE(Y1)
  660. MUL alpha1, a5, t1
  661. lda A1, 8 * SIZE(A1)
  662. ADD3 y2, t2, y2
  663. LD y6, 14 * SIZE(Y1)
  664. MUL alpha1, a6, t2
  665. lda Y1, 8 * SIZE(Y1)
  666. ADD4 y3, t3, y3
  667. LD y7, 7 * SIZE(Y1)
  668. MUL alpha1, a7, t3
  669. bgt I, $L22
  670. .align 4
  671. $L23:
  672. ADD1 y4, t0, y4
  673. ST y0, 0 * SIZE(Y1)
  674. MUL alpha2, a5, t0
  675. unop
  676. ADD2 y5, t1, y5
  677. ST y1, 1 * SIZE(Y1)
  678. MUL alpha2, a4, t1
  679. unop
  680. ADD1 y6, t2, y6
  681. ST y2, 2 * SIZE(Y1)
  682. MUL alpha2, a7, t2
  683. unop
  684. ADD2 y7, t3, y7
  685. ST y3, 3 * SIZE(Y1)
  686. MUL alpha2, a6, t3
  687. unop
  688. ADD3 y4, t0, y4
  689. ADD4 y5, t1, y5
  690. ADD3 y6, t2, y6
  691. ADD4 y7, t3, y7
  692. ST y4, 4 * SIZE(Y1)
  693. unop
  694. ST y5, 5 * SIZE(Y1)
  695. unop
  696. ST y6, 6 * SIZE(Y1)
  697. lda A1, 8 * SIZE(A1)
  698. ST y7, 7 * SIZE(Y1)
  699. lda Y1, 8 * SIZE(Y1)
  700. .align 4
  701. $L25:
  702. and M, 2, I
  703. ble I, $L27
  704. LD a0, 0 * SIZE(A1)
  705. LD a1, 1 * SIZE(A1)
  706. LD a2, 2 * SIZE(A1)
  707. LD a3, 3 * SIZE(A1)
  708. MUL alpha1, a0, t0
  709. LD y0, 0 * SIZE(Y1)
  710. MUL alpha1, a1, t1
  711. LD y1, 1 * SIZE(Y1)
  712. MUL alpha1, a2, t2
  713. LD y2, 2 * SIZE(Y1)
  714. MUL alpha1, a3, t3
  715. LD y3, 3 * SIZE(Y1)
  716. ADD1 y0, t0, y0
  717. MUL alpha2, a1, t0
  718. ADD2 y1, t1, y1
  719. MUL alpha2, a0, t1
  720. ADD1 y2, t2, y2
  721. MUL alpha2, a3, t2
  722. ADD2 y3, t3, y3
  723. MUL alpha2, a2, t3
  724. ADD3 y0, t0, y0
  725. ADD4 y1, t1, y1
  726. ADD3 y2, t2, y2
  727. ADD4 y3, t3, y3
  728. ST y0, 0 * SIZE(Y1)
  729. ST y1, 1 * SIZE(Y1)
  730. ST y2, 2 * SIZE(Y1)
  731. lda A1, 4 * SIZE(A1)
  732. ST y3, 3 * SIZE(Y1)
  733. lda Y1, 4 * SIZE(Y1)
  734. .align 4
  735. $L27:
  736. blbc M, $L990
  737. LD a0, 0 * SIZE(A1)
  738. LD a1, 1 * SIZE(A1)
  739. MUL alpha1, a0, t0
  740. LD y0, 0 * SIZE(Y1)
  741. MUL alpha1, a1, t1
  742. LD y1, 1 * SIZE(Y1)
  743. ADD1 y0, t0, y0
  744. MUL alpha2, a1, t0
  745. ADD2 y1, t1, y1
  746. MUL alpha2, a0, t1
  747. ADD3 y0, t0, y0
  748. ADD4 y1, t1, y1
  749. ST y0, 0 * SIZE(Y1)
  750. ST y1, 1 * SIZE(Y1)
  751. .align 4
  752. $L990:
  753. cmpeq INCY, 2 * SIZE, $0
  754. bne $0, $L999
  755. mov BUFFER, Y1
  756. sra M, 2, I
  757. ble I, $L995
  758. .align 4
  759. $L992:
  760. LD a0, 0 * SIZE(BUFFER)
  761. LD a1, 1 * SIZE(BUFFER)
  762. addq BUFFER, INCY, BUFFER
  763. LD a2, 0 * SIZE(BUFFER)
  764. LD a3, 1 * SIZE(BUFFER)
  765. addq BUFFER, INCY, BUFFER
  766. LD y0, 0 * SIZE(Y)
  767. LD y1, 1 * SIZE(Y)
  768. LD y2, 2 * SIZE(Y)
  769. LD y3, 3 * SIZE(Y)
  770. LD a4, 0 * SIZE(BUFFER)
  771. LD a5, 1 * SIZE(BUFFER)
  772. addq BUFFER, INCY, BUFFER
  773. LD a6, 0 * SIZE(BUFFER)
  774. LD a7, 1 * SIZE(BUFFER)
  775. addq BUFFER, INCY, BUFFER
  776. LD y4, 4 * SIZE(Y)
  777. LD y5, 5 * SIZE(Y)
  778. LD y6, 6 * SIZE(Y)
  779. LD y7, 7 * SIZE(Y)
  780. ADD a0, y0, a0
  781. ADD a1, y1, a1
  782. ADD a2, y2, a2
  783. ADD a3, y3, a3
  784. ST a0, 0 * SIZE(Y1)
  785. ADD a4, y4, a4
  786. ST a1, 1 * SIZE(Y1)
  787. ADD a5, y5, a5
  788. addq Y1, INCY, Y1
  789. ST a2, 0 * SIZE(Y1)
  790. ADD a6, y6, a6
  791. ST a3, 1 * SIZE(Y1)
  792. ADD a7, y7, a7
  793. addq Y1, INCY, Y1
  794. ST a4, 0 * SIZE(Y1)
  795. ST a5, 1 * SIZE(Y1)
  796. addq Y1, INCY, Y1
  797. ST a6, 0 * SIZE(Y1)
  798. ST a7, 1 * SIZE(Y1)
  799. addq Y1, INCY, Y1
  800. lda I, -1(I)
  801. lda Y, 8 * SIZE(Y)
  802. bgt I, $L992
  803. .align 4
  804. $L995:
  805. and M, 3, I
  806. ble I, $L999
  807. .align 4
  808. $L996:
  809. LD a0, 0 * SIZE(BUFFER)
  810. LD a1, 1 * SIZE(BUFFER)
  811. addq BUFFER, INCY, BUFFER
  812. LD y0, 0 * SIZE(Y)
  813. LD y1, 1 * SIZE(Y)
  814. lda Y, 2 * SIZE(Y)
  815. ADD a0, y0, a0
  816. ADD a1, y1, a1
  817. ST a0, 0 * SIZE(Y1)
  818. ST a1, 1 * SIZE(Y1)
  819. addq Y1, INCY, Y1
  820. lda I, -1(I)
  821. bgt I, $L996
  822. .align 4
  823. $L999:
  824. ldt $f2, 0($sp)
  825. ldt $f3, 8($sp)
  826. ldt $f4, 16($sp)
  827. ldt $f5, 24($sp)
  828. ldt $f6, 32($sp)
  829. ldt $f7, 40($sp)
  830. ldt $f8, 48($sp)
  831. ldt $f9, 56($sp)
  832. lda $sp, STACKSIZE($sp)
  833. ret
  834. EPILOGUE