You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 17 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define STACKSIZE 64
  42. #define PREFETCHSIZE 32
  43. #define M $16
  44. #define N $17
  45. #define A $20
  46. #define LDA $21
  47. #define X $18
  48. #define INCX $19
  49. #define Y $22
  50. #define INCY $23
  51. #define BUFFER $24
  52. #define I $25
  53. #define J $27
  54. #define X1 $3
  55. #define Y1 $4
  56. #define A1 $5
  57. #define A2 $6
  58. #define A3 $7
  59. #define A4 $8
  60. #define alpha $f19
  61. #define s0 $f0
  62. #define s1 $f1
  63. #define s2 $f10
  64. #define s3 $f11
  65. #define t0 $f12
  66. #define t1 $f13
  67. #define t2 $f14
  68. #define t3 $f15
  69. #define x0 $f16
  70. #define x1 $f17
  71. #define x2 $f18
  72. #define x3 $f21
  73. #define a0 $f22
  74. #define a1 $f23
  75. #define a2 $f24
  76. #define a3 $f25
  77. #define a4 $f26
  78. #define a5 $f27
  79. #define a6 $f28
  80. #define a7 $f29
  81. #define a8 $f2
  82. #define a9 $f3
  83. #define a10 $f4
  84. #define a11 $f5
  85. #define a12 $f6
  86. #define a13 $f7
  87. #define a14 $f8
  88. #define a15 $f9
  89. PROLOGUE
  90. lda $sp, -STACKSIZE($sp)
  91. ldq X, 0 + STACKSIZE($sp)
  92. ldq INCX, 8 + STACKSIZE($sp)
  93. ldq Y, 16 + STACKSIZE($sp)
  94. ldq INCY, 24 + STACKSIZE($sp)
  95. ldq BUFFER, 32 + STACKSIZE($sp)
  96. stt $f2, 0($sp)
  97. stt $f3, 8($sp)
  98. stt $f4, 16($sp)
  99. stt $f5, 24($sp)
  100. stt $f6, 32($sp)
  101. stt $f7, 40($sp)
  102. stt $f8, 48($sp)
  103. stt $f9, 56($sp)
  104. PROFCODE
  105. cmple M, 0, $0
  106. SXADDQ INCX, 0, INCX
  107. cmple N, 0, $1
  108. SXADDQ INCY, 0, INCY
  109. or $0, $1, $0
  110. bne $0, $L999
  111. cmpeq INCX, SIZE, $0
  112. mov X, X1
  113. SXADDQ LDA, 0, LDA
  114. bne $0, $L10
  115. sra M, 3, I
  116. mov BUFFER, Y1
  117. mov BUFFER, X
  118. ble I, $L05
  119. .align 4
  120. $L02:
  121. ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
  122. lda I, -1(I)
  123. LD a0, 0 * SIZE(X1)
  124. addq X1, INCX, X1
  125. LD a1, 0 * SIZE(X1)
  126. addq X1, INCX, X1
  127. LD a2, 0 * SIZE(X1)
  128. addq X1, INCX, X1
  129. LD a3, 0 * SIZE(X1)
  130. addq X1, INCX, X1
  131. ST a0, 0 * SIZE(Y1)
  132. ST a1, 1 * SIZE(Y1)
  133. ST a2, 2 * SIZE(Y1)
  134. ST a3, 3 * SIZE(Y1)
  135. LD a4, 0 * SIZE(X1)
  136. addq X1, INCX, X1
  137. LD a5, 0 * SIZE(X1)
  138. addq X1, INCX, X1
  139. LD a6, 0 * SIZE(X1)
  140. addq X1, INCX, X1
  141. LD a7, 0 * SIZE(X1)
  142. addq X1, INCX, X1
  143. ST a4, 4 * SIZE(Y1)
  144. ST a5, 5 * SIZE(Y1)
  145. ST a6, 6 * SIZE(Y1)
  146. ST a7, 7 * SIZE(Y1)
  147. lda Y1, 8 * SIZE(Y1)
  148. bgt I, $L02
  149. .align 4
  150. $L05:
  151. and M, 7, I
  152. ble I, $L10
  153. .align 4
  154. $L06:
  155. LD a0, 0 * SIZE(X1)
  156. addq X1, INCX, X1
  157. ST a0, 0 * SIZE(Y1)
  158. addq Y1, SIZE, Y1
  159. lda I, -1(I)
  160. bgt I, $L06
  161. .align 4
  162. $L10:
  163. mov Y, Y1
  164. fclr t0
  165. unop
  166. fclr t1
  167. sra N, 2, J
  168. fclr t2
  169. fclr t3
  170. ble J, $L20
  171. .align 4
  172. $L11:
  173. mov A, A1
  174. fclr s0
  175. addq A, LDA, A2
  176. fclr s1
  177. addq A2, LDA, A3
  178. fclr s2
  179. addq A3, LDA, A4
  180. fclr s3
  181. s4addq LDA, A, A
  182. unop
  183. mov X, X1
  184. lds $f31, 3 * SIZE(Y)
  185. sra M, 3, I
  186. ble I, $L15
  187. LD x0, 0 * SIZE(X1)
  188. LD x1, 1 * SIZE(X1)
  189. LD x2, 2 * SIZE(X1)
  190. LD a0, 0 * SIZE(A1)
  191. LD a1, 0 * SIZE(A2)
  192. LD a2, 0 * SIZE(A3)
  193. LD a3, 0 * SIZE(A4)
  194. LD a4, 1 * SIZE(A1)
  195. LD a5, 1 * SIZE(A2)
  196. LD a6, 1 * SIZE(A3)
  197. LD a7, 1 * SIZE(A4)
  198. LD a8, 2 * SIZE(A1)
  199. LD a9, 2 * SIZE(A2)
  200. LD a10, 2 * SIZE(A3)
  201. LD a11, 2 * SIZE(A4)
  202. LD a12, 3 * SIZE(A1)
  203. LD a13, 3 * SIZE(A2)
  204. LD a14, 3 * SIZE(A3)
  205. LD a15, 3 * SIZE(A4)
  206. lda I, -1(I)
  207. ble I, $L13
  208. .align 4
  209. $L12:
  210. ADD s0, t0, s0
  211. LD x3, 3 * SIZE(X1)
  212. MUL x0, a0, t0
  213. LD a0, 4 * SIZE(A1)
  214. ADD s1, t1, s1
  215. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  216. MUL x0, a1, t1
  217. LD a1, 4 * SIZE(A2)
  218. ADD s2, t2, s2
  219. unop
  220. MUL x0, a2, t2
  221. LD a2, 4 * SIZE(A3)
  222. ADD s3, t3, s3
  223. unop
  224. MUL x0, a3, t3
  225. LD a3, 4 * SIZE(A4)
  226. ADD s0, t0, s0
  227. LD x0, 4 * SIZE(X1)
  228. MUL x1, a4, t0
  229. LD a4, 5 * SIZE(A1)
  230. ADD s1, t1, s1
  231. lda A1, 8 * SIZE(A1)
  232. MUL x1, a5, t1
  233. LD a5, 5 * SIZE(A2)
  234. ADD s2, t2, s2
  235. unop
  236. MUL x1, a6, t2
  237. LD a6, 5 * SIZE(A3)
  238. ADD s3, t3, s3
  239. unop
  240. MUL x1, a7, t3
  241. LD a7, 5 * SIZE(A4)
  242. ADD s0, t0, s0
  243. LD x1, 5 * SIZE(X1)
  244. MUL x2, a8, t0
  245. LD a8, -2 * SIZE(A1)
  246. ADD s1, t1, s1
  247. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  248. MUL x2, a9, t1
  249. LD a9, 6 * SIZE(A2)
  250. ADD s2, t2, s2
  251. lda A2, 8 * SIZE(A2)
  252. MUL x2, a10, t2
  253. LD a10, 6 * SIZE(A3)
  254. ADD s3, t3, s3
  255. lda A3, 8 * SIZE(A3)
  256. MUL x2, a11, t3
  257. LD a11, 6 * SIZE(A4)
  258. ADD s0, t0, s0
  259. LD x2, 6 * SIZE(X1)
  260. MUL x3, a12, t0
  261. LD a12, -1 * SIZE(A1)
  262. ADD s1, t1, s1
  263. lda A4, 8 * SIZE(A4)
  264. MUL x3, a13, t1
  265. LD a13, -1 * SIZE(A2)
  266. ADD s2, t2, s2
  267. unop
  268. MUL x3, a14, t2
  269. LD a14, -1 * SIZE(A3)
  270. ADD s3, t3, s3
  271. unop
  272. MUL x3, a15, t3
  273. LD a15, -1 * SIZE(A4)
  274. ADD s0, t0, s0
  275. LD x3, 7 * SIZE(X1)
  276. MUL x0, a0, t0
  277. LD a0, 0 * SIZE(A1)
  278. ADD s1, t1, s1
  279. ldl $31, (PREFETCHSIZE - 8) * SIZE(A3)
  280. MUL x0, a1, t1
  281. LD a1, 0 * SIZE(A2)
  282. ADD s2, t2, s2
  283. unop
  284. MUL x0, a2, t2
  285. LD a2, 0 * SIZE(A3)
  286. ADD s3, t3, s3
  287. unop
  288. MUL x0, a3, t3
  289. LD a3, 0 * SIZE(A4)
  290. ADD s0, t0, s0
  291. LD x0, 8 * SIZE(X1)
  292. MUL x1, a4, t0
  293. LD a4, 1 * SIZE(A1)
  294. ADD s1, t1, s1
  295. unop
  296. MUL x1, a5, t1
  297. LD a5, 1 * SIZE(A2)
  298. ADD s2, t2, s2
  299. unop
  300. MUL x1, a6, t2
  301. LD a6, 1 * SIZE(A3)
  302. ADD s3, t3, s3
  303. unop
  304. MUL x1, a7, t3
  305. LD a7, 1 * SIZE(A4)
  306. ADD s0, t0, s0
  307. LD x1, 9 * SIZE(X1)
  308. MUL x2, a8, t0
  309. LD a8, 2 * SIZE(A1)
  310. ADD s1, t1, s1
  311. ldl $31, (PREFETCHSIZE - 8) * SIZE(A4)
  312. MUL x2, a9, t1
  313. LD a9, 2 * SIZE(A2)
  314. ADD s2, t2, s2
  315. lda X1, 8 * SIZE(X1)
  316. MUL x2, a10, t2
  317. LD a10, 2 * SIZE(A3)
  318. ADD s3, t3, s3
  319. lda I, -1(I)
  320. MUL x2, a11, t3
  321. LD a11, 2 * SIZE(A4)
  322. ADD s0, t0, s0
  323. LD x2, 2 * SIZE(X1)
  324. MUL x3, a12, t0
  325. LD a12, 3 * SIZE(A1)
  326. ADD s1, t1, s1
  327. ldl $31, (PREFETCHSIZE - 8) * SIZE(X1)
  328. MUL x3, a13, t1
  329. LD a13, 3 * SIZE(A2)
  330. ADD s2, t2, s2
  331. unop
  332. MUL x3, a14, t2
  333. LD a14, 3 * SIZE(A3)
  334. ADD s3, t3, s3
  335. MUL x3, a15, t3
  336. LD a15, 3 * SIZE(A4)
  337. bgt I, $L12
  338. .align 4
  339. $L13:
  340. ADD s0, t0, s0
  341. LD x3, 3 * SIZE(X1)
  342. MUL x0, a0, t0
  343. LD a0, 4 * SIZE(A1)
  344. ADD s1, t1, s1
  345. unop
  346. MUL x0, a1, t1
  347. LD a1, 4 * SIZE(A2)
  348. ADD s2, t2, s2
  349. unop
  350. MUL x0, a2, t2
  351. LD a2, 4 * SIZE(A3)
  352. ADD s3, t3, s3
  353. unop
  354. MUL x0, a3, t3
  355. LD a3, 4 * SIZE(A4)
  356. ADD s0, t0, s0
  357. LD x0, 4 * SIZE(X1)
  358. MUL x1, a4, t0
  359. LD a4, 5 * SIZE(A1)
  360. ADD s1, t1, s1
  361. unop
  362. MUL x1, a5, t1
  363. LD a5, 5 * SIZE(A2)
  364. ADD s2, t2, s2
  365. unop
  366. MUL x1, a6, t2
  367. LD a6, 5 * SIZE(A3)
  368. ADD s3, t3, s3
  369. unop
  370. MUL x1, a7, t3
  371. LD a7, 5 * SIZE(A4)
  372. ADD s0, t0, s0
  373. LD x1, 5 * SIZE(X1)
  374. MUL x2, a8, t0
  375. LD a8, 6 * SIZE(A1)
  376. ADD s1, t1, s1
  377. unop
  378. MUL x2, a9, t1
  379. LD a9, 6 * SIZE(A2)
  380. ADD s2, t2, s2
  381. unop
  382. MUL x2, a10, t2
  383. LD a10, 6 * SIZE(A3)
  384. ADD s3, t3, s3
  385. unop
  386. MUL x2, a11, t3
  387. LD a11, 6 * SIZE(A4)
  388. ADD s0, t0, s0
  389. LD x2, 6 * SIZE(X1)
  390. MUL x3, a12, t0
  391. LD a12, 7 * SIZE(A1)
  392. ADD s1, t1, s1
  393. lda A1, 8 * SIZE(A1)
  394. MUL x3, a13, t1
  395. LD a13, 7 * SIZE(A2)
  396. ADD s2, t2, s2
  397. lda A2, 8 * SIZE(A2)
  398. MUL x3, a14, t2
  399. LD a14, 7 * SIZE(A3)
  400. ADD s3, t3, s3
  401. lda A3, 8 * SIZE(A3)
  402. MUL x3, a15, t3
  403. LD a15, 7 * SIZE(A4)
  404. ADD s0, t0, s0
  405. LD x3, 7 * SIZE(X1)
  406. MUL x0, a0, t0
  407. unop
  408. ADD s1, t1, s1
  409. lda X1, 8 * SIZE(X1)
  410. MUL x0, a1, t1
  411. lda A4, 8 * SIZE(A4)
  412. ADD s2, t2, s2
  413. MUL x0, a2, t2
  414. ADD s3, t3, s3
  415. MUL x0, a3, t3
  416. ADD s0, t0, s0
  417. MUL x1, a4, t0
  418. ADD s1, t1, s1
  419. MUL x1, a5, t1
  420. ADD s2, t2, s2
  421. MUL x1, a6, t2
  422. ADD s3, t3, s3
  423. MUL x1, a7, t3
  424. ADD s0, t0, s0
  425. MUL x2, a8, t0
  426. ADD s1, t1, s1
  427. MUL x2, a9, t1
  428. ADD s2, t2, s2
  429. MUL x2, a10, t2
  430. ADD s3, t3, s3
  431. MUL x2, a11, t3
  432. ADD s0, t0, s0
  433. MUL x3, a12, t0
  434. ADD s1, t1, s1
  435. MUL x3, a13, t1
  436. ADD s2, t2, s2
  437. MUL x3, a14, t2
  438. ADD s3, t3, s3
  439. MUL x3, a15, t3
  440. .align 4
  441. $L15:
  442. and M, 7, I
  443. ble I, $L18
  444. LD x0, 0 * SIZE(X1)
  445. LD a0, 0 * SIZE(A1)
  446. LD a1, 0 * SIZE(A2)
  447. LD a2, 0 * SIZE(A3)
  448. LD a3, 0 * SIZE(A4)
  449. lda I, -1(I)
  450. ble I, $L17
  451. .align 4
  452. $L16:
  453. ADD s0, t0, s0
  454. lda A4, 1 * SIZE(A4)
  455. MUL x0, a0, t0
  456. LD a0, 1 * SIZE(A1)
  457. ADD s1, t1, s1
  458. lda A1, 1 * SIZE(A1)
  459. MUL x0, a1, t1
  460. LD a1, 1 * SIZE(A2)
  461. ADD s2, t2, s2
  462. lda A2, 1 * SIZE(A2)
  463. MUL x0, a2, t2
  464. LD a2, 1 * SIZE(A3)
  465. ADD s3, t3, s3
  466. lda A3, 1 * SIZE(A3)
  467. MUL x0, a3, t3
  468. LD a3, 0 * SIZE(A4)
  469. LD x0, 1 * SIZE(X1)
  470. lda X1, 1 * SIZE(X1)
  471. lda I, -1(I)
  472. bgt I, $L16
  473. .align 4
  474. $L17:
  475. ADD s0, t0, s0
  476. MUL x0, a0, t0
  477. ADD s1, t1, s1
  478. MUL x0, a1, t1
  479. ADD s2, t2, s2
  480. MUL x0, a2, t2
  481. ADD s3, t3, s3
  482. MUL x0, a3, t3
  483. .align 4
  484. $L18:
  485. LD a0, 0 * SIZE(Y)
  486. addq Y, INCY, Y
  487. LD a1, 0 * SIZE(Y)
  488. addq Y, INCY, Y
  489. LD a2, 0 * SIZE(Y)
  490. addq Y, INCY, Y
  491. LD a3, 0 * SIZE(Y)
  492. addq Y, INCY, Y
  493. ADD s0, t0, s0
  494. ADD s1, t1, s1
  495. ADD s2, t2, s2
  496. ADD s3, t3, s3
  497. MUL alpha, s0, s0
  498. MUL alpha, s1, s1
  499. MUL alpha, s2, s2
  500. MUL alpha, s3, s3
  501. ADD a0, s0, a0
  502. fclr t0
  503. ADD a1, s1, a1
  504. fclr t1
  505. ADD a2, s2, a2
  506. fclr t2
  507. ADD a3, s3, a3
  508. fclr t3
  509. ST a0, 0 * SIZE(Y1)
  510. addq Y1, INCY, Y1
  511. ST a1, 0 * SIZE(Y1)
  512. addq Y1, INCY, Y1
  513. ST a2, 0 * SIZE(Y1)
  514. addq Y1, INCY, Y1
  515. ST a3, 0 * SIZE(Y1)
  516. addq Y1, INCY, Y1
  517. lda J, -1(J)
  518. bgt J, $L11
  519. .align 4
  520. $L20:
  521. and N, 2, J
  522. ble J, $L30
  523. mov A, A1
  524. addq A, LDA, A2
  525. addq A2, LDA, A
  526. fclr s0
  527. mov X, X1
  528. fclr s1
  529. sra M, 3, I
  530. fclr s2
  531. fclr s3
  532. ble I, $L25
  533. LD a0, 0 * SIZE(A1)
  534. LD a1, 0 * SIZE(A2)
  535. LD a2, 1 * SIZE(A1)
  536. LD a3, 1 * SIZE(A2)
  537. LD a4, 2 * SIZE(A1)
  538. LD a5, 2 * SIZE(A2)
  539. LD a6, 3 * SIZE(A1)
  540. LD a7, 3 * SIZE(A2)
  541. LD a8, 4 * SIZE(A1)
  542. LD a9, 4 * SIZE(A2)
  543. LD a10, 5 * SIZE(A1)
  544. LD a11, 5 * SIZE(A2)
  545. LD a12, 6 * SIZE(A1)
  546. LD a13, 6 * SIZE(A2)
  547. LD a14, 7 * SIZE(A1)
  548. LD a15, 7 * SIZE(A2)
  549. LD x0, 0 * SIZE(X1)
  550. LD x1, 1 * SIZE(X1)
  551. LD x2, 2 * SIZE(X1)
  552. lda I, -1(I)
  553. ble I, $L23
  554. .align 4
  555. $L22:
  556. ADD s0, t0, s0
  557. LD x3, 3 * SIZE(X1)
  558. MUL x0, a0, t0
  559. LD a0, 8 * SIZE(A1)
  560. ADD s1, t1, s1
  561. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  562. MUL x0, a1, t1
  563. LD a1, 8 * SIZE(A2)
  564. ADD s0, t2, s0
  565. LD x0, 4 * SIZE(X1)
  566. MUL x1, a2, t2
  567. LD a2, 9 * SIZE(A1)
  568. ADD s1, t3, s1
  569. unop
  570. MUL x1, a3, t3
  571. LD a3, 9 * SIZE(A2)
  572. ADD s0, t0, s0
  573. LD x1, 5 * SIZE(X1)
  574. MUL x2, a4, t0
  575. LD a4, 10 * SIZE(A1)
  576. ADD s1, t1, s1
  577. lda I, -1(I)
  578. MUL x2, a5, t1
  579. LD a5, 10 * SIZE(A2)
  580. ADD s0, t2, s0
  581. LD x2, 6 * SIZE(X1)
  582. MUL x3, a6, t2
  583. LD a6, 11 * SIZE(A1)
  584. ADD s1, t3, s1
  585. lda X1, 8 * SIZE(X1)
  586. MUL x3, a7, t3
  587. LD a7, 11 * SIZE(A2)
  588. ADD s0, t0, s0
  589. LD x3, -1 * SIZE(X1)
  590. MUL x0, a8, t0
  591. LD a8, 12 * SIZE(A1)
  592. ADD s1, t1, s1
  593. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  594. MUL x0, a9, t1
  595. LD a9, 12 * SIZE(A2)
  596. ADD s0, t0, s0
  597. LD x0, 0 * SIZE(X1)
  598. MUL x1, a10, t0
  599. LD a10, 13 * SIZE(A1)
  600. ADD s1, t1, s1
  601. lda A1, 8 * SIZE(A1)
  602. MUL x1, a11, t1
  603. LD a11, 13 * SIZE(A2)
  604. ADD s0, t0, s0
  605. LD x1, 1 * SIZE(X1)
  606. MUL x2, a12, t0
  607. LD a12, 6 * SIZE(A1)
  608. ADD s1, t1, s1
  609. MUL x2, a13, t1
  610. LD a13, 14 * SIZE(A2)
  611. lda A2, 8 * SIZE(A2)
  612. ADD s0, t0, s0
  613. LD x2, 2 * SIZE(X1)
  614. MUL x3, a14, t0
  615. LD a14, 7 * SIZE(A1)
  616. ADD s1, t1, s1
  617. MUL x3, a15, t1
  618. LD a15, 7 * SIZE(A2)
  619. bgt I, $L22
  620. .align 4
  621. $L23:
  622. ADD s0, t0, s0
  623. LD x3, 3 * SIZE(X1)
  624. MUL x0, a0, t0
  625. lda A1, 8 * SIZE(A1)
  626. ADD s1, t1, s1
  627. unop
  628. MUL x0, a1, t1
  629. unop
  630. ADD s0, t2, s0
  631. LD x0, 4 * SIZE(X1)
  632. MUL x1, a2, t2
  633. lda A2, 8 * SIZE(A2)
  634. ADD s1, t3, s1
  635. unop
  636. MUL x1, a3, t3
  637. unop
  638. ADD s0, t0, s0
  639. LD x1, 5 * SIZE(X1)
  640. MUL x2, a4, t0
  641. unop
  642. ADD s1, t1, s1
  643. unop
  644. MUL x2, a5, t1
  645. unop
  646. ADD s0, t2, s0
  647. LD x2, 6 * SIZE(X1)
  648. MUL x3, a6, t2
  649. unop
  650. ADD s1, t3, s1
  651. unop
  652. MUL x3, a7, t3
  653. unop
  654. ADD s0, t0, s0
  655. LD x3, 7 * SIZE(X1)
  656. MUL x0, a8, t0
  657. lda X1, 8 * SIZE(X1)
  658. ADD s1, t1, s1
  659. unop
  660. MUL x0, a9, t1
  661. unop
  662. ADD s0, t0, s0
  663. MUL x1, a10, t0
  664. ADD s1, t1, s1
  665. MUL x1, a11, t1
  666. ADD s0, t0, s0
  667. MUL x2, a12, t0
  668. ADD s1, t1, s1
  669. MUL x2, a13, t1
  670. ADD s0, t0, s0
  671. MUL x3, a14, t0
  672. ADD s1, t1, s1
  673. MUL x3, a15, t1
  674. .align 4
  675. $L25:
  676. and M, 7, I
  677. ble I, $L28
  678. LD a0, 0 * SIZE(A1)
  679. LD a1, 0 * SIZE(A2)
  680. LD x0, 0 * SIZE(X1)
  681. lda I, -1(I)
  682. ble I, $L27
  683. .align 4
  684. $L26:
  685. ADD s0, t0, s0
  686. lda A2, 1 * SIZE(A2)
  687. MUL x0, a0, t0
  688. LD a0, 1 * SIZE(A1)
  689. ADD s1, t1, s1
  690. lda A1, 1 * SIZE(A1)
  691. MUL x0, a1, t1
  692. LD a1, 0 * SIZE(A2)
  693. LD x0, 1 * SIZE(X1)
  694. lda X1, 1 * SIZE(X1)
  695. lda I, -1(I)
  696. bgt I, $L26
  697. .align 4
  698. $L27:
  699. ADD s0, t0, s0
  700. MUL x0, a0, t0
  701. ADD s1, t1, s1
  702. MUL x0, a1, t1
  703. .align 4
  704. $L28:
  705. LD a0, 0 * SIZE(Y)
  706. addq Y, INCY, Y
  707. LD a1, 0 * SIZE(Y)
  708. addq Y, INCY, Y
  709. ADD s0, t0, s0
  710. ADD s1, t1, s1
  711. ADD s2, t2, s2
  712. ADD s3, t3, s3
  713. ADD s0, s2, s0
  714. ADD s1, s3, s1
  715. MUL alpha, s0, s0
  716. MUL alpha, s1, s1
  717. ADD a0, s0, a0
  718. ADD a1, s1, a1
  719. ST a0, 0 * SIZE(Y1)
  720. fclr t0
  721. addq Y1, INCY, Y1
  722. fclr t1
  723. ST a1, 0 * SIZE(Y1)
  724. fclr t2
  725. addq Y1, INCY, Y1
  726. fclr t3
  727. .align 4
  728. $L30:
  729. blbc N, $L999
  730. mov A, A1
  731. fclr s0
  732. mov X, X1
  733. fclr s1
  734. sra M, 3, I
  735. fclr s2
  736. fclr s3
  737. ble I, $L35
  738. LD a0, 0 * SIZE(A1)
  739. LD a1, 1 * SIZE(A1)
  740. LD a8, 0 * SIZE(X1)
  741. LD a9, 1 * SIZE(X1)
  742. LD a2, 2 * SIZE(A1)
  743. LD a3, 3 * SIZE(A1)
  744. LD a10, 2 * SIZE(X1)
  745. LD a11, 3 * SIZE(X1)
  746. LD a4, 4 * SIZE(A1)
  747. LD a5, 5 * SIZE(A1)
  748. LD a12, 4 * SIZE(X1)
  749. LD a13, 5 * SIZE(X1)
  750. LD a6, 6 * SIZE(A1)
  751. LD a7, 7 * SIZE(A1)
  752. LD a14, 6 * SIZE(X1)
  753. lda I, -1(I)
  754. ble I, $L33
  755. .align 4
  756. $L32:
  757. ADD s0, t0, s0
  758. LD a15, 7 * SIZE(X1)
  759. MUL a0, a8, t0
  760. LD a0, 8 * SIZE(A1)
  761. ADD s1, t1, s1
  762. LD a8, 8 * SIZE(X1)
  763. MUL a1, a9, t1
  764. LD a1, 9 * SIZE(A1)
  765. ADD s2, t2, s2
  766. LD a9, 9 * SIZE(X1)
  767. MUL a2, a10, t2
  768. LD a2, 10 * SIZE(A1)
  769. ADD s3, t3, s3
  770. LD a10, 10 * SIZE(X1)
  771. MUL a3, a11, t3
  772. LD a3, 11 * SIZE(A1)
  773. ADD s0, t0, s0
  774. LD a11, 11 * SIZE(X1)
  775. MUL a4, a12, t0
  776. LD a4, 12 * SIZE(A1)
  777. ADD s1, t1, s1
  778. LD a12, 12 * SIZE(X1)
  779. MUL a5, a13, t1
  780. LD a5, 13 * SIZE(A1)
  781. ADD s2, t2, s2
  782. LD a13, 13 * SIZE(X1)
  783. MUL a6, a14, t2
  784. LD a6, 14 * SIZE(A1)
  785. ADD s3, t3, s3
  786. LD a14, 14 * SIZE(X1)
  787. MUL a7, a15, t3
  788. LD a7, 15 * SIZE(A1)
  789. lda A1, 8 * SIZE(A1)
  790. lda I, -1(I)
  791. lda X1, 8 * SIZE(X1)
  792. bgt I, $L32
  793. .align 4
  794. $L33:
  795. ADD s0, t0, s0
  796. LD a15, 7 * SIZE(X1)
  797. MUL a0, a8, t0
  798. lda A1, 8 * SIZE(A1)
  799. ADD s1, t1, s1
  800. unop
  801. MUL a1, a9, t1
  802. lda X1, 8 * SIZE(X1)
  803. ADD s2, t2, s2
  804. MUL a2, a10, t2
  805. ADD s3, t3, s3
  806. MUL a3, a11, t3
  807. ADD s0, t0, s0
  808. MUL a4, a12, t0
  809. ADD s1, t1, s1
  810. MUL a5, a13, t1
  811. ADD s2, t2, s2
  812. MUL a6, a14, t2
  813. ADD s3, t3, s3
  814. MUL a7, a15, t3
  815. .align 4
  816. $L35:
  817. and M, 7, I
  818. ble I, $L38
  819. LD a0, 0 * SIZE(A1)
  820. LD x0, 0 * SIZE(X1)
  821. lda I, -1(I)
  822. ble I, $L37
  823. .align 4
  824. $L36:
  825. ADD s0, t0, s0
  826. MUL x0, a0, t0
  827. LD a0, 1 * SIZE(A1)
  828. LD x0, 1 * SIZE(X1)
  829. lda A1, 1 * SIZE(A1)
  830. lda X1, 1 * SIZE(X1)
  831. lda I, -1(I)
  832. bgt I, $L36
  833. .align 4
  834. $L37:
  835. ADD s0, t0, s0
  836. MUL x0, a0, t0
  837. .align 4
  838. $L38:
  839. LD a0, 0 * SIZE(Y)
  840. ADD s0, t0, s0
  841. ADD s1, t1, s1
  842. ADD s2, t2, s2
  843. ADD s3, t3, s3
  844. ADD s0, s2, s0
  845. ADD s1, s3, s1
  846. ADD s0, s1, s0
  847. MUL alpha, s0, s0
  848. ADD a0, s0, a0
  849. ST a0, 0 * SIZE(Y1)
  850. .align 4
  851. $L999:
  852. ldt $f2, 0($sp)
  853. ldt $f3, 8($sp)
  854. ldt $f4, 16($sp)
  855. ldt $f5, 24($sp)
  856. ldt $f6, 32($sp)
  857. ldt $f7, 40($sp)
  858. ldt $f8, 48($sp)
  859. ldt $f9, 56($sp)
  860. lda $sp, STACKSIZE($sp)
  861. ret
  862. EPILOGUE