You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 22 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #define STACKSIZE 64
  42. #define PREFETCHSIZE 32
  43. #define M $16
  44. #define N $17
  45. #define A $20
  46. #define LDA $21
  47. #define X $18
  48. #define INCX $19
  49. #define Y $22
  50. #define INCY $23
  51. #define BUFFER $24
  52. #define I $25
  53. #define J $27
  54. #define Y1 $4
  55. #define A1 $5
  56. #define A2 $6
  57. #define A3 $7
  58. #define A4 $8
  59. #define alpha $f19
  60. #define alpha1 $f0
  61. #define alpha2 $f1
  62. #define alpha3 $f10
  63. #define alpha4 $f11
  64. #define y0 $f12
  65. #define y1 $f13
  66. #define y2 $f14
  67. #define y3 $f15
  68. #define y4 $f16
  69. #define y5 $f17
  70. #define y6 $f18
  71. #define y7 $f21
  72. #define a0 $f22
  73. #define a1 $f23
  74. #define a2 $f24
  75. #define a3 $f25
  76. #define a4 $f26
  77. #define a5 $f27
  78. #define a6 $f28
  79. #define a7 $f29
  80. #define a8 $f2
  81. #define a9 $f3
  82. #define a10 $f4
  83. #define a11 $f5
  84. #define a12 $f6
  85. #define a13 $f7
  86. #define a14 $f8
  87. #define a15 $f9
  88. PROLOGUE
  89. lda $sp, -STACKSIZE($sp)
  90. ldq X, 0 + STACKSIZE($sp)
  91. ldq INCX, 8 + STACKSIZE($sp)
  92. ldq Y, 16 + STACKSIZE($sp)
  93. ldq INCY, 24 + STACKSIZE($sp)
  94. ldq BUFFER, 32 + STACKSIZE($sp)
  95. stt $f2, 0($sp)
  96. stt $f3, 8($sp)
  97. stt $f4, 16($sp)
  98. stt $f5, 24($sp)
  99. stt $f6, 32($sp)
  100. stt $f7, 40($sp)
  101. stt $f8, 48($sp)
  102. stt $f9, 56($sp)
  103. PROFCODE
  104. cmple M, 0, $0
  105. SXADDQ INCX, 0, INCX
  106. cmple N, 0, $1
  107. SXADDQ INCY, 0, INCY
  108. or $0, $1, $0
  109. bne $0, $L999
  110. SXADDQ LDA, 0, LDA
  111. cmpeq INCY, SIZE, $0
  112. bne $0, $L10
  113. mov BUFFER, Y1
  114. mov Y, BUFFER
  115. mov Y1, Y
  116. sra M, 3, I
  117. ble I, $L05
  118. .align 4
  119. $L02:
  120. ST $f31, 0 * SIZE(Y1)
  121. ST $f31, 1 * SIZE(Y1)
  122. ST $f31, 2 * SIZE(Y1)
  123. ST $f31, 3 * SIZE(Y1)
  124. ST $f31, 4 * SIZE(Y1)
  125. ST $f31, 5 * SIZE(Y1)
  126. ST $f31, 6 * SIZE(Y1)
  127. ST $f31, 7 * SIZE(Y1)
  128. lda Y1, 8 * SIZE(Y1)
  129. lda I, -1(I)
  130. bgt I, $L02
  131. .align 4
  132. $L05:
  133. and M, 7, I
  134. ble I, $L10
  135. .align 4
  136. $L06:
  137. ST $f31, 0 * SIZE(Y1)
  138. addq Y1, SIZE, Y1
  139. lda I, -1(I)
  140. bgt I, $L06
  141. .align 4
  142. $L10:
  143. sra N, 2, J
  144. ble J, $L20
  145. .align 4
  146. $L11:
  147. LD alpha1, 0 * SIZE(X)
  148. addq X, INCX, X
  149. LD alpha2, 0 * SIZE(X)
  150. addq X, INCX, X
  151. LD alpha3, 0 * SIZE(X)
  152. addq X, INCX, X
  153. LD alpha4, 0 * SIZE(X)
  154. addq X, INCX, X
  155. MUL alpha, alpha1, alpha1
  156. MUL alpha, alpha2, alpha2
  157. MUL alpha, alpha3, alpha3
  158. MUL alpha, alpha4, alpha4
  159. mov A, A1
  160. addq A, LDA, A2
  161. addq A2, LDA, A3
  162. addq A3, LDA, A4
  163. s4addq LDA, A, A
  164. mov Y, Y1
  165. ldl $31, 4 * SIZE(X)
  166. sra M, 3, I
  167. ble I, $L15
  168. LD a0, 0 * SIZE(A1)
  169. LD a1, 1 * SIZE(A1)
  170. LD a2, 2 * SIZE(A1)
  171. LD a3, 3 * SIZE(A1)
  172. LD a4, 0 * SIZE(A2)
  173. LD a5, 1 * SIZE(A2)
  174. LD a6, 2 * SIZE(A2)
  175. LD a7, 3 * SIZE(A2)
  176. LD y0, 0 * SIZE(Y1)
  177. LD y1, 1 * SIZE(Y1)
  178. LD y2, 2 * SIZE(Y1)
  179. LD y3, 3 * SIZE(Y1)
  180. LD a8, 0 * SIZE(A3)
  181. LD a9, 1 * SIZE(A3)
  182. LD a10, 2 * SIZE(A3)
  183. LD a11, 3 * SIZE(A3)
  184. LD y4, 4 * SIZE(Y1)
  185. LD y5, 5 * SIZE(Y1)
  186. LD y6, 6 * SIZE(Y1)
  187. LD y7, 7 * SIZE(Y1)
  188. MUL alpha1, a0, a0
  189. LD a12, 0 * SIZE(A4)
  190. MUL alpha1, a1, a1
  191. LD a13, 1 * SIZE(A4)
  192. MUL alpha1, a2, a2
  193. LD a14, 2 * SIZE(A4)
  194. MUL alpha1, a3, a3
  195. LD a15, 3 * SIZE(A4)
  196. ADD y0, a0, y0
  197. LD a0, 4 * SIZE(A1)
  198. MUL alpha2, a4, a4
  199. unop
  200. ADD y1, a1, y1
  201. LD a1, 5 * SIZE(A1)
  202. MUL alpha2, a5, a5
  203. unop
  204. ADD y2, a2, y2
  205. LD a2, 6 * SIZE(A1)
  206. MUL alpha2, a6, a6
  207. unop
  208. ADD y3, a3, y3
  209. LD a3, 7 * SIZE(A1)
  210. MUL alpha2, a7, a7
  211. unop
  212. ADD y0, a4, y0
  213. LD a4, 4 * SIZE(A2)
  214. MUL alpha3, a8, a8
  215. unop
  216. ADD y1, a5, y1
  217. LD a5, 5 * SIZE(A2)
  218. MUL alpha3, a9, a9
  219. lda I, -1(I)
  220. ADD y2, a6, y2
  221. LD a6, 6 * SIZE(A2)
  222. MUL alpha3, a10, a10
  223. unop
  224. ADD y3, a7, y3
  225. LD a7, 7 * SIZE(A2)
  226. MUL alpha3, a11, a11
  227. unop
  228. ADD y0, a8, y0
  229. LD a8, 4 * SIZE(A3)
  230. MUL alpha4, a12, a12
  231. ble I, $L13
  232. .align 4
  233. $L12:
  234. ADD y1, a9, y1
  235. LD a9, 5 * SIZE(A3)
  236. MUL alpha4, a13, a13
  237. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  238. ADD y2, a10, y2
  239. LD a10, 6 * SIZE(A3)
  240. MUL alpha4, a14, a14
  241. unop
  242. ADD y3, a11, y3
  243. LD a11, 7 * SIZE(A3)
  244. MUL alpha4, a15, a15
  245. lda I, -1(I)
  246. ADD y0, a12, y0
  247. LD a12, 4 * SIZE(A4)
  248. MUL alpha1, a0, a0
  249. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  250. ADD y1, a13, y1
  251. LD a13, 5 * SIZE(A4)
  252. MUL alpha1, a1, a1
  253. unop
  254. ADD y2, a14, y2
  255. LD a14, 6 * SIZE(A4)
  256. MUL alpha1, a2, a2
  257. unop
  258. ADD y3, a15, y3
  259. LD a15, 7 * SIZE(A4)
  260. MUL alpha1, a3, a3
  261. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  262. ADD y4, a0, y4
  263. ST y0, 0 * SIZE(Y1)
  264. MUL alpha2, a4, a4
  265. LD a0, 8 * SIZE(A1)
  266. ADD y5, a1, y5
  267. ST y1, 1 * SIZE(Y1)
  268. MUL alpha2, a5, a5
  269. LD a1, 9 * SIZE(A1)
  270. ADD y6, a2, y6
  271. ST y2, 2 * SIZE(Y1)
  272. MUL alpha2, a6, a6
  273. LD a2, 10 * SIZE(A1)
  274. ADD y7, a3, y7
  275. ST y3, 3 * SIZE(Y1)
  276. MUL alpha2, a7, a7
  277. LD a3, 11 * SIZE(A1)
  278. ADD y4, a4, y4
  279. LD a4, 8 * SIZE(A2)
  280. MUL alpha3, a8, a8
  281. LD y0, 8 * SIZE(Y1)
  282. ADD y5, a5, y5
  283. LD a5, 9 * SIZE(A2)
  284. MUL alpha3, a9, a9
  285. LD y1, 9 * SIZE(Y1)
  286. ADD y6, a6, y6
  287. LD a6, 10 * SIZE(A2)
  288. MUL alpha3, a10, a10
  289. LD y2, 10 * SIZE(Y1)
  290. ADD y7, a7, y7
  291. LD a7, 11 * SIZE(A2)
  292. MUL alpha3, a11, a11
  293. LD y3, 11 * SIZE(Y1)
  294. ADD y4, a8, y4
  295. LD a8, 8 * SIZE(A3)
  296. MUL alpha4, a12, a12
  297. ldl $31, (PREFETCHSIZE + 0) * SIZE(A3)
  298. ADD y5, a9, y5
  299. LD a9, 9 * SIZE(A3)
  300. MUL alpha4, a13, a13
  301. lda A1, 8 * SIZE(A1)
  302. ADD y6, a10, y6
  303. LD a10, 10 * SIZE(A3)
  304. MUL alpha4, a14, a14
  305. lda A2, 8 * SIZE(A2)
  306. ADD y7, a11, y7
  307. LD a11, 11 * SIZE(A3)
  308. MUL alpha4, a15, a15
  309. lda Y1, 8 * SIZE(Y1)
  310. ADD y4, a12, y4
  311. LD a12, 8 * SIZE(A4)
  312. MUL alpha1, a0, a0
  313. unop
  314. ADD y5, a13, y5
  315. LD a13, 9 * SIZE(A4)
  316. MUL alpha1, a1, a1
  317. lda A3, 8 * SIZE(A3)
  318. ADD y6, a14, y6
  319. LD a14, 10 * SIZE(A4)
  320. MUL alpha1, a2, a2
  321. ldl $31, (PREFETCHSIZE + 0) * SIZE(A4)
  322. ADD y7, a15, y7
  323. LD a15, 11 * SIZE(A4)
  324. MUL alpha1, a3, a3
  325. lda A4, 8 * SIZE(A4)
  326. ADD y0, a0, y0
  327. LD a0, 4 * SIZE(A1)
  328. MUL alpha2, a4, a4
  329. ST y4, -4 * SIZE(Y1)
  330. ADD y1, a1, y1
  331. LD a1, 5 * SIZE(A1)
  332. MUL alpha2, a5, a5
  333. ST y5, -3 * SIZE(Y1)
  334. ADD y2, a2, y2
  335. LD a2, 6 * SIZE(A1)
  336. MUL alpha2, a6, a6
  337. ST y6, -2 * SIZE(Y1)
  338. ADD y3, a3, y3
  339. LD a3, 7 * SIZE(A1)
  340. MUL alpha2, a7, a7
  341. ST y7, -1 * SIZE(Y1)
  342. ADD y0, a4, y0
  343. LD a4, 4 * SIZE(A2)
  344. MUL alpha3, a8, a8
  345. LD y4, 4 * SIZE(Y1)
  346. ADD y1, a5, y1
  347. LD a5, 5 * SIZE(A2)
  348. MUL alpha3, a9, a9
  349. LD y5, 5 * SIZE(Y1)
  350. ADD y2, a6, y2
  351. LD a6, 6 * SIZE(A2)
  352. MUL alpha3, a10, a10
  353. LD y6, 6 * SIZE(Y1)
  354. ADD y3, a7, y3
  355. LD a7, 7 * SIZE(A2)
  356. MUL alpha3, a11, a11
  357. LD y7, 7 * SIZE(Y1)
  358. ADD y0, a8, y0
  359. LD a8, 4 * SIZE(A3)
  360. MUL alpha4, a12, a12
  361. bgt I, $L12
  362. .align 4
  363. $L13:
  364. ADD y1, a9, y1
  365. LD a9, 5 * SIZE(A3)
  366. MUL alpha4, a13, a13
  367. unop
  368. ADD y2, a10, y2
  369. LD a10, 6 * SIZE(A3)
  370. MUL alpha4, a14, a14
  371. unop
  372. ADD y3, a11, y3
  373. LD a11, 7 * SIZE(A3)
  374. MUL alpha4, a15, a15
  375. unop
  376. ADD y0, a12, y0
  377. LD a12, 4 * SIZE(A4)
  378. MUL alpha1, a0, a0
  379. unop
  380. ADD y1, a13, y1
  381. LD a13, 5 * SIZE(A4)
  382. MUL alpha1, a1, a1
  383. unop
  384. ADD y2, a14, y2
  385. LD a14, 6 * SIZE(A4)
  386. MUL alpha1, a2, a2
  387. unop
  388. ADD y3, a15, y3
  389. LD a15, 7 * SIZE(A4)
  390. MUL alpha1, a3, a3
  391. unop
  392. ST y0, 0 * SIZE(Y1)
  393. ADD y4, a0, y4
  394. unop
  395. MUL alpha2, a4, a4
  396. ST y1, 1 * SIZE(Y1)
  397. ADD y5, a1, y5
  398. unop
  399. MUL alpha2, a5, a5
  400. ST y2, 2 * SIZE(Y1)
  401. ADD y6, a2, y6
  402. unop
  403. MUL alpha2, a6, a6
  404. ST y3, 3 * SIZE(Y1)
  405. ADD y7, a3, y7
  406. lda Y1, 8 * SIZE(Y1)
  407. MUL alpha2, a7, a7
  408. ADD y4, a4, y4
  409. MUL alpha3, a8, a8
  410. ADD y5, a5, y5
  411. MUL alpha3, a9, a9
  412. ADD y6, a6, y6
  413. MUL alpha3, a10, a10
  414. ADD y7, a7, y7
  415. MUL alpha3, a11, a11
  416. ADD y4, a8, y4
  417. MUL alpha4, a12, a12
  418. ADD y5, a9, y5
  419. MUL alpha4, a13, a13
  420. ADD y6, a10, y6
  421. MUL alpha4, a14, a14
  422. ADD y7, a11, y7
  423. MUL alpha4, a15, a15
  424. ADD y4, a12, y4
  425. ADD y5, a13, y5
  426. ADD y6, a14, y6
  427. ADD y7, a15, y7
  428. ST y4, -4 * SIZE(Y1)
  429. lda A1, 8 * SIZE(A1)
  430. ST y5, -3 * SIZE(Y1)
  431. lda A2, 8 * SIZE(A2)
  432. ST y6, -2 * SIZE(Y1)
  433. lda A3, 8 * SIZE(A3)
  434. ST y7, -1 * SIZE(Y1)
  435. lda A4, 8 * SIZE(A4)
  436. .align 4
  437. $L15:
  438. and M, 4, I
  439. ble I, $L16
  440. LD y0, 0 * SIZE(Y1)
  441. LD y1, 1 * SIZE(Y1)
  442. LD y2, 2 * SIZE(Y1)
  443. LD y3, 3 * SIZE(Y1)
  444. LD a0, 0 * SIZE(A1)
  445. LD a1, 1 * SIZE(A1)
  446. LD a2, 2 * SIZE(A1)
  447. LD a3, 3 * SIZE(A1)
  448. LD a4, 0 * SIZE(A2)
  449. LD a5, 1 * SIZE(A2)
  450. LD a6, 2 * SIZE(A2)
  451. LD a7, 3 * SIZE(A2)
  452. LD a8, 0 * SIZE(A3)
  453. LD a9, 1 * SIZE(A3)
  454. LD a10, 2 * SIZE(A3)
  455. LD a11, 3 * SIZE(A3)
  456. MUL alpha1, a0, a0
  457. LD a12, 0 * SIZE(A4)
  458. MUL alpha1, a1, a1
  459. LD a13, 1 * SIZE(A4)
  460. MUL alpha1, a2, a2
  461. LD a14, 2 * SIZE(A4)
  462. MUL alpha1, a3, a3
  463. LD a15, 3 * SIZE(A4)
  464. ADD y0, a0, y0
  465. MUL alpha2, a4, a4
  466. ADD y1, a1, y1
  467. MUL alpha2, a5, a5
  468. ADD y2, a2, y2
  469. MUL alpha2, a6, a6
  470. ADD y3, a3, y3
  471. MUL alpha2, a7, a7
  472. ADD y0, a4, y0
  473. MUL alpha3, a8, a8
  474. ADD y1, a5, y1
  475. MUL alpha3, a9, a9
  476. ADD y2, a6, y2
  477. MUL alpha3, a10, a10
  478. ADD y3, a7, y3
  479. MUL alpha3, a11, a11
  480. ADD y0, a8, y0
  481. MUL alpha4, a12, a12
  482. ADD y1, a9, y1
  483. MUL alpha4, a13, a13
  484. ADD y2, a10, y2
  485. MUL alpha4, a14, a14
  486. ADD y3, a11, y3
  487. MUL alpha4, a15, a15
  488. ADD y0, a12, y0
  489. lda Y1, 4 * SIZE(Y1)
  490. ADD y1, a13, y1
  491. unop
  492. ADD y2, a14, y2
  493. unop
  494. ADD y3, a15, y3
  495. unop
  496. ST y0, -4 * SIZE(Y1)
  497. lda A1, 4 * SIZE(A1)
  498. ST y1, -3 * SIZE(Y1)
  499. lda A2, 4 * SIZE(A2)
  500. ST y2, -2 * SIZE(Y1)
  501. lda A3, 4 * SIZE(A3)
  502. ST y3, -1 * SIZE(Y1)
  503. lda A4, 4 * SIZE(A4)
  504. .align 4
  505. $L16:
  506. and M, 2, I
  507. ble I, $L17
  508. LD a0, 0 * SIZE(A1)
  509. LD a1, 1 * SIZE(A1)
  510. LD a2, 0 * SIZE(A2)
  511. LD a3, 1 * SIZE(A2)
  512. LD y0, 0 * SIZE(Y1)
  513. LD y1, 1 * SIZE(Y1)
  514. LD a4, 0 * SIZE(A3)
  515. MUL alpha1, a0, a0
  516. LD a5, 1 * SIZE(A3)
  517. MUL alpha1, a1, a1
  518. LD a6, 0 * SIZE(A4)
  519. MUL alpha2, a2, a2
  520. LD a7, 1 * SIZE(A4)
  521. MUL alpha2, a3, a3
  522. ADD y0, a0, y0
  523. MUL alpha3, a4, a4
  524. ADD y1, a1, y1
  525. MUL alpha3, a5, a5
  526. ADD y0, a2, y0
  527. MUL alpha4, a6, a6
  528. ADD y1, a3, y1
  529. MUL alpha4, a7, a7
  530. ADD y0, a4, y0
  531. lda A1, 2 * SIZE(A1)
  532. ADD y1, a5, y1
  533. lda A2, 2 * SIZE(A2)
  534. ADD y0, a6, y0
  535. lda A3, 2 * SIZE(A3)
  536. ADD y1, a7, y1
  537. lda A4, 2 * SIZE(A4)
  538. ST y0, 0 * SIZE(Y1)
  539. unop
  540. ST y1, 1 * SIZE(Y1)
  541. lda Y1, 2 * SIZE(Y1)
  542. .align 4
  543. $L17:
  544. blbc M, $L18
  545. LD y0, 0 * SIZE(Y1)
  546. LD a0, 0 * SIZE(A1)
  547. LD a1, 0 * SIZE(A2)
  548. LD a2, 0 * SIZE(A3)
  549. LD a3, 0 * SIZE(A4)
  550. MUL alpha1, a0, a0
  551. MUL alpha2, a1, a1
  552. MUL alpha3, a2, a2
  553. MUL alpha4, a3, a3
  554. ADD y0, a0, y0
  555. ADD y0, a1, y0
  556. ADD y0, a2, y0
  557. ADD y0, a3, y0
  558. ST y0, 0 * SIZE(Y1)
  559. .align 4
  560. $L18:
  561. lda J, -1(J)
  562. bgt J, $L11
  563. .align 4
  564. $L20:
  565. and N, 2, J
  566. ble J, $L30
  567. LD alpha1, 0 * SIZE(X)
  568. addq X, INCX, X
  569. LD alpha2, 0 * SIZE(X)
  570. addq X, INCX, X
  571. mov A, A1
  572. MUL alpha, alpha1, alpha1
  573. addq A, LDA, A2
  574. MUL alpha, alpha2, alpha2
  575. addq A2, LDA, A
  576. mov Y, Y1
  577. sra M, 3, I
  578. ble I, $L25
  579. LD a0, 0 * SIZE(A1)
  580. LD a1, 1 * SIZE(A1)
  581. LD a2, 2 * SIZE(A1)
  582. LD a3, 3 * SIZE(A1)
  583. LD a4, 0 * SIZE(A2)
  584. LD a5, 1 * SIZE(A2)
  585. LD a6, 2 * SIZE(A2)
  586. LD a7, 3 * SIZE(A2)
  587. LD y0, 0 * SIZE(Y1)
  588. LD y1, 1 * SIZE(Y1)
  589. LD y2, 2 * SIZE(Y1)
  590. LD y3, 3 * SIZE(Y1)
  591. MUL alpha1, a0, a0
  592. LD y4, 4 * SIZE(Y1)
  593. MUL alpha1, a1, a1
  594. LD y5, 5 * SIZE(Y1)
  595. MUL alpha1, a2, a2
  596. LD y6, 6 * SIZE(Y1)
  597. MUL alpha1, a3, a3
  598. LD y7, 7 * SIZE(Y1)
  599. ADD y0, a0, y0
  600. LD a0, 4 * SIZE(A1)
  601. MUL alpha2, a4, a4
  602. ADD y1, a1, y1
  603. LD a1, 5 * SIZE(A1)
  604. MUL alpha2, a5, a5
  605. ADD y2, a2, y2
  606. LD a2, 6 * SIZE(A1)
  607. MUL alpha2, a6, a6
  608. ADD y3, a3, y3
  609. LD a3, 7 * SIZE(A1)
  610. MUL alpha2, a7, a7
  611. ADD y0, a4, y0
  612. LD a4, 4 * SIZE(A2)
  613. MUL alpha1, a0, a0
  614. ADD y1, a5, y1
  615. LD a5, 5 * SIZE(A2)
  616. MUL alpha1, a1, a1
  617. ADD y2, a6, y2
  618. LD a6, 6 * SIZE(A2)
  619. MUL alpha1, a2, a2
  620. ADD y3, a7, y3
  621. LD a7, 7 * SIZE(A2)
  622. MUL alpha1, a3, a3
  623. lda I, -1(I)
  624. ble I, $L23
  625. .align 4
  626. $L22:
  627. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  628. lda I, -1(I)
  629. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  630. lda A2, 8 * SIZE(A2)
  631. ADD y4, a0, y4
  632. ST y0, 0 * SIZE(Y1)
  633. MUL alpha2, a4, a4
  634. LD a0, 8 * SIZE(A1)
  635. ADD y5, a1, y5
  636. ST y1, 1 * SIZE(Y1)
  637. MUL alpha2, a5, a5
  638. LD a1, 9 * SIZE(A1)
  639. ADD y6, a2, y6
  640. ST y2, 2 * SIZE(Y1)
  641. MUL alpha2, a6, a6
  642. LD a2, 10 * SIZE(A1)
  643. ADD y7, a3, y7
  644. ST y3, 3 * SIZE(Y1)
  645. MUL alpha2, a7, a7
  646. LD a3, 11 * SIZE(A1)
  647. ADD y4, a4, y4
  648. LD a4, 0 * SIZE(A2)
  649. MUL alpha1, a0, a0
  650. LD y0, 8 * SIZE(Y1)
  651. ADD y5, a5, y5
  652. LD a5, 1 * SIZE(A2)
  653. MUL alpha1, a1, a1
  654. LD y1, 9 * SIZE(Y1)
  655. ADD y6, a6, y6
  656. LD a6, 2 * SIZE(A2)
  657. MUL alpha1, a2, a2
  658. LD y2, 10 * SIZE(Y1)
  659. ADD y7, a7, y7
  660. LD a7, 3 * SIZE(A2)
  661. MUL alpha1, a3, a3
  662. LD y3, 11 * SIZE(Y1)
  663. ADD y0, a0, y0
  664. ST y4, 4 * SIZE(Y1)
  665. MUL alpha2, a4, a4
  666. LD a0, 12 * SIZE(A1)
  667. ADD y1, a1, y1
  668. ST y5, 5 * SIZE(Y1)
  669. MUL alpha2, a5, a5
  670. LD a1, 13 * SIZE(A1)
  671. ADD y2, a2, y2
  672. ST y6, 6 * SIZE(Y1)
  673. MUL alpha2, a6, a6
  674. LD a2, 14 * SIZE(A1)
  675. ADD y3, a3, y3
  676. ST y7, 7 * SIZE(Y1)
  677. MUL alpha2, a7, a7
  678. LD a3, 15 * SIZE(A1)
  679. ADD y0, a4, y0
  680. LD a4, 4 * SIZE(A2)
  681. MUL alpha1, a0, a0
  682. LD y4, 12 * SIZE(Y1)
  683. ADD y1, a5, y1
  684. LD a5, 5 * SIZE(A2)
  685. MUL alpha1, a1, a1
  686. LD y5, 13 * SIZE(Y1)
  687. ADD y2, a6, y2
  688. LD a6, 6 * SIZE(A2)
  689. MUL alpha1, a2, a2
  690. LD y6, 14 * SIZE(Y1)
  691. ADD y3, a7, y3
  692. LD a7, 7 * SIZE(A2)
  693. MUL alpha1, a3, a3
  694. LD y7, 15 * SIZE(Y1)
  695. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  696. lda A1, 8 * SIZE(A1)
  697. lda Y1, 8 * SIZE(Y1)
  698. bgt I, $L22
  699. .align 4
  700. $L23:
  701. ADD y4, a0, y4
  702. ST y0, 0 * SIZE(Y1)
  703. MUL alpha2, a4, a4
  704. unop
  705. ADD y5, a1, y5
  706. ST y1, 1 * SIZE(Y1)
  707. MUL alpha2, a5, a5
  708. unop
  709. ADD y6, a2, y6
  710. ST y2, 2 * SIZE(Y1)
  711. MUL alpha2, a6, a6
  712. unop
  713. ADD y7, a3, y7
  714. ST y3, 3 * SIZE(Y1)
  715. MUL alpha2, a7, a7
  716. unop
  717. ADD y4, a4, y4
  718. ADD y5, a5, y5
  719. ADD y6, a6, y6
  720. ADD y7, a7, y7
  721. ST y4, 4 * SIZE(Y1)
  722. lda A1, 8 * SIZE(A1)
  723. ST y5, 5 * SIZE(Y1)
  724. lda A2, 8 * SIZE(A2)
  725. ST y6, 6 * SIZE(Y1)
  726. unop
  727. ST y7, 7 * SIZE(Y1)
  728. lda Y1, 8 * SIZE(Y1)
  729. .align 4
  730. $L25:
  731. and M, 4, I
  732. ble I, $L26
  733. LD y0, 0 * SIZE(Y1)
  734. LD y1, 1 * SIZE(Y1)
  735. LD y2, 2 * SIZE(Y1)
  736. LD y3, 3 * SIZE(Y1)
  737. LD a0, 0 * SIZE(A1)
  738. LD a1, 1 * SIZE(A1)
  739. LD a2, 2 * SIZE(A1)
  740. LD a3, 3 * SIZE(A1)
  741. MUL alpha1, a0, a0
  742. LD a4, 0 * SIZE(A2)
  743. MUL alpha1, a1, a1
  744. LD a5, 1 * SIZE(A2)
  745. MUL alpha1, a2, a2
  746. LD a6, 2 * SIZE(A2)
  747. MUL alpha1, a3, a3
  748. LD a7, 3 * SIZE(A2)
  749. ADD y0, a0, y0
  750. MUL alpha2, a4, a4
  751. ADD y1, a1, y1
  752. MUL alpha2, a5, a5
  753. ADD y2, a2, y2
  754. MUL alpha2, a6, a6
  755. ADD y3, a3, y3
  756. MUL alpha2, a7, a7
  757. ADD y0, a4, y0
  758. lda Y1, 4 * SIZE(Y1)
  759. ADD y1, a5, y1
  760. unop
  761. ADD y2, a6, y2
  762. unop
  763. ADD y3, a7, y3
  764. unop
  765. ST y0, -4 * SIZE(Y1)
  766. lda A1, 4 * SIZE(A1)
  767. ST y1, -3 * SIZE(Y1)
  768. lda A2, 4 * SIZE(A2)
  769. ST y2, -2 * SIZE(Y1)
  770. lda A3, 4 * SIZE(A3)
  771. ST y3, -1 * SIZE(Y1)
  772. lda A4, 4 * SIZE(A4)
  773. .align 4
  774. $L26:
  775. and M, 2, I
  776. ble I, $L27
  777. LD a0, 0 * SIZE(A1)
  778. LD a1, 1 * SIZE(A1)
  779. LD a2, 0 * SIZE(A2)
  780. LD a3, 1 * SIZE(A2)
  781. LD y0, 0 * SIZE(Y1)
  782. LD y1, 1 * SIZE(Y1)
  783. MUL alpha1, a0, a0
  784. MUL alpha1, a1, a1
  785. MUL alpha2, a2, a2
  786. MUL alpha2, a3, a3
  787. ADD y0, a0, y0
  788. lda A1, 2 * SIZE(A1)
  789. ADD y1, a1, y1
  790. lda A2, 2 * SIZE(A2)
  791. ADD y0, a2, y0
  792. unop
  793. ADD y1, a3, y1
  794. unop
  795. ST y0, 0 * SIZE(Y1)
  796. unop
  797. ST y1, 1 * SIZE(Y1)
  798. lda Y1, 2 * SIZE(Y1)
  799. .align 4
  800. $L27:
  801. blbc M, $L30
  802. LD y0, 0 * SIZE(Y1)
  803. LD a0, 0 * SIZE(A1)
  804. LD a1, 0 * SIZE(A2)
  805. MUL alpha1, a0, a0
  806. MUL alpha2, a1, a1
  807. ADD y0, a0, y0
  808. ADD y0, a1, y0
  809. ST y0, 0 * SIZE(Y1)
  810. .align 4
  811. $L30:
  812. blbc N, $L990
  813. LD alpha1, 0 * SIZE(X)
  814. mov A, A1
  815. MUL alpha, alpha1, alpha1
  816. mov Y, Y1
  817. sra M, 3, I
  818. ble I, $L35
  819. LD a0, 0 * SIZE(A1)
  820. LD a1, 1 * SIZE(A1)
  821. LD a2, 2 * SIZE(A1)
  822. LD a3, 3 * SIZE(A1)
  823. LD a4, 4 * SIZE(A1)
  824. LD a5, 5 * SIZE(A1)
  825. LD a6, 6 * SIZE(A1)
  826. LD a7, 7 * SIZE(A1)
  827. LD y0, 0 * SIZE(Y1)
  828. LD y1, 1 * SIZE(Y1)
  829. LD y2, 2 * SIZE(Y1)
  830. LD y3, 3 * SIZE(Y1)
  831. LD y4, 4 * SIZE(Y1)
  832. LD y5, 5 * SIZE(Y1)
  833. LD y6, 6 * SIZE(Y1)
  834. LD y7, 7 * SIZE(Y1)
  835. MUL alpha1, a0, a0
  836. MUL alpha1, a1, a1
  837. MUL alpha1, a2, a2
  838. MUL alpha1, a3, a3
  839. lda I, -1(I)
  840. ble I, $L33
  841. .align 4
  842. $L32:
  843. ADD y0, a0, y0
  844. LD y4, 4 * SIZE(Y1)
  845. MUL alpha1, a4, a4
  846. LD a0, 8 * SIZE(A1)
  847. ADD y1, a1, y1
  848. LD y5, 5 * SIZE(Y1)
  849. MUL alpha1, a5, a5
  850. LD a1, 9 * SIZE(A1)
  851. ADD y2, a2, y2
  852. LD y6, 6 * SIZE(Y1)
  853. MUL alpha1, a6, a6
  854. LD a2, 10 * SIZE(A1)
  855. ADD y3, a3, y3
  856. LD y7, 7 * SIZE(Y1)
  857. MUL alpha1, a7, a7
  858. LD a3, 11 * SIZE(A1)
  859. ST y0, 0 * SIZE(Y1)
  860. ST y1, 1 * SIZE(Y1)
  861. ST y2, 2 * SIZE(Y1)
  862. ST y3, 3 * SIZE(Y1)
  863. ADD y4, a4, y4
  864. LD y0, 8 * SIZE(Y1)
  865. MUL alpha1, a0, a0
  866. LD a4, 12 * SIZE(A1)
  867. ADD y5, a5, y5
  868. LD y1, 9 * SIZE(Y1)
  869. MUL alpha1, a1, a1
  870. LD a5, 13 * SIZE(A1)
  871. ADD y6, a6, y6
  872. LD y2, 10 * SIZE(Y1)
  873. MUL alpha1, a2, a2
  874. LD a6, 14 * SIZE(A1)
  875. ADD y7, a7, y7
  876. LD y3, 11 * SIZE(Y1)
  877. MUL alpha1, a3, a3
  878. LD a7, 15 * SIZE(A1)
  879. ST y4, 4 * SIZE(Y1)
  880. lda I, -1(I)
  881. ST y5, 5 * SIZE(Y1)
  882. lda A1, 8 * SIZE(A1)
  883. ST y6, 6 * SIZE(Y1)
  884. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  885. ST y7, 7 * SIZE(Y1)
  886. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  887. lda Y1, 8 * SIZE(Y1)
  888. bgt I, $L32
  889. .align 4
  890. $L33:
  891. ADD y0, a0, y0
  892. LD y4, 4 * SIZE(Y1)
  893. MUL alpha1, a4, a4
  894. unop
  895. ADD y1, a1, y1
  896. LD y5, 5 * SIZE(Y1)
  897. MUL alpha1, a5, a5
  898. unop
  899. ADD y2, a2, y2
  900. LD y6, 6 * SIZE(Y1)
  901. MUL alpha1, a6, a6
  902. unop
  903. ADD y3, a3, y3
  904. LD y7, 7 * SIZE(Y1)
  905. MUL alpha1, a7, a7
  906. unop
  907. ADD y4, a4, y4
  908. ST y0, 0 * SIZE(Y1)
  909. ADD y5, a5, y5
  910. ST y1, 1 * SIZE(Y1)
  911. ADD y6, a6, y6
  912. ST y2, 2 * SIZE(Y1)
  913. ADD y7, a7, y7
  914. ST y3, 3 * SIZE(Y1)
  915. ST y4, 4 * SIZE(Y1)
  916. unop
  917. ST y5, 5 * SIZE(Y1)
  918. unop
  919. ST y6, 6 * SIZE(Y1)
  920. lda A1, 8 * SIZE(A1)
  921. ST y7, 7 * SIZE(Y1)
  922. lda Y1, 8 * SIZE(Y1)
  923. .align 4
  924. $L35:
  925. and M, 4, I
  926. ble I, $L36
  927. LD a0, 0 * SIZE(A1)
  928. LD a1, 1 * SIZE(A1)
  929. LD a2, 2 * SIZE(A1)
  930. LD a3, 3 * SIZE(A1)
  931. MUL alpha1, a0, a0
  932. LD y0, 0 * SIZE(Y1)
  933. MUL alpha1, a1, a1
  934. LD y1, 1 * SIZE(Y1)
  935. MUL alpha1, a2, a2
  936. LD y2, 2 * SIZE(Y1)
  937. MUL alpha1, a3, a3
  938. LD y3, 3 * SIZE(Y1)
  939. ADD y0, a0, y0
  940. ADD y1, a1, y1
  941. ADD y2, a2, y2
  942. ADD y3, a3, y3
  943. ST y0, 0 * SIZE(Y1)
  944. lda A1, 4 * SIZE(A1)
  945. ST y1, 1 * SIZE(Y1)
  946. lda A2, 4 * SIZE(A2)
  947. ST y2, 2 * SIZE(Y1)
  948. unop
  949. ST y3, 3 * SIZE(Y1)
  950. lda Y1, 4 * SIZE(Y1)
  951. .align 4
  952. $L36:
  953. and M, 2, I
  954. ble I, $L37
  955. LD a0, 0 * SIZE(A1)
  956. LD a1, 1 * SIZE(A1)
  957. LD y0, 0 * SIZE(Y1)
  958. MUL alpha1, a0, a0
  959. LD y1, 1 * SIZE(Y1)
  960. MUL alpha1, a1, a1
  961. ADD y0, a0, y0
  962. ADD y1, a1, y1
  963. ST y0, 0 * SIZE(Y1)
  964. lda A1, 2 * SIZE(A1)
  965. ST y1, 1 * SIZE(Y1)
  966. lda Y1, 2 * SIZE(Y1)
  967. .align 4
  968. $L37:
  969. blbc M, $L990
  970. LD y0, 0 * SIZE(Y1)
  971. LD a0, 0 * SIZE(A1)
  972. MUL alpha1, a0, a0
  973. ADD y0, a0, y0
  974. ST y0, 0 * SIZE(Y1)
  975. .align 4
  976. $L990:
  977. cmpeq INCY, SIZE, $0
  978. bne $0, $L999
  979. mov BUFFER, Y1
  980. sra M, 3, I
  981. ble I, $L995
  982. .align 4
  983. $L992:
  984. LD a0, 0 * SIZE(BUFFER)
  985. addq BUFFER, INCY, BUFFER
  986. LD a1, 0 * SIZE(BUFFER)
  987. addq BUFFER, INCY, BUFFER
  988. LD a2, 0 * SIZE(BUFFER)
  989. addq BUFFER, INCY, BUFFER
  990. LD a3, 0 * SIZE(BUFFER)
  991. addq BUFFER, INCY, BUFFER
  992. LD y0, 0 * SIZE(Y)
  993. LD y1, 1 * SIZE(Y)
  994. LD y2, 2 * SIZE(Y)
  995. LD y3, 3 * SIZE(Y)
  996. LD a4, 0 * SIZE(BUFFER)
  997. addq BUFFER, INCY, BUFFER
  998. LD a5, 0 * SIZE(BUFFER)
  999. addq BUFFER, INCY, BUFFER
  1000. LD a6, 0 * SIZE(BUFFER)
  1001. addq BUFFER, INCY, BUFFER
  1002. LD a7, 0 * SIZE(BUFFER)
  1003. addq BUFFER, INCY, BUFFER
  1004. LD y4, 4 * SIZE(Y)
  1005. LD y5, 5 * SIZE(Y)
  1006. LD y6, 6 * SIZE(Y)
  1007. LD y7, 7 * SIZE(Y)
  1008. ADD a0, y0, a0
  1009. ADD a1, y1, a1
  1010. ADD a2, y2, a2
  1011. ADD a3, y3, a3
  1012. ADD a4, y4, a4
  1013. ADD a5, y5, a5
  1014. ADD a6, y6, a6
  1015. ADD a7, y7, a7
  1016. ST a0, 0 * SIZE(Y1)
  1017. addq Y1, INCY, Y1
  1018. ST a1, 0 * SIZE(Y1)
  1019. addq Y1, INCY, Y1
  1020. ST a2, 0 * SIZE(Y1)
  1021. addq Y1, INCY, Y1
  1022. ST a3, 0 * SIZE(Y1)
  1023. addq Y1, INCY, Y1
  1024. ST a4, 0 * SIZE(Y1)
  1025. addq Y1, INCY, Y1
  1026. ST a5, 0 * SIZE(Y1)
  1027. addq Y1, INCY, Y1
  1028. ST a6, 0 * SIZE(Y1)
  1029. addq Y1, INCY, Y1
  1030. ST a7, 0 * SIZE(Y1)
  1031. addq Y1, INCY, Y1
  1032. lda I, -1(I)
  1033. lda Y, 8 * SIZE(Y)
  1034. bgt I, $L992
  1035. .align 4
  1036. $L995:
  1037. and M, 7, I
  1038. ble I, $L999
  1039. .align 4
  1040. $L996:
  1041. LD a0, 0 * SIZE(BUFFER)
  1042. addq BUFFER, INCY, BUFFER
  1043. LD y0, 0 * SIZE(Y)
  1044. lda Y, 1 * SIZE(Y)
  1045. ADD a0, y0, a0
  1046. ST a0, 0 * SIZE(Y1)
  1047. addq Y1, INCY, Y1
  1048. lda I, -1(I)
  1049. bgt I, $L996
  1050. .align 4
  1051. $L999:
  1052. ldt $f2, 0($sp)
  1053. ldt $f3, 8($sp)
  1054. ldt $f4, 16($sp)
  1055. ldt $f5, 24($sp)
  1056. ldt $f6, 32($sp)
  1057. ldt $f7, 40($sp)
  1058. ldt $f8, 48($sp)
  1059. ldt $f9, 56($sp)
  1060. lda $sp, STACKSIZE($sp)
  1061. ret
  1062. EPILOGUE