You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 22 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACKSIZE 64
  41. #define PREFETCHSIZE 32
  42. #define M $16
  43. #define N $17
  44. #define A $20
  45. #define LDA $21
  46. #define X $18
  47. #define INCX $19
  48. #define Y $22
  49. #define INCY $23
  50. #define BUFFER $24
  51. #define I $25
  52. #define J $27
  53. #define Y1 $4
  54. #define A1 $5
  55. #define A2 $6
  56. #define A3 $7
  57. #define A4 $8
  58. #define alpha $f19
  59. #define alpha1 $f0
  60. #define alpha2 $f1
  61. #define alpha3 $f10
  62. #define alpha4 $f11
  63. #define y0 $f12
  64. #define y1 $f13
  65. #define y2 $f14
  66. #define y3 $f15
  67. #define y4 $f16
  68. #define y5 $f17
  69. #define y6 $f18
  70. #define y7 $f21
  71. #define a0 $f22
  72. #define a1 $f23
  73. #define a2 $f24
  74. #define a3 $f25
  75. #define a4 $f26
  76. #define a5 $f27
  77. #define a6 $f28
  78. #define a7 $f29
  79. #define a8 $f2
  80. #define a9 $f3
  81. #define a10 $f4
  82. #define a11 $f5
  83. #define a12 $f6
  84. #define a13 $f7
  85. #define a14 $f8
  86. #define a15 $f9
  87. PROLOGUE
  88. lda $sp, -STACKSIZE($sp)
  89. ldq X, 0 + STACKSIZE($sp)
  90. ldq INCX, 8 + STACKSIZE($sp)
  91. ldq Y, 16 + STACKSIZE($sp)
  92. ldq INCY, 24 + STACKSIZE($sp)
  93. ldq BUFFER, 32 + STACKSIZE($sp)
  94. stt $f2, 0($sp)
  95. stt $f3, 8($sp)
  96. stt $f4, 16($sp)
  97. stt $f5, 24($sp)
  98. stt $f6, 32($sp)
  99. stt $f7, 40($sp)
  100. stt $f8, 48($sp)
  101. stt $f9, 56($sp)
  102. PROFCODE
  103. cmple M, 0, $0
  104. SXADDQ INCX, 0, INCX
  105. cmple N, 0, $1
  106. SXADDQ INCY, 0, INCY
  107. or $0, $1, $0
  108. bne $0, $L999
  109. SXADDQ LDA, 0, LDA
  110. cmpeq INCY, SIZE, $0
  111. bne $0, $L10
  112. mov BUFFER, Y1
  113. mov Y, BUFFER
  114. mov Y1, Y
  115. sra M, 3, I
  116. ble I, $L05
  117. .align 4
  118. $L02:
  119. ST $f31, 0 * SIZE(Y1)
  120. ST $f31, 1 * SIZE(Y1)
  121. ST $f31, 2 * SIZE(Y1)
  122. ST $f31, 3 * SIZE(Y1)
  123. ST $f31, 4 * SIZE(Y1)
  124. ST $f31, 5 * SIZE(Y1)
  125. ST $f31, 6 * SIZE(Y1)
  126. ST $f31, 7 * SIZE(Y1)
  127. lda Y1, 8 * SIZE(Y1)
  128. lda I, -1(I)
  129. bgt I, $L02
  130. .align 4
  131. $L05:
  132. and M, 7, I
  133. ble I, $L10
  134. .align 4
  135. $L06:
  136. ST $f31, 0 * SIZE(Y1)
  137. addq Y1, SIZE, Y1
  138. lda I, -1(I)
  139. bgt I, $L06
  140. .align 4
  141. $L10:
  142. sra N, 2, J
  143. ble J, $L20
  144. .align 4
  145. $L11:
  146. LD alpha1, 0 * SIZE(X)
  147. addq X, INCX, X
  148. LD alpha2, 0 * SIZE(X)
  149. addq X, INCX, X
  150. LD alpha3, 0 * SIZE(X)
  151. addq X, INCX, X
  152. LD alpha4, 0 * SIZE(X)
  153. addq X, INCX, X
  154. MUL alpha, alpha1, alpha1
  155. MUL alpha, alpha2, alpha2
  156. MUL alpha, alpha3, alpha3
  157. MUL alpha, alpha4, alpha4
  158. mov A, A1
  159. addq A, LDA, A2
  160. addq A2, LDA, A3
  161. addq A3, LDA, A4
  162. s4addq LDA, A, A
  163. mov Y, Y1
  164. ldl $31, 4 * SIZE(X)
  165. sra M, 3, I
  166. ble I, $L15
  167. LD a0, 0 * SIZE(A1)
  168. LD a1, 1 * SIZE(A1)
  169. LD a2, 2 * SIZE(A1)
  170. LD a3, 3 * SIZE(A1)
  171. LD a4, 0 * SIZE(A2)
  172. LD a5, 1 * SIZE(A2)
  173. LD a6, 2 * SIZE(A2)
  174. LD a7, 3 * SIZE(A2)
  175. LD y0, 0 * SIZE(Y1)
  176. LD y1, 1 * SIZE(Y1)
  177. LD y2, 2 * SIZE(Y1)
  178. LD y3, 3 * SIZE(Y1)
  179. LD a8, 0 * SIZE(A3)
  180. LD a9, 1 * SIZE(A3)
  181. LD a10, 2 * SIZE(A3)
  182. LD a11, 3 * SIZE(A3)
  183. LD y4, 4 * SIZE(Y1)
  184. LD y5, 5 * SIZE(Y1)
  185. LD y6, 6 * SIZE(Y1)
  186. LD y7, 7 * SIZE(Y1)
  187. MUL alpha1, a0, a0
  188. LD a12, 0 * SIZE(A4)
  189. MUL alpha1, a1, a1
  190. LD a13, 1 * SIZE(A4)
  191. MUL alpha1, a2, a2
  192. LD a14, 2 * SIZE(A4)
  193. MUL alpha1, a3, a3
  194. LD a15, 3 * SIZE(A4)
  195. ADD y0, a0, y0
  196. LD a0, 4 * SIZE(A1)
  197. MUL alpha2, a4, a4
  198. unop
  199. ADD y1, a1, y1
  200. LD a1, 5 * SIZE(A1)
  201. MUL alpha2, a5, a5
  202. unop
  203. ADD y2, a2, y2
  204. LD a2, 6 * SIZE(A1)
  205. MUL alpha2, a6, a6
  206. unop
  207. ADD y3, a3, y3
  208. LD a3, 7 * SIZE(A1)
  209. MUL alpha2, a7, a7
  210. unop
  211. ADD y0, a4, y0
  212. LD a4, 4 * SIZE(A2)
  213. MUL alpha3, a8, a8
  214. unop
  215. ADD y1, a5, y1
  216. LD a5, 5 * SIZE(A2)
  217. MUL alpha3, a9, a9
  218. lda I, -1(I)
  219. ADD y2, a6, y2
  220. LD a6, 6 * SIZE(A2)
  221. MUL alpha3, a10, a10
  222. unop
  223. ADD y3, a7, y3
  224. LD a7, 7 * SIZE(A2)
  225. MUL alpha3, a11, a11
  226. unop
  227. ADD y0, a8, y0
  228. LD a8, 4 * SIZE(A3)
  229. MUL alpha4, a12, a12
  230. ble I, $L13
  231. .align 4
  232. $L12:
  233. ADD y1, a9, y1
  234. LD a9, 5 * SIZE(A3)
  235. MUL alpha4, a13, a13
  236. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  237. ADD y2, a10, y2
  238. LD a10, 6 * SIZE(A3)
  239. MUL alpha4, a14, a14
  240. unop
  241. ADD y3, a11, y3
  242. LD a11, 7 * SIZE(A3)
  243. MUL alpha4, a15, a15
  244. lda I, -1(I)
  245. ADD y0, a12, y0
  246. LD a12, 4 * SIZE(A4)
  247. MUL alpha1, a0, a0
  248. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  249. ADD y1, a13, y1
  250. LD a13, 5 * SIZE(A4)
  251. MUL alpha1, a1, a1
  252. unop
  253. ADD y2, a14, y2
  254. LD a14, 6 * SIZE(A4)
  255. MUL alpha1, a2, a2
  256. unop
  257. ADD y3, a15, y3
  258. LD a15, 7 * SIZE(A4)
  259. MUL alpha1, a3, a3
  260. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  261. ADD y4, a0, y4
  262. ST y0, 0 * SIZE(Y1)
  263. MUL alpha2, a4, a4
  264. LD a0, 8 * SIZE(A1)
  265. ADD y5, a1, y5
  266. ST y1, 1 * SIZE(Y1)
  267. MUL alpha2, a5, a5
  268. LD a1, 9 * SIZE(A1)
  269. ADD y6, a2, y6
  270. ST y2, 2 * SIZE(Y1)
  271. MUL alpha2, a6, a6
  272. LD a2, 10 * SIZE(A1)
  273. ADD y7, a3, y7
  274. ST y3, 3 * SIZE(Y1)
  275. MUL alpha2, a7, a7
  276. LD a3, 11 * SIZE(A1)
  277. ADD y4, a4, y4
  278. LD a4, 8 * SIZE(A2)
  279. MUL alpha3, a8, a8
  280. LD y0, 8 * SIZE(Y1)
  281. ADD y5, a5, y5
  282. LD a5, 9 * SIZE(A2)
  283. MUL alpha3, a9, a9
  284. LD y1, 9 * SIZE(Y1)
  285. ADD y6, a6, y6
  286. LD a6, 10 * SIZE(A2)
  287. MUL alpha3, a10, a10
  288. LD y2, 10 * SIZE(Y1)
  289. ADD y7, a7, y7
  290. LD a7, 11 * SIZE(A2)
  291. MUL alpha3, a11, a11
  292. LD y3, 11 * SIZE(Y1)
  293. ADD y4, a8, y4
  294. LD a8, 8 * SIZE(A3)
  295. MUL alpha4, a12, a12
  296. ldl $31, (PREFETCHSIZE + 0) * SIZE(A3)
  297. ADD y5, a9, y5
  298. LD a9, 9 * SIZE(A3)
  299. MUL alpha4, a13, a13
  300. lda A1, 8 * SIZE(A1)
  301. ADD y6, a10, y6
  302. LD a10, 10 * SIZE(A3)
  303. MUL alpha4, a14, a14
  304. lda A2, 8 * SIZE(A2)
  305. ADD y7, a11, y7
  306. LD a11, 11 * SIZE(A3)
  307. MUL alpha4, a15, a15
  308. lda Y1, 8 * SIZE(Y1)
  309. ADD y4, a12, y4
  310. LD a12, 8 * SIZE(A4)
  311. MUL alpha1, a0, a0
  312. unop
  313. ADD y5, a13, y5
  314. LD a13, 9 * SIZE(A4)
  315. MUL alpha1, a1, a1
  316. lda A3, 8 * SIZE(A3)
  317. ADD y6, a14, y6
  318. LD a14, 10 * SIZE(A4)
  319. MUL alpha1, a2, a2
  320. ldl $31, (PREFETCHSIZE + 0) * SIZE(A4)
  321. ADD y7, a15, y7
  322. LD a15, 11 * SIZE(A4)
  323. MUL alpha1, a3, a3
  324. lda A4, 8 * SIZE(A4)
  325. ADD y0, a0, y0
  326. LD a0, 4 * SIZE(A1)
  327. MUL alpha2, a4, a4
  328. ST y4, -4 * SIZE(Y1)
  329. ADD y1, a1, y1
  330. LD a1, 5 * SIZE(A1)
  331. MUL alpha2, a5, a5
  332. ST y5, -3 * SIZE(Y1)
  333. ADD y2, a2, y2
  334. LD a2, 6 * SIZE(A1)
  335. MUL alpha2, a6, a6
  336. ST y6, -2 * SIZE(Y1)
  337. ADD y3, a3, y3
  338. LD a3, 7 * SIZE(A1)
  339. MUL alpha2, a7, a7
  340. ST y7, -1 * SIZE(Y1)
  341. ADD y0, a4, y0
  342. LD a4, 4 * SIZE(A2)
  343. MUL alpha3, a8, a8
  344. LD y4, 4 * SIZE(Y1)
  345. ADD y1, a5, y1
  346. LD a5, 5 * SIZE(A2)
  347. MUL alpha3, a9, a9
  348. LD y5, 5 * SIZE(Y1)
  349. ADD y2, a6, y2
  350. LD a6, 6 * SIZE(A2)
  351. MUL alpha3, a10, a10
  352. LD y6, 6 * SIZE(Y1)
  353. ADD y3, a7, y3
  354. LD a7, 7 * SIZE(A2)
  355. MUL alpha3, a11, a11
  356. LD y7, 7 * SIZE(Y1)
  357. ADD y0, a8, y0
  358. LD a8, 4 * SIZE(A3)
  359. MUL alpha4, a12, a12
  360. bgt I, $L12
  361. .align 4
  362. $L13:
  363. ADD y1, a9, y1
  364. LD a9, 5 * SIZE(A3)
  365. MUL alpha4, a13, a13
  366. unop
  367. ADD y2, a10, y2
  368. LD a10, 6 * SIZE(A3)
  369. MUL alpha4, a14, a14
  370. unop
  371. ADD y3, a11, y3
  372. LD a11, 7 * SIZE(A3)
  373. MUL alpha4, a15, a15
  374. unop
  375. ADD y0, a12, y0
  376. LD a12, 4 * SIZE(A4)
  377. MUL alpha1, a0, a0
  378. unop
  379. ADD y1, a13, y1
  380. LD a13, 5 * SIZE(A4)
  381. MUL alpha1, a1, a1
  382. unop
  383. ADD y2, a14, y2
  384. LD a14, 6 * SIZE(A4)
  385. MUL alpha1, a2, a2
  386. unop
  387. ADD y3, a15, y3
  388. LD a15, 7 * SIZE(A4)
  389. MUL alpha1, a3, a3
  390. unop
  391. ST y0, 0 * SIZE(Y1)
  392. ADD y4, a0, y4
  393. unop
  394. MUL alpha2, a4, a4
  395. ST y1, 1 * SIZE(Y1)
  396. ADD y5, a1, y5
  397. unop
  398. MUL alpha2, a5, a5
  399. ST y2, 2 * SIZE(Y1)
  400. ADD y6, a2, y6
  401. unop
  402. MUL alpha2, a6, a6
  403. ST y3, 3 * SIZE(Y1)
  404. ADD y7, a3, y7
  405. lda Y1, 8 * SIZE(Y1)
  406. MUL alpha2, a7, a7
  407. ADD y4, a4, y4
  408. MUL alpha3, a8, a8
  409. ADD y5, a5, y5
  410. MUL alpha3, a9, a9
  411. ADD y6, a6, y6
  412. MUL alpha3, a10, a10
  413. ADD y7, a7, y7
  414. MUL alpha3, a11, a11
  415. ADD y4, a8, y4
  416. MUL alpha4, a12, a12
  417. ADD y5, a9, y5
  418. MUL alpha4, a13, a13
  419. ADD y6, a10, y6
  420. MUL alpha4, a14, a14
  421. ADD y7, a11, y7
  422. MUL alpha4, a15, a15
  423. ADD y4, a12, y4
  424. ADD y5, a13, y5
  425. ADD y6, a14, y6
  426. ADD y7, a15, y7
  427. ST y4, -4 * SIZE(Y1)
  428. lda A1, 8 * SIZE(A1)
  429. ST y5, -3 * SIZE(Y1)
  430. lda A2, 8 * SIZE(A2)
  431. ST y6, -2 * SIZE(Y1)
  432. lda A3, 8 * SIZE(A3)
  433. ST y7, -1 * SIZE(Y1)
  434. lda A4, 8 * SIZE(A4)
  435. .align 4
  436. $L15:
  437. and M, 4, I
  438. ble I, $L16
  439. LD y0, 0 * SIZE(Y1)
  440. LD y1, 1 * SIZE(Y1)
  441. LD y2, 2 * SIZE(Y1)
  442. LD y3, 3 * SIZE(Y1)
  443. LD a0, 0 * SIZE(A1)
  444. LD a1, 1 * SIZE(A1)
  445. LD a2, 2 * SIZE(A1)
  446. LD a3, 3 * SIZE(A1)
  447. LD a4, 0 * SIZE(A2)
  448. LD a5, 1 * SIZE(A2)
  449. LD a6, 2 * SIZE(A2)
  450. LD a7, 3 * SIZE(A2)
  451. LD a8, 0 * SIZE(A3)
  452. LD a9, 1 * SIZE(A3)
  453. LD a10, 2 * SIZE(A3)
  454. LD a11, 3 * SIZE(A3)
  455. MUL alpha1, a0, a0
  456. LD a12, 0 * SIZE(A4)
  457. MUL alpha1, a1, a1
  458. LD a13, 1 * SIZE(A4)
  459. MUL alpha1, a2, a2
  460. LD a14, 2 * SIZE(A4)
  461. MUL alpha1, a3, a3
  462. LD a15, 3 * SIZE(A4)
  463. ADD y0, a0, y0
  464. MUL alpha2, a4, a4
  465. ADD y1, a1, y1
  466. MUL alpha2, a5, a5
  467. ADD y2, a2, y2
  468. MUL alpha2, a6, a6
  469. ADD y3, a3, y3
  470. MUL alpha2, a7, a7
  471. ADD y0, a4, y0
  472. MUL alpha3, a8, a8
  473. ADD y1, a5, y1
  474. MUL alpha3, a9, a9
  475. ADD y2, a6, y2
  476. MUL alpha3, a10, a10
  477. ADD y3, a7, y3
  478. MUL alpha3, a11, a11
  479. ADD y0, a8, y0
  480. MUL alpha4, a12, a12
  481. ADD y1, a9, y1
  482. MUL alpha4, a13, a13
  483. ADD y2, a10, y2
  484. MUL alpha4, a14, a14
  485. ADD y3, a11, y3
  486. MUL alpha4, a15, a15
  487. ADD y0, a12, y0
  488. lda Y1, 4 * SIZE(Y1)
  489. ADD y1, a13, y1
  490. unop
  491. ADD y2, a14, y2
  492. unop
  493. ADD y3, a15, y3
  494. unop
  495. ST y0, -4 * SIZE(Y1)
  496. lda A1, 4 * SIZE(A1)
  497. ST y1, -3 * SIZE(Y1)
  498. lda A2, 4 * SIZE(A2)
  499. ST y2, -2 * SIZE(Y1)
  500. lda A3, 4 * SIZE(A3)
  501. ST y3, -1 * SIZE(Y1)
  502. lda A4, 4 * SIZE(A4)
  503. .align 4
  504. $L16:
  505. and M, 2, I
  506. ble I, $L17
  507. LD a0, 0 * SIZE(A1)
  508. LD a1, 1 * SIZE(A1)
  509. LD a2, 0 * SIZE(A2)
  510. LD a3, 1 * SIZE(A2)
  511. LD y0, 0 * SIZE(Y1)
  512. LD y1, 1 * SIZE(Y1)
  513. LD a4, 0 * SIZE(A3)
  514. MUL alpha1, a0, a0
  515. LD a5, 1 * SIZE(A3)
  516. MUL alpha1, a1, a1
  517. LD a6, 0 * SIZE(A4)
  518. MUL alpha2, a2, a2
  519. LD a7, 1 * SIZE(A4)
  520. MUL alpha2, a3, a3
  521. ADD y0, a0, y0
  522. MUL alpha3, a4, a4
  523. ADD y1, a1, y1
  524. MUL alpha3, a5, a5
  525. ADD y0, a2, y0
  526. MUL alpha4, a6, a6
  527. ADD y1, a3, y1
  528. MUL alpha4, a7, a7
  529. ADD y0, a4, y0
  530. lda A1, 2 * SIZE(A1)
  531. ADD y1, a5, y1
  532. lda A2, 2 * SIZE(A2)
  533. ADD y0, a6, y0
  534. lda A3, 2 * SIZE(A3)
  535. ADD y1, a7, y1
  536. lda A4, 2 * SIZE(A4)
  537. ST y0, 0 * SIZE(Y1)
  538. unop
  539. ST y1, 1 * SIZE(Y1)
  540. lda Y1, 2 * SIZE(Y1)
  541. .align 4
  542. $L17:
  543. blbc M, $L18
  544. LD y0, 0 * SIZE(Y1)
  545. LD a0, 0 * SIZE(A1)
  546. LD a1, 0 * SIZE(A2)
  547. LD a2, 0 * SIZE(A3)
  548. LD a3, 0 * SIZE(A4)
  549. MUL alpha1, a0, a0
  550. MUL alpha2, a1, a1
  551. MUL alpha3, a2, a2
  552. MUL alpha4, a3, a3
  553. ADD y0, a0, y0
  554. ADD y0, a1, y0
  555. ADD y0, a2, y0
  556. ADD y0, a3, y0
  557. ST y0, 0 * SIZE(Y1)
  558. .align 4
  559. $L18:
  560. lda J, -1(J)
  561. bgt J, $L11
  562. .align 4
  563. $L20:
  564. and N, 2, J
  565. ble J, $L30
  566. LD alpha1, 0 * SIZE(X)
  567. addq X, INCX, X
  568. LD alpha2, 0 * SIZE(X)
  569. addq X, INCX, X
  570. mov A, A1
  571. MUL alpha, alpha1, alpha1
  572. addq A, LDA, A2
  573. MUL alpha, alpha2, alpha2
  574. addq A2, LDA, A
  575. mov Y, Y1
  576. sra M, 3, I
  577. ble I, $L25
  578. LD a0, 0 * SIZE(A1)
  579. LD a1, 1 * SIZE(A1)
  580. LD a2, 2 * SIZE(A1)
  581. LD a3, 3 * SIZE(A1)
  582. LD a4, 0 * SIZE(A2)
  583. LD a5, 1 * SIZE(A2)
  584. LD a6, 2 * SIZE(A2)
  585. LD a7, 3 * SIZE(A2)
  586. LD y0, 0 * SIZE(Y1)
  587. LD y1, 1 * SIZE(Y1)
  588. LD y2, 2 * SIZE(Y1)
  589. LD y3, 3 * SIZE(Y1)
  590. MUL alpha1, a0, a0
  591. LD y4, 4 * SIZE(Y1)
  592. MUL alpha1, a1, a1
  593. LD y5, 5 * SIZE(Y1)
  594. MUL alpha1, a2, a2
  595. LD y6, 6 * SIZE(Y1)
  596. MUL alpha1, a3, a3
  597. LD y7, 7 * SIZE(Y1)
  598. ADD y0, a0, y0
  599. LD a0, 4 * SIZE(A1)
  600. MUL alpha2, a4, a4
  601. ADD y1, a1, y1
  602. LD a1, 5 * SIZE(A1)
  603. MUL alpha2, a5, a5
  604. ADD y2, a2, y2
  605. LD a2, 6 * SIZE(A1)
  606. MUL alpha2, a6, a6
  607. ADD y3, a3, y3
  608. LD a3, 7 * SIZE(A1)
  609. MUL alpha2, a7, a7
  610. ADD y0, a4, y0
  611. LD a4, 4 * SIZE(A2)
  612. MUL alpha1, a0, a0
  613. ADD y1, a5, y1
  614. LD a5, 5 * SIZE(A2)
  615. MUL alpha1, a1, a1
  616. ADD y2, a6, y2
  617. LD a6, 6 * SIZE(A2)
  618. MUL alpha1, a2, a2
  619. ADD y3, a7, y3
  620. LD a7, 7 * SIZE(A2)
  621. MUL alpha1, a3, a3
  622. lda I, -1(I)
  623. ble I, $L23
  624. .align 4
  625. $L22:
  626. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  627. lda I, -1(I)
  628. ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
  629. lda A2, 8 * SIZE(A2)
  630. ADD y4, a0, y4
  631. ST y0, 0 * SIZE(Y1)
  632. MUL alpha2, a4, a4
  633. LD a0, 8 * SIZE(A1)
  634. ADD y5, a1, y5
  635. ST y1, 1 * SIZE(Y1)
  636. MUL alpha2, a5, a5
  637. LD a1, 9 * SIZE(A1)
  638. ADD y6, a2, y6
  639. ST y2, 2 * SIZE(Y1)
  640. MUL alpha2, a6, a6
  641. LD a2, 10 * SIZE(A1)
  642. ADD y7, a3, y7
  643. ST y3, 3 * SIZE(Y1)
  644. MUL alpha2, a7, a7
  645. LD a3, 11 * SIZE(A1)
  646. ADD y4, a4, y4
  647. LD a4, 0 * SIZE(A2)
  648. MUL alpha1, a0, a0
  649. LD y0, 8 * SIZE(Y1)
  650. ADD y5, a5, y5
  651. LD a5, 1 * SIZE(A2)
  652. MUL alpha1, a1, a1
  653. LD y1, 9 * SIZE(Y1)
  654. ADD y6, a6, y6
  655. LD a6, 2 * SIZE(A2)
  656. MUL alpha1, a2, a2
  657. LD y2, 10 * SIZE(Y1)
  658. ADD y7, a7, y7
  659. LD a7, 3 * SIZE(A2)
  660. MUL alpha1, a3, a3
  661. LD y3, 11 * SIZE(Y1)
  662. ADD y0, a0, y0
  663. ST y4, 4 * SIZE(Y1)
  664. MUL alpha2, a4, a4
  665. LD a0, 12 * SIZE(A1)
  666. ADD y1, a1, y1
  667. ST y5, 5 * SIZE(Y1)
  668. MUL alpha2, a5, a5
  669. LD a1, 13 * SIZE(A1)
  670. ADD y2, a2, y2
  671. ST y6, 6 * SIZE(Y1)
  672. MUL alpha2, a6, a6
  673. LD a2, 14 * SIZE(A1)
  674. ADD y3, a3, y3
  675. ST y7, 7 * SIZE(Y1)
  676. MUL alpha2, a7, a7
  677. LD a3, 15 * SIZE(A1)
  678. ADD y0, a4, y0
  679. LD a4, 4 * SIZE(A2)
  680. MUL alpha1, a0, a0
  681. LD y4, 12 * SIZE(Y1)
  682. ADD y1, a5, y1
  683. LD a5, 5 * SIZE(A2)
  684. MUL alpha1, a1, a1
  685. LD y5, 13 * SIZE(Y1)
  686. ADD y2, a6, y2
  687. LD a6, 6 * SIZE(A2)
  688. MUL alpha1, a2, a2
  689. LD y6, 14 * SIZE(Y1)
  690. ADD y3, a7, y3
  691. LD a7, 7 * SIZE(A2)
  692. MUL alpha1, a3, a3
  693. LD y7, 15 * SIZE(Y1)
  694. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  695. lda A1, 8 * SIZE(A1)
  696. lda Y1, 8 * SIZE(Y1)
  697. bgt I, $L22
  698. .align 4
  699. $L23:
  700. ADD y4, a0, y4
  701. ST y0, 0 * SIZE(Y1)
  702. MUL alpha2, a4, a4
  703. unop
  704. ADD y5, a1, y5
  705. ST y1, 1 * SIZE(Y1)
  706. MUL alpha2, a5, a5
  707. unop
  708. ADD y6, a2, y6
  709. ST y2, 2 * SIZE(Y1)
  710. MUL alpha2, a6, a6
  711. unop
  712. ADD y7, a3, y7
  713. ST y3, 3 * SIZE(Y1)
  714. MUL alpha2, a7, a7
  715. unop
  716. ADD y4, a4, y4
  717. ADD y5, a5, y5
  718. ADD y6, a6, y6
  719. ADD y7, a7, y7
  720. ST y4, 4 * SIZE(Y1)
  721. lda A1, 8 * SIZE(A1)
  722. ST y5, 5 * SIZE(Y1)
  723. lda A2, 8 * SIZE(A2)
  724. ST y6, 6 * SIZE(Y1)
  725. unop
  726. ST y7, 7 * SIZE(Y1)
  727. lda Y1, 8 * SIZE(Y1)
  728. .align 4
  729. $L25:
  730. and M, 4, I
  731. ble I, $L26
  732. LD y0, 0 * SIZE(Y1)
  733. LD y1, 1 * SIZE(Y1)
  734. LD y2, 2 * SIZE(Y1)
  735. LD y3, 3 * SIZE(Y1)
  736. LD a0, 0 * SIZE(A1)
  737. LD a1, 1 * SIZE(A1)
  738. LD a2, 2 * SIZE(A1)
  739. LD a3, 3 * SIZE(A1)
  740. MUL alpha1, a0, a0
  741. LD a4, 0 * SIZE(A2)
  742. MUL alpha1, a1, a1
  743. LD a5, 1 * SIZE(A2)
  744. MUL alpha1, a2, a2
  745. LD a6, 2 * SIZE(A2)
  746. MUL alpha1, a3, a3
  747. LD a7, 3 * SIZE(A2)
  748. ADD y0, a0, y0
  749. MUL alpha2, a4, a4
  750. ADD y1, a1, y1
  751. MUL alpha2, a5, a5
  752. ADD y2, a2, y2
  753. MUL alpha2, a6, a6
  754. ADD y3, a3, y3
  755. MUL alpha2, a7, a7
  756. ADD y0, a4, y0
  757. lda Y1, 4 * SIZE(Y1)
  758. ADD y1, a5, y1
  759. unop
  760. ADD y2, a6, y2
  761. unop
  762. ADD y3, a7, y3
  763. unop
  764. ST y0, -4 * SIZE(Y1)
  765. lda A1, 4 * SIZE(A1)
  766. ST y1, -3 * SIZE(Y1)
  767. lda A2, 4 * SIZE(A2)
  768. ST y2, -2 * SIZE(Y1)
  769. lda A3, 4 * SIZE(A3)
  770. ST y3, -1 * SIZE(Y1)
  771. lda A4, 4 * SIZE(A4)
  772. .align 4
  773. $L26:
  774. and M, 2, I
  775. ble I, $L27
  776. LD a0, 0 * SIZE(A1)
  777. LD a1, 1 * SIZE(A1)
  778. LD a2, 0 * SIZE(A2)
  779. LD a3, 1 * SIZE(A2)
  780. LD y0, 0 * SIZE(Y1)
  781. LD y1, 1 * SIZE(Y1)
  782. MUL alpha1, a0, a0
  783. MUL alpha1, a1, a1
  784. MUL alpha2, a2, a2
  785. MUL alpha2, a3, a3
  786. ADD y0, a0, y0
  787. lda A1, 2 * SIZE(A1)
  788. ADD y1, a1, y1
  789. lda A2, 2 * SIZE(A2)
  790. ADD y0, a2, y0
  791. unop
  792. ADD y1, a3, y1
  793. unop
  794. ST y0, 0 * SIZE(Y1)
  795. unop
  796. ST y1, 1 * SIZE(Y1)
  797. lda Y1, 2 * SIZE(Y1)
  798. .align 4
  799. $L27:
  800. blbc M, $L30
  801. LD y0, 0 * SIZE(Y1)
  802. LD a0, 0 * SIZE(A1)
  803. LD a1, 0 * SIZE(A2)
  804. MUL alpha1, a0, a0
  805. MUL alpha2, a1, a1
  806. ADD y0, a0, y0
  807. ADD y0, a1, y0
  808. ST y0, 0 * SIZE(Y1)
  809. .align 4
  810. $L30:
  811. blbc N, $L990
  812. LD alpha1, 0 * SIZE(X)
  813. mov A, A1
  814. MUL alpha, alpha1, alpha1
  815. mov Y, Y1
  816. sra M, 3, I
  817. ble I, $L35
  818. LD a0, 0 * SIZE(A1)
  819. LD a1, 1 * SIZE(A1)
  820. LD a2, 2 * SIZE(A1)
  821. LD a3, 3 * SIZE(A1)
  822. LD a4, 4 * SIZE(A1)
  823. LD a5, 5 * SIZE(A1)
  824. LD a6, 6 * SIZE(A1)
  825. LD a7, 7 * SIZE(A1)
  826. LD y0, 0 * SIZE(Y1)
  827. LD y1, 1 * SIZE(Y1)
  828. LD y2, 2 * SIZE(Y1)
  829. LD y3, 3 * SIZE(Y1)
  830. LD y4, 4 * SIZE(Y1)
  831. LD y5, 5 * SIZE(Y1)
  832. LD y6, 6 * SIZE(Y1)
  833. LD y7, 7 * SIZE(Y1)
  834. MUL alpha1, a0, a0
  835. MUL alpha1, a1, a1
  836. MUL alpha1, a2, a2
  837. MUL alpha1, a3, a3
  838. lda I, -1(I)
  839. ble I, $L33
  840. .align 4
  841. $L32:
  842. ADD y0, a0, y0
  843. LD y4, 4 * SIZE(Y1)
  844. MUL alpha1, a4, a4
  845. LD a0, 8 * SIZE(A1)
  846. ADD y1, a1, y1
  847. LD y5, 5 * SIZE(Y1)
  848. MUL alpha1, a5, a5
  849. LD a1, 9 * SIZE(A1)
  850. ADD y2, a2, y2
  851. LD y6, 6 * SIZE(Y1)
  852. MUL alpha1, a6, a6
  853. LD a2, 10 * SIZE(A1)
  854. ADD y3, a3, y3
  855. LD y7, 7 * SIZE(Y1)
  856. MUL alpha1, a7, a7
  857. LD a3, 11 * SIZE(A1)
  858. ST y0, 0 * SIZE(Y1)
  859. ST y1, 1 * SIZE(Y1)
  860. ST y2, 2 * SIZE(Y1)
  861. ST y3, 3 * SIZE(Y1)
  862. ADD y4, a4, y4
  863. LD y0, 8 * SIZE(Y1)
  864. MUL alpha1, a0, a0
  865. LD a4, 12 * SIZE(A1)
  866. ADD y5, a5, y5
  867. LD y1, 9 * SIZE(Y1)
  868. MUL alpha1, a1, a1
  869. LD a5, 13 * SIZE(A1)
  870. ADD y6, a6, y6
  871. LD y2, 10 * SIZE(Y1)
  872. MUL alpha1, a2, a2
  873. LD a6, 14 * SIZE(A1)
  874. ADD y7, a7, y7
  875. LD y3, 11 * SIZE(Y1)
  876. MUL alpha1, a3, a3
  877. LD a7, 15 * SIZE(A1)
  878. ST y4, 4 * SIZE(Y1)
  879. lda I, -1(I)
  880. ST y5, 5 * SIZE(Y1)
  881. lda A1, 8 * SIZE(A1)
  882. ST y6, 6 * SIZE(Y1)
  883. ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
  884. ST y7, 7 * SIZE(Y1)
  885. lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
  886. lda Y1, 8 * SIZE(Y1)
  887. bgt I, $L32
  888. .align 4
  889. $L33:
  890. ADD y0, a0, y0
  891. LD y4, 4 * SIZE(Y1)
  892. MUL alpha1, a4, a4
  893. unop
  894. ADD y1, a1, y1
  895. LD y5, 5 * SIZE(Y1)
  896. MUL alpha1, a5, a5
  897. unop
  898. ADD y2, a2, y2
  899. LD y6, 6 * SIZE(Y1)
  900. MUL alpha1, a6, a6
  901. unop
  902. ADD y3, a3, y3
  903. LD y7, 7 * SIZE(Y1)
  904. MUL alpha1, a7, a7
  905. unop
  906. ADD y4, a4, y4
  907. ST y0, 0 * SIZE(Y1)
  908. ADD y5, a5, y5
  909. ST y1, 1 * SIZE(Y1)
  910. ADD y6, a6, y6
  911. ST y2, 2 * SIZE(Y1)
  912. ADD y7, a7, y7
  913. ST y3, 3 * SIZE(Y1)
  914. ST y4, 4 * SIZE(Y1)
  915. unop
  916. ST y5, 5 * SIZE(Y1)
  917. unop
  918. ST y6, 6 * SIZE(Y1)
  919. lda A1, 8 * SIZE(A1)
  920. ST y7, 7 * SIZE(Y1)
  921. lda Y1, 8 * SIZE(Y1)
  922. .align 4
  923. $L35:
  924. and M, 4, I
  925. ble I, $L36
  926. LD a0, 0 * SIZE(A1)
  927. LD a1, 1 * SIZE(A1)
  928. LD a2, 2 * SIZE(A1)
  929. LD a3, 3 * SIZE(A1)
  930. MUL alpha1, a0, a0
  931. LD y0, 0 * SIZE(Y1)
  932. MUL alpha1, a1, a1
  933. LD y1, 1 * SIZE(Y1)
  934. MUL alpha1, a2, a2
  935. LD y2, 2 * SIZE(Y1)
  936. MUL alpha1, a3, a3
  937. LD y3, 3 * SIZE(Y1)
  938. ADD y0, a0, y0
  939. ADD y1, a1, y1
  940. ADD y2, a2, y2
  941. ADD y3, a3, y3
  942. ST y0, 0 * SIZE(Y1)
  943. lda A1, 4 * SIZE(A1)
  944. ST y1, 1 * SIZE(Y1)
  945. lda A2, 4 * SIZE(A2)
  946. ST y2, 2 * SIZE(Y1)
  947. unop
  948. ST y3, 3 * SIZE(Y1)
  949. lda Y1, 4 * SIZE(Y1)
  950. .align 4
  951. $L36:
  952. and M, 2, I
  953. ble I, $L37
  954. LD a0, 0 * SIZE(A1)
  955. LD a1, 1 * SIZE(A1)
  956. LD y0, 0 * SIZE(Y1)
  957. MUL alpha1, a0, a0
  958. LD y1, 1 * SIZE(Y1)
  959. MUL alpha1, a1, a1
  960. ADD y0, a0, y0
  961. ADD y1, a1, y1
  962. ST y0, 0 * SIZE(Y1)
  963. lda A1, 2 * SIZE(A1)
  964. ST y1, 1 * SIZE(Y1)
  965. lda Y1, 2 * SIZE(Y1)
  966. .align 4
  967. $L37:
  968. blbc M, $L990
  969. LD y0, 0 * SIZE(Y1)
  970. LD a0, 0 * SIZE(A1)
  971. MUL alpha1, a0, a0
  972. ADD y0, a0, y0
  973. ST y0, 0 * SIZE(Y1)
  974. .align 4
  975. $L990:
  976. cmpeq INCY, SIZE, $0
  977. bne $0, $L999
  978. mov BUFFER, Y1
  979. sra M, 3, I
  980. ble I, $L995
  981. .align 4
  982. $L992:
  983. LD a0, 0 * SIZE(BUFFER)
  984. addq BUFFER, INCY, BUFFER
  985. LD a1, 0 * SIZE(BUFFER)
  986. addq BUFFER, INCY, BUFFER
  987. LD a2, 0 * SIZE(BUFFER)
  988. addq BUFFER, INCY, BUFFER
  989. LD a3, 0 * SIZE(BUFFER)
  990. addq BUFFER, INCY, BUFFER
  991. LD y0, 0 * SIZE(Y)
  992. LD y1, 1 * SIZE(Y)
  993. LD y2, 2 * SIZE(Y)
  994. LD y3, 3 * SIZE(Y)
  995. LD a4, 0 * SIZE(BUFFER)
  996. addq BUFFER, INCY, BUFFER
  997. LD a5, 0 * SIZE(BUFFER)
  998. addq BUFFER, INCY, BUFFER
  999. LD a6, 0 * SIZE(BUFFER)
  1000. addq BUFFER, INCY, BUFFER
  1001. LD a7, 0 * SIZE(BUFFER)
  1002. addq BUFFER, INCY, BUFFER
  1003. LD y4, 4 * SIZE(Y)
  1004. LD y5, 5 * SIZE(Y)
  1005. LD y6, 6 * SIZE(Y)
  1006. LD y7, 7 * SIZE(Y)
  1007. ADD a0, y0, a0
  1008. ADD a1, y1, a1
  1009. ADD a2, y2, a2
  1010. ADD a3, y3, a3
  1011. ADD a4, y4, a4
  1012. ADD a5, y5, a5
  1013. ADD a6, y6, a6
  1014. ADD a7, y7, a7
  1015. ST a0, 0 * SIZE(Y1)
  1016. addq Y1, INCY, Y1
  1017. ST a1, 0 * SIZE(Y1)
  1018. addq Y1, INCY, Y1
  1019. ST a2, 0 * SIZE(Y1)
  1020. addq Y1, INCY, Y1
  1021. ST a3, 0 * SIZE(Y1)
  1022. addq Y1, INCY, Y1
  1023. ST a4, 0 * SIZE(Y1)
  1024. addq Y1, INCY, Y1
  1025. ST a5, 0 * SIZE(Y1)
  1026. addq Y1, INCY, Y1
  1027. ST a6, 0 * SIZE(Y1)
  1028. addq Y1, INCY, Y1
  1029. ST a7, 0 * SIZE(Y1)
  1030. addq Y1, INCY, Y1
  1031. lda I, -1(I)
  1032. lda Y, 8 * SIZE(Y)
  1033. bgt I, $L992
  1034. .align 4
  1035. $L995:
  1036. and M, 7, I
  1037. ble I, $L999
  1038. .align 4
  1039. $L996:
  1040. LD a0, 0 * SIZE(BUFFER)
  1041. addq BUFFER, INCY, BUFFER
  1042. LD y0, 0 * SIZE(Y)
  1043. lda Y, 1 * SIZE(Y)
  1044. ADD a0, y0, a0
  1045. ST a0, 0 * SIZE(Y1)
  1046. addq Y1, INCY, Y1
  1047. lda I, -1(I)
  1048. bgt I, $L996
  1049. .align 4
  1050. $L999:
  1051. ldt $f2, 0($sp)
  1052. ldt $f3, 8($sp)
  1053. ldt $f4, 16($sp)
  1054. ldt $f5, 24($sp)
  1055. ldt $f6, 32($sp)
  1056. ldt $f7, 40($sp)
  1057. ldt $f8, 48($sp)
  1058. ldt $f9, 56($sp)
  1059. lda $sp, STACKSIZE($sp)
  1060. ret
  1061. EPILOGUE