You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

daxpy_loongson3a_simd.S 16 kB


  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #define ASSEMBLER
  66. #include "common.h"
  67. #define PREFETCH_DISTANCE 2016
  68. #define N $4
  69. #define X $8
  70. #define INCX $9
  71. #define Y $10
  72. #define INCY $11
  73. #define I $2
  74. #define TEMP $3
  75. #define YY $5
  76. #define ALPHA $f15
  77. #define a1 $f0
  78. #define a2 $f1
  79. #define a3 $f2
  80. #define a4 $f3
  81. #define a5 $f4
  82. #define a6 $f5
  83. #define a7 $f6
  84. #define a8 $f7
  85. #define a9 $f8
  86. #define a10 $f9
  87. #define a11 $f10
  88. #define a12 $f11
  89. #define a13 $f12
  90. #define a14 $f13
  91. #define a15 $f14
  92. #define a16 $f17
  93. #define t1 $f18
  94. #define t2 $f19
  95. #define t3 $f20
  96. #define t4 $f21
  97. #define b1 $f22
  98. #define b2 $f23
  99. #define b3 $f24
  100. #define b4 $f25
  101. #define b5 $f26
  102. #define b6 $f27
  103. #define b7 $f28
  104. #define b8 $f29
  105. #define A1 0
  106. #define A2 1
  107. #define A3 2
  108. #define A4 3
  109. #define A5 4
  110. #define A6 5
  111. #define A7 6
  112. #define A8 7
  113. #define A9 8
  114. #define A10 9
  115. #define A11 10
  116. #define A12 11
  117. #define A13 12
  118. #define A14 13
  119. #define A15 14
  120. #define A16 17
  121. #define T1 18
  122. #define T2 19
  123. #define T3 20
  124. #define T4 21
  125. #define B1 22
  126. #define B2 23
  127. #define B3 24
  128. #define B4 25
  129. #define B5 26
  130. #define B6 27
  131. #define B7 28
  132. #define B8 29
  133. #define X_BASE 8
  134. #define Y_BASE 10
  135. #define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  136. #define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset))
  137. #define gsSQC1_(base,fq,ft,offset) .word (0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  138. #define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset))
  139. PROLOGUE
  140. #ifndef __64BIT__
  141. daddiu $sp, $sp, -40
  142. sdc1 $f20, 0($sp)
  143. sdc1 $f22, 8($sp)
  144. sdc1 $f24, 16($sp)
  145. sdc1 $f26, 24($sp)
  146. sdc1 $f28, 32($sp)
  147. #else
  148. daddiu $sp, $sp, -48
  149. sdc1 $f24, 0($sp)
  150. sdc1 $f25, 8($sp)
  151. sdc1 $f26, 16($sp)
  152. sdc1 $f27, 24($sp)
  153. sdc1 $f28, 32($sp)
  154. sdc1 $f29, 40($sp)
  155. #endif
  156. li TEMP, SIZE
  157. blez N, .L999
  158. dsll INCX, INCX, BASE_SHIFT
  159. bne INCX, TEMP, .L20
  160. dsll INCY, INCY, BASE_SHIFT
  161. bne INCY, TEMP, .L20
  162. //Dose the address of Y algin 16 bytes?
  163. andi TEMP, Y, 8
  164. beq TEMP, $0, .L10
  165. //Y unalgin. Compute this unalgined element.
  166. LD a1, 0 * SIZE(X)
  167. LD b1, 0 * SIZE(Y)
  168. daddiu X, X, SIZE
  169. daddiu Y, Y, SIZE
  170. MADD t1, b1, ALPHA, a1
  171. daddiu N, N, -1
  172. ST t1, -1 * SIZE(Y)
  173. blez N, .L999
  174. .align 5
  175. .L10:
  176. dsra I, N, 4
  177. blez I, .L15
  178. daddiu I, I, -1
  179. //Y algin. We need test X address
  180. //Dose the address of X algin 16 bytes?
  181. andi TEMP, X, 8
  182. bne TEMP, $0, .L30 ///
  183. .align 5
  184. .L11:
  185. //X & Y algin
  186. gsLQC1(X_BASE,A2,A1,0)
  187. gsLQC1(X_BASE,A4,A3,1)
  188. gsLQC1(X_BASE,A6,A5,2)
  189. gsLQC1(X_BASE,A8,A7,3)
  190. gsLQC1(X_BASE,A10,A9,4)
  191. gsLQC1(X_BASE,A12,A11,5)
  192. gsLQC1(X_BASE,A14,A13,6)
  193. gsLQC1(X_BASE,A16,A15,7)
  194. gsLQC1(Y_BASE,B2,B1,0)
  195. gsLQC1(Y_BASE,B4,B3,1)
  196. gsLQC1(Y_BASE,B6,B5,2)
  197. gsLQC1(Y_BASE,B8,B7,3)
  198. blez I, .L13
  199. NOP
  200. .align 5
  201. .L12:
  202. MADD t1, b1, ALPHA, a1
  203. MADD t2, b2, ALPHA, a2
  204. gsSQC1(Y_BASE, T2, T1, 0)
  205. gsLQC1(Y_BASE,B2,B1,4)
  206. MADD t3, b3, ALPHA, a3
  207. MADD t4, b4, ALPHA, a4
  208. gsSQC1(Y_BASE, T4, T3, 1)
  209. gsLQC1(Y_BASE,B4,B3,5)
  210. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  211. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  212. MADD t1, b5, ALPHA, a5
  213. MADD t2, b6, ALPHA, a6
  214. gsSQC1(Y_BASE, T2, T1, 2)
  215. gsLQC1(Y_BASE,B6,B5,6)
  216. MADD t3, b7, ALPHA, a7
  217. MADD t4, b8, ALPHA, a8
  218. gsSQC1(Y_BASE, T4, T3, 3)
  219. gsLQC1(Y_BASE,B8,B7, 7)
  220. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
  221. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
  222. MADD t1, b1, ALPHA, a9
  223. MADD t2, b2, ALPHA, a10
  224. gsSQC1(Y_BASE, T2, T1, 4)
  225. gsLQC1(Y_BASE,B2,B1,8)
  226. MADD t3, b3, ALPHA, a11
  227. MADD t4, b4, ALPHA, a12
  228. gsSQC1(Y_BASE, T4, T3, 5)
  229. gsLQC1(Y_BASE,B4,B3,9)
  230. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  231. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  232. MADD t1, b5, ALPHA, a13
  233. MADD t2, b6, ALPHA, a14
  234. gsSQC1(Y_BASE, T2, T1, 6)
  235. gsLQC1(Y_BASE,B6,B5,10)
  236. MADD t3, b7, ALPHA, a15
  237. MADD t4, b8, ALPHA, a16
  238. gsSQC1(Y_BASE, T4, T3, 7)
  239. gsLQC1(Y_BASE,B8,B7,11)
  240. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
  241. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
  242. gsLQC1(X_BASE,A2,A1,8)
  243. gsLQC1(X_BASE,A4,A3,9)
  244. gsLQC1(X_BASE,A6,A5,10)
  245. gsLQC1(X_BASE,A8,A7,11)
  246. gsLQC1(X_BASE,A10,A9,12)
  247. gsLQC1(X_BASE,A12,A11,13)
  248. gsLQC1(X_BASE,A14,A13,14)
  249. gsLQC1(X_BASE,A16,A15,15)
  250. daddiu I, I, -1
  251. daddiu Y, Y, 16 * SIZE
  252. daddiu X, X, 16 * SIZE
  253. bgtz I, .L12
  254. .align 5
  255. .L13:
  256. MADD t1, b1, ALPHA, a1
  257. MADD t2, b2, ALPHA, a2
  258. gsSQC1(Y_BASE, T2, T1, 0)
  259. gsLQC1(Y_BASE,B2,B1,4)
  260. MADD t3, b3, ALPHA, a3
  261. MADD t4, b4, ALPHA, a4
  262. gsSQC1(Y_BASE, T4, T3, 1)
  263. gsLQC1(Y_BASE,B4,B3,5)
  264. MADD t1, b5, ALPHA, a5
  265. MADD t2, b6, ALPHA, a6
  266. gsSQC1(Y_BASE, T2, T1, 2)
  267. gsLQC1(Y_BASE,B6,B5,6)
  268. MADD t3, b7, ALPHA, a7
  269. MADD t4, b8, ALPHA, a8
  270. gsSQC1(Y_BASE, T4, T3, 3)
  271. gsLQC1(Y_BASE,B8,B7,7)
  272. MADD t1, b1, ALPHA, a9
  273. MADD t2, b2, ALPHA, a10
  274. gsSQC1(Y_BASE, T2, T1, 4)
  275. MADD t3, b3, ALPHA, a11
  276. MADD t4, b4, ALPHA, a12
  277. gsSQC1(Y_BASE, T4, T3, 5)
  278. MADD t1, b5, ALPHA, a13
  279. MADD t2, b6, ALPHA, a14
  280. gsSQC1(Y_BASE, T2, T1, 6)
  281. MADD t3, b7, ALPHA, a15
  282. MADD t4, b8, ALPHA, a16
  283. gsSQC1(Y_BASE, T4, T3, 7)
  284. daddiu X, X, 16 * SIZE
  285. daddiu Y, Y, 16 * SIZE
  286. .align 5
  287. .L15:
  288. andi I, N, 15
  289. blez I, .L999
  290. NOP
  291. .align 5
  292. .L16:
  293. LD a1, 0 * SIZE(X)
  294. LD b1, 0 * SIZE(Y)
  295. daddiu X, X, SIZE
  296. daddiu Y, Y, SIZE
  297. MADD t1, b1, ALPHA, a1
  298. daddiu I, I, -1
  299. bgtz I, .L16
  300. ST t1, -1 * SIZE(Y)
  301. #ifndef __64BIT__
  302. ldc1 $f20, 0($sp)
  303. ldc1 $f22, 8($sp)
  304. ldc1 $f24, 16($sp)
  305. ldc1 $f26, 24($sp)
  306. ldc1 $f28, 32($sp)
  307. daddiu $sp, $sp, 40
  308. #else
  309. ldc1 $f24, 0($sp)
  310. ldc1 $f25, 8($sp)
  311. ldc1 $f26, 16($sp)
  312. ldc1 $f27, 24($sp)
  313. ldc1 $f28, 32($sp)
  314. ldc1 $f29, 40($sp)
  315. daddiu $sp, $sp, 48
  316. #endif
  317. j $31
  318. NOP
  319. .align 5
  320. .L30:
  321. //Y align, X unalign, INCX==INCY==1
  322. //unloop 16
  323. LD a1, 0 * SIZE(X)
  324. daddiu X, X, SIZE
  325. gsLQC1(X_BASE,A3,A2,0)
  326. gsLQC1(X_BASE,A5,A4,1)
  327. gsLQC1(X_BASE,A7,A6,2)
  328. gsLQC1(X_BASE,A9,A8,3)
  329. gsLQC1(X_BASE,A11,A10,4)
  330. gsLQC1(X_BASE,A13,A12,5)
  331. gsLQC1(X_BASE,A15,A14,6)
  332. LD a16, 14 * SIZE(X)
  333. gsLQC1(Y_BASE,B2,B1,0)
  334. gsLQC1(Y_BASE,B4,B3,1)
  335. gsLQC1(Y_BASE,B6,B5,2)
  336. gsLQC1(Y_BASE,B8,B7,3)
  337. blez I, .L32
  338. NOP
  339. .align 5
  340. .L31:
  341. MADD t1, b1, ALPHA, a1
  342. MADD t2, b2, ALPHA, a2
  343. gsSQC1(Y_BASE, T2, T1, 0)
  344. gsLQC1(Y_BASE,B2,B1,4)
  345. MADD t3, b3, ALPHA, a3
  346. MADD t4, b4, ALPHA, a4
  347. gsSQC1(Y_BASE, T4, T3, 1)
  348. gsLQC1(Y_BASE,B4,B3,5)
  349. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  350. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  351. MADD t1, b5, ALPHA, a5
  352. MADD t2, b6, ALPHA, a6
  353. gsSQC1(Y_BASE, T2, T1, 2)
  354. gsLQC1(Y_BASE,B6,B5,6)
  355. MADD t3, b7, ALPHA, a7
  356. MADD t4, b8, ALPHA, a8
  357. gsSQC1(Y_BASE, T4, T3, 3)
  358. gsLQC1(Y_BASE,B8,B7,7)
  359. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
  360. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
  361. MADD t1, b1, ALPHA, a9
  362. MADD t2, b2, ALPHA, a10
  363. gsSQC1(Y_BASE, T2, T1, 4)
  364. gsLQC1(Y_BASE,B2,B1,8)
  365. MADD t3, b3, ALPHA, a11
  366. MADD t4, b4, ALPHA, a12
  367. gsSQC1(Y_BASE, T4, T3, 5)
  368. gsLQC1(Y_BASE,B4,B3,9)
  369. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  370. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  371. MADD t1, b5, ALPHA, a13
  372. MADD t2, b6, ALPHA, a14
  373. gsSQC1(Y_BASE, T2, T1, 6)
  374. gsLQC1(Y_BASE,B6,B5,10)
  375. MADD t3, b7, ALPHA, a15
  376. MADD t4, b8, ALPHA, a16
  377. gsSQC1(Y_BASE, T4, T3, 7)
  378. gsLQC1(Y_BASE,B8,B7,11)
  379. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
  380. PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
  381. LD a1, 15 * SIZE(X)
  382. gsLQC1(X_BASE,A3,A2,8)
  383. gsLQC1(X_BASE,A5,A4,9)
  384. gsLQC1(X_BASE,A7,A6,10)
  385. gsLQC1(X_BASE,A9,A8,11)
  386. gsLQC1(X_BASE,A11,A10,12)
  387. gsLQC1(X_BASE,A13,A12,13)
  388. gsLQC1(X_BASE,A15,A14,14)
  389. LD a16, 30 * SIZE(X)
  390. daddiu I, I, -1
  391. daddiu Y, Y, 16 * SIZE
  392. daddiu X, X, 16 * SIZE
  393. bgtz I, .L31
  394. .align 5
  395. //Loop end:
  396. .L32:
  397. MADD t1, b1, ALPHA, a1
  398. MADD t2, b2, ALPHA, a2
  399. gsSQC1(Y_BASE, T2, T1, 0)
  400. gsLQC1(Y_BASE,B2,B1,4)
  401. MADD t3, b3, ALPHA, a3
  402. MADD t4, b4, ALPHA, a4
  403. gsSQC1(Y_BASE, T4, T3, 1)
  404. gsLQC1(Y_BASE,B4,B3,5)
  405. MADD t1, b5, ALPHA, a5
  406. MADD t2, b6, ALPHA, a6
  407. gsSQC1(Y_BASE, T2, T1, 2)
  408. gsLQC1(Y_BASE,B6,B5,6)
  409. MADD t3, b7, ALPHA, a7
  410. MADD t4, b8, ALPHA, a8
  411. gsSQC1(Y_BASE, T4, T3, 3)
  412. gsLQC1(Y_BASE,B8,B7,7)
  413. MADD t1, b1, ALPHA, a9
  414. MADD t2, b2, ALPHA, a10
  415. gsSQC1(Y_BASE, T2, T1, 4)
  416. MADD t3, b3, ALPHA, a11
  417. MADD t4, b4, ALPHA, a12
  418. gsSQC1(Y_BASE, T4, T3, 5)
  419. MADD t1, b5, ALPHA, a13
  420. MADD t2, b6, ALPHA, a14
  421. gsSQC1(Y_BASE, T2, T1, 6)
  422. MADD t3, b7, ALPHA, a15
  423. MADD t4, b8, ALPHA, a16
  424. gsSQC1(Y_BASE, T4, T3, 7)
  425. daddiu X, X, 15 * SIZE
  426. daddiu Y, Y, 16 * SIZE
  427. //jump back to the remain process.
  428. b .L15
  429. .align 5
  430. //INCX!=1 or INCY != 1
  431. .L20:
  432. dsra I, N, 3
  433. move YY, Y
  434. blez I, .L25
  435. daddiu I, I, -1
  436. LD a1, 0 * SIZE(X)
  437. daddu X, X, INCX
  438. LD b1, 0 * SIZE(Y)
  439. daddu Y, Y, INCY
  440. LD a2, 0 * SIZE(X)
  441. daddu X, X, INCX
  442. LD b2, 0 * SIZE(Y)
  443. daddu Y, Y, INCY
  444. LD a3, 0 * SIZE(X)
  445. daddu X, X, INCX
  446. LD b3, 0 * SIZE(Y)
  447. daddu Y, Y, INCY
  448. LD a4, 0 * SIZE(X)
  449. daddu X, X, INCX
  450. LD b4, 0 * SIZE(Y)
  451. daddu Y, Y, INCY
  452. LD a5, 0 * SIZE(X)
  453. daddu X, X, INCX
  454. LD b5, 0 * SIZE(Y)
  455. daddu Y, Y, INCY
  456. LD a6, 0 * SIZE(X)
  457. daddu X, X, INCX
  458. LD b6, 0 * SIZE(Y)
  459. daddu Y, Y, INCY
  460. LD a7, 0 * SIZE(X)
  461. daddu X, X, INCX
  462. LD b7, 0 * SIZE(Y)
  463. daddu Y, Y, INCY
  464. LD a8, 0 * SIZE(X)
  465. daddu X, X, INCX
  466. LD b8, 0 * SIZE(Y)
  467. daddu Y, Y, INCY
  468. blez I, .L23
  469. NOP
  470. .align 5
  471. .L22:
  472. MADD t1, b1, ALPHA, a1
  473. LD a1, 0 * SIZE(X)
  474. LD b1, 0 * SIZE(Y)
  475. daddu X, X, INCX
  476. daddu Y, Y, INCY
  477. MADD t2, b2, ALPHA, a2
  478. LD a2, 0 * SIZE(X)
  479. LD b2, 0 * SIZE(Y)
  480. daddu X, X, INCX
  481. daddu Y, Y, INCY
  482. MADD t3, b3, ALPHA, a3
  483. LD a3, 0 * SIZE(X)
  484. LD b3, 0 * SIZE(Y)
  485. daddu X, X, INCX
  486. daddu Y, Y, INCY
  487. MADD t4, b4, ALPHA, a4
  488. LD a4, 0 * SIZE(X)
  489. LD b4, 0 * SIZE(Y)
  490. daddu X, X, INCX
  491. daddu Y, Y, INCY
  492. ST t1, 0 * SIZE(YY)
  493. daddu YY, YY, INCY
  494. MADD t1, b5, ALPHA, a5
  495. LD a5, 0 * SIZE(X)
  496. LD b5, 0 * SIZE(Y)
  497. daddu X, X, INCX
  498. daddu Y, Y, INCY
  499. ST t2, 0 * SIZE(YY)
  500. daddu YY, YY, INCY
  501. MADD t2, b6, ALPHA, a6
  502. LD a6, 0 * SIZE(X)
  503. LD b6, 0 * SIZE(Y)
  504. daddu X, X, INCX
  505. daddu Y, Y, INCY
  506. ST t3, 0 * SIZE(YY)
  507. daddu YY, YY, INCY
  508. MADD t3, b7, ALPHA, a7
  509. LD a7, 0 * SIZE(X)
  510. LD b7, 0 * SIZE(Y)
  511. daddu X, X, INCX
  512. daddu Y, Y, INCY
  513. ST t4, 0 * SIZE(YY)
  514. daddu YY, YY, INCY
  515. MADD t4, b8, ALPHA, a8
  516. LD a8, 0 * SIZE(X)
  517. daddu X, X, INCX
  518. LD b8, 0 * SIZE(Y)
  519. daddu Y, Y, INCY
  520. ST t1, 0 * SIZE(YY)
  521. daddu YY, YY, INCY
  522. ST t2, 0 * SIZE(YY)
  523. daddu YY, YY, INCY
  524. ST t3, 0 * SIZE(YY)
  525. daddu YY, YY, INCY
  526. ST t4, 0 * SIZE(YY)
  527. daddiu I, I, -1
  528. bgtz I, .L22
  529. daddu YY, YY, INCY
  530. .align 5
  531. .L23:
  532. MADD t1, b1, ALPHA, a1
  533. MADD t2, b2, ALPHA, a2
  534. MADD t3, b3, ALPHA, a3
  535. MADD t4, b4, ALPHA, a4
  536. ST t1, 0 * SIZE(YY)
  537. daddu YY, YY, INCY
  538. MADD t1, b5, ALPHA, a5
  539. ST t2, 0 * SIZE(YY)
  540. daddu YY, YY, INCY
  541. MADD t2, b6, ALPHA, a6
  542. ST t3, 0 * SIZE(YY)
  543. daddu YY, YY, INCY
  544. MADD t3, b7, ALPHA, a7
  545. ST t4, 0 * SIZE(YY)
  546. daddu YY, YY, INCY
  547. MADD t4, b8, ALPHA, a8
  548. ST t1, 0 * SIZE(YY)
  549. daddu YY, YY, INCY
  550. ST t2, 0 * SIZE(YY)
  551. daddu YY, YY, INCY
  552. ST t3, 0 * SIZE(YY)
  553. daddu YY, YY, INCY
  554. ST t4, 0 * SIZE(YY)
  555. daddu YY, YY, INCY
  556. .align 5
  557. .L25:
  558. andi I, N, 7
  559. blez I, .L999
  560. NOP
  561. .align 5
  562. .L26:
  563. LD a1, 0 * SIZE(X)
  564. LD b1, 0 * SIZE(Y)
  565. MADD t1, b1, ALPHA, a1
  566. daddu X, X, INCX
  567. ST t1, 0 * SIZE(Y)
  568. daddiu I, I, -1
  569. bgtz I, .L26
  570. daddu Y, Y, INCY
  571. .align 5
  572. .L999:
  573. #ifndef __64BIT__
  574. ldc1 $f20, 0($sp)
  575. ldc1 $f22, 8($sp)
  576. ldc1 $f24, 16($sp)
  577. ldc1 $f26, 24($sp)
  578. ldc1 $f28, 32($sp)
  579. daddiu $sp, $sp, 40
  580. #else
  581. ldc1 $f24, 0($sp)
  582. ldc1 $f25, 8($sp)
  583. ldc1 $f26, 16($sp)
  584. ldc1 $f27, 24($sp)
  585. ldc1 $f28, 32($sp)
  586. ldc1 $f29, 40($sp)
  587. daddiu $sp, $sp, 48
  588. #endif
  589. j $31
  590. NOP
  591. EPILOGUE