You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_L.S 13 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define A $6
  42. #define LDA $7
  43. #define X $8
  44. #define INCX $9
  45. #define Y $10
  46. #define INCY $11
  47. #define BUFFER $5
  48. #define XX $12
  49. #define YY $13
  50. #define I $14
  51. #define IS $15
  52. #define AO1 $16
  53. #define AO2 $17
  54. #define Y1 $18
  55. #define TEMP $19
  56. #define II INCX
  57. #define ALPHA $f13
  58. #define a1 $f0
  59. #define a2 $f1
  60. #define a3 $f2
  61. #define a4 $f3
  62. #define a5 $f4
  63. #define a6 $f5
  64. #define a7 $f6
  65. #define a8 $f7
  66. #define alpha1 $f8
  67. #define alpha2 $f9
  68. #define x1 $f10
  69. #define x2 $f11
  70. #define x3 $f12
  71. #define x4 $f14
  72. #define xsum1 $f15
  73. #define xsum2 $f16
  74. #define ysum1 $f17
  75. #define ysum2 $f18
  76. #define ysum3 $f19
  77. #define ysum4 $f20
  78. PROLOGUE
  79. LDARG BUFFER, 0($sp)
  80. daddiu $sp, $sp, -32
  81. SDARG $16, 0($sp)
  82. dsll LDA, LDA, BASE_SHIFT
  83. SDARG $17, 8($sp)
  84. dsll INCX, INCX, BASE_SHIFT
  85. SDARG $18, 16($sp)
  86. dsll INCY, INCY, BASE_SHIFT
  87. SDARG $19, 24($sp)
  88. nop
  89. blez M, .L999
  90. li IS, SIZE
  91. beq IS, INCX, .L05
  92. move Y1, Y
  93. dsra I, M, 2
  94. move XX, X
  95. blez I, .L02
  96. move X, BUFFER
  97. .align 3
  98. .L01:
  99. LD a1, 0 * SIZE(XX)
  100. daddu XX, XX, INCX
  101. LD a2, 0 * SIZE(XX)
  102. daddu XX, XX, INCX
  103. LD a3, 0 * SIZE(XX)
  104. daddu XX, XX, INCX
  105. LD a4, 0 * SIZE(XX)
  106. daddu XX, XX, INCX
  107. ST a1, 0 * SIZE(BUFFER)
  108. ST a2, 1 * SIZE(BUFFER)
  109. ST a3, 2 * SIZE(BUFFER)
  110. ST a4, 3 * SIZE(BUFFER)
  111. daddiu I, I, -1
  112. bgtz I, .L01
  113. daddiu BUFFER, BUFFER, 4 * SIZE
  114. .align 3
  115. .L02:
  116. andi I, M, 3
  117. blez I, .L05
  118. NOP
  119. .align 3
  120. .L03:
  121. LD a1, 0 * SIZE(XX)
  122. daddu XX, XX, INCX
  123. ST a1, 0 * SIZE(BUFFER)
  124. daddiu I, I, -1
  125. bgtz I, .L03
  126. daddiu BUFFER, BUFFER, 1 * SIZE
  127. .align 3
  128. .L05:
  129. beq IS, INCY, .L10
  130. daddiu BUFFER, BUFFER, 255
  131. li TEMP, -256
  132. and BUFFER, BUFFER, TEMP
  133. dsra I, M, 2
  134. move Y1, BUFFER
  135. blez I, .L07
  136. move YY, Y
  137. .align 3
  138. .L06:
  139. LD a1, 0 * SIZE(YY)
  140. daddu YY, YY, INCY
  141. LD a2, 0 * SIZE(YY)
  142. daddu YY, YY, INCY
  143. LD a3, 0 * SIZE(YY)
  144. daddu YY, YY, INCY
  145. LD a4, 0 * SIZE(YY)
  146. daddu YY, YY, INCY
  147. ST a1, 0 * SIZE(BUFFER)
  148. ST a2, 1 * SIZE(BUFFER)
  149. ST a3, 2 * SIZE(BUFFER)
  150. ST a4, 3 * SIZE(BUFFER)
  151. daddiu I, I, -1
  152. bgtz I, .L06
  153. daddiu BUFFER, BUFFER, 4 * SIZE
  154. .align 3
  155. .L07:
  156. andi I, M, 3
  157. blez I, .L10
  158. NOP
  159. .align 3
  160. .L08:
  161. LD a1, 0 * SIZE(YY)
  162. daddu YY, YY, INCY
  163. ST a1, 0 * SIZE(BUFFER)
  164. daddiu I, I, -1
  165. bgtz I, .L08
  166. daddiu BUFFER, BUFFER, 1 * SIZE
  167. .align 3
  168. .L10:
  169. slti TEMP, M, 2
  170. nop
  171. bgtz TEMP, .L20
  172. li IS, 0
  173. .align 3
  174. .L11:
  175. dsll TEMP, IS, BASE_SHIFT
  176. nop
  177. daddu XX, X, TEMP
  178. daddu YY, Y1, TEMP
  179. LD alpha1, 0 * SIZE(XX)
  180. move AO1, A
  181. LD alpha2, 1 * SIZE(XX)
  182. daddiu XX, XX, 2 * SIZE
  183. LD a1, 0 * SIZE(AO1)
  184. daddu AO2, A, LDA
  185. LD a2, 1 * SIZE(AO1)
  186. daddiu AO1, AO1, 2 * SIZE
  187. LD a3, 0 * SIZE(AO2)
  188. daddu A, AO2, LDA
  189. LD a4, 1 * SIZE(AO2)
  190. daddiu AO2, AO2, 2 * SIZE
  191. MUL xsum1, alpha1, a1
  192. daddiu A, A, 2 * SIZE
  193. MUL xsum2, alpha1, a2
  194. dsubu II, M, IS
  195. MADD xsum1, xsum1, alpha2, a2
  196. MADD xsum2, xsum2, alpha2, a4
  197. daddiu II, II, - 2
  198. MUL alpha1, ALPHA, alpha1
  199. daddiu YY, YY, 2 * SIZE
  200. MUL alpha2, ALPHA, alpha2
  201. dsra I, II, 3
  202. blez I, .L15
  203. daddiu I, I, -1
  204. LD x1, 0 * SIZE(XX)
  205. LD x2, 1 * SIZE(XX)
  206. LD x3, 2 * SIZE(XX)
  207. LD a1, 0 * SIZE(AO1)
  208. LD a2, 1 * SIZE(AO1)
  209. LD a5, 2 * SIZE(AO1)
  210. LD a6, 3 * SIZE(AO1)
  211. LD a3, 0 * SIZE(AO2)
  212. LD a4, 1 * SIZE(AO2)
  213. LD a7, 2 * SIZE(AO2)
  214. LD a8, 3 * SIZE(AO2)
  215. LD ysum1, 0 * SIZE(YY)
  216. LD ysum2, 1 * SIZE(YY)
  217. blez I, .L13
  218. LD ysum3, 2 * SIZE(YY)
  219. .align 3
  220. .L12:
  221. MADD ysum1, ysum1, alpha1, a1
  222. LD ysum4, 3 * SIZE(YY)
  223. MADD ysum2, ysum2, alpha1, a2
  224. LD x4, 3 * SIZE(XX)
  225. MADD xsum1, xsum1, x1, a1
  226. LD a1, 4 * SIZE(AO1)
  227. MADD xsum2, xsum2, x1, a3
  228. LD x1, 4 * SIZE(XX)
  229. MADD ysum1, ysum1, alpha2, a3
  230. LD a3, 4 * SIZE(AO2)
  231. MADD ysum2, ysum2, alpha2, a4
  232. daddiu I, I, -1
  233. MADD xsum1, xsum1, x2, a2
  234. LD a2, 5 * SIZE(AO1)
  235. MADD xsum2, xsum2, x2, a4
  236. LD a4, 5 * SIZE(AO2)
  237. ST ysum1, 0 * SIZE(YY)
  238. LD ysum1, 4 * SIZE(YY)
  239. ST ysum2, 1 * SIZE(YY)
  240. LD ysum2, 5 * SIZE(YY)
  241. MADD ysum3, ysum3, alpha1, a5
  242. nop
  243. MADD ysum4, ysum4, alpha1, a6
  244. LD x2, 5 * SIZE(XX)
  245. MADD xsum1, xsum1, x3, a5
  246. LD a5, 6 * SIZE(AO1)
  247. MADD xsum2, xsum2, x3, a7
  248. LD x3, 6 * SIZE(XX)
  249. MADD ysum3, ysum3, alpha2, a7
  250. LD a7, 6 * SIZE(AO2)
  251. MADD ysum4, ysum4, alpha2, a8
  252. daddiu XX, XX, 8 * SIZE
  253. MADD xsum1, xsum1, x4, a6
  254. LD a6, 7 * SIZE(AO1)
  255. MADD xsum2, xsum2, x4, a8
  256. LD a8, 7 * SIZE(AO2)
  257. ST ysum3, 2 * SIZE(YY)
  258. LD ysum3, 6 * SIZE(YY)
  259. ST ysum4, 3 * SIZE(YY)
  260. LD ysum4, 7 * SIZE(YY)
  261. MADD ysum1, ysum1, alpha1, a1
  262. daddiu AO2, AO2, 8 * SIZE
  263. MADD ysum2, ysum2, alpha1, a2
  264. LD x4,-1 * SIZE(XX)
  265. MADD xsum1, xsum1, x1, a1
  266. LD a1, 8 * SIZE(AO1)
  267. MADD xsum2, xsum2, x1, a3
  268. LD x1, 0 * SIZE(XX)
  269. MADD ysum1, ysum1, alpha2, a3
  270. LD a3, 0 * SIZE(AO2)
  271. MADD ysum2, ysum2, alpha2, a4
  272. nop
  273. MADD xsum1, xsum1, x2, a2
  274. LD a2, 9 * SIZE(AO1)
  275. MADD xsum2, xsum2, x2, a4
  276. LD a4, 1 * SIZE(AO2)
  277. ST ysum1, 4 * SIZE(YY)
  278. LD ysum1, 8 * SIZE(YY)
  279. ST ysum2, 5 * SIZE(YY)
  280. LD ysum2, 9 * SIZE(YY)
  281. MADD ysum3, ysum3, alpha1, a5
  282. daddiu AO1, AO1, 8 * SIZE
  283. MADD ysum4, ysum4, alpha1, a6
  284. LD x2, 1 * SIZE(XX)
  285. MADD xsum1, xsum1, x3, a5
  286. LD a5, 2 * SIZE(AO1)
  287. MADD xsum2, xsum2, x3, a7
  288. LD x3, 2 * SIZE(XX)
  289. MADD ysum3, ysum3, alpha2, a7
  290. LD a7, 2 * SIZE(AO2)
  291. MADD ysum4, ysum4, alpha2, a8
  292. daddiu YY, YY, 8 * SIZE
  293. MADD xsum1, xsum1, x4, a6
  294. LD a6, 3 * SIZE(AO1)
  295. MADD xsum2, xsum2, x4, a8
  296. LD a8, 3 * SIZE(AO2)
  297. ST ysum3,-2 * SIZE(YY)
  298. LD ysum3, 2 * SIZE(YY)
  299. bgtz I, .L12
  300. ST ysum4,-1 * SIZE(YY)
  301. .align 3
  302. .L13:
  303. MADD ysum1, ysum1, alpha1, a1
  304. LD ysum4, 3 * SIZE(YY)
  305. MADD ysum2, ysum2, alpha1, a2
  306. LD x4, 3 * SIZE(XX)
  307. MADD xsum1, xsum1, x1, a1
  308. LD a1, 4 * SIZE(AO1)
  309. MADD xsum2, xsum2, x1, a3
  310. LD x1, 4 * SIZE(XX)
  311. MADD ysum1, ysum1, alpha2, a3
  312. LD a3, 4 * SIZE(AO2)
  313. MADD ysum2, ysum2, alpha2, a4
  314. MADD xsum1, xsum1, x2, a2
  315. LD a2, 5 * SIZE(AO1)
  316. MADD xsum2, xsum2, x2, a4
  317. LD a4, 5 * SIZE(AO2)
  318. LD x2, 5 * SIZE(XX)
  319. ST ysum1, 0 * SIZE(YY)
  320. ST ysum2, 1 * SIZE(YY)
  321. LD ysum1, 4 * SIZE(YY)
  322. LD ysum2, 5 * SIZE(YY)
  323. MADD ysum3, ysum3, alpha1, a5
  324. MADD ysum4, ysum4, alpha1, a6
  325. MADD xsum1, xsum1, x3, a5
  326. LD a5, 6 * SIZE(AO1)
  327. MADD xsum2, xsum2, x3, a7
  328. LD x3, 6 * SIZE(XX)
  329. MADD ysum3, ysum3, alpha2, a7
  330. LD a7, 6 * SIZE(AO2)
  331. MADD ysum4, ysum4, alpha2, a8
  332. MADD xsum1, xsum1, x4, a6
  333. LD a6, 7 * SIZE(AO1)
  334. MADD xsum2, xsum2, x4, a8
  335. LD a8, 7 * SIZE(AO2)
  336. LD x4, 7 * SIZE(XX)
  337. ST ysum3, 2 * SIZE(YY)
  338. ST ysum4, 3 * SIZE(YY)
  339. LD ysum3, 6 * SIZE(YY)
  340. LD ysum4, 7 * SIZE(YY)
  341. MADD ysum1, ysum1, alpha1, a1
  342. MADD ysum2, ysum2, alpha1, a2
  343. MADD xsum1, xsum1, x1, a1
  344. MADD xsum2, xsum2, x1, a3
  345. MADD ysum1, ysum1, alpha2, a3
  346. MADD ysum2, ysum2, alpha2, a4
  347. MADD xsum1, xsum1, x2, a2
  348. MADD xsum2, xsum2, x2, a4
  349. MADD ysum3, ysum3, alpha1, a5
  350. MADD ysum4, ysum4, alpha1, a6
  351. MADD xsum1, xsum1, x3, a5
  352. MADD xsum2, xsum2, x3, a7
  353. MADD ysum3, ysum3, alpha2, a7
  354. daddiu XX, XX, 8 * SIZE
  355. MADD ysum4, ysum4, alpha2, a8
  356. daddiu AO1, AO1, 8 * SIZE
  357. MADD xsum1, xsum1, x4, a6
  358. daddiu AO2, AO2, 8 * SIZE
  359. MADD xsum2, xsum2, x4, a8
  360. ST ysum1, 4 * SIZE(YY)
  361. ST ysum2, 5 * SIZE(YY)
  362. ST ysum3, 6 * SIZE(YY)
  363. ST ysum4, 7 * SIZE(YY)
  364. daddiu YY, YY, 8 * SIZE
  365. .align 3
  366. .L15:
  367. andi I, II, 4
  368. NOP
  369. blez I, .L16
  370. NOP
  371. LD x1, 0 * SIZE(XX)
  372. LD x2, 1 * SIZE(XX)
  373. LD x3, 2 * SIZE(XX)
  374. LD x4, 3 * SIZE(XX)
  375. daddiu XX, XX, 4 * SIZE
  376. LD a1, 0 * SIZE(AO1)
  377. LD a2, 1 * SIZE(AO1)
  378. LD a5, 2 * SIZE(AO1)
  379. LD a6, 3 * SIZE(AO1)
  380. daddiu AO1, AO1, 4 * SIZE
  381. LD a3, 0 * SIZE(AO2)
  382. LD a4, 1 * SIZE(AO2)
  383. LD a7, 2 * SIZE(AO2)
  384. LD a8, 3 * SIZE(AO2)
  385. daddiu AO2, AO2, 4 * SIZE
  386. LD ysum1, 0 * SIZE(YY)
  387. LD ysum2, 1 * SIZE(YY)
  388. LD ysum3, 2 * SIZE(YY)
  389. LD ysum4, 3 * SIZE(YY)
  390. MADD ysum1, ysum1, alpha1, a1
  391. MADD ysum2, ysum2, alpha1, a2
  392. MADD xsum1, xsum1, x1, a1
  393. MADD xsum2, xsum2, x1, a3
  394. MADD ysum1, ysum1, alpha2, a3
  395. MADD ysum2, ysum2, alpha2, a4
  396. MADD xsum1, xsum1, x2, a2
  397. MADD xsum2, xsum2, x2, a4
  398. MADD ysum3, ysum3, alpha1, a5
  399. MADD ysum4, ysum4, alpha1, a6
  400. MADD xsum1, xsum1, x3, a5
  401. MADD xsum2, xsum2, x3, a7
  402. MADD ysum3, ysum3, alpha2, a7
  403. MADD ysum4, ysum4, alpha2, a8
  404. MADD xsum1, xsum1, x4, a6
  405. MADD xsum2, xsum2, x4, a8
  406. ST ysum1, 0 * SIZE(YY)
  407. ST ysum2, 1 * SIZE(YY)
  408. ST ysum3, 2 * SIZE(YY)
  409. ST ysum4, 3 * SIZE(YY)
  410. daddiu YY, YY, 4 * SIZE
  411. .align 3
  412. .L16:
  413. andi I, II, 2
  414. NOP
  415. blez I, .L17
  416. NOP
  417. LD x1, 0 * SIZE(XX)
  418. LD x2, 1 * SIZE(XX)
  419. daddiu XX, XX, 2 * SIZE
  420. LD a1, 0 * SIZE(AO1)
  421. LD a2, 1 * SIZE(AO1)
  422. daddiu AO1, AO1, 2 * SIZE
  423. LD a3, 0 * SIZE(AO2)
  424. LD a4, 1 * SIZE(AO2)
  425. daddiu AO2, AO2, 2 * SIZE
  426. LD ysum1, 0 * SIZE(YY)
  427. LD ysum2, 1 * SIZE(YY)
  428. MADD ysum1, ysum1, alpha1, a1
  429. MADD ysum2, ysum2, alpha1, a2
  430. MADD xsum1, xsum1, x1, a1
  431. MADD xsum2, xsum2, x1, a3
  432. MADD ysum1, ysum1, alpha2, a3
  433. MADD ysum2, ysum2, alpha2, a4
  434. MADD xsum1, xsum1, x2, a2
  435. MADD xsum2, xsum2, x2, a4
  436. ST ysum1, 0 * SIZE(YY)
  437. ST ysum2, 1 * SIZE(YY)
  438. daddiu YY, YY, 2 * SIZE
  439. .align 3
  440. .L17:
  441. andi I, M, 1
  442. NOP
  443. blez I, .L19
  444. NOP
  445. LD x1, 0 * SIZE(XX)
  446. daddiu XX, XX, 1 * SIZE
  447. LD a1, 0 * SIZE(AO1)
  448. daddiu AO1, AO1, 1 * SIZE
  449. LD a3, 0 * SIZE(AO2)
  450. daddiu AO2, AO2, 1 * SIZE
  451. LD ysum1, 0 * SIZE(YY)
  452. MADD ysum1, ysum1, alpha1, a1
  453. MADD xsum1, xsum1, x1, a1
  454. MADD ysum1, ysum1, alpha2, a3
  455. MADD xsum2, xsum2, x1, a3
  456. ST ysum1, 0 * SIZE(YY)
  457. .align 3
  458. .L19:
  459. dsll TEMP, IS, BASE_SHIFT
  460. daddu TEMP, Y1, TEMP
  461. LD ysum1, 0 * SIZE(TEMP)
  462. LD ysum2, 1 * SIZE(TEMP)
  463. MADD ysum1, ysum1, ALPHA, xsum1
  464. MADD ysum2, ysum2, ALPHA, xsum2
  465. ST ysum1, 0 * SIZE(TEMP)
  466. ST ysum2, 1 * SIZE(TEMP)
  467. daddiu TEMP, IS, 4
  468. slt TEMP, M, TEMP
  469. beqz TEMP, .L11
  470. daddiu IS, IS, 2
  471. .align 3
  472. .L20:
  473. andi I, M, 1
  474. dsll TEMP, IS, BASE_SHIFT
  475. blez I, .L900
  476. daddu XX, X, TEMP
  477. daddu YY, Y1, TEMP
  478. LD x1, 0 * SIZE(XX)
  479. LD ysum1, 0 * SIZE(YY)
  480. LD a1, 0 * SIZE(A)
  481. MUL xsum1, a1, x1
  482. MADD ysum1, ysum1, ALPHA, xsum1
  483. ST ysum1, 0 * SIZE(YY)
  484. .align 3
  485. .L900:
  486. li IS, SIZE
  487. beq INCY, IS, .L999
  488. NOP
  489. dsra I, M, 2
  490. blez I, .L905
  491. NOP
  492. .align 3
  493. .L902:
  494. LD a1, 0 * SIZE(Y1)
  495. LD a2, 1 * SIZE(Y1)
  496. LD a3, 2 * SIZE(Y1)
  497. LD a4, 3 * SIZE(Y1)
  498. ST a1, 0 * SIZE(Y)
  499. daddu Y, Y, INCY
  500. ST a2, 0 * SIZE(Y)
  501. daddu Y, Y, INCY
  502. ST a3, 0 * SIZE(Y)
  503. daddu Y, Y, INCY
  504. ST a4, 0 * SIZE(Y)
  505. daddu Y, Y, INCY
  506. daddiu I, I, -1
  507. bgtz I, .L902
  508. daddiu Y1, Y1, 4 * SIZE
  509. .align 3
  510. .L905:
  511. andi I, M, 3
  512. blez I, .L999
  513. NOP
  514. .align 3
  515. .L906:
  516. LD a1, 0 * SIZE(Y1)
  517. daddiu Y1, Y1, 1 * SIZE
  518. ST a1, 0 * SIZE(Y)
  519. daddiu I, I, -1
  520. bgtz I, .L906
  521. daddu Y, Y, INCY
  522. .align 3
  523. .L999:
  524. LDARG $16, 0($sp)
  525. LDARG $17, 8($sp)
  526. LDARG $18, 16($sp)
  527. LDARG $19, 24($sp)
  528. j $31
  529. daddiu $sp, $sp, 32
  530. EPILOGUE