You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Unused param dummy1 */
  30. #define M $r4
  31. #define N $r5
  32. #define A $r7
  33. #define LDA $r8
  34. #define X $r9
  35. #define INCX $r10
  36. #define Y $r11
  37. #define INCY $r6
  38. #define BUFFER $r16
  39. #define YORIG $r18
  40. #define XX $r12
  41. #define YY $r13
  42. #define I $r14
  43. #define J $r15
  44. #define AO1 $r23
  45. #define AO2 $r24
  46. #define ALPHA $f0
  47. #define a1 $f22
  48. #define a2 $f8
  49. #define a3 $f23
  50. #define a4 $f9
  51. #define a5 $f10
  52. #define a6 $f11
  53. #define a7 $f12
  54. #define a8 $f13
  55. #define x1 $f14
  56. #define x2 $f15
  57. #define y1 $f16
  58. #define y2 $f17
  59. #define y3 $f3
  60. #define y4 $f1
  61. #define y5 $f2
  62. #define y6 $f4
  63. #define y7 $f5
  64. #define y8 $f6
  65. #define t1 $f7
  66. #define t2 $f18
  67. #define t3 $f19
  68. #define t4 $f20
  69. PROLOGUE
  70. LDARG INCY, $sp, 0
  71. LDARG BUFFER, $sp, 8
  72. #ifdef __64BIT__
  73. addi.d $sp, $sp, -16
  74. #else
  75. addi.d $sp, $sp, -48
  76. #endif
  77. SDARG $r23, $sp, 0
  78. SDARG $r24, $sp, 8
  79. slli.d LDA, LDA, BASE_SHIFT
  80. #ifndef __64BIT__
  81. fst.d $f18, $sp, 16
  82. fst.d $f19, $sp, 24
  83. fst.d $f20, $sp, 32
  84. #endif
  85. slli.d INCX, INCX, BASE_SHIFT
  86. bge $r0, M, .L999
  87. slli.d INCY, INCY, BASE_SHIFT
  88. bge $r0, N, .L999
  89. li.d I, SIZE
  90. move YORIG, Y
  91. beq INCY, I, .L10
  92. srai.d I, M, 2
  93. move YORIG, BUFFER
  94. move XX, Y
  95. move YY, BUFFER
  96. bge $r0, I, .L05
  97. .align 3
  98. .L02:
  99. LD a1, XX, 0 * SIZE
  100. add.d XX, XX, INCY
  101. LD a2, XX, 0 * SIZE
  102. add.d XX, XX, INCY
  103. LD a3, XX, 0 * SIZE
  104. add.d XX, XX, INCY
  105. LD a4, XX, 0 * SIZE
  106. add.d XX, XX, INCY
  107. ST a1, YY, 0 * SIZE
  108. ST a2, YY, 1 * SIZE
  109. ST a3, YY, 2 * SIZE
  110. ST a4, YY, 3 * SIZE
  111. addi.d I, I, -1
  112. addi.d YY, YY, 4 * SIZE
  113. blt $r0, I, .L02
  114. .align 3
  115. .L05:
  116. andi I, M, 3
  117. bge $r0, I, .L10
  118. .align 3
  119. .L06:
  120. LD a1, XX, 0 * SIZE
  121. add.d XX, XX, INCY
  122. ST a1, YY, 0 * SIZE
  123. addi.d I, I, -1
  124. addi.d YY, YY, 1 * SIZE
  125. blt $r0, I, .L06
  126. .align 3
  127. .L10:
  128. srai.d J, N, 1
  129. bge $r0, J, .L20
  130. .align 3
  131. .L11:
  132. LD x1, X, 0 * SIZE
  133. add.d X, X, INCX
  134. LD x2, X, 0 * SIZE
  135. add.d X, X, INCX
  136. move AO1, A
  137. add.d AO2, A, LDA
  138. add.d A, AO2, LDA
  139. move YY, YORIG
  140. MUL x1, ALPHA, x1
  141. srai.d I, M, 3
  142. MUL x2, ALPHA, x2
  143. bge $r0, I, .L15
  144. LD a1, AO1, 0 * SIZE
  145. LD y1, YY, 0 * SIZE
  146. LD a2, AO1, 1 * SIZE
  147. LD y2, YY, 1 * SIZE
  148. LD a3, AO1, 2 * SIZE
  149. LD y3, YY, 2 * SIZE
  150. LD a4, AO1, 3 * SIZE
  151. LD y4, YY, 3 * SIZE
  152. LD a5, AO2, 0 * SIZE
  153. LD y5, YY, 4 * SIZE
  154. LD a6, AO2, 1 * SIZE
  155. LD y6, YY, 5 * SIZE
  156. LD a7, AO2, 2 * SIZE
  157. LD y7, YY, 6 * SIZE
  158. LD a8, AO2, 3 * SIZE
  159. addi.d I, I, -1
  160. LD y8, YY, 7 * SIZE
  161. bge $r0, I, .L13
  162. .align 3
  163. .L12:
  164. MADD t1, a1, x1, y1
  165. LD a1, AO1, 4 * SIZE
  166. MADD t2, a2, x1, y2
  167. LD a2, AO1, 5 * SIZE
  168. LD y1, YY, 8 * SIZE
  169. LD y2, YY, 9 * SIZE
  170. MADD t3, a3, x1, y3
  171. LD a3, AO1, 6 * SIZE
  172. MADD t4, a4, x1, y4
  173. LD a4, AO1, 7 * SIZE
  174. LD y3, YY, 10 * SIZE
  175. LD y4, YY, 11 * SIZE
  176. MADD t1, a5, x2, t1
  177. LD a5, AO2, 4 * SIZE
  178. MADD t2, a6, x2, t2
  179. LD a6, AO2, 5 * SIZE
  180. MADD t3, a7, x2, t3
  181. LD a7, AO2, 6 * SIZE
  182. MADD t4, a8, x2, t4
  183. LD a8, AO2, 7 * SIZE
  184. ST t1, YY, 0 * SIZE
  185. ST t2, YY, 1 * SIZE
  186. ST t3, YY, 2 * SIZE
  187. ST t4, YY, 3 * SIZE
  188. MADD t1, a1, x1, y5
  189. LD a1, AO1, 8 * SIZE
  190. MADD t2, a2, x1, y6
  191. LD a2, AO1, 9 * SIZE
  192. LD y5, YY, 12 * SIZE
  193. LD y6, YY, 13 * SIZE
  194. MADD t3, a3, x1, y7
  195. LD a3, AO1, 10 * SIZE
  196. MADD t4, a4, x1, y8
  197. LD a4, AO1, 11 * SIZE
  198. LD y7, YY, 14 * SIZE
  199. LD y8, YY, 15 * SIZE
  200. MADD t1, a5, x2, t1
  201. LD a5, AO2, 8 * SIZE
  202. MADD t2, a6, x2, t2
  203. LD a6, AO2, 9 * SIZE
  204. MADD t3, a7, x2, t3
  205. LD a7, AO2, 10 * SIZE
  206. MADD t4, a8, x2, t4
  207. LD a8, AO2, 11 * SIZE
  208. ST t1, YY, 4 * SIZE
  209. ST t2, YY, 5 * SIZE
  210. ST t3, YY, 6 * SIZE
  211. ST t4, YY, 7 * SIZE
  212. addi.d I, I, -1
  213. addi.d YY, YY, 8 * SIZE
  214. addi.d AO1, AO1, 8 * SIZE
  215. addi.d AO2, AO2, 8 * SIZE
  216. blt $r0, I, .L12
  217. .align 3
  218. .L13:
  219. MADD t1, a1, x1, y1
  220. LD a1, AO1, 4 * SIZE
  221. MADD t2, a2, x1, y2
  222. LD a2, AO1, 5 * SIZE
  223. MADD t3, a3, x1, y3
  224. LD a3, AO1, 6 * SIZE
  225. MADD t4, a4, x1, y4
  226. LD a4, AO1, 7 * SIZE
  227. MADD t1, a5, x2, t1
  228. LD a5, AO2, 4 * SIZE
  229. MADD t2, a6, x2, t2
  230. LD a6, AO2, 5 * SIZE
  231. MADD t3, a7, x2, t3
  232. LD a7, AO2, 6 * SIZE
  233. MADD t4, a8, x2, t4
  234. LD a8, AO2, 7 * SIZE
  235. ST t1, YY, 0 * SIZE
  236. MADD t1, a1, x1, y5
  237. ST t2, YY, 1 * SIZE
  238. MADD t2, a2, x1, y6
  239. ST t3, YY, 2 * SIZE
  240. MADD t3, a3, x1, y7
  241. ST t4, YY, 3 * SIZE
  242. MADD t4, a4, x1, y8
  243. MADD t1, a5, x2, t1
  244. addi.d AO1, AO1, 8 * SIZE
  245. MADD t2, a6, x2, t2
  246. addi.d AO2, AO2, 8 * SIZE
  247. MADD t3, a7, x2, t3
  248. addi.d YY, YY, 8 * SIZE
  249. MADD t4, a8, x2, t4
  250. ST t1, YY, -4 * SIZE
  251. ST t2, YY, -3 * SIZE
  252. ST t3, YY, -2 * SIZE
  253. ST t4, YY, -1 * SIZE
  254. .align 3
  255. .L15:
  256. andi I, M, 4
  257. bge $r0, I, .L16
  258. LD a1, AO1, 0 * SIZE
  259. LD y1, YY, 0 * SIZE
  260. LD a2, AO1, 1 * SIZE
  261. LD y2, YY, 1 * SIZE
  262. LD a3, AO1, 2 * SIZE
  263. LD y3, YY, 2 * SIZE
  264. LD a4, AO1, 3 * SIZE
  265. LD y4, YY, 3 * SIZE
  266. LD a5, AO2, 0 * SIZE
  267. MADD y1, a1, x1, y1
  268. LD a6, AO2, 1 * SIZE
  269. MADD y2, a2, x1, y2
  270. LD a7, AO2, 2 * SIZE
  271. MADD y3, a3, x1, y3
  272. LD a8, AO2, 3 * SIZE
  273. MADD y4, a4, x1, y4
  274. MADD y1, a5, x2, y1
  275. addi.d YY, YY, 4 * SIZE
  276. MADD y2, a6, x2, y2
  277. addi.d AO1, AO1, 4 * SIZE
  278. MADD y3, a7, x2, y3
  279. addi.d AO2, AO2, 4 * SIZE
  280. MADD y4, a8, x2, y4
  281. ST y1, YY, -4 * SIZE
  282. ST y2, YY, -3 * SIZE
  283. ST y3, YY, -2 * SIZE
  284. ST y4, YY, -1 * SIZE
  285. .align 3
  286. .L16:
  287. andi I, M, 2
  288. bge $r0, I, .L17
  289. LD a1, AO1, 0 * SIZE
  290. LD y1, YY, 0 * SIZE
  291. LD a2, AO1, 1 * SIZE
  292. LD y2, YY, 1 * SIZE
  293. LD a5, AO2, 0 * SIZE
  294. LD a6, AO2, 1 * SIZE
  295. MADD y1, a1, x1, y1
  296. MADD y2, a2, x1, y2
  297. addi.d YY, YY, 2 * SIZE
  298. MADD y1, a5, x2, y1
  299. addi.d AO1, AO1, 2 * SIZE
  300. MADD y2, a6, x2, y2
  301. addi.d AO2, AO2, 2 * SIZE
  302. ST y1, YY, -2 * SIZE
  303. ST y2, YY, -1 * SIZE
  304. .align 3
  305. .L17:
  306. andi I, M, 1
  307. bge $r0, I, .L19
  308. LD y1, YY, 0 * SIZE
  309. LD a1, AO1, 0 * SIZE
  310. LD a5, AO2, 0 * SIZE
  311. MADD y1, a1, x1, y1
  312. MADD y1, a5, x2, y1
  313. ST y1, YY, 0 * SIZE
  314. .align 3
  315. .L19:
  316. addi.d J, J, -1
  317. blt $r0, J, .L11
  318. .align 3
  319. .L20:
  320. andi J, N, 1
  321. bge $r0, J, .L900
  322. .align 3
  323. .L21:
  324. LD x1, X, 0 * SIZE
  325. add.d X, X, INCX
  326. move YY, YORIG
  327. move AO1, A
  328. srai.d I, M, 3
  329. MUL x1, ALPHA, x1
  330. bge $r0, I, .L25
  331. LD a1, AO1, 0 * SIZE
  332. LD y1, YY, 0 * SIZE
  333. LD a2, AO1, 1 * SIZE
  334. LD y2, YY, 1 * SIZE
  335. LD a3, AO1, 2 * SIZE
  336. LD y3, YY, 2 * SIZE
  337. LD a4, AO1, 3 * SIZE
  338. LD y4, YY, 3 * SIZE
  339. LD y5, YY, 4 * SIZE
  340. LD y6, YY, 5 * SIZE
  341. LD y7, YY, 6 * SIZE
  342. addi.d I, I, -1
  343. LD y8, YY, 7 * SIZE
  344. bge $r0, I, .L23
  345. .align 3
  346. .L22:
  347. MADD t1, a1, x1, y1
  348. LD a1, AO1, 4 * SIZE
  349. MADD t2, a2, x1, y2
  350. LD a2, AO1, 5 * SIZE
  351. LD y1, YY, 8 * SIZE
  352. LD y2, YY, 9 * SIZE
  353. MADD t3, a3, x1, y3
  354. LD a3, AO1, 6 * SIZE
  355. MADD t4, a4, x1, y4
  356. LD a4, AO1, 7 * SIZE
  357. LD y3, YY, 10 * SIZE
  358. LD y4, YY, 11 * SIZE
  359. ST t1, YY, 0 * SIZE
  360. ST t2, YY, 1 * SIZE
  361. ST t3, YY, 2 * SIZE
  362. ST t4, YY, 3 * SIZE
  363. MADD t1, a1, x1, y5
  364. LD a1, AO1, 8 * SIZE
  365. MADD t2, a2, x1, y6
  366. LD a2, AO1, 9 * SIZE
  367. LD y5, YY, 12 * SIZE
  368. LD y6, YY, 13 * SIZE
  369. MADD t3, a3, x1, y7
  370. LD a3, AO1, 10 * SIZE
  371. MADD t4, a4, x1, y8
  372. LD a4, AO1, 11 * SIZE
  373. LD y7, YY, 14 * SIZE
  374. LD y8, YY, 15 * SIZE
  375. ST t1, YY, 4 * SIZE
  376. ST t2, YY, 5 * SIZE
  377. ST t3, YY, 6 * SIZE
  378. ST t4, YY, 7 * SIZE
  379. addi.d I, I, -1
  380. addi.d YY, YY, 8 * SIZE
  381. addi.d AO1, AO1, 8 * SIZE
  382. blt $r0, I, .L22
  383. .align 3
  384. .L23:
  385. MADD t1, a1, x1, y1
  386. LD a1, AO1, 4 * SIZE
  387. MADD t2, a2, x1, y2
  388. LD a2, AO1, 5 * SIZE
  389. MADD t3, a3, x1, y3
  390. LD a3, AO1, 6 * SIZE
  391. MADD t4, a4, x1, y4
  392. LD a4, AO1, 7 * SIZE
  393. ST t1, YY, 0 * SIZE
  394. MADD t1, a1, x1, y5
  395. ST t2, YY, 1 * SIZE
  396. MADD t2, a2, x1, y6
  397. ST t3, YY, 2 * SIZE
  398. MADD t3, a3, x1, y7
  399. ST t4, YY, 3 * SIZE
  400. MADD t4, a4, x1, y8
  401. ST t1, YY, 4 * SIZE
  402. ST t2, YY, 5 * SIZE
  403. ST t3, YY, 6 * SIZE
  404. ST t4, YY, 7 * SIZE
  405. addi.d AO1, AO1, 8 * SIZE
  406. addi.d YY, YY, 8 * SIZE
  407. .align 3
  408. .L25:
  409. andi I, M, 4
  410. bge $r0, I, .L26
  411. LD a1, AO1, 0 * SIZE
  412. LD y1, YY, 0 * SIZE
  413. LD a2, AO1, 1 * SIZE
  414. LD y2, YY, 1 * SIZE
  415. LD a3, AO1, 2 * SIZE
  416. LD y3, YY, 2 * SIZE
  417. LD a4, AO1, 3 * SIZE
  418. LD y4, YY, 3 * SIZE
  419. MADD y1, a1, x1, y1
  420. MADD y2, a2, x1, y2
  421. MADD y3, a3, x1, y3
  422. addi.d YY, YY, 4 * SIZE
  423. MADD y4, a4, x1, y4
  424. addi.d AO1, AO1, 4 * SIZE
  425. ST y1, YY, -4 * SIZE
  426. ST y2, YY, -3 * SIZE
  427. ST y3, YY, -2 * SIZE
  428. ST y4, YY, -1 * SIZE
  429. .align 3
  430. .L26:
  431. andi I, M, 2
  432. bge $r0, I, .L27
  433. LD a1, AO1, 0 * SIZE
  434. LD y1, YY, 0 * SIZE
  435. LD a2, AO1, 1 * SIZE
  436. LD y2, YY, 1 * SIZE
  437. MADD y1, a1, x1, y1
  438. addi.d YY, YY, 2 * SIZE
  439. MADD y2, a2, x1, y2
  440. addi.d AO1, AO1, 2 * SIZE
  441. ST y1, YY, -2 * SIZE
  442. ST y2, YY, -1 * SIZE
  443. .align 3
  444. .L27:
  445. andi I, M, 1
  446. bge $r0, I, .L900
  447. LD y1, YY, 0 * SIZE
  448. LD a1, AO1, 0 * SIZE
  449. MADD y1, a1, x1, y1
  450. ST y1, YY, 0 * SIZE
  451. .align 3
  452. .L900:
  453. li.d YORIG, SIZE
  454. srai.d I, M, 2
  455. beq INCY, YORIG, .L999
  456. move XX, BUFFER
  457. bge $r0, I, .L905
  458. .align 3
  459. .L902:
  460. LD a1, XX, 0 * SIZE
  461. LD a2, XX, 1 * SIZE
  462. LD a3, XX, 2 * SIZE
  463. LD a4, XX, 3 * SIZE
  464. ST a1, Y, 0 * SIZE
  465. add.d Y, Y, INCY
  466. ST a2, Y, 0 * SIZE
  467. add.d Y, Y, INCY
  468. ST a3, Y, 0 * SIZE
  469. add.d Y, Y, INCY
  470. ST a4, Y, 0 * SIZE
  471. add.d Y, Y, INCY
  472. addi.d I, I, -1
  473. addi.d XX, XX, 4 * SIZE
  474. blt $r0, I, .L902
  475. .align 3
  476. .L905:
  477. andi I, M, 3
  478. bge $r0, I, .L999
  479. .align 3
  480. .L906:
  481. LD a1, XX, 0 * SIZE
  482. addi.d XX, XX, 1 * SIZE
  483. ST a1, Y, 0 * SIZE
  484. addi.d I, I, -1
  485. add.d Y, Y, INCY
  486. blt $r0, I, .L906
  487. .align 3
  488. .L999:
  489. LDARG $r23, $sp, 0
  490. LDARG $r24, $sp, 8
  491. #ifndef __64BIT__
  492. fld.d $f18, $sp, 16
  493. fld.d $f19, $sp, 24
  494. fld.d $f20, $sp, 32
  495. #endif
  496. #ifdef __64BIT__
  497. addi.d $sp, $sp, 16
  498. #else
  499. addi.d $sp, $sp, 48
  500. #endif
  501. move $r4, $r17
  502. fmov.d $f0, $f22
  503. jirl $r0, $r1, 0x0
  504. EPILOGUE