You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Unused param dummy1 */
  30. #define M $r4
  31. #define N $r5
  32. #define A $r7
  33. #define LDA $r8
  34. #define X $r9
  35. #define INCX $r10
  36. #define Y $r11
  37. #define INCY $r6
  38. #define BUFFER $r16
  39. #define XORIG $r18
  40. #define XX $r12
  41. #define YY $r13
  42. #define I $r14
  43. #define J $r15
  44. #define AO1 $r23
  45. #define AO2 $r24
  46. #define ALPHA $f0
  47. #define a1 $f22
  48. #define a2 $f8
  49. #define a3 $f23
  50. #define a4 $f9
  51. #define a5 $f10
  52. #define a6 $f11
  53. #define a7 $f12
  54. #define a8 $f13
  55. #define y1 $f14
  56. #define y2 $f15
  57. #define y3 $f16
  58. #define y4 $f17
  59. #define x1 $f3
  60. #define x2 $f1
  61. #define x3 $f2
  62. #define x4 $f4
  63. #define x5 $f5
  64. #define x6 $f6
  65. #define x7 $f7
  66. #define x8 $f18
  67. PROLOGUE
  68. LDARG INCY, $sp, 0
  69. LDARG BUFFER, $sp, 8
  70. #ifdef __64BIT__
  71. addi.d $sp, $sp, -16
  72. #else
  73. addi.d $sp, $sp, -32
  74. #endif
  75. MTC y1, $r0
  76. SDARG $r23, $sp, 0
  77. SDARG $r24, $sp, 8
  78. slli.d LDA, LDA, BASE_SHIFT
  79. #ifndef __64BIT__
  80. fst.d $f18, $sp, 16
  81. #endif
  82. slli.d INCX, INCX, BASE_SHIFT
  83. bge $r0, M, .L999
  84. slli.d INCY, INCY, BASE_SHIFT
  85. bge $r0, N, .L999
  86. li.d I, SIZE
  87. move XORIG, X
  88. beq INCX, I, .L10
  89. srai.d I, M, 2
  90. move XORIG, BUFFER
  91. move YY, BUFFER
  92. bge $r0, I, .L05
  93. .align 3
  94. .L02:
  95. LD a1, X, 0 * SIZE
  96. add.d X, X, INCX
  97. LD a2, X, 0 * SIZE
  98. add.d X, X, INCX
  99. LD a3, X, 0 * SIZE
  100. add.d X, X, INCX
  101. LD a4, X, 0 * SIZE
  102. add.d X, X, INCX
  103. ST a1, YY, 0 * SIZE
  104. ST a2, YY, 1 * SIZE
  105. ST a3, YY, 2 * SIZE
  106. ST a4, YY, 3 * SIZE
  107. addi.d I, I, -1
  108. addi.d YY, YY, 4 * SIZE
  109. blt $r0, I, .L02
  110. .align 3
  111. .L05:
  112. andi I, M, 3
  113. bge $r0, I, .L10
  114. .align 3
  115. .L06:
  116. LD a1, X, 0 * SIZE
  117. add.d X, X, INCX
  118. ST a1, YY, 0 * SIZE
  119. addi.d I, I, -1
  120. addi.d YY, YY, 1 * SIZE
  121. blt $r0, I, .L06
  122. .align 3
  123. .L10:
  124. srai.d J, N, 1
  125. move YY, Y
  126. bge $r0, J, .L20
  127. .align 3
  128. .L11:
  129. move AO1, A
  130. MOV y2, y1
  131. add.d AO2, A, LDA
  132. MOV y3, y1
  133. add.d A, AO2, LDA
  134. MOV y4, y1
  135. srai.d I, M, 3
  136. move XX, XORIG
  137. bge $r0, I, .L15
  138. LD a1, AO1, 0 * SIZE
  139. LD x1, XX, 0 * SIZE
  140. LD a2, AO2, 0 * SIZE
  141. LD x2, XX, 1 * SIZE
  142. LD a3, AO1, 1 * SIZE
  143. LD x3, XX, 2 * SIZE
  144. LD a4, AO2, 1 * SIZE
  145. LD x4, XX, 3 * SIZE
  146. LD a5, AO1, 2 * SIZE
  147. LD x5, XX, 4 * SIZE
  148. LD a6, AO2, 2 * SIZE
  149. LD x6, XX, 5 * SIZE
  150. LD a7, AO1, 3 * SIZE
  151. LD x7, XX, 6 * SIZE
  152. LD a8, AO2, 3 * SIZE
  153. addi.d I, I, -1
  154. LD x8, XX, 7 * SIZE
  155. bge $r0, I, .L13
  156. .align 3
  157. .L12:
  158. MADD y1, a1, x1, y1
  159. LD a1, AO1, 4 * SIZE
  160. MADD y2, a2, x1, y2
  161. LD a2, AO2, 4 * SIZE
  162. MADD y3, a3, x2, y3
  163. LD a3, AO1, 5 * SIZE
  164. MADD y4, a4, x2, y4
  165. LD a4, AO2, 5 * SIZE
  166. LD x1, XX, 8 * SIZE
  167. LD x2, XX, 9 * SIZE
  168. MADD y1, a5, x3, y1
  169. LD a5, AO1, 6 * SIZE
  170. MADD y2, a6, x3, y2
  171. LD a6, AO2, 6 * SIZE
  172. MADD y3, a7, x4, y3
  173. LD a7, AO1, 7 * SIZE
  174. MADD y4, a8, x4, y4
  175. LD a8, AO2, 7 * SIZE
  176. LD x3, XX, 10 * SIZE
  177. LD x4, XX, 11 * SIZE
  178. MADD y1, a1, x5, y1
  179. LD a1, AO1, 8 * SIZE
  180. MADD y2, a2, x5, y2
  181. LD a2, AO2, 8 * SIZE
  182. MADD y3, a3, x6, y3
  183. LD a3, AO1, 9 * SIZE
  184. MADD y4, a4, x6, y4
  185. LD a4, AO2, 9 * SIZE
  186. LD x5, XX, 12 * SIZE
  187. LD x6, XX, 13 * SIZE
  188. MADD y1, a5, x7, y1
  189. LD a5, AO1, 10 * SIZE
  190. MADD y2, a6, x7, y2
  191. LD a6, AO2, 10 * SIZE
  192. MADD y3, a7, x8, y3
  193. LD a7, AO1, 11 * SIZE
  194. MADD y4, a8, x8, y4
  195. LD a8, AO2, 11 * SIZE
  196. LD x7, XX, 14 * SIZE
  197. LD x8, XX, 15 * SIZE
  198. addi.d I, I, -1
  199. addi.d XX, XX, 8 * SIZE
  200. addi.d AO1, AO1, 8 * SIZE
  201. addi.d AO2, AO2, 8 * SIZE
  202. blt $r0, I, .L12
  203. .align 3
  204. .L13:
  205. MADD y1, a1, x1, y1
  206. LD a1, AO1, 4 * SIZE
  207. MADD y2, a2, x1, y2
  208. LD a2, AO2, 4 * SIZE
  209. MADD y3, a3, x2, y3
  210. LD a3, AO1, 5 * SIZE
  211. MADD y4, a4, x2, y4
  212. LD a4, AO2, 5 * SIZE
  213. MADD y1, a5, x3, y1
  214. LD a5, AO1, 6 * SIZE
  215. MADD y2, a6, x3, y2
  216. LD a6, AO2, 6 * SIZE
  217. MADD y3, a7, x4, y3
  218. LD a7, AO1, 7 * SIZE
  219. MADD y4, a8, x4, y4
  220. LD a8, AO2, 7 * SIZE
  221. MADD y1, a1, x5, y1
  222. MADD y2, a2, x5, y2
  223. MADD y3, a3, x6, y3
  224. MADD y4, a4, x6, y4
  225. MADD y1, a5, x7, y1
  226. addi.d XX, XX, 8 * SIZE
  227. MADD y2, a6, x7, y2
  228. addi.d AO1, AO1, 8 * SIZE
  229. MADD y3, a7, x8, y3
  230. addi.d AO2, AO2, 8 * SIZE
  231. MADD y4, a8, x8, y4
  232. .align 3
  233. .L15:
  234. andi I, M, 4
  235. bge $r0, I, .L17
  236. LD a1, AO1, 0 * SIZE
  237. LD x1, XX, 0 * SIZE
  238. LD a2, AO2, 0 * SIZE
  239. LD a3, AO1, 1 * SIZE
  240. LD x2, XX, 1 * SIZE
  241. LD a4, AO2, 1 * SIZE
  242. LD a5, AO1, 2 * SIZE
  243. LD x3, XX, 2 * SIZE
  244. MADD y1, a1, x1, y1
  245. LD a6, AO2, 2 * SIZE
  246. MADD y2, a2, x1, y2
  247. LD a7, AO1, 3 * SIZE
  248. MADD y3, a3, x2, y3
  249. LD x4, XX, 3 * SIZE
  250. MADD y4, a4, x2, y4
  251. LD a8, AO2, 3 * SIZE
  252. MADD y1, a5, x3, y1
  253. MADD y2, a6, x3, y2
  254. addi.d XX, XX, 4 * SIZE
  255. MADD y3, a7, x4, y3
  256. addi.d AO1, AO1, 4 * SIZE
  257. MADD y4, a8, x4, y4
  258. addi.d AO2, AO2, 4 * SIZE
  259. .align 3
  260. .L17:
  261. andi I, M, 3
  262. ADD y1, y1, y3
  263. ADD y2, y2, y4
  264. bge $r0, I, .L19
  265. .align 3
  266. .L18:
  267. LD x1, XX, 0 * SIZE
  268. LD a1, AO1, 0 * SIZE
  269. LD a2, AO2, 0 * SIZE
  270. addi.d I, I, -1
  271. addi.d XX, XX, 1 * SIZE
  272. addi.d AO1, AO1, 1 * SIZE
  273. addi.d AO2, AO2, 1 * SIZE
  274. MADD y1, a1, x1, y1
  275. MADD y2, a2, x1, y2
  276. blt $r0, I, .L18
  277. .align 3
  278. .L19:
  279. LD a1, Y, 0 * SIZE
  280. add.d Y, Y, INCY
  281. LD a2, Y, 0 * SIZE
  282. add.d Y, Y, INCY
  283. MADD a1, y1, ALPHA, a1
  284. addi.d J, J, -1
  285. MADD a2, y2, ALPHA, a2
  286. MTC y1, $r0
  287. ST a1, YY, 0 * SIZE
  288. add.d YY, YY, INCY
  289. ST a2, YY, 0 * SIZE
  290. add.d YY, YY, INCY
  291. blt $r0, J, .L11
  292. .align 3
  293. .L20:
  294. andi J, N, 1
  295. MOV y3, y1
  296. move AO1, A
  297. bge $r0, J, .L999
  298. srai.d I, M, 3
  299. move XX, XORIG
  300. bge $r0, I, .L25
  301. LD a1, AO1, 0 * SIZE
  302. LD x1, XX, 0 * SIZE
  303. LD a3, AO1, 1 * SIZE
  304. LD x2, XX, 1 * SIZE
  305. LD a5, AO1, 2 * SIZE
  306. LD x3, XX, 2 * SIZE
  307. LD a7, AO1, 3 * SIZE
  308. LD x4, XX, 3 * SIZE
  309. LD x5, XX, 4 * SIZE
  310. LD x6, XX, 5 * SIZE
  311. LD x7, XX, 6 * SIZE
  312. addi.d I, I, -1
  313. LD x8, XX, 7 * SIZE
  314. bge $r0, I, .L23
  315. .align 3
  316. .L22:
  317. MADD y1, a1, x1, y1
  318. LD a1, AO1, 4 * SIZE
  319. MADD y3, a3, x2, y3
  320. LD a3, AO1, 5 * SIZE
  321. LD x1, XX, 8 * SIZE
  322. LD x2, XX, 9 * SIZE
  323. MADD y1, a5, x3, y1
  324. LD a5, AO1, 6 * SIZE
  325. MADD y3, a7, x4, y3
  326. LD a7, AO1, 7 * SIZE
  327. LD x3, XX, 10 * SIZE
  328. LD x4, XX, 11 * SIZE
  329. MADD y1, a1, x5, y1
  330. LD a1, AO1, 8 * SIZE
  331. MADD y3, a3, x6, y3
  332. LD a3, AO1, 9 * SIZE
  333. LD x5, XX, 12 * SIZE
  334. LD x6, XX, 13 * SIZE
  335. MADD y1, a5, x7, y1
  336. LD a5, AO1, 10 * SIZE
  337. MADD y3, a7, x8, y3
  338. LD a7, AO1, 11 * SIZE
  339. LD x7, XX, 14 * SIZE
  340. LD x8, XX, 15 * SIZE
  341. addi.d I, I, -1
  342. addi.d XX, XX, 8 * SIZE
  343. addi.d AO1, AO1, 8 * SIZE
  344. blt $r0, I, .L22
  345. .align 3
  346. .L23:
  347. MADD y1, a1, x1, y1
  348. LD a1, AO1, 4 * SIZE
  349. MADD y3, a3, x2, y3
  350. LD a3, AO1, 5 * SIZE
  351. MADD y1, a5, x3, y1
  352. LD a5, AO1, 6 * SIZE
  353. MADD y3, a7, x4, y3
  354. LD a7, AO1, 7 * SIZE
  355. MADD y1, a1, x5, y1
  356. MADD y3, a3, x6, y3
  357. MADD y1, a5, x7, y1
  358. MADD y3, a7, x8, y3
  359. addi.d XX, XX, 8 * SIZE
  360. addi.d AO1, AO1, 8 * SIZE
  361. .align 3
  362. .L25:
  363. andi I, M, 4
  364. bge $r0, I, .L27
  365. LD a1, AO1, 0 * SIZE
  366. LD x1, XX, 0 * SIZE
  367. LD a3, AO1, 1 * SIZE
  368. LD x2, XX, 1 * SIZE
  369. LD a5, AO1, 2 * SIZE
  370. LD x3, XX, 2 * SIZE
  371. MADD y1, a1, x1, y1
  372. LD a7, AO1, 3 * SIZE
  373. MADD y3, a3, x2, y3
  374. LD x4, XX, 3 * SIZE
  375. MADD y1, a5, x3, y1
  376. addi.d XX, XX, 4 * SIZE
  377. MADD y3, a7, x4, y3
  378. addi.d AO1, AO1, 4 * SIZE
  379. .align 3
  380. .L27:
  381. andi I, M, 3
  382. ADD y1, y1, y3
  383. bge $r0, I, .L29
  384. .align 3
  385. .L28:
  386. LD x1, XX, 0 * SIZE
  387. LD a1, AO1, 0 * SIZE
  388. addi.d I, I, -1
  389. addi.d XX, XX, 1 * SIZE
  390. addi.d AO1, AO1, 1 * SIZE
  391. MADD y1, a1, x1, y1
  392. blt $r0, I, .L28
  393. .align 3
  394. .L29:
  395. LD a1, Y, 0 * SIZE
  396. add.d Y, Y, INCY
  397. MADD a1, y1, ALPHA, a1
  398. ST a1, YY, 0 * SIZE
  399. add.d YY, YY, INCY
  400. .align 3
  401. .L999:
  402. LDARG $r23, $sp, 0
  403. LDARG $r24, $sp, 8
  404. #ifndef __64BIT__
  405. fld.d $f18, $sp, 16
  406. #endif
  407. #ifdef __64BIT__
  408. addi.d $sp, $sp, 16
  409. #else
  410. addi.d $sp, $sp, 32
  411. #endif
  412. move $r4, $r17
  413. fmov.d $f0, $f22
  414. jirl $r0, $r1, 0x0
  415. EPILOGUE