You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_loongson3a_2x2.S 27 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468
  1. #define ASSEMBLER
  2. #include "common.h"
  3. #define FETCH ld
  4. #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  5. #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  6. #define STACKSIZE 160
  7. #define M $4
  8. #define N $5
  9. #define K $6
  10. #define A $9
  11. #define B $10
  12. #define C $11
  13. #define LDC $8
  14. #define AO $12
  15. #define BO $13
  16. #define R12 12
  17. #define R13 13
  18. #define I $2
  19. #define J $3
  20. #define L $7
  21. #define CO1 $14
  22. #define CO2 $15
  23. #define PREA $16
  24. #define PREB $17
  25. #if defined(TRMMKERNEL)
  26. #define OFFSET $18
  27. #define KK $19
  28. #define TEMP $20
  29. #endif
  30. #define a1 $f0
  31. #define a2 $f1
  32. #define a3 $f2
  33. #define a4 $f3
  34. #define b1 $f4
  35. #define b2 $f5
  36. #define b3 $f6
  37. #define b4 $f7
  38. #define a5 $f8
  39. #define a6 $f9
  40. #define a7 $f10
  41. #define a8 $f11
  42. #define b5 $f12
  43. #define b6 $f13
  44. #define b7 $f15
  45. #define b8 $f16
  46. #define c11 $f14
  47. #define c12 $f17
  48. #define c13 $f18
  49. #define c14 $f19
  50. #define c21 $f20
  51. #define c22 $f21
  52. #define c23 $f22
  53. #define c24 $f23
  54. #define c31 $f24
  55. #define c32 $f25
  56. #define c33 $f26
  57. #define c34 $f27
  58. #define c41 $f28
  59. #define c42 $f29
  60. #define c43 $f30
  61. #define c44 $f31
  62. #define F0 0
  63. #define F1 1
  64. #define F2 2
  65. #define F3 3
  66. #define F4 4
  67. #define F5 5
  68. #define F6 6
  69. #define F7 7
  70. #define F8 8
  71. #define F9 9
  72. #define F10 10
  73. #define F11 11
  74. #define F12 12
  75. #define F13 13
  76. #define F14 14
  77. #define F15 15
  78. #define F16 16
  79. #define F17 17
  80. #define F18 18
  81. #define F19 19
  82. #define F20 20
  83. #define F21 21
  84. #define F22 22
  85. #define F23 23
  86. #define F24 24
  87. #define F25 25
  88. #define F26 26
  89. #define F27 27
  90. #define F28 28
  91. #define F29 29
  92. #define F30 30
  93. #define F31 31
  94. #define ALPHA_R $f15
  95. #define ALPHA_I $f16
  96. #################################
  97. ## MADD1 a*c
  98. ## MADD2 b*c
  99. ## MADD3 a*d
  100. ## MADD4 d*b
  101. ##################################
  102. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  103. #define MADD1 MADD
  104. #define MADD2 MADD
  105. #define MADD3 MADD
  106. #define MADD4 NMSUB
  107. #endif
  108. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  109. #define MADD1 MADD
  110. #define MADD2 MADD
  111. #define MADD3 NMSUB
  112. #define MADD4 MADD
  113. #endif
  114. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  115. #define MADD1 MADD
  116. #define MADD2 NMSUB
  117. #define MADD3 MADD
  118. #define MADD4 MADD
  119. #endif
  120. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  121. #define MADD1 MADD
  122. #define MADD2 NMSUB
  123. #define MADD3 NMSUB
  124. #define MADD4 NMSUB
  125. #endif
  126. PROLOGUE
  127. LDARG LDC, 0($sp)
  128. daddiu $sp, $sp, -STACKSIZE
  129. SDARG $16, 0($sp)
  130. SDARG $17, 8($sp)
  131. sdc1 $f24, 16($sp)
  132. sdc1 $f25, 24($sp)
  133. sdc1 $f26, 32($sp)
  134. sdc1 $f27, 40($sp)
  135. sdc1 $f28, 48($sp)
  136. sdc1 $f29, 56($sp)
  137. #if defined(TRMMKERNEL)
  138. SDARG $18, 64($sp)
  139. SDARG $19, 72($sp)
  140. SDARG $20, 80($sp)
  141. LDARG OFFSET, STACKSIZE + 8($sp)
  142. #endif
  143. #ifndef __64BIT__
  144. sdc1 $f20, 88($sp)
  145. sdc1 $f21, 96($sp)
  146. sdc1 $f22,104($sp)
  147. sdc1 $f23,112($sp)
  148. #endif
  149. dsra J, N, 1 # J=N/2
  150. ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
  151. #if defined(TRMMKERNEL) && !defined(LEFT)
  152. neg KK, OFFSET
  153. #endif
  154. dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
  155. blez J, .L20
  156. ST ALPHA_I, 136($sp)
  157. .align 5
  158. .L10:
  159. #if defined(TRMMKERNEL) && defined(LEFT)
  160. move KK, OFFSET
  161. #endif
  162. daddiu J, J, -1
  163. dsra I, M, 1 # I=M/2
  164. dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  165. dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  166. move CO1, C # Fix pointer Cx
  167. daddu CO2, C, LDC
  168. move AO, A # Reset AO
  169. blez I, .L30
  170. daddu PREA, PREA, A # PREA=A+panel size
  171. .L11:
  172. #if defined(TRMMKERNEL)
  173. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  174. move BO, B
  175. #else
  176. dsll TEMP, KK, 1 + ZBASE_SHIFT
  177. daddu AO, AO, TEMP
  178. daddu BO, B, TEMP
  179. #endif
  180. MTC $0, c11 # Clear results regs
  181. LD a1, 0 * SIZE(AO)
  182. MOV c12, c11
  183. LD a2, 1 * SIZE(AO)
  184. MOV c13, c11
  185. LD b1, 0 * SIZE(BO)
  186. MOV c14, c11
  187. LD b2, 1 * SIZE(BO)
  188. MOV c21, c11
  189. LD a3, 2 * SIZE(AO)
  190. MOV c22, c11
  191. LD a4, 3 * SIZE(AO)
  192. MOV c23, c11
  193. LD b3, 2 * SIZE(BO)
  194. MOV c24, c11
  195. LD b4, 3 * SIZE(BO)
  196. FETCH $0, 0 * SIZE(CO2)
  197. MOV c31, c11
  198. MOV c32, c11
  199. FETCH $0, 0 * SIZE(CO1)
  200. MOV c33, c11
  201. MOV c34, c11
  202. FETCH $0, 4 * SIZE(CO2)
  203. MOV c41, c11
  204. MOV c42, c11
  205. FETCH $0, 4 * SIZE(CO1)
  206. MOV c43, c11
  207. MOV c44, c11
  208. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. dsubu TEMP, K, KK
  210. #elif defined(LEFT)
  211. daddiu TEMP, KK, 2
  212. #else
  213. daddiu TEMP, KK, 2
  214. #endif
  215. dsra L, TEMP, 2
  216. daddu PREB, PREB, B # PREA=A+panel size
  217. blez L, .L15
  218. NOP
  219. #else
  220. dsra L, K, 2 # Unroll K 4 times
  221. move BO, B
  222. MTC $0, c11 # Clear results regs
  223. LD a1, 0 * SIZE(AO)
  224. MOV c12, c11
  225. LD a2, 1 * SIZE(AO)
  226. MOV c13, c11
  227. LD b1, 0 * SIZE(BO)
  228. MOV c14, c11
  229. LD b2, 1 * SIZE(BO)
  230. MOV c21, c11
  231. LD a3, 2 * SIZE(AO)
  232. MOV c22, c11
  233. LD a4, 3 * SIZE(AO)
  234. MOV c23, c11
  235. LD b3, 2 * SIZE(BO)
  236. MOV c24, c11
  237. LD b4, 3 * SIZE(BO)
  238. MOV c31, c11
  239. MOV c32, c11
  240. FETCH $0, 0 * SIZE(CO2)
  241. MOV c33, c11
  242. MOV c34, c11
  243. FETCH $0, 0 * SIZE(CO1)
  244. MOV c41, c11
  245. MOV c42, c11
  246. FETCH $0, 4 * SIZE(CO2)
  247. MOV c43, c11
  248. NOP
  249. FETCH $0, 4 * SIZE(CO1)
  250. daddu PREB, PREB, B # PREA=A+panel size
  251. blez L, .L15
  252. MOV c44, c11
  253. #endif
  254. .align 5
  255. .L12:
  256. LD a5, 4 * SIZE(AO)
  257. LD a6, 5 * SIZE(AO)
  258. MADD1 c11, c11, a1, b1 # axc A1xB1
  259. MADD3 c13, c13, a1, b2 # axd
  260. LD b5, 4 * SIZE(BO)
  261. LD b6, 5 * SIZE(BO)
  262. MADD2 c12, c12, a2, b1 # bxc
  263. MADD4 c14, c14, a2, b2 # bxd
  264. LD a7, 6 * SIZE(AO)
  265. LD a8, 7 * SIZE(AO)
  266. MADD1 c21, c21, a3, b1 # A2xB1
  267. MADD3 c23, c23, a3, b2
  268. LD b7, 6 * SIZE(BO)
  269. LD b8, 7 * SIZE(BO)
  270. MADD2 c22, c22, a4, b1
  271. MADD4 c24, c24, a4, b2
  272. FETCH $0, 4 * SIZE(PREA)
  273. FETCH $0, 4 * SIZE(PREB)
  274. MADD1 c31, c31, a1, b3 # A1xB2
  275. MADD3 c33, c33, a1, b4
  276. MADD2 c32, c32, a2, b3
  277. MADD4 c34, c34, a2, b4
  278. MADD1 c41, c41, a3, b3 # A2xB2
  279. MADD3 c43, c43, a3, b4
  280. MADD2 c42, c42, a4, b3
  281. MADD4 c44, c44, a4, b4
  282. LD a1, 8 * SIZE(AO)
  283. LD a2, 9 * SIZE(AO)
  284. MADD1 c11, c11, a5, b5 # axc A1xB1
  285. MADD3 c13, c13, a5, b6 # axd
  286. LD b1, 8 * SIZE(BO)
  287. LD b2, 9 * SIZE(BO)
  288. MADD2 c12, c12, a6, b5 # bxc
  289. MADD4 c14, c14, a6, b6 # bxd
  290. LD a3, 10 * SIZE(AO)
  291. LD a4, 11 * SIZE(AO)
  292. MADD1 c21, c21, a7, b5 # A2xB1
  293. MADD3 c23, c23, a7, b6
  294. LD b3, 10 * SIZE(BO)
  295. LD b4, 11 * SIZE(BO)
  296. MADD2 c22, c22, a8, b5
  297. MADD4 c24, c24, a8, b6
  298. FETCH $0, 8 * SIZE(PREA)
  299. FETCH $0, 8 * SIZE(PREB)
  300. MADD1 c31, c31, a5, b7 # A1xB2
  301. MADD3 c33, c33, a5, b8
  302. MADD2 c32, c32, a6, b7
  303. MADD4 c34, c34, a6, b8
  304. MADD1 c41, c41, a7, b7 # A2xB2
  305. MADD3 c43, c43, a7, b8
  306. MADD2 c42, c42, a8, b7
  307. MADD4 c44, c44, a8, b8
  308. LD a5, 12 * SIZE(AO)
  309. LD a6, 13 * SIZE(AO)
  310. MADD1 c11, c11, a1, b1 # axc A1xB1
  311. MADD3 c13, c13, a1, b2 # axd
  312. LD b5, 12 * SIZE(BO)
  313. LD b6, 13 * SIZE(BO)
  314. MADD2 c12, c12, a2, b1 # bxc
  315. MADD4 c14, c14, a2, b2 # bxd
  316. LD a7, 14 * SIZE(AO)
  317. LD a8, 15 * SIZE(AO)
  318. MADD1 c21, c21, a3, b1 # A2xB1
  319. MADD3 c23, c23, a3, b2
  320. LD b7, 14 * SIZE(BO)
  321. LD b8, 15 * SIZE(BO)
  322. MADD2 c22, c22, a4, b1
  323. MADD4 c24, c24, a4, b2
  324. FETCH $0, 12 * SIZE(PREA)
  325. MADD1 c31, c31, a1, b3 # A1xB2
  326. MADD3 c33, c33, a1, b4
  327. daddiu L, L, -1
  328. FETCH $0, 12 * SIZE(PREB)
  329. MADD2 c32, c32, a2, b3
  330. MADD4 c34, c34, a2, b4
  331. daddiu AO, AO, 16 * SIZE
  332. daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
  333. MADD1 c41, c41, a3, b3 # A2xB2
  334. MADD3 c43, c43, a3, b4
  335. daddu PREA, PREA, 16 * SIZE
  336. MADD2 c42, c42, a4, b3
  337. MADD4 c44, c44, a4, b4
  338. daddu PREB, PREB, 16 * SIZE
  339. LD a1, 0 * SIZE(AO)
  340. LD a2, 1 * SIZE(AO)
  341. MADD1 c11, c11, a5, b5 # axc A1xB1
  342. MADD3 c13, c13, a5, b6 # axd
  343. LD b1, 0 * SIZE(BO)
  344. LD b2, 1 * SIZE(BO)
  345. MADD2 c12, c12, a6, b5 # bxc
  346. MADD4 c14, c14, a6, b6 # bxd
  347. LD a3, 2 * SIZE(AO)
  348. LD a4, 3 * SIZE(AO)
  349. MADD1 c21, c21, a7, b5 # A2xB1
  350. MADD3 c23, c23, a7, b6
  351. LD b3, 2 * SIZE(BO)
  352. LD b4, 3 * SIZE(BO)
  353. MADD2 c22, c22, a8, b5
  354. MADD4 c24, c24, a8, b6
  355. FETCH $0, 0 * SIZE(PREA)
  356. FETCH $0, 0 * SIZE(PREB)
  357. MADD1 c31, c31, a5, b7 # A1xB2
  358. MADD3 c33, c33, a5, b8
  359. MADD2 c32, c32, a6, b7
  360. MADD4 c34, c34, a6, b8
  361. MADD1 c41, c41, a7, b7 # A2xB2
  362. MADD3 c43, c43, a7, b8
  363. MADD2 c42, c42, a8, b7
  364. bgtz L, .L12
  365. MADD4 c44, c44, a8, b8
  366. .align 5
  367. .L15:
  368. #ifndef TRMMKERNEL
  369. andi L, K, 3
  370. LD ALPHA_R, 128($sp)
  371. #else
  372. andi L, TEMP, 3
  373. LD ALPHA_R, 128($sp)
  374. #endif
  375. blez L, .L18
  376. LD ALPHA_I, 136($sp)
  377. .align 5
  378. .L16:
  379. daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
  380. daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
  381. MADD1 c11, c11, a1, b1 # axc A1xB1
  382. MADD3 c13, c13, a1, b2 # axd
  383. daddiu PREA, PREA, 4 * SIZE
  384. daddiu PREB, PREB, 4 * SIZE
  385. MADD2 c12, c12, a2, b1 # bxc
  386. MADD4 c14, c14, a2, b2 # bxd
  387. MADD1 c21, c21, a3, b1 # A2xB1
  388. MADD3 c23, c23, a3, b2
  389. MADD2 c22, c22, a4, b1
  390. MADD4 c24, c24, a4, b2
  391. FETCH $0, 0 * SIZE(PREA)
  392. MADD1 c31, c31, a1, b3 # A1xB2
  393. MADD3 c33, c33, a1, b4
  394. daddiu L, L, -1
  395. MADD2 c32, c32, a2, b3
  396. MADD4 c34, c34, a2, b4
  397. FETCH $0, 0 * SIZE(PREB)
  398. MADD1 c41, c41, a3, b3 # A2xB2
  399. MADD3 c43, c43, a3, b4
  400. MADD2 c42, c42, a4, b3
  401. MADD4 c44, c44, a4, b4
  402. LD a1, 0 * SIZE(AO)
  403. LD a2, 1 * SIZE(AO)
  404. LD b1, 0 * SIZE(BO)
  405. LD b2, 1 * SIZE(BO)
  406. LD a3, 2 * SIZE(AO)
  407. LD a4, 3 * SIZE(AO)
  408. LD b3, 2 * SIZE(BO)
  409. LD b4, 3 * SIZE(BO)
  410. bgtz L, .L16
  411. NOP
  412. .L18:
  413. #ifndef TRMMKERNEL
  414. ADD c11, c14, c11
  415. LD a1, 0 * SIZE(CO1)
  416. ADD c12, c13, c12
  417. LD a2, 1 * SIZE(CO1)
  418. ADD c21, c24, c21
  419. LD b1, 2 * SIZE(CO1)
  420. ADD c22, c23, c22
  421. LD b2, 3 * SIZE(CO1)
  422. ADD c31, c34, c31
  423. LD a3, 0 * SIZE(CO2)
  424. ADD c32, c33, c32
  425. LD a4, 1 * SIZE(CO2)
  426. ADD c41, c44, c41
  427. LD b3, 2 * SIZE(CO2)
  428. ADD c42, c43, c42
  429. LD b4, 3 * SIZE(CO2)
  430. daddiu I, I, -1
  431. MADD a1, a1, ALPHA_R, c11
  432. MADD a2, a2, ALPHA_R, c12
  433. MADD b1, b1, ALPHA_R, c21
  434. MADD b2, b2, ALPHA_R, c22
  435. NMSUB a1, a1, ALPHA_I, c12
  436. MADD a2, a2, ALPHA_I, c11
  437. NMSUB b1, b1, ALPHA_I, c22
  438. MADD b2, b2, ALPHA_I, c21
  439. MADD a3, a3, ALPHA_R, c31
  440. MADD a4, a4, ALPHA_R, c32
  441. ST a1, 0 * SIZE(CO1)
  442. MADD b3, b3, ALPHA_R, c41
  443. MADD b4, b4, ALPHA_R, c42
  444. ST a2, 1 * SIZE(CO1)
  445. NMSUB a3, a3, ALPHA_I, c32
  446. MADD a4, a4, ALPHA_I, c31
  447. ST b1, 2 * SIZE(CO1)
  448. NMSUB b3, b3, ALPHA_I, c42
  449. MADD b4, b4, ALPHA_I, c41
  450. ST b2, 3 * SIZE(CO1)
  451. ST a3, 0 * SIZE(CO2)
  452. ST a4, 1 * SIZE(CO2)
  453. ST b3, 2 * SIZE(CO2)
  454. ST b4, 3 * SIZE(CO2)
  455. #else
  456. ADD c11, c14, c11
  457. ADD c12, c13, c12
  458. ADD c21, c24, c21
  459. ADD c22, c23, c22
  460. ADD c31, c34, c31
  461. ADD c32, c33, c32
  462. ADD c41, c44, c41
  463. ADD c42, c43, c42
  464. daddiu I, I, -1
  465. MUL a1, ALPHA_R, c11
  466. MUL a2, ALPHA_R, c12
  467. MUL b1, ALPHA_R, c21
  468. MUL b2, ALPHA_R, c22
  469. NMSUB a1, a1, ALPHA_I, c12
  470. MADD a2, a2, ALPHA_I, c11
  471. NMSUB b1, b1, ALPHA_I, c22
  472. MADD b2, b2, ALPHA_I, c21
  473. MUL a3, ALPHA_R, c31
  474. MUL a4, ALPHA_R, c32
  475. MUL b3, ALPHA_R, c41
  476. MUL b4, ALPHA_R, c42
  477. NMSUB a3, a3, ALPHA_I, c32
  478. MADD a4, a4, ALPHA_I, c31
  479. NMSUB b3, b3, ALPHA_I, c42
  480. MADD b4, b4, ALPHA_I, c41
  481. ST a1, 0 * SIZE(CO1)
  482. ST a2, 1 * SIZE(CO1)
  483. ST b1, 2 * SIZE(CO1)
  484. ST b2, 3 * SIZE(CO1)
  485. ST a3, 0 * SIZE(CO2)
  486. ST a4, 1 * SIZE(CO2)
  487. ST b3, 2 * SIZE(CO2)
  488. ST b4, 3 * SIZE(CO2)
  489. #if ( defined(LEFT) && defined(TRANSA)) || \
  490. (!defined(LEFT) && !defined(TRANSA))
  491. dsubu TEMP, K, KK
  492. #ifdef LEFT
  493. daddiu TEMP, TEMP, -2
  494. #else
  495. daddiu TEMP, TEMP, -2
  496. #endif
  497. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  498. daddu AO, AO, TEMP
  499. daddu BO, BO, TEMP
  500. #endif
  501. #ifdef LEFT
  502. daddiu KK, KK, 2
  503. #endif
  504. #endif
  505. dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  506. daddiu CO1,CO1, 4 * SIZE
  507. bgtz I, .L11
  508. daddiu CO2,CO2, 4 * SIZE
  509. .align 5
  510. .L30:
  511. andi I, M, 1
  512. daddu C, C, LDC # Change C to next panel
  513. daddu PREB, PREB, B # PREA=A+panel size
  514. blez I, .L19
  515. daddu C, C, LDC # Change C to next panel
  516. #if defined(TRMMKERNEL)
  517. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  518. move BO, B
  519. #else
  520. dsll L, KK, ZBASE_SHIFT # MR=1
  521. dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
  522. daddu AO, AO, L
  523. daddu BO, B, TEMP
  524. #endif
  525. LD a1, 0 * SIZE(AO)
  526. LD a2, 1 * SIZE(AO)
  527. MTC $0, c11 # Clear results regs
  528. MOV c12, c11
  529. LD b1, 0 * SIZE(BO)
  530. LD b2, 1 * SIZE(BO)
  531. MOV c13, c11
  532. MOV c14, c11
  533. LD b3, 2 * SIZE(BO)
  534. LD b4, 3 * SIZE(BO)
  535. MOV c31, c11
  536. MOV c32, c11
  537. FETCH $0, 0 * SIZE(PREB)
  538. MOV c33, c11
  539. MOV c34, c11
  540. FETCH $0, 0 * SIZE(CO1)
  541. FETCH $0, 0 * SIZE(CO2)
  542. FETCH $0, 4 * SIZE(CO1)
  543. FETCH $0, 4 * SIZE(CO2)
  544. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  545. dsubu TEMP, K, KK
  546. #elif defined(LEFT)
  547. daddiu TEMP, KK, 1 # MR=1
  548. #else
  549. daddiu TEMP, KK, 2 # NR=2
  550. #endif
  551. dsra L, TEMP, 2
  552. blez L, .L35
  553. NOP
  554. #else
  555. LD a1, 0 * SIZE(AO)
  556. LD a2, 1 * SIZE(AO)
  557. dsra L, K, 2 # Unroll K 4 times
  558. move BO, B
  559. LD b1, 0 * SIZE(BO)
  560. LD b2, 1 * SIZE(BO)
  561. MTC $0, c11 # Clear results regs
  562. MOV c12, c11
  563. LD b3, 2 * SIZE(BO)
  564. LD b4, 3 * SIZE(BO)
  565. MOV c13, c11
  566. MOV c14, c11
  567. FETCH $0, 0 * SIZE(PREB)
  568. MOV c31, c11
  569. MOV c32, c11
  570. FETCH $0, 0 * SIZE(CO1)
  571. FETCH $0, 0 * SIZE(CO2)
  572. FETCH $0, 4 * SIZE(CO1)
  573. FETCH $0, 4 * SIZE(CO2)
  574. MOV c33, c11
  575. blez L, .L35
  576. MOV c34, c11
  577. #endif
  578. .align 5
  579. .L32:
  580. LD a3, 2 * SIZE(AO)
  581. LD a4, 3 * SIZE(AO)
  582. MADD1 c11, c11, a1, b1 # axc A1xB1
  583. MADD3 c13, c13, a1, b2 # axd
  584. LD b5, 4 * SIZE(BO)
  585. LD b6, 5 * SIZE(BO)
  586. MADD2 c12, c12, a2, b1 # bxc
  587. MADD4 c14, c14, a2, b2 # bxd
  588. LD b7, 6 * SIZE(BO)
  589. LD b8, 7 * SIZE(BO)
  590. MADD1 c31, c31, a1, b3 # A1xB2
  591. MADD3 c33, c33, a1, b4
  592. FETCH $0, 4 * SIZE(PREB)
  593. MADD2 c32, c32, a2, b3
  594. MADD4 c34, c34, a2, b4
  595. NOP
  596. LD a5, 4 * SIZE(AO)
  597. LD a6, 5 * SIZE(AO)
  598. MADD1 c11, c11, a3, b5 # axc A1xB1
  599. MADD3 c13, c13, a3, b6 # axd
  600. LD b1, 8 * SIZE(BO)
  601. LD b2, 9 * SIZE(BO)
  602. MADD2 c12, c12, a4, b5 # bxc
  603. MADD4 c14, c14, a4, b6 # bxd
  604. LD b3, 10 * SIZE(BO)
  605. LD b4, 11 * SIZE(BO)
  606. MADD1 c31, c31, a3, b7 # A1xB2
  607. MADD3 c33, c33, a3, b8
  608. FETCH $0, 8 * SIZE(PREB)
  609. MADD2 c32, c32, a4, b7
  610. MADD4 c34, c34, a4, b8
  611. daddiu L, L, -1
  612. LD a7, 6 * SIZE(AO)
  613. LD a8, 7 * SIZE(AO)
  614. MADD1 c11, c11, a5, b1 # axc A1xB1
  615. MADD3 c13, c13, a5, b2 # axd
  616. LD b5, 12 * SIZE(BO)
  617. LD b6, 13 * SIZE(BO)
  618. MADD2 c12, c12, a6, b1 # bxc
  619. MADD4 c14, c14, a6, b2 # bxd
  620. LD b7, 14 * SIZE(BO)
  621. LD b8, 15 * SIZE(BO)
  622. MADD1 c31, c31, a5, b3 # A1xB2
  623. MADD3 c33, c33, a5, b4
  624. daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
  625. daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
  626. FETCH $0, 12 * SIZE(PREB)
  627. MADD2 c32, c32, a6, b3
  628. MADD4 c34, c34, a6, b4
  629. NOP
  630. LD a1, 0 * SIZE(AO)
  631. LD a2, 1 * SIZE(AO)
  632. MADD1 c11, c11, a7, b5 # axc A1xB1
  633. MADD3 c13, c13, a7, b6 # axd
  634. LD b1, 0 * SIZE(BO)
  635. LD b2, 1 * SIZE(BO)
  636. MADD2 c12, c12, a8, b5 # bxc
  637. MADD4 c14, c14, a8, b6 # bxd
  638. LD b3, 2 * SIZE(BO)
  639. LD b4, 3 * SIZE(BO)
  640. MADD1 c31, c31, a7, b7 # A1xB2
  641. NOP
  642. MADD3 c33, c33, a7, b8
  643. daddiu PREB, PREB, 16 * SIZE
  644. FETCH $0, 0 * SIZE(PREB)
  645. MADD2 c32, c32, a8, b7
  646. bgtz L, .L32
  647. MADD4 c34, c34, a8, b8
  648. .L35:
  649. #ifndef TRMMKERNEL
  650. andi L, K, 3
  651. LD ALPHA_R, 128($sp)
  652. #else
  653. andi L, TEMP, 3
  654. LD ALPHA_R, 128($sp)
  655. #endif
  656. blez L, .L38
  657. LD ALPHA_I, 136($sp)
  658. .align 5
  659. .L36:
  660. daddiu L, L, -1
  661. MADD1 c11, c11, a1, b1 # axc A1xB1
  662. MADD3 c13, c13, a1, b2 # axd
  663. daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
  664. MADD2 c12, c12, a2, b1 # bxc
  665. MADD4 c14, c14, a2, b2 # bxd
  666. daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
  667. MADD1 c31, c31, a1, b3 # A1xB2
  668. MADD3 c33, c33, a1, b4
  669. daddiu PREB, PREB, 4 * SIZE
  670. MADD2 c32, c32, a2, b3
  671. MADD4 c34, c34, a2, b4
  672. LD a1, 0 * SIZE(AO)
  673. LD a2, 1 * SIZE(AO)
  674. LD b1, 0 * SIZE(BO)
  675. LD b2, 1 * SIZE(BO)
  676. LD b3, 2 * SIZE(BO)
  677. LD b4, 3 * SIZE(BO)
  678. bgtz L, .L36
  679. NOP
  680. .L38:
  681. #ifndef TRMMKERNEL
  682. ADD c11, c14, c11
  683. LD a1, 0 * SIZE(CO1)
  684. ADD c12, c13, c12
  685. LD a2, 1 * SIZE(CO1)
  686. ADD c31, c34, c31
  687. LD a3, 0 * SIZE(CO2)
  688. ADD c32, c33, c32
  689. LD a4, 1 * SIZE(CO2)
  690. MADD a1, a1, ALPHA_R, c11
  691. MADD a2, a2, ALPHA_R, c12
  692. MADD a3, a3, ALPHA_R, c31
  693. MADD a4, a4, ALPHA_R, c32
  694. NMSUB a1, a1, ALPHA_I, c12
  695. MADD a2, a2, ALPHA_I, c11
  696. NMSUB a3, a3, ALPHA_I, c32
  697. MADD a4, a4, ALPHA_I, c31
  698. ST a1, 0 * SIZE(CO1)
  699. ST a2, 1 * SIZE(CO1)
  700. ST a3, 0 * SIZE(CO2)
  701. ST a4, 1 * SIZE(CO2)
  702. daddiu CO1,CO1, 2 * SIZE
  703. daddiu CO2,CO2, 2 * SIZE
  704. #else
  705. ADD c11, c14, c11
  706. ADD c12, c13, c12
  707. ADD c31, c34, c31
  708. ADD c32, c33, c32
  709. MUL a1, ALPHA_R, c11
  710. MUL a2, ALPHA_R, c12
  711. MUL a3, ALPHA_R, c31
  712. MUL a4, ALPHA_R, c32
  713. NMSUB a1, a1, ALPHA_I, c12
  714. MADD a2, a2, ALPHA_I, c11
  715. NMSUB a3, a3, ALPHA_I, c32
  716. MADD a4, a4, ALPHA_I, c31
  717. ST a1, 0 * SIZE(CO1)
  718. ST a2, 1 * SIZE(CO1)
  719. ST a3, 0 * SIZE(CO2)
  720. ST a4, 1 * SIZE(CO2)
  721. daddiu CO1,CO1, 2 * SIZE
  722. daddiu CO2,CO2, 2 * SIZE
  723. #if ( defined(LEFT) && defined(TRANSA)) || \
  724. (!defined(LEFT) && !defined(TRANSA))
  725. dsubu TEMP, K, KK
  726. #ifdef LEFT
  727. daddiu TEMP, TEMP, -1
  728. #else
  729. daddiu TEMP, TEMP, -2
  730. #endif
  731. dsll L, TEMP, ZBASE_SHIFT
  732. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  733. daddu AO, AO, L
  734. daddu BO, BO, TEMP
  735. #endif
  736. #ifdef LEFT
  737. daddiu KK, KK, 1
  738. #endif
  739. #endif
  740. .align 5
  741. .L19:
  742. #if defined(TRMMKERNEL) && !defined(LEFT)
  743. daddiu KK, KK, 2
  744. #endif
  745. bgtz J, .L10
  746. move B, BO
  747. .align 5
  748. .L20:
  749. andi J, N, 1
  750. blez J, .L999
  751. dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4
  752. dsra I, M, 1 # I=M/2
  753. move CO1, C
  754. #if defined(TRMMKERNEL) && defined(LEFT)
  755. move KK, OFFSET
  756. #endif
  757. move AO, A # Reset AO
  758. blez I, .L29
  759. daddu PREA, PREA, A
  760. .L21:
  761. #if defined(TRMMKERNEL)
  762. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  763. move BO, B
  764. #else
  765. dsll L, KK, 1 + ZBASE_SHIFT
  766. dsll TEMP, KK, ZBASE_SHIFT
  767. daddu AO, AO, L
  768. daddu BO, B, TEMP
  769. #endif
  770. LD a1, 0 * SIZE(AO)
  771. LD a2, 1 * SIZE(AO)
  772. MTC $0, c11 # Clear results regs
  773. MOV c12, c11
  774. LD b1, 0 * SIZE(BO)
  775. LD b2, 1 * SIZE(BO)
  776. MOV c13, c11
  777. MOV c14, c11
  778. LD a3, 2 * SIZE(AO)
  779. LD a4, 3 * SIZE(AO)
  780. MOV c21, c11
  781. MOV c22, c11
  782. FETCH $0, 0 * SIZE(PREA)
  783. MOV c23, c11
  784. MOV c24, c11
  785. FETCH $0, 0 * SIZE(CO1)
  786. FETCH $0, 4 * SIZE(CO1)
  787. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  788. dsubu TEMP, K, KK
  789. #elif defined(LEFT)
  790. daddiu TEMP, KK, 2 # define Mr=2
  791. #else
  792. daddiu TEMP, KK, 1 # define NR=1
  793. #endif
  794. dsra L, TEMP, 2
  795. blez L, .L25
  796. NOP
  797. #else
  798. dsra L, K, 2 # Unroll K 4 times
  799. move BO, B
  800. LD a1, 0 * SIZE(AO)
  801. LD a2, 1 * SIZE(AO)
  802. MTC $0, c11 # Clear results regs
  803. MOV c12, c11
  804. LD b1, 0 * SIZE(BO)
  805. LD b2, 1 * SIZE(BO)
  806. MOV c13, c11
  807. MOV c14, c11
  808. LD a3, 2 * SIZE(AO)
  809. LD a4, 3 * SIZE(AO)
  810. MOV c21, c11
  811. MOV c22, c11
  812. FETCH $0, 0 * SIZE(PREA)
  813. MOV c23, c11
  814. MOV c24, c11
  815. FETCH $0, 0 * SIZE(CO1)
  816. FETCH $0, 4 * SIZE(CO1)
  817. blez L, .L25
  818. NOP
  819. #endif
  820. .align 5
  821. .L22:
  822. LD a5, 4 * SIZE(AO)
  823. LD a6, 5 * SIZE(AO)
  824. MADD1 c11, c11, a1, b1 # axc A1xB1
  825. MADD3 c13, c13, a1, b2 # axd
  826. LD b3, 2 * SIZE(BO)
  827. LD b4, 3 * SIZE(BO)
  828. MADD2 c12, c12, a2, b1 # bxc
  829. MADD4 c14, c14, a2, b2 # bxd
  830. LD a7, 6 * SIZE(AO)
  831. LD a8, 7 * SIZE(AO)
  832. MADD1 c21, c21, a3, b1 # A2xB1
  833. MADD3 c23, c23, a3, b2
  834. FETCH $0, 4 * SIZE(PREA)
  835. MADD2 c22, c22, a4, b1
  836. MADD4 c24, c24, a4, b2
  837. LD a1, 8 * SIZE(AO)
  838. LD a2, 9 * SIZE(AO)
  839. MADD1 c11, c11, a5, b3 # axc A1xB1
  840. MADD3 c13, c13, a5, b4 # axd
  841. LD b5, 4 * SIZE(BO)
  842. LD b6, 5 * SIZE(BO)
  843. MADD2 c12, c12, a6, b3 # bxc
  844. MADD4 c14, c14, a6, b4 # bxd
  845. LD a3, 10 * SIZE(AO)
  846. LD a4, 11 * SIZE(AO)
  847. MADD1 c21, c21, a7, b3 # A2xB1
  848. MADD3 c23, c23, a7, b4
  849. FETCH $0, 8 * SIZE(PREA)
  850. MADD2 c22, c22, a8, b3
  851. MADD4 c24, c24, a8, b4
  852. daddiu L, L, -1
  853. LD a5, 12 * SIZE(AO)
  854. LD a6, 13 * SIZE(AO)
  855. MADD1 c11, c11, a1, b5 # axc A1xB1
  856. MADD3 c13, c13, a1, b6 # axd
  857. LD b7, 6 * SIZE(BO)
  858. LD b8, 7 * SIZE(BO)
  859. MADD2 c12, c12, a2, b5 # bxc
  860. MADD4 c14, c14, a2, b6 # bxd
  861. LD a7, 14 * SIZE(AO)
  862. LD a8, 15 * SIZE(AO)
  863. MADD1 c21, c21, a3, b5 # A2xB1
  864. MADD3 c23, c23, a3, b6
  865. daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx
  866. daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
  867. FETCH $0, 12 * SIZE(PREA)
  868. MADD2 c22, c22, a4, b5
  869. MADD4 c24, c24, a4, b6
  870. daddiu PREA, PREA, 16 * SIZE
  871. LD a1, 0 * SIZE(AO)
  872. LD a2, 1 * SIZE(AO)
  873. MADD1 c11, c11, a5, b7 # axc A1xB1
  874. MADD3 c13, c13, a5, b8 # axd
  875. LD b1, 0 * SIZE(BO)
  876. LD b2, 1 * SIZE(BO)
  877. MADD2 c12, c12, a6, b7 # bxc
  878. MADD4 c14, c14, a6, b8 # bxd
  879. LD a3, 2 * SIZE(AO)
  880. LD a4, 3 * SIZE(AO)
  881. MADD1 c21, c21, a7, b7 # A2xB1
  882. MADD3 c23, c23, a7, b8
  883. FETCH $0, 0 * SIZE(PREA)
  884. MADD2 c22, c22, a8, b7
  885. bgtz L, .L22
  886. MADD4 c24, c24, a8, b8
  887. .L25:
  888. #ifndef TRMMKERNEL
  889. andi L, K, 3
  890. LD ALPHA_R, 128($sp)
  891. #else
  892. andi L, TEMP, 3
  893. LD ALPHA_R, 128($sp)
  894. #endif
  895. blez L, .L28
  896. LD ALPHA_I, 136($sp)
  897. .align 3
  898. .L26:
  899. daddiu L, L, -1
  900. MADD1 c11, c11, a1, b1 # axc A1xB1
  901. MADD3 c13, c13, a1, b2 # axd
  902. daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
  903. MADD2 c12, c12, a2, b1 # bxc
  904. MADD4 c14, c14, a2, b2 # bxd
  905. daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
  906. MADD1 c21, c21, a3, b1 # A2xB1
  907. MADD3 c23, c23, a3, b2
  908. daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx
  909. MADD2 c22, c22, a4, b1
  910. MADD4 c24, c24, a4, b2
  911. # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  912. # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  913. # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  914. LD a1, 0 * SIZE(AO)
  915. LD a2, 1 * SIZE(AO)
  916. LD b1, 0 * SIZE(BO)
  917. LD b2, 1 * SIZE(BO)
  918. LD a3, 2 * SIZE(AO)
  919. LD a4, 3 * SIZE(AO)
  920. bgtz L, .L26
  921. FETCH $0, 0 * SIZE(PREA)
  922. .L28:
  923. #ifndef TRMMKERNEL
  924. ADD c11, c14, c11
  925. LD a1, 0 * SIZE(CO1)
  926. ADD c12, c13, c12
  927. LD a2, 1 * SIZE(CO1)
  928. ADD c21, c24, c21
  929. LD b1, 2 * SIZE(CO1)
  930. ADD c22, c23, c22
  931. LD b2, 3 * SIZE(CO1)
  932. daddiu I, I, -1
  933. MADD a1, a1, ALPHA_R, c11
  934. MADD a2, a2, ALPHA_R, c12
  935. MADD b1, b1, ALPHA_R, c21
  936. MADD b2, b2, ALPHA_R, c22
  937. NMSUB a1, a1, ALPHA_I, c12
  938. MADD a2, a2, ALPHA_I, c11
  939. NMSUB b1, b1, ALPHA_I, c22
  940. MADD b2, b2, ALPHA_I, c21
  941. ST a1, 0 * SIZE(CO1)
  942. ST a2, 1 * SIZE(CO1)
  943. ST b1, 2 * SIZE(CO1)
  944. ST b2, 3 * SIZE(CO1)
  945. #else
  946. ADD c11, c14, c11
  947. ADD c12, c13, c12
  948. ADD c21, c24, c21
  949. ADD c22, c23, c22
  950. daddiu I, I, -1
  951. MUL a1, ALPHA_R, c11
  952. MUL a2, ALPHA_R, c12
  953. MUL b1, ALPHA_R, c21
  954. MUL b2, ALPHA_R, c22
  955. NMSUB a1, a1, ALPHA_I, c12
  956. MADD a2, a2, ALPHA_I, c11
  957. NMSUB b1, b1, ALPHA_I, c22
  958. MADD b2, b2, ALPHA_I, c21
  959. ST a1, 0 * SIZE(CO1)
  960. ST a2, 1 * SIZE(CO1)
  961. ST b1, 2 * SIZE(CO1)
  962. ST b2, 3 * SIZE(CO1)
  963. #if ( defined(LEFT) && defined(TRANSA)) || \
  964. (!defined(LEFT) && !defined(TRANSA))
  965. dsubu TEMP, K, KK
  966. #ifdef LEFT
  967. daddiu TEMP, TEMP, -2
  968. #else
  969. daddiu TEMP, TEMP, -1
  970. #endif
  971. dsll L, TEMP, 1 + ZBASE_SHIFT
  972. dsll TEMP, TEMP, ZBASE_SHIFT
  973. daddu AO, AO, L
  974. daddu BO, BO, TEMP
  975. #endif
  976. #ifdef LEFT
  977. daddiu KK, KK, 2
  978. #endif
  979. #endif
  980. daddiu CO1,CO1, 4 * SIZE
  981. bgtz I, .L21
  982. NOP
  983. .L29:
  984. andi I, M, 1
  985. blez I, .L999
  986. NOP
  987. #if defined(TRMMKERNEL)
  988. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  989. move BO, B
  990. #else
  991. dsll TEMP, KK, ZBASE_SHIFT
  992. daddu AO, AO, TEMP
  993. daddu BO, B, TEMP
  994. #endif
  995. # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  996. LD a1, 0 * SIZE(AO)
  997. LD a2, 1 * SIZE(AO)
  998. MTC $0, c11 # Clear results regs
  999. MOV c12, c11
  1000. # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  1001. LD b1, 0 * SIZE(BO)
  1002. LD b2, 1 * SIZE(BO)
  1003. MOV c13, c11
  1004. MOV c14, c11
  1005. FETCH $0, 0 * SIZE(PREA)
  1006. FETCH $0, 4 * SIZE(PREA)
  1007. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1008. dsubu TEMP, K, KK
  1009. #elif defined(LEFT)
  1010. daddiu TEMP, KK, 1
  1011. #else
  1012. daddiu TEMP, KK, 1
  1013. #endif
  1014. dsra L, TEMP, 2
  1015. blez L, .L45
  1016. NOP
  1017. #else
  1018. dsra L, K, 2 # Unroll K 4 times
  1019. move BO, B
  1020. # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  1021. LD a1, 0 * SIZE(AO)
  1022. LD a2, 1 * SIZE(AO)
  1023. MTC $0, c11 # Clear results regs
  1024. MOV c12, c11
  1025. # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  1026. LD b1, 0 * SIZE(BO)
  1027. LD b2, 1 * SIZE(BO)
  1028. MOV c13, c11
  1029. MOV c14, c11
  1030. FETCH $0, 0 * SIZE(PREA)
  1031. FETCH $0, 4 * SIZE(PREA)
  1032. blez L, .L45
  1033. NOP
  1034. #endif
  1035. .align 3
  1036. .L42:
  1037. # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  1038. LD a3, 2 * SIZE(AO)
  1039. LD a4, 3 * SIZE(AO)
  1040. MADD1 c11, c11, a1, b1 # axc A1xB1
  1041. MADD3 c13, c13, a1, b2 # axd
  1042. # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  1043. LD b3, 2 * SIZE(BO)
  1044. LD b4, 3 * SIZE(BO)
  1045. MADD2 c12, c12, a2, b1 # bxc
  1046. MADD4 c14, c14, a2, b2 # bxd
  1047. # gsLQC1(R12, F9, F8, 2) # Unroll K=1
  1048. LD a5, 4 * SIZE(AO)
  1049. LD a6, 5 * SIZE(AO)
  1050. MADD1 c11, c11, a3, b3 # axc A1xB1
  1051. MADD3 c13, c13, a3, b4 # axd
  1052. # gsLQC1(R13, F13, F12, 2)
  1053. LD b5, 4 * SIZE(BO)
  1054. LD b6, 5 * SIZE(BO)
  1055. MADD2 c12, c12, a4, b3 # bxc
  1056. MADD4 c14, c14, a4, b4 # bxd
  1057. # gsLQC1(R12, F11, F10, 3)
  1058. LD a7, 6 * SIZE(AO)
  1059. LD a8, 7 * SIZE(AO)
  1060. MADD1 c11, c11, a5, b5 # axc A1xB1
  1061. MADD3 c13, c13, a5, b6 # axd
  1062. daddiu L, L, -1
  1063. # gsLQC1(R13, F16, F15, 3)
  1064. LD b7, 6 * SIZE(BO)
  1065. LD b8, 7 * SIZE(BO)
  1066. MADD2 c12, c12, a6, b5 # bxc
  1067. MADD4 c14, c14, a6, b6 # bxd
  1068. daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
  1069. daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx
  1070. # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  1071. LD a1, 0 * SIZE(AO)
  1072. LD a2, 1 * SIZE(AO)
  1073. MADD1 c11, c11, a7, b7 # axc A1xB1
  1074. MADD3 c13, c13, a7, b8 # axd
  1075. # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  1076. LD b1, 0 * SIZE(BO)
  1077. LD b2, 1 * SIZE(BO)
  1078. MADD2 c12, c12, a8, b7 # bxc
  1079. MADD4 c14, c14, a8, b8 # bxd
  1080. bgtz L, .L42
  1081. NOP
  1082. .align 5
  1083. .L45:
  1084. #ifndef TRMMKERNEL
  1085. andi L, K, 3
  1086. LD ALPHA_R, 128($sp)
  1087. #else
  1088. andi L, TEMP, 3
  1089. LD ALPHA_R, 128($sp)
  1090. #endif
  1091. blez L, .L48
  1092. LD ALPHA_I, 136($sp)
  1093. .L46:
  1094. daddiu L, L, -1
  1095. daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
  1096. daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
  1097. MADD1 c11, c11, a1, b1 # axc A1xB1
  1098. MADD3 c13, c13, a1, b2 # axd
  1099. MADD2 c12, c12, a2, b1 # bxc
  1100. MADD4 c14, c14, a2, b2 # bxd
  1101. # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  1102. # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  1103. LD a1, 0 * SIZE(AO)
  1104. LD a2, 1 * SIZE(AO)
  1105. LD b1, 0 * SIZE(BO)
  1106. LD b2, 1 * SIZE(BO)
  1107. bgtz L, .L46
  1108. NOP
  1109. .L48:
  1110. #ifndef TRMMKERNEL
  1111. ADD c11, c14, c11
  1112. ADD c12, c13, c12
  1113. LD a1, 0 * SIZE(CO1)
  1114. LD a2, 1 * SIZE(CO1)
  1115. MADD a1, a1, ALPHA_R, c11
  1116. MADD a2, a2, ALPHA_R, c12
  1117. NMSUB a1, a1, ALPHA_I, c12
  1118. MADD a2, a2, ALPHA_I, c11
  1119. ST a1, 0 * SIZE(CO1)
  1120. ST a2, 1 * SIZE(CO1)
  1121. #else
  1122. ADD c11, c14, c11
  1123. ADD c12, c13, c12
  1124. MUL a1, ALPHA_R, c11
  1125. MUL a2, ALPHA_R, c12
  1126. NMSUB a1, a1, ALPHA_I, c12
  1127. MADD a2, a2, ALPHA_I, c11
  1128. ST a1, 0 * SIZE(CO1)
  1129. ST a2, 1 * SIZE(CO1)
  1130. #if ( defined(LEFT) && defined(TRANSA)) || \
  1131. (!defined(LEFT) && !defined(TRANSA))
  1132. dsubu TEMP, K, KK
  1133. #ifdef LEFT
  1134. daddiu TEMP, TEMP, -1
  1135. #else
  1136. daddiu TEMP, TEMP, -1
  1137. #endif
  1138. dsll TEMP, TEMP, ZBASE_SHIFT
  1139. daddu AO, AO, TEMP
  1140. daddu BO, BO, TEMP
  1141. #endif
  1142. #ifdef LEFT
  1143. daddiu KK, KK, 1
  1144. #endif
  1145. daddiu CO1,CO1, 2 * SIZE
  1146. #endif
  1147. .align 5
  1148. .L999:
  1149. LDARG $16, 0($sp)
  1150. LDARG $17, 8($sp)
  1151. ldc1 $f24, 16($sp)
  1152. ldc1 $f25, 24($sp)
  1153. ldc1 $f26, 32($sp)
  1154. ldc1 $f27, 40($sp)
  1155. ldc1 $f28, 48($sp)
  1156. ldc1 $f29, 56($sp)
  1157. #if defined(TRMMKERNEL)
  1158. LDARG $18, 64($sp)
  1159. LDARG $19, 72($sp)
  1160. LDARG $20, 80($sp)
  1161. #endif
  1162. #ifndef __64BIT__
  1163. ldc1 $f20, 88($sp)
  1164. ldc1 $f21, 96($sp)
  1165. ldc1 $f22,104($sp)
  1166. ldc1 $f23,112($sp)
  1167. #endif
  1168. j $31
  1169. daddiu $sp, $sp, STACKSIZE
  1170. EPILOGUE