You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_loongson3a_2x2.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355
  1. #define ASSEMBLER
  2. #include "common.h"
  3. #define FETCH ld
  4. #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  5. #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  6. #define STACKSIZE 160
  7. #define M $4
  8. #define N $5
  9. #define K $6
  10. #define A $9
  11. #define B $10
  12. #define C $11
  13. #define LDC $8
  14. #define AO $12
  15. #define BO $13
  16. #define R12 12
  17. #define R13 13
  18. #define I $2
  19. #define J $3
  20. #define L $7
  21. #define CO1 $14
  22. #define CO2 $15
  23. #define PREA $16
  24. #define PREB $17
  25. #if defined(TRMMKERNEL)
  26. #define OFFSET $18
  27. #define KK $19
  28. #define TEMP $20
  29. #endif
  30. #define a1 $f0
  31. #define a2 $f1
  32. #define a3 $f2
  33. #define a4 $f3
  34. #define b1 $f4
  35. #define b2 $f5
  36. #define b3 $f6
  37. #define b4 $f7
  38. #define a5 $f8
  39. #define a6 $f9
  40. #define a7 $f10
  41. #define a8 $f11
  42. #define b5 $f12
  43. #define b6 $f13
  44. #define b7 $f15
  45. #define b8 $f16
  46. #define c11 $f14
  47. #define c12 $f17
  48. #define c13 $f18
  49. #define c14 $f19
  50. #define c21 $f20
  51. #define c22 $f21
  52. #define c23 $f22
  53. #define c24 $f23
  54. #define c31 $f24
  55. #define c32 $f25
  56. #define c33 $f26
  57. #define c34 $f27
  58. #define c41 $f28
  59. #define c42 $f29
  60. #define c43 $f30
  61. #define c44 $f31
  62. #define F0 0
  63. #define F1 1
  64. #define F2 2
  65. #define F3 3
  66. #define F4 4
  67. #define F5 5
  68. #define F6 6
  69. #define F7 7
  70. #define F8 8
  71. #define F9 9
  72. #define F10 10
  73. #define F11 11
  74. #define F12 12
  75. #define F13 13
  76. #define F14 14
  77. #define F15 15
  78. #define F16 16
  79. #define F17 17
  80. #define F18 18
  81. #define F19 19
  82. #define F20 20
  83. #define F21 21
  84. #define F22 22
  85. #define F23 23
  86. #define F24 24
  87. #define F25 25
  88. #define F26 26
  89. #define F27 27
  90. #define F28 28
  91. #define F29 29
  92. #define F30 30
  93. #define F31 31
  94. #define ALPHA_R $f15
  95. #define ALPHA_I $f16
  96. #################################
  97. ## MADD1 a*c
  98. ## MADD2 b*c
  99. ## MADD3 a*d
  100. ## MADD4 d*b
  101. ##################################
  102. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  103. #define MADD1 MADD
  104. #define MADD2 MADD
  105. #define MADD3 MADD
  106. #define MADD4 NMSUB
  107. #endif
  108. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  109. #define MADD1 MADD
  110. #define MADD2 MADD
  111. #define MADD3 NMSUB
  112. #define MADD4 MADD
  113. #endif
  114. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  115. #define MADD1 MADD
  116. #define MADD2 NMSUB
  117. #define MADD3 MADD
  118. #define MADD4 MADD
  119. #endif
  120. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  121. #define MADD1 MADD
  122. #define MADD2 NMSUB
  123. #define MADD3 NMSUB
  124. #define MADD4 NMSUB
  125. #endif
  126. PROLOGUE
  127. LDARG LDC, 0($sp)
  128. daddiu $sp, $sp, -STACKSIZE
  129. SDARG $16, 0($sp)
  130. SDARG $17, 8($sp)
  131. sdc1 $f24, 16($sp)
  132. sdc1 $f25, 24($sp)
  133. sdc1 $f26, 32($sp)
  134. sdc1 $f27, 40($sp)
  135. sdc1 $f28, 48($sp)
  136. sdc1 $f29, 56($sp)
  137. #if defined(TRMMKERNEL)
  138. SDARG $18, 64($sp)
  139. SDARG $19, 72($sp)
  140. SDARG $20, 80($sp)
  141. LDARG OFFSET, STACKSIZE + 8($sp)
  142. #endif
  143. #ifndef __64BIT__
  144. sdc1 $f20, 88($sp)
  145. sdc1 $f21, 96($sp)
  146. sdc1 $f22,104($sp)
  147. sdc1 $f23,112($sp)
  148. #endif
  149. dsra J, N, 1 # J=N/2
  150. ST ALPHA_R, 128($sp) # store alpha_r & alpha_i
  151. #if defined(TRMMKERNEL) && !defined(LEFT)
  152. neg KK, OFFSET
  153. #endif
  154. dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE
  155. blez J, .L20
  156. ST ALPHA_I, 136($sp)
  157. .align 5
  158. .L10:
  159. #if defined(TRMMKERNEL) && defined(LEFT)
  160. move KK, OFFSET
  161. #endif
  162. daddiu J, J, -1
  163. dsra I, M, 1 # I=M/2
  164. dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  165. dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  166. move CO1, C # Fix pointer Cx
  167. daddu CO2, C, LDC
  168. move AO, A # Reset AO
  169. blez I, .L30
  170. daddu PREA, PREA, A # PREA=A+panel size
  171. .L11:
  172. #if defined(TRMMKERNEL)
  173. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  174. move BO, B
  175. #else
  176. dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2
  177. dsll TEMP, KK, 1 + ZBASE_SHIFT
  178. daddu AO, AO, L
  179. daddu BO, B, TEMP
  180. #endif
  181. MTC $0, c11 # Clear results regs
  182. MOV c12, c11
  183. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  184. MOV c13, c11
  185. MOV c14, c11
  186. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  187. MOV c21, c11
  188. MOV c22, c11
  189. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  190. MOV c23, c11
  191. MOV c24, c11
  192. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  193. FETCH $0, 0 * SIZE(CO2)
  194. MOV c31, c11
  195. MOV c32, c11
  196. FETCH $0, 0 * SIZE(CO1)
  197. MOV c33, c11
  198. MOV c34, c11
  199. FETCH $0, 4 * SIZE(CO2)
  200. MOV c41, c11
  201. MOV c42, c11
  202. FETCH $0, 4 * SIZE(CO1)
  203. MOV c43, c11
  204. MOV c44, c11
  205. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  206. dsubu TEMP, K, KK
  207. #elif defined(LEFT)
  208. daddiu TEMP, KK, 2
  209. #else
  210. daddiu TEMP, KK, 2
  211. #endif
  212. dsra L, TEMP, 2
  213. daddu PREB, PREB, B # PREA=A+panel size
  214. blez L, .L15
  215. NOP
  216. #else
  217. dsra L, K, 2 # Unroll K 4 times
  218. move BO, B
  219. MTC $0, c11 # Clear results regs
  220. MOV c12, c11
  221. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  222. MOV c13, c11
  223. MOV c14, c11
  224. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  225. MOV c21, c11
  226. MOV c22, c11
  227. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  228. MOV c23, c11
  229. MOV c24, c11
  230. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  231. FETCH $0, 0 * SIZE(CO2)
  232. MOV c31, c11
  233. MOV c32, c11
  234. FETCH $0, 0 * SIZE(CO1)
  235. MOV c33, c11
  236. MOV c34, c11
  237. FETCH $0, 4 * SIZE(CO2)
  238. MOV c41, c11
  239. MOV c42, c11
  240. FETCH $0, 4 * SIZE(CO1)
  241. MOV c43, c11
  242. daddu PREB, PREB, B # PREA=A+panel size
  243. blez L, .L15
  244. MOV c44, c11
  245. #endif
  246. .align 5
  247. .L12:
  248. gsLQC1(R12, F9, F8, 2) # Unroll K=1
  249. gsLQC1(R13, F13, F12, 2)
  250. MADD1 c11, c11, a1, b1 # axc A1xB1
  251. MADD3 c13, c13, a1, b2 # axd
  252. gsLQC1(R12, F11, F10, 3)
  253. gsLQC1(R13, F16, F15, 3)
  254. MADD2 c12, c12, a2, b1 # bxc
  255. MADD4 c14, c14, a2, b2 # bxd
  256. MADD1 c21, c21, a3, b1 # A2xB1
  257. MADD3 c23, c23, a3, b2
  258. MADD2 c22, c22, a4, b1
  259. MADD4 c24, c24, a4, b2
  260. FETCH $0, 4 * SIZE(PREA)
  261. FETCH $0, 4 * SIZE(PREB)
  262. MADD1 c31, c31, a1, b3 # A1xB2
  263. MADD3 c33, c33, a1, b4
  264. MADD2 c32, c32, a2, b3
  265. MADD4 c34, c34, a2, b4
  266. MADD1 c41, c41, a3, b3 # A2xB2
  267. MADD3 c43, c43, a3, b4
  268. MADD2 c42, c42, a4, b3
  269. MADD4 c44, c44, a4, b4
  270. gsLQC1(R12, F1, F0, 4) # unroll k=2
  271. gsLQC1(R13, F5, F4, 4)
  272. MADD1 c11, c11, a5, b5 # axc A1xB1
  273. MADD3 c13, c13, a5, b6 # axd
  274. MADD2 c12, c12, a6, b5 # bxc
  275. MADD4 c14, c14, a6, b6 # bxd
  276. gsLQC1(R12, F3, F2, 5)
  277. gsLQC1(R13, F7, F6, 5)
  278. MADD1 c21, c21, a7, b5 # A2xB1
  279. MADD3 c23, c23, a7, b6
  280. MADD2 c22, c22, a8, b5
  281. MADD4 c24, c24, a8, b6
  282. FETCH $0, 8 * SIZE(PREA)
  283. FETCH $0, 8 * SIZE(PREB)
  284. MADD1 c31, c31, a5, b7 # A1xB2
  285. MADD3 c33, c33, a5, b8
  286. MADD2 c32, c32, a6, b7
  287. MADD4 c34, c34, a6, b8
  288. MADD1 c41, c41, a7, b7 # A2xB2
  289. MADD3 c43, c43, a7, b8
  290. MADD2 c42, c42, a8, b7
  291. MADD4 c44, c44, a8, b8
  292. gsLQC1(R12, F9, F8, 6) # Unroll K=3
  293. gsLQC1(R13, F13, F12, 6)
  294. MADD1 c11, c11, a1, b1 # axc A1xB1
  295. MADD3 c13, c13, a1, b2 # axd
  296. gsLQC1(R13, F16, F15, 7)
  297. gsLQC1(R12, F11, F10, 7)
  298. MADD2 c12, c12, a2, b1 # bxc
  299. MADD4 c14, c14, a2, b2 # bxd
  300. MADD1 c21, c21, a3, b1 # A2xB1
  301. MADD3 c23, c23, a3, b2
  302. daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
  303. daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
  304. MADD2 c22, c22, a4, b1
  305. MADD4 c24, c24, a4, b2
  306. FETCH $0, 12 * SIZE(PREA)
  307. MADD1 c31, c31, a1, b3 # A1xB2
  308. MADD3 c33, c33, a1, b4
  309. daddiu L, L, -1
  310. FETCH $0, 12 * SIZE(PREB)
  311. MADD2 c32, c32, a2, b3
  312. MADD4 c34, c34, a2, b4
  313. MADD1 c41, c41, a3, b3 # A2xB2
  314. MADD3 c43, c43, a3, b4
  315. daddu PREA, PREA, 16 * SIZE
  316. daddu PREB, PREB, 16 * SIZE
  317. MADD2 c42, c42, a4, b3
  318. MADD4 c44, c44, a4, b4
  319. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  320. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  321. MADD1 c11, c11, a5, b5 # axc A1xB1
  322. MADD3 c13, c13, a5, b6 # axd
  323. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  324. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  325. MADD2 c12, c12, a6, b5 # bxc
  326. MADD4 c14, c14, a6, b6 # bxd
  327. MADD1 c21, c21, a7, b5 # A2xB1
  328. MADD3 c23, c23, a7, b6
  329. MADD2 c22, c22, a8, b5
  330. MADD4 c24, c24, a8, b6
  331. FETCH $0, 0 * SIZE(PREA)
  332. FETCH $0, 0 * SIZE(PREB)
  333. MADD1 c31, c31, a5, b7 # A1xB2
  334. MADD3 c33, c33, a5, b8
  335. MADD2 c32, c32, a6, b7
  336. MADD4 c34, c34, a6, b8
  337. MADD1 c41, c41, a7, b7 # A2xB2
  338. MADD3 c43, c43, a7, b8
  339. MADD2 c42, c42, a8, b7
  340. bgtz L, .L12
  341. MADD4 c44, c44, a8, b8
  342. .align 5
  343. .L15:
  344. #ifndef TRMMKERNEL
  345. andi L, K, 3
  346. LD ALPHA_R, 128($sp)
  347. #else
  348. andi L, TEMP, 3
  349. LD ALPHA_R, 128($sp)
  350. #endif
  351. blez L, .L18
  352. LD ALPHA_I, 136($sp)
  353. .align 5
  354. .L16:
  355. daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
  356. daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
  357. MADD1 c11, c11, a1, b1 # axc A1xB1
  358. MADD3 c13, c13, a1, b2 # axd
  359. daddiu PREA, PREA, 4 * SIZE
  360. daddiu PREB, PREB, 4 * SIZE
  361. MADD2 c12, c12, a2, b1 # bxc
  362. MADD4 c14, c14, a2, b2 # bxd
  363. MADD1 c21, c21, a3, b1 # A2xB1
  364. MADD3 c23, c23, a3, b2
  365. MADD2 c22, c22, a4, b1
  366. MADD4 c24, c24, a4, b2
  367. FETCH $0, 0 * SIZE(PREA)
  368. MADD1 c31, c31, a1, b3 # A1xB2
  369. MADD3 c33, c33, a1, b4
  370. daddiu L, L, -1
  371. MADD2 c32, c32, a2, b3
  372. MADD4 c34, c34, a2, b4
  373. FETCH $0, 0 * SIZE(PREB)
  374. MADD1 c41, c41, a3, b3 # A2xB2
  375. MADD3 c43, c43, a3, b4
  376. MADD2 c42, c42, a4, b3
  377. MADD4 c44, c44, a4, b4
  378. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  379. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  380. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  381. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  382. bgtz L, .L16
  383. NOP
  384. .L18:
  385. #ifndef TRMMKERNEL
  386. ADD c11, c14, c11
  387. LD a1, 0 * SIZE(CO1)
  388. ADD c12, c13, c12
  389. LD a2, 1 * SIZE(CO1)
  390. ADD c21, c24, c21
  391. LD b1, 2 * SIZE(CO1)
  392. ADD c22, c23, c22
  393. LD b2, 3 * SIZE(CO1)
  394. ADD c31, c34, c31
  395. LD a3, 0 * SIZE(CO2)
  396. ADD c32, c33, c32
  397. LD a4, 1 * SIZE(CO2)
  398. ADD c41, c44, c41
  399. LD b3, 2 * SIZE(CO2)
  400. ADD c42, c43, c42
  401. LD b4, 3 * SIZE(CO2)
  402. daddiu I, I, -1
  403. MADD a1, a1, ALPHA_R, c11
  404. MADD a2, a2, ALPHA_R, c12
  405. MADD b1, b1, ALPHA_R, c21
  406. MADD b2, b2, ALPHA_R, c22
  407. NMSUB a1, a1, ALPHA_I, c12
  408. MADD a2, a2, ALPHA_I, c11
  409. NMSUB b1, b1, ALPHA_I, c22
  410. MADD b2, b2, ALPHA_I, c21
  411. MADD a3, a3, ALPHA_R, c31
  412. MADD a4, a4, ALPHA_R, c32
  413. ST a1, 0 * SIZE(CO1)
  414. MADD b3, b3, ALPHA_R, c41
  415. MADD b4, b4, ALPHA_R, c42
  416. ST a2, 1 * SIZE(CO1)
  417. NMSUB a3, a3, ALPHA_I, c32
  418. MADD a4, a4, ALPHA_I, c31
  419. ST b1, 2 * SIZE(CO1)
  420. NMSUB b3, b3, ALPHA_I, c42
  421. MADD b4, b4, ALPHA_I, c41
  422. ST b2, 3 * SIZE(CO1)
  423. ST a3, 0 * SIZE(CO2)
  424. ST a4, 1 * SIZE(CO2)
  425. ST b3, 2 * SIZE(CO2)
  426. ST b4, 3 * SIZE(CO2)
  427. #else
  428. ADD c11, c14, c11
  429. ADD c12, c13, c12
  430. ADD c21, c24, c21
  431. ADD c22, c23, c22
  432. ADD c31, c34, c31
  433. ADD c32, c33, c32
  434. ADD c41, c44, c41
  435. ADD c42, c43, c42
  436. daddiu I, I, -1
  437. MUL a1, ALPHA_R, c11
  438. MUL a2, ALPHA_R, c12
  439. MUL b1, ALPHA_R, c21
  440. MUL b2, ALPHA_R, c22
  441. NMSUB a1, a1, ALPHA_I, c12
  442. MADD a2, a2, ALPHA_I, c11
  443. NMSUB b1, b1, ALPHA_I, c22
  444. MADD b2, b2, ALPHA_I, c21
  445. MUL a3, ALPHA_R, c31
  446. MUL a4, ALPHA_R, c32
  447. MUL b3, ALPHA_R, c41
  448. MUL b4, ALPHA_R, c42
  449. NMSUB a3, a3, ALPHA_I, c32
  450. MADD a4, a4, ALPHA_I, c31
  451. NMSUB b3, b3, ALPHA_I, c42
  452. MADD b4, b4, ALPHA_I, c41
  453. ST a1, 0 * SIZE(CO1)
  454. ST a2, 1 * SIZE(CO1)
  455. ST b1, 2 * SIZE(CO1)
  456. ST b2, 3 * SIZE(CO1)
  457. ST a3, 0 * SIZE(CO2)
  458. ST a4, 1 * SIZE(CO2)
  459. ST b3, 2 * SIZE(CO2)
  460. ST b4, 3 * SIZE(CO2)
  461. #if ( defined(LEFT) && defined(TRANSA)) || \
  462. (!defined(LEFT) && !defined(TRANSA))
  463. dsubu TEMP, K, KK
  464. #ifdef LEFT
  465. daddiu TEMP, TEMP, -2
  466. #else
  467. daddiu TEMP, TEMP, -2
  468. #endif
  469. dsll L, TEMP, 1 + ZBASE_SHIFT
  470. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  471. daddu AO, AO, L
  472. daddu BO, BO, TEMP
  473. #endif
  474. #ifdef LEFT
  475. daddiu KK, KK, 2
  476. #endif
  477. #endif
  478. dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
  479. daddiu CO1,CO1, 4 * SIZE
  480. bgtz I, .L11
  481. daddiu CO2,CO2, 4 * SIZE
  482. .align 5
  483. .L30:
  484. andi I, M, 1
  485. daddu C, C, LDC # Change C to next panel
  486. daddu PREB, PREB, B # PREA=A+panel size
  487. blez I, .L19
  488. daddu C, C, LDC # Change C to next panel
  489. #if defined(TRMMKERNEL)
  490. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  491. move BO, B
  492. #else
  493. dsll L, KK, ZBASE_SHIFT # MR=1
  494. dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
  495. daddu AO, AO, L
  496. daddu BO, B, TEMP
  497. #endif
  498. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  499. MTC $0, c11 # Clear results regs
  500. MOV c12, c11
  501. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  502. MOV c13, c11
  503. MOV c14, c11
  504. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  505. MOV c31, c11
  506. MOV c32, c11
  507. FETCH $0, 0 * SIZE(PREB)
  508. MOV c33, c11
  509. MOV c34, c11
  510. FETCH $0, 0 * SIZE(CO1)
  511. FETCH $0, 0 * SIZE(CO2)
  512. FETCH $0, 4 * SIZE(CO1)
  513. FETCH $0, 4 * SIZE(CO2)
  514. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  515. dsubu TEMP, K, KK
  516. #elif defined(LEFT)
  517. daddiu TEMP, KK, 1 # MR=1
  518. #else
  519. daddiu TEMP, KK, 2 # NR=2
  520. #endif
  521. dsra L, TEMP, 2
  522. blez L, .L35
  523. NOP
  524. #else
  525. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  526. dsra L, K, 2 # Unroll K 4 times
  527. move BO, B
  528. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  529. MTC $0, c11 # Clear results regs
  530. MOV c12, c11
  531. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  532. MOV c13, c11
  533. MOV c14, c11
  534. FETCH $0, 0 * SIZE(PREB)
  535. MOV c31, c11
  536. MOV c32, c11
  537. FETCH $0, 0 * SIZE(CO1)
  538. FETCH $0, 0 * SIZE(CO2)
  539. FETCH $0, 4 * SIZE(CO1)
  540. FETCH $0, 4 * SIZE(CO2)
  541. MOV c33, c11
  542. blez L, .L35
  543. MOV c34, c11
  544. #endif
  545. .align 5
  546. .L32:
  547. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  548. gsLQC1(R13, F13, F12, 2)
  549. MADD1 c11, c11, a1, b1 # axc A1xB1
  550. MADD3 c13, c13, a1, b2 # axd
  551. gsLQC1(R13, F16, F15, 3)
  552. MADD2 c12, c12, a2, b1 # bxc
  553. MADD4 c14, c14, a2, b2 # bxd
  554. NOP
  555. MADD1 c31, c31, a1, b3 # A1xB2
  556. MADD3 c33, c33, a1, b4
  557. FETCH $0, 4 * SIZE(PREB)
  558. MADD2 c32, c32, a2, b3
  559. MADD4 c34, c34, a2, b4
  560. NOP
  561. gsLQC1(R12, F9, F8, 2) # Unroll K=1
  562. gsLQC1(R13, F5, F4, 4)
  563. MADD1 c11, c11, a3, b5 # axc A1xB1
  564. MADD3 c13, c13, a3, b6 # axd
  565. gsLQC1(R13, F7, F6, 5)
  566. MADD2 c12, c12, a4, b5 # bxc
  567. MADD4 c14, c14, a4, b6 # bxd
  568. NOP
  569. MADD1 c31, c31, a3, b7 # A1xB2
  570. MADD3 c33, c33, a3, b8
  571. FETCH $0, 8 * SIZE(PREB)
  572. MADD2 c32, c32, a4, b7
  573. MADD4 c34, c34, a4, b8
  574. daddiu L, L, -1
  575. gsLQC1(R12, F11, F10, 3)
  576. gsLQC1(R13, F13, F12, 6)
  577. MADD1 c11, c11, a5, b1 # axc A1xB1
  578. MADD3 c13, c13, a5, b2 # axd
  579. gsLQC1(R13, F16, F15, 7)
  580. MADD2 c12, c12, a6, b1 # bxc
  581. MADD4 c14, c14, a6, b2 # bxd
  582. daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
  583. MADD1 c31, c31, a5, b3 # A1xB2
  584. MADD3 c33, c33, a5, b4
  585. FETCH $0, 12 * SIZE(PREB)
  586. MADD2 c32, c32, a6, b3
  587. MADD4 c34, c34, a6, b4
  588. daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx
  589. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  590. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  591. MADD1 c11, c11, a7, b5 # axc A1xB1
  592. MADD3 c13, c13, a7, b6 # axd
  593. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  594. MADD2 c12, c12, a8, b5 # bxc
  595. MADD4 c14, c14, a8, b6 # bxd
  596. daddiu PREB, PREB, 16 * SIZE
  597. MADD1 c31, c31, a7, b7 # A1xB2
  598. MADD3 c33, c33, a7, b8
  599. FETCH $0, 0 * SIZE(PREB)
  600. MADD2 c32, c32, a8, b7
  601. bgtz L, .L32
  602. MADD4 c34, c34, a8, b8
  603. .L35:
  604. #ifndef TRMMKERNEL
  605. andi L, K, 3
  606. LD ALPHA_R, 128($sp)
  607. #else
  608. andi L, TEMP, 3
  609. LD ALPHA_R, 128($sp)
  610. #endif
  611. blez L, .L38
  612. LD ALPHA_I, 136($sp)
  613. .align 5
  614. .L36:
  615. daddiu L, L, -1
  616. MADD1 c11, c11, a1, b1 # axc A1xB1
  617. MADD3 c13, c13, a1, b2 # axd
  618. daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
  619. MADD2 c12, c12, a2, b1 # bxc
  620. MADD4 c14, c14, a2, b2 # bxd
  621. daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
  622. MADD1 c31, c31, a1, b3 # A1xB2
  623. MADD3 c33, c33, a1, b4
  624. daddiu PREB, PREB, 4 * SIZE
  625. MADD2 c32, c32, a2, b3
  626. MADD4 c34, c34, a2, b4
  627. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  628. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  629. NOP
  630. bgtz L, .L36
  631. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  632. .L38:
  633. #ifndef TRMMKERNEL
  634. ADD c11, c14, c11
  635. LD a1, 0 * SIZE(CO1)
  636. ADD c12, c13, c12
  637. LD a2, 1 * SIZE(CO1)
  638. ADD c31, c34, c31
  639. LD a3, 0 * SIZE(CO2)
  640. ADD c32, c33, c32
  641. LD a4, 1 * SIZE(CO2)
  642. MADD a1, a1, ALPHA_R, c11
  643. MADD a2, a2, ALPHA_R, c12
  644. MADD a3, a3, ALPHA_R, c31
  645. MADD a4, a4, ALPHA_R, c32
  646. NMSUB a1, a1, ALPHA_I, c12
  647. MADD a2, a2, ALPHA_I, c11
  648. NMSUB a3, a3, ALPHA_I, c32
  649. MADD a4, a4, ALPHA_I, c31
  650. ST a1, 0 * SIZE(CO1)
  651. ST a2, 1 * SIZE(CO1)
  652. ST a3, 0 * SIZE(CO2)
  653. ST a4, 1 * SIZE(CO2)
  654. daddiu CO1,CO1, 2 * SIZE
  655. daddiu CO2,CO2, 2 * SIZE
  656. #else
  657. ADD c11, c14, c11
  658. ADD c12, c13, c12
  659. ADD c31, c34, c31
  660. ADD c32, c33, c32
  661. MUL a1, ALPHA_R, c11
  662. MUL a2, ALPHA_R, c12
  663. MUL a3, ALPHA_R, c31
  664. MUL a4, ALPHA_R, c32
  665. NMSUB a1, a1, ALPHA_I, c12
  666. MADD a2, a2, ALPHA_I, c11
  667. NMSUB a3, a3, ALPHA_I, c32
  668. MADD a4, a4, ALPHA_I, c31
  669. ST a1, 0 * SIZE(CO1)
  670. ST a2, 1 * SIZE(CO1)
  671. ST a3, 0 * SIZE(CO2)
  672. ST a4, 1 * SIZE(CO2)
  673. daddiu CO1,CO1, 2 * SIZE
  674. daddiu CO2,CO2, 2 * SIZE
  675. #if ( defined(LEFT) && defined(TRANSA)) || \
  676. (!defined(LEFT) && !defined(TRANSA))
  677. dsubu TEMP, K, KK
  678. #ifdef LEFT
  679. daddiu TEMP, TEMP, -1
  680. #else
  681. daddiu TEMP, TEMP, -2
  682. #endif
  683. dsll L, TEMP, ZBASE_SHIFT
  684. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  685. daddu AO, AO, L
  686. daddu BO, BO, TEMP
  687. #endif
  688. #ifdef LEFT
  689. daddiu KK, KK, 1
  690. #endif
  691. #endif
  692. .align 5
  693. .L19:
  694. #if defined(TRMMKERNEL) && !defined(LEFT)
  695. daddiu KK, KK, 2
  696. #endif
  697. bgtz J, .L10
  698. move B, BO
  699. .align 5
  700. .L20:
  701. andi J, N, 1
  702. blez J, .L999
  703. dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4
  704. dsra I, M, 1 # I=M/2
  705. move CO1, C
  706. #if defined(TRMMKERNEL) && defined(LEFT)
  707. move KK, OFFSET
  708. #endif
  709. move AO, A # Reset AO
  710. blez I, .L29
  711. daddu PREA, PREA, A
  712. .L21:
  713. #if defined(TRMMKERNEL)
  714. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  715. move BO, B
  716. #else
  717. dsll L, KK, 1 + ZBASE_SHIFT
  718. dsll TEMP, KK, ZBASE_SHIFT
  719. daddu AO, AO, L
  720. daddu BO, B, TEMP
  721. #endif
  722. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  723. MTC $0, c11 # Clear results regs
  724. MOV c12, c11
  725. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  726. MOV c13, c11
  727. MOV c14, c11
  728. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  729. MOV c21, c11
  730. MOV c22, c11
  731. FETCH $0, 0 * SIZE(PREA)
  732. MOV c23, c11
  733. MOV c24, c11
  734. FETCH $0, 0 * SIZE(CO1)
  735. FETCH $0, 4 * SIZE(CO1)
  736. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  737. dsubu TEMP, K, KK
  738. #elif defined(LEFT)
  739. daddiu TEMP, KK, 2 # define Mr=2
  740. #else
  741. daddiu TEMP, KK, 1 # define NR=1
  742. #endif
  743. dsra L, TEMP, 2
  744. blez L, .L25
  745. NOP
  746. #else
  747. dsra L, K, 2 # Unroll K 4 times
  748. move BO, B
  749. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  750. MTC $0, c11 # Clear results regs
  751. MOV c12, c11
  752. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  753. MOV c13, c11
  754. MOV c14, c11
  755. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  756. MOV c21, c11
  757. MOV c22, c11
  758. FETCH $0, 0 * SIZE(PREA)
  759. MOV c23, c11
  760. MOV c24, c11
  761. FETCH $0, 0 * SIZE(CO1)
  762. FETCH $0, 4 * SIZE(CO1)
  763. blez L, .L25
  764. NOP
  765. #endif
  766. .align 5
  767. .L22:
  768. gsLQC1(R12, F9, F8, 2) # Unroll K=1
  769. MADD1 c11, c11, a1, b1 # axc A1xB1
  770. MADD3 c13, c13, a1, b2 # axd
  771. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  772. MADD2 c12, c12, a2, b1 # bxc
  773. MADD4 c14, c14, a2, b2 # bxd
  774. gsLQC1(R12, F11, F10, 3)
  775. MADD1 c21, c21, a3, b1 # A2xB1
  776. MADD3 c23, c23, a3, b2
  777. FETCH $0, 4 * SIZE(PREA)
  778. MADD2 c22, c22, a4, b1
  779. MADD4 c24, c24, a4, b2
  780. gsLQC1(R12, F1, F0, 4) # Unroll K=2
  781. MADD1 c11, c11, a5, b3 # axc A1xB1
  782. MADD3 c13, c13, a5, b4 # axd
  783. gsLQC1(R13, F13, F12, 2)
  784. MADD2 c12, c12, a6, b3 # bxc
  785. MADD4 c14, c14, a6, b4 # bxd
  786. gsLQC1(R12, F3, F2, 5)
  787. MADD1 c21, c21, a7, b3 # A2xB1
  788. MADD3 c23, c23, a7, b4
  789. FETCH $0, 8 * SIZE(PREA)
  790. MADD2 c22, c22, a8, b3
  791. MADD4 c24, c24, a8, b4
  792. daddiu L, L, -1
  793. gsLQC1(R12, F9, F8, 6) # Unroll K=3
  794. MADD1 c11, c11, a1, b5 # axc A1xB1
  795. MADD3 c13, c13, a1, b6 # axd
  796. gsLQC1(R13, F16, F15, 3)
  797. MADD2 c12, c12, a2, b5 # bxc
  798. MADD4 c14, c14, a2, b6 # bxd
  799. gsLQC1(R12, F11, F10, 7)
  800. MADD1 c21, c21, a3, b5 # A2xB1
  801. MADD3 c23, c23, a3, b6
  802. daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx
  803. FETCH $0, 12 * SIZE(PREA)
  804. MADD2 c22, c22, a4, b5
  805. MADD4 c24, c24, a4, b6
  806. daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
  807. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  808. MADD1 c11, c11, a5, b7 # axc A1xB1
  809. MADD3 c13, c13, a5, b8 # axd
  810. daddiu PREA, PREA, 16 * SIZE
  811. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  812. MADD2 c12, c12, a6, b7 # bxc
  813. MADD4 c14, c14, a6, b8 # bxd
  814. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  815. MADD1 c21, c21, a7, b7 # A2xB1
  816. MADD3 c23, c23, a7, b8
  817. FETCH $0, 0 * SIZE(PREA)
  818. MADD2 c22, c22, a8, b7
  819. bgtz L, .L22
  820. MADD4 c24, c24, a8, b8
  821. .L25:
  822. #ifndef TRMMKERNEL
  823. andi L, K, 3
  824. LD ALPHA_R, 128($sp)
  825. #else
  826. andi L, TEMP, 3
  827. LD ALPHA_R, 128($sp)
  828. #endif
  829. blez L, .L28
  830. LD ALPHA_I, 136($sp)
  831. .align 3
  832. .L26:
  833. daddiu L, L, -1
  834. MADD1 c11, c11, a1, b1 # axc A1xB1
  835. MADD3 c13, c13, a1, b2 # axd
  836. daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
  837. MADD2 c12, c12, a2, b1 # bxc
  838. MADD4 c14, c14, a2, b2 # bxd
  839. daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
  840. MADD1 c21, c21, a3, b1 # A2xB1
  841. MADD3 c23, c23, a3, b2
  842. daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx
  843. MADD2 c22, c22, a4, b1
  844. MADD4 c24, c24, a4, b2
  845. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  846. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  847. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  848. bgtz L, .L26
  849. FETCH $0, 0 * SIZE(PREA)
  850. .L28:
  851. #ifndef TRMMKERNEL
  852. ADD c11, c14, c11
  853. LD a1, 0 * SIZE(CO1)
  854. ADD c12, c13, c12
  855. LD a2, 1 * SIZE(CO1)
  856. ADD c21, c24, c21
  857. LD b1, 2 * SIZE(CO1)
  858. ADD c22, c23, c22
  859. LD b2, 3 * SIZE(CO1)
  860. daddiu I, I, -1
  861. MADD a1, a1, ALPHA_R, c11
  862. MADD a2, a2, ALPHA_R, c12
  863. MADD b1, b1, ALPHA_R, c21
  864. MADD b2, b2, ALPHA_R, c22
  865. NMSUB a1, a1, ALPHA_I, c12
  866. MADD a2, a2, ALPHA_I, c11
  867. NMSUB b1, b1, ALPHA_I, c22
  868. MADD b2, b2, ALPHA_I, c21
  869. ST a1, 0 * SIZE(CO1)
  870. ST a2, 1 * SIZE(CO1)
  871. ST b1, 2 * SIZE(CO1)
  872. ST b2, 3 * SIZE(CO1)
  873. #else
  874. ADD c11, c14, c11
  875. ADD c12, c13, c12
  876. ADD c21, c24, c21
  877. ADD c22, c23, c22
  878. daddiu I, I, -1
  879. MUL a1, ALPHA_R, c11
  880. MUL a2, ALPHA_R, c12
  881. MUL b1, ALPHA_R, c21
  882. MUL b2, ALPHA_R, c22
  883. NMSUB a1, a1, ALPHA_I, c12
  884. MADD a2, a2, ALPHA_I, c11
  885. NMSUB b1, b1, ALPHA_I, c22
  886. MADD b2, b2, ALPHA_I, c21
  887. ST a1, 0 * SIZE(CO1)
  888. ST a2, 1 * SIZE(CO1)
  889. ST b1, 2 * SIZE(CO1)
  890. ST b2, 3 * SIZE(CO1)
  891. #if ( defined(LEFT) && defined(TRANSA)) || \
  892. (!defined(LEFT) && !defined(TRANSA))
  893. dsubu TEMP, K, KK
  894. #ifdef LEFT
  895. daddiu TEMP, TEMP, -2
  896. #else
  897. daddiu TEMP, TEMP, -1
  898. #endif
  899. dsll L, TEMP, 1 + ZBASE_SHIFT
  900. dsll TEMP, TEMP, ZBASE_SHIFT
  901. daddu AO, AO, L
  902. daddu BO, BO, TEMP
  903. #endif
  904. #ifdef LEFT
  905. daddiu KK, KK, 2
  906. #endif
  907. #endif
  908. daddiu CO1,CO1, 4 * SIZE
  909. bgtz I, .L21
  910. NOP
  911. .L29:
  912. andi I, M, 1
  913. blez I, .L999
  914. NOP
  915. #if defined(TRMMKERNEL)
  916. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  917. move BO, B
  918. #else
  919. dsll TEMP, KK, ZBASE_SHIFT
  920. daddu AO, AO, TEMP
  921. daddu BO, B, TEMP
  922. #endif
  923. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  924. MTC $0, c11 # Clear results regs
  925. MOV c12, c11
  926. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  927. MOV c13, c11
  928. MOV c14, c11
  929. FETCH $0, 0 * SIZE(PREA)
  930. FETCH $0, 4 * SIZE(PREA)
  931. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  932. dsubu TEMP, K, KK
  933. #elif defined(LEFT)
  934. daddiu TEMP, KK, 1
  935. #else
  936. daddiu TEMP, KK, 1
  937. #endif
  938. dsra L, TEMP, 2
  939. blez L, .L45
  940. NOP
  941. #else
  942. dsra L, K, 2 # Unroll K 4 times
  943. move BO, B
  944. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
  945. MTC $0, c11 # Clear results regs
  946. MOV c12, c11
  947. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  948. MOV c13, c11
  949. MOV c14, c11
  950. FETCH $0, 0 * SIZE(PREA)
  951. FETCH $0, 4 * SIZE(PREA)
  952. blez L, .L45
  953. NOP
  954. #endif
  955. .align 3
  956. .L42:
  957. gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
  958. MADD1 c11, c11, a1, b1 # axc A1xB1
  959. MADD3 c13, c13, a1, b2 # axd
  960. gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
  961. MADD2 c12, c12, a2, b1 # bxc
  962. MADD4 c14, c14, a2, b2 # bxd
  963. gsLQC1(R12, F9, F8, 2) # Unroll K=1
  964. MADD1 c11, c11, a3, b3 # axc A1xB1
  965. MADD3 c13, c13, a3, b4 # axd
  966. gsLQC1(R13, F13, F12, 2)
  967. MADD2 c12, c12, a4, b3 # bxc
  968. MADD4 c14, c14, a4, b4 # bxd
  969. daddiu L, L, -1
  970. gsLQC1(R12, F11, F10, 3)
  971. MADD1 c11, c11, a5, b5 # axc A1xB1
  972. MADD3 c13, c13, a5, b6 # axd
  973. daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
  974. gsLQC1(R13, F16, F15, 3)
  975. MADD2 c12, c12, a6, b5 # bxc
  976. MADD4 c14, c14, a6, b6 # bxd
  977. daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx
  978. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  979. MADD1 c11, c11, a7, b7 # axc A1xB1
  980. MADD3 c13, c13, a7, b8 # axd
  981. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  982. MADD2 c12, c12, a8, b7 # bxc
  983. bgtz L, .L42
  984. MADD4 c14, c14, a8, b8 # bxd
  985. .align 5
  986. .L45:
  987. #ifndef TRMMKERNEL
  988. andi L, K, 3
  989. LD ALPHA_R, 128($sp)
  990. #else
  991. andi L, TEMP, 3
  992. LD ALPHA_R, 128($sp)
  993. #endif
  994. blez L, .L48
  995. LD ALPHA_I, 136($sp)
  996. .L46:
  997. daddiu L, L, -1
  998. daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
  999. daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
  1000. MADD1 c11, c11, a1, b1 # axc A1xB1
  1001. MADD3 c13, c13, a1, b2 # axd
  1002. MADD2 c12, c12, a2, b1 # bxc
  1003. MADD4 c14, c14, a2, b2 # bxd
  1004. gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
  1005. gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
  1006. bgtz L, .L46
  1007. NOP
  1008. .L48:
  1009. #ifndef TRMMKERNEL
  1010. ADD c11, c14, c11
  1011. ADD c12, c13, c12
  1012. LD a1, 0 * SIZE(CO1)
  1013. LD a2, 1 * SIZE(CO1)
  1014. MADD a1, a1, ALPHA_R, c11
  1015. MADD a2, a2, ALPHA_R, c12
  1016. NMSUB a1, a1, ALPHA_I, c12
  1017. MADD a2, a2, ALPHA_I, c11
  1018. ST a1, 0 * SIZE(CO1)
  1019. ST a2, 1 * SIZE(CO1)
  1020. #else
  1021. ADD c11, c14, c11
  1022. ADD c12, c13, c12
  1023. MUL a1, ALPHA_R, c11
  1024. MUL a2, ALPHA_R, c12
  1025. NMSUB a1, a1, ALPHA_I, c12
  1026. MADD a2, a2, ALPHA_I, c11
  1027. ST a1, 0 * SIZE(CO1)
  1028. ST a2, 1 * SIZE(CO1)
  1029. #if ( defined(LEFT) && defined(TRANSA)) || \
  1030. (!defined(LEFT) && !defined(TRANSA))
  1031. dsubu TEMP, K, KK
  1032. #ifdef LEFT
  1033. daddiu TEMP, TEMP, -1
  1034. #else
  1035. daddiu TEMP, TEMP, -1
  1036. #endif
  1037. dsll TEMP, TEMP, ZBASE_SHIFT
  1038. daddu AO, AO, TEMP
  1039. daddu BO, BO, TEMP
  1040. #endif
  1041. #ifdef LEFT
  1042. daddiu KK, KK, 1
  1043. #endif
  1044. daddiu CO1,CO1, 2 * SIZE
  1045. #endif
  1046. .align 5
  1047. .L999:
  1048. LDARG $16, 0($sp)
  1049. LDARG $17, 8($sp)
  1050. ldc1 $f24, 16($sp)
  1051. ldc1 $f25, 24($sp)
  1052. ldc1 $f26, 32($sp)
  1053. ldc1 $f27, 40($sp)
  1054. ldc1 $f28, 48($sp)
  1055. ldc1 $f29, 56($sp)
  1056. #if defined(TRMMKERNEL)
  1057. LDARG $18, 64($sp)
  1058. LDARG $19, 72($sp)
  1059. LDARG $20, 80($sp)
  1060. #endif
  1061. #ifndef __64BIT__
  1062. ldc1 $f20, 88($sp)
  1063. ldc1 $f21, 96($sp)
  1064. ldc1 $f22,104($sp)
  1065. ldc1 $f23,112($sp)
  1066. #endif
  1067. j $31
  1068. daddiu $sp, $sp, STACKSIZE
  1069. EPILOGUE