You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_loongson3a.S 31 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783
  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define M $4
  5. #define N $5
  6. #define K $6
  7. #define A $8
  8. #define B $9
  9. #define C $10
  10. #define LDC $11
  11. #define AO $12
  12. #define BO $13
  13. #define I $2
  14. #define J $3
  15. #define L $7
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define OFFSET $22
  21. #define KK $23
  22. #define TEMP $24
  23. #define AORIG $25
  24. #define a1 $f0
  25. #define a2 $f1
  26. #define a3 $f2
  27. #define a4 $f3
  28. #define a5 $f4
  29. #define a6 $f5
  30. #define a7 $f6
  31. #define a8 $f7
  32. #define b1 $f8
  33. #define b2 $f9
  34. #define b3 $f10
  35. #define b4 $f11
  36. #define b5 $f12
  37. #define b6 $f13
  38. #define b7 $f14
  39. #define b8 $f15
  40. #define t11 $f16
  41. #define t21 $f17
  42. #define t31 $f18
  43. #define t41 $f19
  44. #define t12 $f20
  45. #define t22 $f21
  46. #define t32 $f22
  47. #define t42 $f23
  48. #define t13 $f24
  49. #define t23 $f25
  50. #define t33 $f26
  51. #define t43 $f27
  52. #define t14 $f28
  53. #define t24 $f29
  54. #define t34 $f30
  55. #define t44 $f31
  56. #define ALPHA $f15
  57. PROLOGUE
  58. daddiu $sp, $sp, -144
  59. SDARG $16, 0($sp)
  60. SDARG $17, 8($sp)
  61. SDARG $18, 16($sp)
  62. SDARG $19, 24($sp)
  63. SDARG $20, 32($sp)
  64. SDARG $21, 40($sp)
  65. sdc1 $f24, 48($sp)
  66. sdc1 $f25, 56($sp)
  67. sdc1 $f26, 64($sp)
  68. sdc1 $f27, 72($sp)
  69. sdc1 $f28, 80($sp)
  70. SDARG $22, 88($sp)
  71. SDARG $23, 96($sp)
  72. SDARG $24, 104($sp)
  73. SDARG $25, 112($sp)
  74. #ifndef __64BIT__
  75. sdc1 $f20,112($sp)
  76. sdc1 $f21,120($sp)
  77. sdc1 $f22,128($sp)
  78. sdc1 $f23,136($sp)
  79. #endif
  80. # LT compute from left to right, top to bottom
  81. LDARG OFFSET, 144($sp)
  82. dsll LDC, LDC, BASE_SHIFT # ldc
  83. dsra J, N, 2 # j = nc/4
  84. blez J, .L30
  85. nop
  86. .L10: # nr=4
  87. daddiu J, J, -1
  88. move CO1, C
  89. daddu CO2, C, LDC
  90. daddu CO3, CO2, LDC
  91. daddu CO4, CO3, LDC
  92. MTC $0, t11 # clear result registers
  93. MOV t21, t11
  94. MOV t31, t11
  95. MOV t41, t11
  96. MOV t12, t11
  97. MOV t22, t11
  98. MOV t32, t11
  99. MOV t42, t11
  100. dsra I, M, 2 # i = mc/4
  101. move KK, OFFSET # kk is the length of the rectangular data part of panel Ai
  102. move AO, A # reset A
  103. daddu C, CO4, LDC # fixed pointer C, the write back address
  104. blez I, .L20
  105. nop
  106. .L11: # mr=4
  107. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  108. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  109. LD a3, 2 * SIZE(AO)
  110. LD a4, 3 * SIZE(AO) # get 4a
  111. LD b1, 0 * SIZE(B) # get 4b
  112. LD b2, 1 * SIZE(B)
  113. LD b3, 2 * SIZE(B)
  114. LD b4, 3 * SIZE(B)
  115. MOV t13, t11 # clear result registers
  116. MOV t23, t11
  117. MOV t33, t11
  118. MOV t43, t11
  119. MOV t14, t11
  120. MOV t24, t11
  121. MOV t34, t11
  122. MOV t44, t11
  123. dsra L, KK, 2 # L = kk/4
  124. blez L, .L15
  125. move BO, B #
  126. .align 3
  127. .L12:
  128. LD a5, 4 * SIZE(AO)
  129. LD a6, 5 * SIZE(AO)
  130. LD a7, 6 * SIZE(AO)
  131. LD a8, 7 * SIZE(AO)
  132. LD b5, 4 * SIZE(BO)
  133. LD b6, 5 * SIZE(BO)
  134. LD b7, 6 * SIZE(BO)
  135. LD b8, 7 * SIZE(BO)
  136. MADD t11, t11, a1, b1 # 1st compute
  137. MADD t21, t21, a2, b1
  138. MADD t31, t31, a3, b1
  139. MADD t41, t41, a4, b1
  140. MADD t12, t12, a1, b2
  141. MADD t22, t22, a2, b2
  142. MADD t32, t32, a3, b2
  143. MADD t42, t42, a4, b2
  144. MADD t13, t13, a1, b3
  145. MADD t23, t23, a2, b3
  146. MADD t33, t33, a3, b3
  147. MADD t43, t43, a4, b3
  148. MADD t14, t14, a1, b4
  149. MADD t24, t24, a2, b4
  150. MADD t34, t34, a3, b4
  151. MADD t44, t44, a4, b4
  152. LD a1, 8 * SIZE(AO)
  153. LD a2, 9 * SIZE(AO)
  154. LD a3, 10 * SIZE(AO)
  155. LD a4, 11 * SIZE(AO)
  156. LD b1, 8 * SIZE(BO)
  157. LD b2, 9 * SIZE(BO)
  158. LD b3, 10 * SIZE(BO)
  159. LD b4, 11 * SIZE(BO)
  160. MADD t11, t11, a5, b5 # 2ed compute
  161. MADD t21, t21, a6, b5
  162. MADD t31, t31, a7, b5
  163. MADD t41, t41, a8, b5
  164. MADD t12, t12, a5, b6
  165. MADD t22, t22, a6, b6
  166. MADD t32, t32, a7, b6
  167. MADD t42, t42, a8, b6
  168. MADD t13, t13, a5, b7
  169. MADD t23, t23, a6, b7
  170. MADD t33, t33, a7, b7
  171. MADD t43, t43, a8, b7
  172. MADD t14, t14, a5, b8
  173. MADD t24, t24, a6, b8
  174. MADD t34, t34, a7, b8
  175. MADD t44, t44, a8, b8
  176. LD a5, 12 * SIZE(AO)
  177. LD a6, 13 * SIZE(AO)
  178. LD a7, 14 * SIZE(AO)
  179. LD a8, 15 * SIZE(AO)
  180. LD b5, 12 * SIZE(BO)
  181. LD b6, 13 * SIZE(BO)
  182. LD b7, 14 * SIZE(BO)
  183. LD b8, 15 * SIZE(BO)
  184. MADD t11, t11, a1, b1 # 3rd compute
  185. MADD t21, t21, a2, b1
  186. MADD t31, t31, a3, b1
  187. MADD t41, t41, a4, b1
  188. MADD t12, t12, a1, b2
  189. MADD t22, t22, a2, b2
  190. MADD t32, t32, a3, b2
  191. MADD t42, t42, a4, b2
  192. MADD t13, t13, a1, b3
  193. MADD t23, t23, a2, b3
  194. MADD t33, t33, a3, b3
  195. MADD t43, t43, a4, b3
  196. MADD t14, t14, a1, b4
  197. MADD t24, t24, a2, b4
  198. MADD t34, t34, a3, b4
  199. MADD t44, t44, a4, b4
  200. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  201. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  202. LD a1, 0 * SIZE(AO) # next
  203. LD a2, 1 * SIZE(AO)
  204. LD a3, 2 * SIZE(AO)
  205. LD a4, 3 * SIZE(AO)
  206. LD b1, 0 * SIZE(BO)
  207. LD b2, 1 * SIZE(BO)
  208. LD b3, 2 * SIZE(BO)
  209. LD b4, 3 * SIZE(BO)
  210. MADD t11, t11, a5, b5 # 4th compute
  211. MADD t21, t21, a6, b5
  212. MADD t31, t31, a7, b5
  213. MADD t41, t41, a8, b5
  214. MADD t12, t12, a5, b6
  215. MADD t22, t22, a6, b6
  216. MADD t32, t32, a7, b6
  217. MADD t42, t42, a8, b6
  218. MADD t13, t13, a5, b7
  219. MADD t23, t23, a6, b7
  220. MADD t33, t33, a7, b7
  221. MADD t43, t43, a8, b7
  222. MADD t14, t14, a5, b8
  223. MADD t24, t24, a6, b8
  224. MADD t34, t34, a7, b8
  225. MADD t44, t44, a8, b8
  226. daddiu L, L, -1
  227. bgtz L, .L12
  228. nop
  229. .align 3
  230. .L15:
  231. andi L, KK, 3 # the remainder part: KK-KK/4
  232. blez L, .L18
  233. nop
  234. .align 3
  235. .L16:
  236. MADD t11, t11, a1, b1
  237. MADD t21, t21, a2, b1
  238. MADD t31, t31, a3, b1
  239. MADD t41, t41, a4, b1
  240. MADD t12, t12, a1, b2
  241. MADD t22, t22, a2, b2
  242. MADD t32, t32, a3, b2
  243. MADD t42, t42, a4, b2
  244. MADD t13, t13, a1, b3
  245. MADD t23, t23, a2, b3
  246. MADD t33, t33, a3, b3
  247. MADD t43, t43, a4, b3
  248. MADD t14, t14, a1, b4
  249. MADD t24, t24, a2, b4
  250. MADD t34, t34, a3, b4
  251. MADD t44, t44, a4, b4
  252. daddiu AO, AO, 4 * SIZE # AO += 4mr
  253. daddiu BO, BO, 4 * SIZE # BO += 4nr
  254. LD a1, 0 * SIZE(AO) # next
  255. LD a2, 1 * SIZE(AO)
  256. LD a3, 2 * SIZE(AO)
  257. LD a4, 3 * SIZE(AO)
  258. LD b1, 0 * SIZE(BO)
  259. LD b2, 1 * SIZE(BO)
  260. LD b3, 2 * SIZE(BO)
  261. LD b4, 3 * SIZE(BO)
  262. daddiu L, L, -1
  263. bgtz L, .L16
  264. nop
  265. .L18: # deal with the triangular data part of panel Ai
  266. LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
  267. LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
  268. LD b3, 2 * SIZE(BO)
  269. LD b4, 3 * SIZE(BO)
  270. SUB t11, b1, t11
  271. SUB t12, b2, t12
  272. SUB t13, b3, t13
  273. SUB t14, b4, t14
  274. LD b5, 4 * SIZE(BO) # sb store in row major
  275. LD b6, 5 * SIZE(BO)
  276. LD b7, 6 * SIZE(BO)
  277. LD b8, 7 * SIZE(BO)
  278. SUB t21, b5, t21
  279. SUB t22, b6, t22
  280. SUB t23, b7, t23
  281. SUB t24, b8, t24
  282. LD b1, 8 * SIZE(BO)
  283. LD b2, 9 * SIZE(BO)
  284. LD b3, 10 * SIZE(BO)
  285. LD b4, 11 * SIZE(BO)
  286. SUB t31, b1, t31
  287. SUB t32, b2, t32
  288. SUB t33, b3, t33
  289. SUB t34, b4, t34
  290. LD b5, 12 * SIZE(BO)
  291. LD b6, 13 * SIZE(BO)
  292. LD b7, 14 * SIZE(BO)
  293. LD b8, 15 * SIZE(BO)
  294. SUB t41, b5, t41
  295. SUB t42, b6, t42
  296. SUB t43, b7, t43
  297. SUB t44, b8, t44
  298. LD a1, 0 * SIZE(AO) # sa stores in col major
  299. LD a2, 1 * SIZE(AO)
  300. LD a3, 2 * SIZE(AO)
  301. LD a4, 3 * SIZE(AO)
  302. MUL t11, a1, t11
  303. MUL t12, a1, t12
  304. MUL t13, a1, t13
  305. MUL t14, a1, t14
  306. NMSUB t21, t21, a2, t11
  307. NMSUB t22, t22, a2, t12
  308. NMSUB t23, t23, a2, t13
  309. NMSUB t24, t24, a2, t14
  310. NMSUB t31, t31, a3, t11
  311. NMSUB t32, t32, a3, t12
  312. NMSUB t33, t33, a3, t13
  313. NMSUB t34, t34, a3, t14
  314. NMSUB t41, t41, a4, t11
  315. NMSUB t42, t42, a4, t12
  316. NMSUB t43, t43, a4, t13
  317. NMSUB t44, t44, a4, t14
  318. LD a5, 5 * SIZE(AO)
  319. LD a6, 6 * SIZE(AO)
  320. LD a7, 7 * SIZE(AO)
  321. MUL t21, a5, t21
  322. MUL t22, a5, t22
  323. MUL t23, a5, t23
  324. MUL t24, a5, t24
  325. NMSUB t31, t31, a6, t21
  326. NMSUB t32, t32, a6, t22
  327. NMSUB t33, t33, a6, t23
  328. NMSUB t34, t34, a6, t24
  329. NMSUB t41, t41, a7, t21
  330. NMSUB t42, t42, a7, t22
  331. NMSUB t43, t43, a7, t23
  332. NMSUB t44, t44, a7, t24
  333. LD a8, 10 * SIZE(AO)
  334. LD a1, 11 * SIZE(AO)
  335. MUL t31, a8, t31
  336. MUL t32, a8, t32
  337. MUL t33, a8, t33
  338. MUL t34, a8, t34
  339. NMSUB t41, t41, a1, t31
  340. NMSUB t42, t42, a1, t32
  341. NMSUB t43, t43, a1, t33
  342. NMSUB t44, t44, a1, t34
  343. LD a2, 15 * SIZE(AO)
  344. MUL t41, a2, t41
  345. MUL t42, a2, t42
  346. MUL t43, a2, t43
  347. MUL t44, a2, t44
  348. ST t11, 0 * SIZE(BO) # update packed B
  349. ST t12, 1 * SIZE(BO)
  350. ST t13, 2 * SIZE(BO)
  351. ST t14, 3 * SIZE(BO)
  352. ST t21, 4 * SIZE(BO)
  353. ST t22, 5 * SIZE(BO)
  354. ST t23, 6 * SIZE(BO)
  355. ST t24, 7 * SIZE(BO)
  356. ST t31, 8 * SIZE(BO)
  357. ST t32, 9 * SIZE(BO)
  358. ST t33, 10 * SIZE(BO)
  359. ST t34, 11 * SIZE(BO)
  360. ST t41, 12 * SIZE(BO)
  361. ST t42, 13 * SIZE(BO)
  362. ST t43, 14 * SIZE(BO)
  363. ST t44, 15 * SIZE(BO)
  364. ST t11, 0 * SIZE(CO1) # write back
  365. ST t21, 1 * SIZE(CO1)
  366. ST t31, 2 * SIZE(CO1)
  367. ST t41, 3 * SIZE(CO1)
  368. ST t12, 0 * SIZE(CO2)
  369. ST t22, 1 * SIZE(CO2)
  370. ST t32, 2 * SIZE(CO2)
  371. ST t42, 3 * SIZE(CO2)
  372. ST t13, 0 * SIZE(CO3)
  373. ST t23, 1 * SIZE(CO3)
  374. ST t33, 2 * SIZE(CO3)
  375. ST t43, 3 * SIZE(CO3)
  376. ST t14, 0 * SIZE(CO4)
  377. ST t24, 1 * SIZE(CO4)
  378. ST t34, 2 * SIZE(CO4)
  379. ST t44, 3 * SIZE(CO4)
  380. daddiu CO1, CO1, 4 * SIZE # fixed pointers
  381. daddiu CO2, CO2, 4 * SIZE
  382. daddiu CO3, CO3, 4 * SIZE
  383. daddiu CO4, CO4, 4 * SIZE
  384. dsubu TEMP, K, KK
  385. dsll L, TEMP, 2 + BASE_SHIFT
  386. dsll TEMP, TEMP, 2 + BASE_SHIFT
  387. daddu AO, AO, L # mov AO to the end of panel Ai
  388. daddu BO, BO, TEMP # mov BO to the end of panel Bj
  389. daddiu KK, KK, 4 # the length of rectangular data part increases by 4
  390. daddiu I, I, -1
  391. MTC $0, a1
  392. MOV t11, a1
  393. MOV t21, a1
  394. MOV t31, a1
  395. MOV t41, a1
  396. MOV t12, a1
  397. MOV t22, a1
  398. MOV t32, a1
  399. MOV t42, a1
  400. bgtz I, .L11
  401. nop
  402. .align 3
  403. .L20:
  404. andi I, M, 2 # mr=2,nr=4
  405. blez I, .L50
  406. nop
  407. MOV t13, t11
  408. MOV t23, t11
  409. MOV t33, t11
  410. MOV t43, t11
  411. MOV t14, t11
  412. MOV t24, t11
  413. MOV t34, t11
  414. MOV t44, t11
  415. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  416. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  417. LD b1, 0 * SIZE(B) # get 4b
  418. LD b2, 1 * SIZE(B)
  419. LD b3, 2 * SIZE(B)
  420. LD b4, 3 * SIZE(B)
  421. dsra L, KK, 2
  422. blez L, .L25
  423. move BO, B
  424. .align 3
  425. .L22:
  426. LD a5, 2 * SIZE(AO)
  427. LD a6, 3 * SIZE(AO)
  428. LD b5, 4 * SIZE(BO)
  429. LD b6, 5 * SIZE(BO)
  430. LD b7, 6 * SIZE(BO)
  431. LD b8, 7 * SIZE(BO)
  432. MADD t11, t11, a1, b1 # 1st compute
  433. MADD t21, t21, a2, b1
  434. MADD t12, t12, a1, b2
  435. MADD t22, t22, a2, b2
  436. MADD t13, t13, a1, b3
  437. MADD t23, t23, a2, b3
  438. MADD t14, t14, a1, b4
  439. MADD t24, t24, a2, b4
  440. LD a3, 4 * SIZE(AO)
  441. LD a4, 5 * SIZE(AO)
  442. LD b1, 8 * SIZE(BO)
  443. LD b2, 9 * SIZE(BO)
  444. LD b3, 10 * SIZE(BO)
  445. LD b4, 11 * SIZE(BO)
  446. MADD t11, t11, a5, b5 # 2ed compute
  447. MADD t21, t21, a6, b5
  448. MADD t12, t12, a5, b6
  449. MADD t22, t22, a6, b6
  450. MADD t13, t13, a5, b7
  451. MADD t23, t23, a6, b7
  452. MADD t14, t14, a5, b8
  453. MADD t24, t24, a6, b8
  454. LD a7, 6 * SIZE(AO)
  455. LD a8, 7 * SIZE(AO)
  456. LD b5, 12 * SIZE(BO)
  457. LD b6, 13 * SIZE(BO)
  458. LD b7, 14 * SIZE(BO)
  459. LD b8, 15 * SIZE(BO)
  460. MADD t11, t11, a3, b1 # 3rd compute
  461. MADD t21, t21, a4, b1
  462. MADD t12, t12, a3, b2
  463. MADD t22, t22, a4, b2
  464. MADD t13, t13, a3, b3
  465. MADD t23, t23, a4, b3
  466. MADD t14, t14, a3, b4
  467. MADD t24, t24, a4, b4
  468. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  469. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  470. LD a1, 0 * SIZE(AO) # next
  471. LD a2, 1 * SIZE(AO)
  472. LD b1, 0 * SIZE(BO)
  473. LD b2, 1 * SIZE(BO)
  474. LD b3, 2 * SIZE(BO)
  475. LD b4, 3 * SIZE(BO)
  476. MADD t11, t11, a7, b5 # 4th compute
  477. MADD t21, t21, a8, b5
  478. MADD t12, t12, a7, b6
  479. MADD t22, t22, a8, b6
  480. MADD t13, t13, a7, b7
  481. MADD t23, t23, a8, b7
  482. MADD t14, t14, a7, b8
  483. MADD t24, t24, a8, b8
  484. daddiu L, L, -1
  485. bgtz L, .L22
  486. nop
  487. .align 3
  488. .L25:
  489. andi L, KK, 3
  490. blez L, .L28
  491. nop
  492. .align 3
  493. .L26:
  494. MADD t11, t11, a1, b1 # 3rd compute
  495. MADD t21, t21, a2, b1
  496. MADD t12, t12, a1, b2
  497. MADD t22, t22, a2, b2
  498. MADD t13, t13, a1, b3
  499. MADD t23, t23, a2, b3
  500. MADD t14, t14, a1, b4
  501. MADD t24, t24, a2, b4
  502. daddiu AO, AO, 2 * SIZE # AO += 2mr
  503. daddiu BO, BO, 4 * SIZE # BO += 4nr
  504. LD a1, 0 * SIZE(AO) # next
  505. LD a2, 1 * SIZE(AO)
  506. LD b1, 0 * SIZE(BO)
  507. LD b2, 1 * SIZE(BO)
  508. LD b3, 2 * SIZE(BO)
  509. LD b4, 3 * SIZE(BO)
  510. daddiu L, L, -1
  511. bgtz L, .L26
  512. nop
  513. .L28: # deal with the triangular part
  514. LD b1, 0 * SIZE(BO)
  515. LD b2, 1 * SIZE(BO)
  516. LD b3, 2 * SIZE(BO)
  517. LD b4, 3 * SIZE(BO)
  518. LD b5, 4 * SIZE(BO)
  519. LD b6, 5 * SIZE(BO)
  520. LD b7, 6 * SIZE(BO)
  521. LD b8, 7 * SIZE(BO)
  522. SUB t11, b1, t11
  523. SUB t12, b2, t12
  524. SUB t13, b3, t13
  525. SUB t14, b4, t14
  526. SUB t21, b5, t21
  527. SUB t22, b6, t22
  528. SUB t23, b7, t23
  529. SUB t24, b8, t24
  530. LD b1, 0 * SIZE(AO) # computes the triangular_part
  531. LD b2, 1 * SIZE(AO)
  532. MUL t11, b1, t11
  533. MUL t12, b1, t12
  534. MUL t13, b1, t13
  535. MUL t14, b1, t14
  536. NMSUB t21, t21, b2, t11
  537. NMSUB t22, t22, b2, t12
  538. NMSUB t23, t23, b2, t13
  539. NMSUB t24, t24, b2, t14
  540. LD b3, 3 * SIZE(AO)
  541. MUL t21, b3, t21
  542. MUL t22, b3, t22
  543. MUL t23, b3, t23
  544. MUL t24, b3, t24
  545. ST t11, 0 * SIZE(BO)
  546. ST t12, 1 * SIZE(BO)
  547. ST t13, 2 * SIZE(BO)
  548. ST t14, 3 * SIZE(BO)
  549. ST t21, 4 * SIZE(BO)
  550. ST t22, 5 * SIZE(BO)
  551. ST t23, 6 * SIZE(BO)
  552. ST t24, 7 * SIZE(BO)
  553. ST t11, 0 * SIZE(CO1)
  554. ST t21, 1 * SIZE(CO1)
  555. ST t12, 0 * SIZE(CO2)
  556. ST t22, 1 * SIZE(CO2)
  557. ST t13, 0 * SIZE(CO3)
  558. ST t23, 1 * SIZE(CO3)
  559. ST t14, 0 * SIZE(CO4)
  560. ST t24, 1 * SIZE(CO4)
  561. daddiu CO1, CO1, 2 * SIZE
  562. daddiu CO2, CO2, 2 * SIZE
  563. daddiu CO3, CO3, 2 * SIZE
  564. daddiu CO4, CO4, 2 * SIZE
  565. dsubu TEMP, K, KK
  566. dsll L, TEMP, 1 + BASE_SHIFT
  567. dsll TEMP, TEMP, 2 + BASE_SHIFT
  568. daddu AO, AO, L # mov AO to the end of Ai
  569. daddu BO, BO, TEMP # mov BO to the end of Bj
  570. daddiu KK, KK, 2 # the length of rectangular data part increases by 2
  571. MTC $0, a1
  572. MOV t11, a1
  573. MOV t21, a1
  574. MOV t31, a1
  575. MOV t41, a1
  576. MOV t12, a1
  577. MOV t22, a1
  578. MOV t32, a1
  579. MOV t42, a1
  580. .align 3
  581. .L50:
  582. andi I, M, 1 # mr=1,nr=4
  583. blez I, .L29
  584. nop
  585. MOV t13, t11
  586. MOV t23, t11
  587. MOV t33, t11
  588. MOV t43, t11
  589. MOV t14, t11
  590. MOV t24, t11
  591. MOV t34, t11
  592. MOV t44, t11
  593. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  594. LD b1, 0 * SIZE(B) # get 4b
  595. LD b2, 1 * SIZE(B)
  596. LD b3, 2 * SIZE(B)
  597. LD b4, 3 * SIZE(B)
  598. dsra L, KK, 2
  599. blez L, .L55
  600. move BO, B
  601. .align 3
  602. .L52:
  603. LD a5, 1 * SIZE(AO)
  604. LD b5, 4 * SIZE(BO)
  605. LD b6, 5 * SIZE(BO)
  606. LD b7, 6 * SIZE(BO)
  607. LD b8, 7 * SIZE(BO)
  608. MADD t11, t11, a1, b1 # 1st compute
  609. MADD t12, t12, a1, b2
  610. MADD t13, t13, a1, b3
  611. MADD t14, t14, a1, b4
  612. LD a3, 2 * SIZE(AO)
  613. LD b1, 8 * SIZE(BO)
  614. LD b2, 9 * SIZE(BO)
  615. LD b3, 10 * SIZE(BO)
  616. LD b4, 11 * SIZE(BO)
  617. MADD t11, t11, a5, b5 # 2ed compute
  618. MADD t12, t12, a5, b6
  619. MADD t13, t13, a5, b7
  620. MADD t14, t14, a5, b8
  621. LD a7, 3 * SIZE(AO)
  622. LD b5, 12 * SIZE(BO)
  623. LD b6, 13 * SIZE(BO)
  624. LD b7, 14 * SIZE(BO)
  625. LD b8, 15 * SIZE(BO)
  626. MADD t11, t11, a3, b1 # 3rd compute
  627. MADD t12, t12, a3, b2
  628. MADD t13, t13, a3, b3
  629. MADD t14, t14, a3, b4
  630. daddiu AO, AO, 4 * SIZE # AO += mr*4kr
  631. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  632. LD a1, 0 * SIZE(AO) # next
  633. LD b1, 0 * SIZE(BO)
  634. LD b2, 1 * SIZE(BO)
  635. LD b3, 2 * SIZE(BO)
  636. LD b4, 3 * SIZE(BO)
  637. MADD t11, t11, a7, b5 # 4th compute
  638. MADD t12, t12, a7, b6
  639. MADD t13, t13, a7, b7
  640. MADD t14, t14, a7, b8
  641. daddiu L, L, -1
  642. bgtz L, .L52
  643. nop
  644. .align 3
  645. .L55:
  646. andi L, KK, 3
  647. blez L, .L58
  648. nop
  649. .align 3
  650. .L56:
  651. MADD t11, t11, a1, b1 # 3rd compute
  652. MADD t12, t12, a1, b2
  653. MADD t13, t13, a1, b3
  654. MADD t14, t14, a1, b4
  655. daddiu AO, AO, 1 * SIZE # AO += 2mr
  656. daddiu BO, BO, 4 * SIZE # BO += 4nr
  657. LD a1, 0 * SIZE(AO) # next
  658. LD b1, 0 * SIZE(BO)
  659. LD b2, 1 * SIZE(BO)
  660. LD b3, 2 * SIZE(BO)
  661. LD b4, 3 * SIZE(BO)
  662. daddiu L, L, -1
  663. bgtz L, .L56
  664. nop
  665. .L58: # deal with the triangular part
  666. LD b1, 0 * SIZE(BO)
  667. LD b2, 1 * SIZE(BO)
  668. LD b3, 2 * SIZE(BO)
  669. LD b4, 3 * SIZE(BO)
  670. SUB t11, b1, t11
  671. SUB t12, b2, t12
  672. SUB t13, b3, t13
  673. SUB t14, b4, t14
  674. LD b1, 0 * SIZE(AO) # computes the triangular_part
  675. MUL t11, b1, t11
  676. MUL t12, b1, t12
  677. MUL t13, b1, t13
  678. MUL t14, b1, t14
  679. ST t11, 0 * SIZE(BO)
  680. ST t12, 1 * SIZE(BO)
  681. ST t13, 2 * SIZE(BO)
  682. ST t14, 3 * SIZE(BO)
  683. ST t11, 0 * SIZE(CO1)
  684. ST t12, 0 * SIZE(CO2)
  685. ST t13, 0 * SIZE(CO3)
  686. ST t14, 0 * SIZE(CO4)
  687. daddiu CO1, CO1, 1 * SIZE
  688. daddiu CO2, CO2, 1 * SIZE
  689. daddiu CO3, CO3, 1 * SIZE
  690. daddiu CO4, CO4, 1 * SIZE
  691. dsubu TEMP, K, KK
  692. dsll L, TEMP, BASE_SHIFT # mr=1
  693. dsll TEMP, TEMP, 2 + BASE_SHIFT
  694. daddu AO, AO, L # mov AO to the end of Ai
  695. daddu BO, BO, TEMP # mov BO to the end of Bj
  696. daddiu KK, KK, 1 # the length of rectangular data part increases by 2
  697. .align 3
  698. .L29:
  699. move B, BO # fixed panel Bj
  700. bgtz J, .L10
  701. nop
  702. .align 3
  703. .L30:
  704. andi J, N, 2 # nr=2
  705. blez J, .L70
  706. nop
  707. move CO1, C
  708. daddu CO2, C, LDC
  709. MTC $0, t11 # clear result regusters
  710. MOV t21, t11
  711. MOV t31, t11
  712. MOV t41, t11
  713. move KK, OFFSET
  714. move AO, A # reset A
  715. daddu C, CO2, LDC # fixed
  716. dsra I, M, 2 # I = mc/4
  717. blez I, .L40
  718. nop
  719. .L31:
  720. MOV t12, t11
  721. MOV t22, t11
  722. MOV t32, t11
  723. MOV t42, t11
  724. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  725. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  726. LD a3, 2 * SIZE(AO)
  727. LD a4, 3 * SIZE(AO) # get 4a
  728. LD b1, 0 * SIZE(B) # get 4b
  729. LD b2, 1 * SIZE(B)
  730. dsra L, KK, 2 # L=kk/4
  731. blez L, .L35
  732. move BO, B # reset B
  733. .align 3
  734. .L32:
  735. LD a5, 4 * SIZE(AO)
  736. LD a6, 5 * SIZE(AO)
  737. LD a7, 6 * SIZE(AO)
  738. LD a8, 7 * SIZE(AO)
  739. LD b5, 2 * SIZE(BO)
  740. LD b6, 3 * SIZE(BO)
  741. MADD t11, t11, a1, b1 # 1st compute
  742. MADD t21, t21, a2, b1
  743. MADD t31, t31, a3, b1
  744. MADD t41, t41, a4, b1
  745. MADD t12, t12, a1, b2
  746. MADD t22, t22, a2, b2
  747. MADD t32, t32, a3, b2
  748. MADD t42, t42, a4, b2
  749. LD a1, 8 * SIZE(AO)
  750. LD a2, 9 * SIZE(AO)
  751. LD a3, 10 * SIZE(AO)
  752. LD a4, 11 * SIZE(AO)
  753. LD b3, 4 * SIZE(BO)
  754. LD b4, 5 * SIZE(BO)
  755. MADD t11, t11, a5, b5 # 2ed compute
  756. MADD t21, t21, a6, b5
  757. MADD t31, t31, a7, b5
  758. MADD t41, t41, a8, b5
  759. MADD t12, t12, a5, b6
  760. MADD t22, t22, a6, b6
  761. MADD t32, t32, a7, b6
  762. MADD t42, t42, a8, b6
  763. LD a5, 12 * SIZE(AO)
  764. LD a6, 13 * SIZE(AO)
  765. LD a7, 14 * SIZE(AO)
  766. LD a8, 15 * SIZE(AO)
  767. LD b7, 6 * SIZE(BO)
  768. LD b8, 7 * SIZE(BO)
  769. MADD t11, t11, a1, b3 # 3rd compute
  770. MADD t21, t21, a2, b3
  771. MADD t31, t31, a3, b3
  772. MADD t41, t41, a4, b3
  773. MADD t12, t12, a1, b4
  774. MADD t22, t22, a2, b4
  775. MADD t32, t32, a3, b4
  776. MADD t42, t42, a4, b4
  777. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  778. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  779. LD a1, 0 * SIZE(AO) # next
  780. LD a2, 1 * SIZE(AO)
  781. LD a3, 2 * SIZE(AO)
  782. LD a4, 3 * SIZE(AO)
  783. LD b1, 0 * SIZE(BO)
  784. LD b2, 1 * SIZE(BO)
  785. MADD t11, t11, a5, b7 # 4th compute
  786. MADD t21, t21, a6, b7
  787. MADD t31, t31, a7, b7
  788. MADD t41, t41, a8, b7
  789. MADD t12, t12, a5, b8
  790. MADD t22, t22, a6, b8
  791. MADD t32, t32, a7, b8
  792. MADD t42, t42, a8, b8
  793. daddiu L, L, -1
  794. bgtz L, .L32
  795. nop
  796. .align 3
  797. .L35:
  798. andi L, KK, 3
  799. blez L, .L38
  800. nop
  801. .align 3
  802. .L36:
  803. MADD t11, t11, a1, b1 # 3rd compute
  804. MADD t21, t21, a2, b1
  805. MADD t31, t31, a3, b1
  806. MADD t41, t41, a4, b1
  807. MADD t12, t12, a1, b2
  808. MADD t22, t22, a2, b2
  809. MADD t32, t32, a3, b2
  810. MADD t42, t42, a4, b2
  811. daddiu AO, AO, 4 * SIZE # AO += 4mr
  812. daddiu BO, BO, 2 * SIZE # BO += 2nr
  813. LD a1, 0 * SIZE(AO) # next
  814. LD a2, 1 * SIZE(AO)
  815. LD a3, 2 * SIZE(AO)
  816. LD a4, 3 * SIZE(AO)
  817. LD b1, 0 * SIZE(BO)
  818. LD b2, 1 * SIZE(BO)
  819. daddiu L, L, -1
  820. bgtz L, .L36
  821. nop
  822. .L38: #
  823. LD b1, 0 * SIZE(BO)
  824. LD b2, 1 * SIZE(BO)
  825. LD b3, 2 * SIZE(BO)
  826. LD b4, 3 * SIZE(BO)
  827. LD b5, 4 * SIZE(BO)
  828. LD b6, 5 * SIZE(BO)
  829. LD b7, 6 * SIZE(BO)
  830. LD b8, 7 * SIZE(BO)
  831. SUB t11, b1, t11
  832. SUB t12, b2, t12
  833. SUB t21, b3, t21
  834. SUB t22, b4, t22
  835. SUB t31, b5, t31
  836. SUB t32, b6, t32
  837. SUB t41, b7, t41
  838. SUB t42, b8, t42
  839. LD a1, 0 * SIZE(AO) # sa stores in col major
  840. LD a2, 1 * SIZE(AO)
  841. LD a3, 2 * SIZE(AO)
  842. LD a4, 3 * SIZE(AO)
  843. MUL t11, a1, t11
  844. MUL t12, a1, t12
  845. NMSUB t21, t21, a2, t11
  846. NMSUB t22, t22, a2, t12
  847. NMSUB t31, t31, a3, t11
  848. NMSUB t32, t32, a3, t12
  849. NMSUB t41, t41, a4, t11
  850. NMSUB t42, t42, a4, t12
  851. LD a5, 5 * SIZE(AO)
  852. LD a6, 6 * SIZE(AO)
  853. LD a7, 7 * SIZE(AO)
  854. MUL t21, a5, t21
  855. MUL t22, a5, t22
  856. NMSUB t31, t31, a6, t21
  857. NMSUB t32, t32, a6, t22
  858. NMSUB t41, t41, a7, t21
  859. NMSUB t42, t42, a7, t22
  860. LD a8, 10 * SIZE(AO)
  861. LD a1, 11 * SIZE(AO)
  862. MUL t31, a8, t31
  863. MUL t32, a8, t32
  864. NMSUB t41, t41, a1, t31
  865. NMSUB t42, t42, a1, t32
  866. LD a2, 15 * SIZE(AO)
  867. MUL t41, a2, t41
  868. MUL t42, a2, t42
  869. ST t11, 0 * SIZE(BO)
  870. ST t12, 1 * SIZE(BO)
  871. ST t21, 2 * SIZE(BO)
  872. ST t22, 3 * SIZE(BO)
  873. ST t31, 4 * SIZE(BO)
  874. ST t32, 5 * SIZE(BO)
  875. ST t41, 6 * SIZE(BO)
  876. ST t42, 7 * SIZE(BO)
  877. ST t11, 0 * SIZE(CO1)
  878. ST t21, 1 * SIZE(CO1)
  879. ST t31, 2 * SIZE(CO1)
  880. ST t41, 3 * SIZE(CO1)
  881. ST t12, 0 * SIZE(CO2)
  882. ST t22, 1 * SIZE(CO2)
  883. ST t32, 2 * SIZE(CO2)
  884. ST t42, 3 * SIZE(CO2)
  885. daddiu CO1, CO1, 4 * SIZE
  886. daddiu CO2, CO2, 4 * SIZE
  887. dsubu TEMP, K, KK
  888. dsll L, TEMP, 2 + BASE_SHIFT
  889. dsll TEMP, TEMP, 1 + BASE_SHIFT
  890. daddu AO, AO, L # move AO to the end of Ai
  891. daddu BO, BO, TEMP
  892. daddiu KK, KK, 4 #
  893. MTC $0, a1
  894. MOV t11, a1
  895. MOV t21, a1
  896. MOV t31, a1
  897. MOV t41, a1
  898. daddiu I, I, -1
  899. bgtz I, .L31
  900. nop
  901. .align 3
  902. .L40:
  903. andi I, M, 2
  904. blez I, .L60
  905. nop
  906. MOV t12, t11 # clear result registers
  907. MOV t22, t21
  908. MOV t32, t31
  909. MOV t42, t41
  910. LD a1, 0 * SIZE(AO)
  911. LD a2, 1 * SIZE(AO)
  912. LD b1, 0 * SIZE(B)
  913. LD b2, 1 * SIZE(B)
  914. dsra L, KK, 2
  915. blez L, .L45
  916. move BO, B # reset B
  917. .align 3
  918. .L42:
  919. LD a5, 2 * SIZE(AO)
  920. LD a6, 3 * SIZE(AO)
  921. LD b5, 2 * SIZE(BO)
  922. LD b6, 3 * SIZE(BO)
  923. MADD t11, t11, a1, b1 # 1st compute
  924. MADD t21, t21, a2, b1
  925. MADD t12, t12, a1, b2
  926. MADD t22, t22, a2, b2
  927. LD a3, 4 * SIZE(AO)
  928. LD a4, 5 * SIZE(AO)
  929. LD b3, 4 * SIZE(BO)
  930. LD b4, 5 * SIZE(BO)
  931. MADD t11, t11, a5, b5 # 2ed compute
  932. MADD t21, t21, a6, b5
  933. MADD t12, t12, a5, b6
  934. MADD t22, t22, a6, b6
  935. LD a7, 6 * SIZE(AO)
  936. LD a8, 7 * SIZE(AO)
  937. LD b7, 6 * SIZE(BO)
  938. LD b8, 7 * SIZE(BO)
  939. MADD t11, t11, a3, b3 # 3rd compute
  940. MADD t21, t21, a4, b3
  941. MADD t12, t12, a3, b4
  942. MADD t22, t22, a4, b4
  943. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  944. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  945. LD a1, 0 * SIZE(AO) # next
  946. LD a2, 1 * SIZE(AO)
  947. LD b1, 0 * SIZE(BO)
  948. LD b2, 1 * SIZE(BO)
  949. MADD t11, t11, a7, b7 # 4th compute
  950. MADD t21, t21, a8, b7
  951. MADD t12, t12, a7, b8
  952. MADD t22, t22, a8, b8
  953. daddiu L, L, -1
  954. bgtz L, .L42
  955. nop
  956. .align 3
  957. .L45:
  958. andi L, KK, 3
  959. blez L, .L48
  960. nop
  961. .align 3
  962. .L46:
  963. MADD t11, t11, a1, b1 # 3rd compute
  964. MADD t21, t21, a2, b1
  965. MADD t12, t12, a1, b2
  966. MADD t22, t22, a2, b2
  967. daddiu AO, AO, 2 * SIZE # AO += 2mr
  968. daddiu BO, BO, 2 * SIZE # BO += 2nr
  969. LD a1, 0 * SIZE(AO) # next
  970. LD a2, 1 * SIZE(AO)
  971. LD b1, 0 * SIZE(BO)
  972. LD b2, 1 * SIZE(BO)
  973. daddiu L, L, -1
  974. bgtz L, .L46
  975. nop
  976. .L48:
  977. LD b1, 0 * SIZE(BO)
  978. LD b2, 1 * SIZE(BO)
  979. LD b3, 2 * SIZE(BO)
  980. LD b4, 3 * SIZE(BO)
  981. SUB t11, b1, t11
  982. SUB t12, b2, t12
  983. SUB t21, b3, t21
  984. SUB t22, b4, t22
  985. LD b1, 0 * SIZE(AO) # computes the triangular_part
  986. LD b2, 1 * SIZE(AO)
  987. MUL t11, b1, t11
  988. MUL t12, b1, t12
  989. NMSUB t21, t21, b2, t11
  990. NMSUB t22, t22, b2, t12
  991. LD b3, 3 * SIZE(AO)
  992. MUL t21, b3, t21
  993. MUL t22, b3, t22
  994. ST t11, 0 * SIZE(BO)
  995. ST t12, 1 * SIZE(BO)
  996. ST t21, 2 * SIZE(BO)
  997. ST t22, 3 * SIZE(BO)
  998. ST t11, 0 * SIZE(CO1)
  999. ST t21, 1 * SIZE(CO1)
  1000. ST t12, 0 * SIZE(CO2)
  1001. ST t22, 1 * SIZE(CO2)
  1002. daddiu CO1, CO1, 2 * SIZE
  1003. daddiu CO2, CO2, 2 * SIZE
  1004. dsubu TEMP, K, KK
  1005. dsll L, TEMP, 1 + BASE_SHIFT
  1006. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1007. daddu AO, AO, L
  1008. daddu BO, BO, TEMP
  1009. daddiu KK, KK, 2
  1010. MTC $0, a1
  1011. MOV t11, a1
  1012. MOV t21, a1
  1013. MOV t31, a1
  1014. MOV t41, a1
  1015. .align 3
  1016. .L60:
  1017. andi I, M, 1 # mr=1
  1018. blez I, .L49
  1019. nop
  1020. MOV t12, t11 # clear result registers
  1021. MOV t22, t21
  1022. MOV t32, t31
  1023. MOV t42, t41
  1024. LD a1, 0 * SIZE(AO)
  1025. LD b1, 0 * SIZE(B)
  1026. LD b2, 1 * SIZE(B)
  1027. dsra L, KK, 2
  1028. blez L, .L65
  1029. move BO, B # reset B
  1030. .align 3
  1031. .L62:
  1032. LD a5, 1 * SIZE(AO)
  1033. LD b5, 2 * SIZE(BO)
  1034. LD b6, 3 * SIZE(BO)
  1035. MADD t11, t11, a1, b1 # 1st compute
  1036. MADD t12, t12, a1, b2
  1037. LD a3, 2 * SIZE(AO)
  1038. LD b3, 4 * SIZE(BO)
  1039. LD b4, 5 * SIZE(BO)
  1040. MADD t11, t11, a5, b5 # 2ed compute
  1041. MADD t12, t12, a5, b6
  1042. LD a7, 3 * SIZE(AO)
  1043. LD b7, 6 * SIZE(BO)
  1044. LD b8, 7 * SIZE(BO)
  1045. MADD t11, t11, a3, b3 # 3rd compute
  1046. MADD t12, t12, a3, b4
  1047. daddiu AO, AO, 4 * SIZE # AO += mr*4kr
  1048. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  1049. LD a1, 0 * SIZE(AO) # next
  1050. LD b1, 0 * SIZE(BO)
  1051. LD b2, 1 * SIZE(BO)
  1052. MADD t11, t11, a7, b7 # 4th compute
  1053. MADD t12, t12, a7, b8
  1054. daddiu L, L, -1
  1055. bgtz L, .L62
  1056. nop
  1057. .align 3
  1058. .L65:
  1059. andi L, KK, 3
  1060. blez L, .L68
  1061. nop
  1062. .align 3
  1063. .L66:
  1064. MADD t11, t11, a1, b1 # 3rd compute
  1065. MADD t12, t12, a1, b2
  1066. daddiu AO, AO, 1 * SIZE # AO += 1mr
  1067. daddiu BO, BO, 2 * SIZE # BO += 2nr
  1068. LD a1, 0 * SIZE(AO) # next
  1069. LD b1, 0 * SIZE(BO)
  1070. LD b2, 1 * SIZE(BO)
  1071. daddiu L, L, -1
  1072. bgtz L, .L66
  1073. nop
  1074. .L68:
  1075. LD b1, 0 * SIZE(BO)
  1076. LD b2, 1 * SIZE(BO)
  1077. SUB t11, b1, t11
  1078. SUB t12, b2, t12
  1079. LD b1, 0 * SIZE(AO) # computes the triangular_part
  1080. MUL t11, b1, t11
  1081. MUL t12, b1, t12
  1082. ST t11, 0 * SIZE(BO)
  1083. ST t12, 1 * SIZE(BO)
  1084. ST t11, 0 * SIZE(CO1)
  1085. ST t12, 0 * SIZE(CO2)
  1086. daddiu CO1, CO1, 1 * SIZE
  1087. daddiu CO2, CO2, 1 * SIZE
  1088. dsubu TEMP, K, KK
  1089. dsll L, TEMP, BASE_SHIFT # mr=1
  1090. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1091. daddu AO, AO, L
  1092. daddu BO, BO, TEMP
  1093. daddiu KK, KK, 1
  1094. .align 3
  1095. .L49:
  1096. move B, BO
  1097. .align 3
  1098. .L70:
  1099. andi J, N, 1 # nr=1
  1100. blez J, .L999 # END
  1101. nop
  1102. move CO1, C
  1103. move KK, OFFSET
  1104. move AO, A
  1105. dsra I, M, 2
  1106. blez I, .L80
  1107. nop
  1108. .L71:
  1109. MTC $0, t11 # clear result regusters
  1110. MOV t21, t11
  1111. MOV t31, t11
  1112. MOV t41, t11
  1113. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  1114. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  1115. LD a3, 2 * SIZE(AO)
  1116. LD a4, 3 * SIZE(AO) # get 4a
  1117. LD b1, 0 * SIZE(B) # get 4b
  1118. dsra L, KK, 2
  1119. blez L, .L75
  1120. move BO, B # reset B
  1121. .align 3
  1122. .L72:
  1123. LD a5, 4 * SIZE(AO)
  1124. LD a6, 5 * SIZE(AO)
  1125. LD a7, 6 * SIZE(AO)
  1126. LD a8, 7 * SIZE(AO)
  1127. LD b5, 1 * SIZE(BO)
  1128. MADD t11, t11, a1, b1 # 1st compute
  1129. MADD t21, t21, a2, b1
  1130. MADD t31, t31, a3, b1
  1131. MADD t41, t41, a4, b1
  1132. LD a1, 8 * SIZE(AO)
  1133. LD a2, 9 * SIZE(AO)
  1134. LD a3, 10 * SIZE(AO)
  1135. LD a4, 11 * SIZE(AO)
  1136. LD b3, 2 * SIZE(BO)
  1137. MADD t11, t11, a5, b5 # 2ed compute
  1138. MADD t21, t21, a6, b5
  1139. MADD t31, t31, a7, b5
  1140. MADD t41, t41, a8, b5
  1141. LD a5, 12 * SIZE(AO)
  1142. LD a6, 13 * SIZE(AO)
  1143. LD a7, 14 * SIZE(AO)
  1144. LD a8, 15 * SIZE(AO)
  1145. LD b7, 3 * SIZE(BO)
  1146. MADD t11, t11, a1, b3 # 3rd compute
  1147. MADD t21, t21, a2, b3
  1148. MADD t31, t31, a3, b3
  1149. MADD t41, t41, a4, b3
  1150. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  1151. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1152. LD a1, 0 * SIZE(AO) # next
  1153. LD a2, 1 * SIZE(AO)
  1154. LD a3, 2 * SIZE(AO)
  1155. LD a4, 3 * SIZE(AO)
  1156. LD b1, 0 * SIZE(BO)
  1157. MADD t11, t11, a5, b7 # 4th compute
  1158. MADD t21, t21, a6, b7
  1159. MADD t31, t31, a7, b7
  1160. MADD t41, t41, a8, b7
  1161. daddiu L, L, -1
  1162. bgtz L, .L72
  1163. nop
  1164. .align 3
  1165. .L75:
  1166. andi L, KK, 3
  1167. blez L, .L78
  1168. nop
  1169. .align 3
  1170. .L76:
  1171. MADD t11, t11, a1, b1 # 3rd compute
  1172. MADD t21, t21, a2, b1
  1173. MADD t31, t31, a3, b1
  1174. MADD t41, t41, a4, b1
  1175. daddiu AO, AO, 4 * SIZE # AO += 4mr
  1176. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1177. LD a1, 0 * SIZE(AO) # next
  1178. LD a2, 1 * SIZE(AO)
  1179. LD a3, 2 * SIZE(AO)
  1180. LD a4, 3 * SIZE(AO)
  1181. LD b1, 0 * SIZE(BO)
  1182. daddiu L, L, -1
  1183. bgtz L, .L76
  1184. nop
  1185. .L78:
  1186. LD b1, 0 * SIZE(BO)
  1187. LD b2, 1 * SIZE(BO)
  1188. LD b3, 2 * SIZE(BO)
  1189. LD b4, 3 * SIZE(BO)
  1190. SUB t11, b1, t11
  1191. SUB t21, b2, t21
  1192. SUB t31, b3, t31
  1193. SUB t41, b4, t41
  1194. LD a1, 0 * SIZE(AO) # sa stores in col major
  1195. LD a2, 1 * SIZE(AO)
  1196. LD a3, 2 * SIZE(AO)
  1197. LD a4, 3 * SIZE(AO)
  1198. MUL t11, a1, t11
  1199. NMSUB t21, t21, a2, t11
  1200. NMSUB t31, t31, a3, t11
  1201. NMSUB t41, t41, a4, t11
  1202. LD a5, 5 * SIZE(AO)
  1203. LD a6, 6 * SIZE(AO)
  1204. LD a7, 7 * SIZE(AO)
  1205. MUL t21, a5, t21
  1206. NMSUB t31, t31, a6, t21
  1207. NMSUB t41, t41, a7, t21
  1208. LD a8, 10 * SIZE(AO)
  1209. LD a1, 11 * SIZE(AO)
  1210. MUL t31, a8, t31
  1211. NMSUB t41, t41, a1, t31
  1212. LD a2, 15 * SIZE(AO)
  1213. MUL t41, a2, t41
  1214. ST t11, 0 * SIZE(BO)
  1215. ST t21, 1 * SIZE(BO)
  1216. ST t31, 2 * SIZE(BO)
  1217. ST t41, 3 * SIZE(BO)
  1218. ST t11, 0 * SIZE(CO1)
  1219. ST t21, 1 * SIZE(CO1)
  1220. ST t31, 2 * SIZE(CO1)
  1221. ST t41, 3 * SIZE(CO1)
  1222. daddiu CO1, CO1, 4 * SIZE
  1223. dsubu TEMP, K, KK
  1224. dsll L, TEMP, 2 + BASE_SHIFT
  1225. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1226. daddu AO, AO, L
  1227. daddu BO, BO, TEMP
  1228. daddiu KK, KK, 4
  1229. daddiu I, I, -1
  1230. bgtz I, .L71
  1231. nop
  1232. .align 3
  1233. .L80:
  1234. andi I, M, 2
  1235. blez I, .L90
  1236. NOP
  1237. MTC $0, t11
  1238. MOV t21, t11 # clear result registers
  1239. LD a1, 0 * SIZE(AO)
  1240. LD a2, 1 * SIZE(AO)
  1241. LD b1, 0 * SIZE(B)
  1242. dsra L, KK, 2
  1243. blez L, .L85
  1244. move BO, B
  1245. .align 3
  1246. .L82:
  1247. LD a5, 2 * SIZE(AO)
  1248. LD a6, 3 * SIZE(AO)
  1249. LD b5, 1 * SIZE(BO)
  1250. MADD t11, t11, a1, b1 # 1st compute
  1251. MADD t21, t21, a2, b1
  1252. LD a3, 4 * SIZE(AO)
  1253. LD a4, 5 * SIZE(AO)
  1254. LD b3, 2 * SIZE(BO)
  1255. MADD t11, t11, a5, b5 # 2ed compute
  1256. MADD t21, t21, a6, b5
  1257. LD a7, 6 * SIZE(AO)
  1258. LD a8, 7 * SIZE(AO)
  1259. LD b7, 3 * SIZE(BO)
  1260. MADD t11, t11, a3, b3 # 3rd compute
  1261. MADD t21, t21, a4, b3
  1262. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  1263. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1264. LD a1, 0 * SIZE(AO) # next
  1265. LD a2, 1 * SIZE(AO)
  1266. LD b1, 0 * SIZE(BO)
  1267. MADD t11, t11, a7, b7 # 4th compute
  1268. MADD t21, t21, a8, b7
  1269. daddiu L, L, -1
  1270. bgtz L, .L82
  1271. nop
  1272. .align 3
  1273. .L85:
  1274. andi L, KK, 3
  1275. blez L, .L88
  1276. nop
  1277. .align 3
  1278. .L86:
  1279. MADD t11, t11, a1, b1 # 3rd compute
  1280. MADD t21, t21, a2, b1
  1281. daddiu AO, AO, 2 * SIZE # AO += 2mr
  1282. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1283. LD a1, 0 * SIZE(AO) # next
  1284. LD a2, 1 * SIZE(AO)
  1285. LD b1, 0 * SIZE(BO)
  1286. daddiu L, L, -1
  1287. bgtz L, .L86
  1288. nop
  1289. .L88:
  1290. LD b1, 0 * SIZE(BO)
  1291. LD b2, 1 * SIZE(BO)
  1292. SUB t11, b1, t11
  1293. SUB t21, b2, t21
  1294. LD b1, 0 * SIZE(AO) # computes the triangular_part
  1295. LD b2, 1 * SIZE(AO)
  1296. MUL t11, b1, t11
  1297. NMSUB t21, t21, b2, t11
  1298. LD b3, 3 * SIZE(AO)
  1299. MUL t21, b3, t21
  1300. ST t11, 0 * SIZE(BO)
  1301. ST t21, 1 * SIZE(BO)
  1302. ST t11, 0 * SIZE(CO1)
  1303. ST t21, 1 * SIZE(CO1)
  1304. daddiu CO1, CO1, 2 * SIZE
  1305. dsubu TEMP, K, KK
  1306. dsll L, TEMP, 1 + BASE_SHIFT
  1307. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1308. daddu AO, AO, L
  1309. daddu BO, BO, TEMP
  1310. daddiu KK, KK, 2
  1311. .align 3
  1312. .L90:
  1313. andi I, M, 1 # mr=1
  1314. blez I, .L89
  1315. NOP
  1316. MTC $0, t11
  1317. LD a1, 0 * SIZE(AO)
  1318. LD b1, 0 * SIZE(B)
  1319. dsra L, KK, 2
  1320. blez L, .L95
  1321. move BO, B
  1322. .align 3
  1323. .L92:
  1324. LD a5, 1 * SIZE(AO)
  1325. LD b5, 1 * SIZE(BO)
  1326. MADD t11, t11, a1, b1 # 1st compute
  1327. LD a3, 2 * SIZE(AO)
  1328. LD b3, 2 * SIZE(BO)
  1329. MADD t11, t11, a5, b5 # 2ed compute
  1330. LD a7, 3 * SIZE(AO)
  1331. LD b7, 3 * SIZE(BO)
  1332. MADD t11, t11, a3, b3 # 3rd compute
  1333. daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr
  1334. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1335. LD a1, 0 * SIZE(AO) # next
  1336. LD b1, 0 * SIZE(BO)
  1337. MADD t11, t11, a7, b7 # 4th compute
  1338. daddiu L, L, -1
  1339. bgtz L, .L92
  1340. nop
  1341. .align 3
  1342. .L95:
  1343. andi L, KK, 3
  1344. blez L, .L98
  1345. nop
  1346. .align 3
  1347. .L96:
  1348. MADD t11, t11, a1, b1 # 3rd compute
  1349. daddiu AO, AO, 1 * SIZE # AO += 2mr
  1350. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1351. LD a1, 0 * SIZE(AO) # next
  1352. LD b1, 0 * SIZE(BO)
  1353. daddiu L, L, -1
  1354. bgtz L, .L96
  1355. nop
  1356. .L98:
  1357. LD b1, 0 * SIZE(BO)
  1358. SUB t11, b1, t11
  1359. LD b1, 0 * SIZE(AO) # computes the triangular_part
  1360. MUL t11, b1, t11
  1361. ST t11, 0 * SIZE(BO)
  1362. ST t11, 0 * SIZE(CO1)
  1363. daddiu CO1, CO1, 1 * SIZE
  1364. dsubu TEMP, K, KK
  1365. dsll L, TEMP, BASE_SHIFT
  1366. dsll TEMP, TEMP, BASE_SHIFT
  1367. daddu AO, AO, L
  1368. daddu BO, BO, TEMP
  1369. daddiu KK, KK, 1
  1370. .align 3
  1371. .L89:
  1372. move B, BO
  1373. .align 3
  1374. .L999:
  1375. LDARG $16, 0($sp)
  1376. LDARG $17, 8($sp)
  1377. LDARG $18, 16($sp)
  1378. LDARG $19, 24($sp)
  1379. LDARG $20, 32($sp)
  1380. LDARG $21, 40($sp)
  1381. ldc1 $f24, 48($sp)
  1382. ldc1 $f25, 56($sp)
  1383. ldc1 $f26, 64($sp)
  1384. ldc1 $f27, 72($sp)
  1385. ldc1 $f28, 80($sp)
  1386. LDARG $22, 88($sp)
  1387. LDARG $23, 96($sp)
  1388. LDARG $24, 104($sp)
  1389. LDARG $25, 112($sp)
  1390. #ifndef __64BIT__
  1391. ldc1 $f20,112($sp)
  1392. ldc1 $f21,120($sp)
  1393. ldc1 $f22,128($sp)
  1394. ldc1 $f23,136($sp)
  1395. #endif
  1396. j $31
  1397. daddiu $sp, $sp, 144
  1398. EPILOGUE