You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel.S 44 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define AO $r12
  37. #define BO $r13
  38. #define I $r17
  39. #define J $r18
  40. #define L $r30
  41. #define PREFETCHSIZE (4 * 10)
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define CO5 $r25
  47. #define CO6 $r26
  48. #define CO7 $r27
  49. #define CO8 $r28
  50. #define BB $r29
  51. #if defined(TRMMKERNEL)
  52. #define OFFSET $r11
  53. #define KK $r20
  54. #define TEMP $r16
  55. #endif
  56. #define a1 $f22
  57. #define a2 $f8
  58. #define a3 $f27
  59. #define a4 $f28
  60. #define b1 $f23
  61. #define b2 $f9
  62. #define b3 $f10
  63. #define b4 $f11
  64. #define b5 $f12
  65. #define b6 $f13
  66. #define b7 $f14
  67. #define b8 $f15
  68. #define a5 b8
  69. #define c11 $f16
  70. #define c12 $f17
  71. #define c21 $f3
  72. #define c22 $f1
  73. #define c31 $f2
  74. #define c32 $f4
  75. #define c41 $f5
  76. #define c42 $f6
  77. #define c51 $f7
  78. #define c52 $f18
  79. #define c61 $f19
  80. #define c62 $f20
  81. #define c71 $f21
  82. #define c72 $f24
  83. #define c81 $f25
  84. #define c82 $f26
  85. #define ALPHA $f0
  86. PROLOGUE
  87. addi.d $sp, $sp, -160
  88. SDARG $r23, $sp, 0
  89. SDARG $r24, $sp, 8
  90. SDARG $r25, $sp, 16
  91. SDARG $r26, $sp, 24
  92. SDARG $r27, $sp, 32
  93. SDARG $r28, $sp, 40
  94. SDARG $r29, $sp, 48
  95. SDARG $r30, $sp, 96
  96. fst.d $f24, $sp, 56
  97. fst.d $f25, $sp, 64
  98. fst.d $f26, $sp, 72
  99. fst.d $f27, $sp, 80
  100. fst.d $f28, $sp, 88
  101. #if defined(TRMMKERNEL)
  102. SDARG $r20, $sp, 104
  103. SDARG $r16, $sp, 112
  104. #endif
  105. #ifndef __64BIT__
  106. fst.d $f18, $sp, 120
  107. fst.d $f19, $sp, 128
  108. fst.d $f20, $sp, 136
  109. fst.d $f21, $sp, 144
  110. #endif
  111. slli.d LDC, LDC, BASE_SHIFT
  112. #if defined(TRMMKERNEL) && !defined(LEFT)
  113. sub.d KK, $r0, OFFSET
  114. #endif
  115. srai.d J, N, 3
  116. nop
  117. bge $r0, J, .L30
  118. .L10:
  119. move CO1, C
  120. MTC c11, $r0
  121. add.d CO2, C, LDC
  122. move AO, A
  123. add.d CO3, CO2, LDC
  124. addi.d J, J, -1
  125. add.d CO4, CO3, LDC
  126. MOV c21, c11
  127. add.d CO5, CO4, LDC
  128. MOV c31, c11
  129. add.d CO6, CO5, LDC
  130. MOV c41, c11
  131. add.d CO7, CO6, LDC
  132. MOV c51, c11
  133. add.d CO8, CO7, LDC
  134. srai.d I, M, 1
  135. add.d C, CO8, LDC
  136. slli.d BB, K, 2 + BASE_SHIFT
  137. add.d BB, B, BB
  138. #if defined(TRMMKERNEL) && defined(LEFT)
  139. move KK, OFFSET
  140. #endif
  141. MOV c61, c11
  142. bge $r0, I, .L20
  143. .L11:
  144. #if defined(TRMMKERNEL)
  145. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  146. move BO, B
  147. #else
  148. slli.d L, KK, 1 + BASE_SHIFT
  149. slli.d TEMP, KK, 3 + BASE_SHIFT
  150. add.d AO, AO, L
  151. add.d BO, B, TEMP
  152. #endif
  153. LD a1, AO, 0 * SIZE
  154. MOV c71, c11
  155. LD b1, BO, 0 * SIZE
  156. MOV c81, c11
  157. LD a3, AO, 4 * SIZE
  158. MOV c12, c11
  159. LD b2, BO, 1 * SIZE
  160. MOV c22, c11
  161. MOV c32, c11
  162. LD b3, BO, 2 * SIZE
  163. MOV c42, c11
  164. LD b4, BO, 3 * SIZE
  165. MOV c52, c11
  166. LD b5, BO, 4 * SIZE
  167. MOV c62, c11
  168. LD b6, BO, 8 * SIZE
  169. MOV c72, c11
  170. LD b7, BO, 12 * SIZE
  171. MOV c82, c11
  172. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  173. sub.d TEMP, K, KK
  174. #elif defined(LEFT)
  175. addi.d TEMP, KK, 2
  176. #else
  177. addi.d TEMP, KK, 8
  178. #endif
  179. srai.d L, TEMP, 2
  180. bge $r0, L, .L15
  181. #else
  182. LD a1, AO, 0 * SIZE
  183. MOV c71, c11
  184. LD b1, B, 0 * SIZE
  185. MOV c81, c11
  186. preld 1, CO1, 3 * SIZE
  187. preld 1, CO2, 3 * SIZE
  188. LD a3, AO, 4 * SIZE
  189. MOV c12, c11
  190. LD b2, B, 1 * SIZE
  191. MOV c22, c11
  192. srai.d L, K, 2
  193. MOV c32, c11
  194. LD b3, B, 2 * SIZE
  195. MOV c42, c11
  196. LD b4, B, 3 * SIZE
  197. MOV c52, c11
  198. LD b5, B, 4 * SIZE
  199. MOV c62, c11
  200. LD b6, B, 8 * SIZE
  201. MOV c72, c11
  202. LD b7, B, 12 * SIZE
  203. MOV c82, c11
  204. move BO, B
  205. bge $r0, L, .L15
  206. #endif
  207. MADD c11, b1, a1, c11
  208. LD a2, AO, 1 * SIZE
  209. MADD c21, b2, a1, c21
  210. addi.d L, L, -1
  211. MADD c31, b3, a1, c31
  212. MADD c41, b4, a1, c41
  213. bge $r0, L, .L13
  214. preld 1, CO3, 2 * SIZE
  215. .align 3
  216. .L12:
  217. MADD c12, b1, a2, c12
  218. LD b1, BO, 16 * SIZE
  219. MADD c22, b2, a2, c22
  220. LD b2, BO, 5 * SIZE
  221. MADD c32, b3, a2, c32
  222. LD b3, BO, 6 * SIZE
  223. MADD c42, b4, a2, c42
  224. LD b4, BO, 7 * SIZE
  225. MADD c51, b5, a1, c51
  226. LD a4, AO, 2 * SIZE
  227. MADD c61, b2, a1, c61
  228. MADD c71, b3, a1, c71
  229. MADD c81, b4, a1, c81
  230. LD a1, AO, 8 * SIZE
  231. MADD c52, b5, a2, c52
  232. LD b5, BO, 20 * SIZE
  233. MADD c62, b2, a2, c62
  234. LD b2, BO, 9 * SIZE
  235. MADD c72, b3, a2, c72
  236. LD b3, BO, 10 * SIZE
  237. MADD c82, b4, a2, c82
  238. LD b4, BO, 11 * SIZE
  239. MADD c11, b6, a4, c11
  240. LD a2, AO, 3 * SIZE
  241. MADD c21, b2, a4, c21
  242. MADD c31, b3, a4, c31
  243. MADD c41, b4, a4, c41
  244. MADD c12, b6, a2, c12
  245. LD b6, BO, 24 * SIZE
  246. MADD c22, b2, a2, c22
  247. LD b2, BO, 13 * SIZE
  248. MADD c32, b3, a2, c32
  249. LD b3, BO, 14 * SIZE
  250. MADD c42, b4, a2, c42
  251. LD b4, BO, 15 * SIZE
  252. MADD c51, b7, a4, c51
  253. MADD c61, b2, a4, c61
  254. MADD c71, b3, a4, c71
  255. MADD c81, b4, a4, c81
  256. MADD c52, b7, a2, c52
  257. LD b7, BO, 28 * SIZE
  258. MADD c62, b2, a2, c62
  259. LD b2, BO, 17 * SIZE
  260. MADD c72, b3, a2, c72
  261. LD b3, BO, 18 * SIZE
  262. MADD c82, b4, a2, c82
  263. LD b4, BO, 19 * SIZE
  264. MADD c11, b1, a3, c11
  265. LD a2, AO, 5 * SIZE
  266. MADD c21, b2, a3, c21
  267. MADD c31, b3, a3, c31
  268. MADD c41, b4, a3, c41
  269. MADD c12, b1, a2, c12
  270. LD b1, BO, 32 * SIZE
  271. MADD c22, b2, a2, c22
  272. LD b2, BO, 21 * SIZE
  273. MADD c32, b3, a2, c32
  274. LD b3, BO, 22 * SIZE
  275. MADD c42, b4, a2, c42
  276. LD b4, BO, 23 * SIZE
  277. MADD c51, b5, a3, c51
  278. LD a4, AO, 6 * SIZE
  279. MADD c61, b2, a3, c61
  280. MADD c71, b3, a3, c71
  281. MADD c81, b4, a3, c81
  282. LD a3, AO, 12 * SIZE
  283. MADD c52, b5, a2, c52
  284. LD b5, BO, 36 * SIZE
  285. MADD c62, b2, a2, c62
  286. LD b2, BO, 25 * SIZE
  287. MADD c72, b3, a2, c72
  288. LD b3, BO, 26 * SIZE
  289. MADD c82, b4, a2, c82
  290. LD b4, BO, 27 * SIZE
  291. MADD c11, b6, a4, c11
  292. LD a2, AO, 7 * SIZE
  293. MADD c21, b2, a4, c21
  294. MADD c31, b3, a4, c31
  295. MADD c41, b4, a4, c41
  296. addi.d L, L, -1
  297. MADD c12, b6, a2, c12
  298. LD b6, BO, 40 * SIZE
  299. MADD c22, b2, a2, c22
  300. LD b2, BO, 29 * SIZE
  301. MADD c32, b3, a2, c32
  302. LD b3, BO, 30 * SIZE
  303. MADD c42, b4, a2, c42
  304. LD b4, BO, 31 * SIZE
  305. MADD c51, b7, a4, c51
  306. addi.d BO, BO, 32 * SIZE
  307. MADD c61, b2, a4, c61
  308. addi.d AO, AO, 8 * SIZE
  309. MADD c71, b3, a4, c71
  310. MADD c81, b4, a4, c81
  311. MADD c52, b7, a2, c52
  312. LD b7, BO, 12 * SIZE
  313. MADD c62, b2, a2, c62
  314. LD b2, BO, 1 * SIZE
  315. MADD c72, b3, a2, c72
  316. LD b3, BO, 2 * SIZE
  317. MADD c82, b4, a2, c82
  318. LD b4, BO, 3 * SIZE
  319. MADD c11, b1, a1, c11
  320. LD a2, AO, 1 * SIZE
  321. MADD c21, b2, a1, c21
  322. MADD c31, b3, a1, c31
  323. MADD c41, b4, a1, c41
  324. blt $r0, L, .L12
  325. .align 3
  326. .L13:
  327. MADD c12, b1, a2, c12
  328. LD b1, BO, 16 * SIZE
  329. MADD c22, b2, a2, c22
  330. LD b2, BO, 5 * SIZE
  331. MADD c32, b3, a2, c32
  332. LD b3, BO, 6 * SIZE
  333. MADD c42, b4, a2, c42
  334. LD b4, BO, 7 * SIZE
  335. MADD c51, b5, a1, c51
  336. MADD c61, b2, a1, c61
  337. LD a4, AO, 2 * SIZE
  338. MADD c71, b3, a1, c71
  339. MADD c81, b4, a1, c81
  340. LD a1, AO, 8 * SIZE
  341. MADD c52, b5, a2, c52
  342. LD b5, BO, 20 * SIZE
  343. MADD c62, b2, a2, c62
  344. LD b2, BO, 9 * SIZE
  345. MADD c72, b3, a2, c72
  346. LD b3, BO, 10 * SIZE
  347. MADD c82, b4, a2, c82
  348. LD b4, BO, 11 * SIZE
  349. MADD c11, b6, a4, c11
  350. LD a2, AO, 3 * SIZE
  351. MADD c21, b2, a4, c21
  352. MADD c31, b3, a4, c31
  353. preld 1, CO4, 3 * SIZE
  354. MADD c41, b4, a4, c41
  355. MADD c12, b6, a2, c12
  356. LD b6, BO, 24 * SIZE
  357. MADD c22, b2, a2, c22
  358. LD b2, BO, 13 * SIZE
  359. MADD c32, b3, a2, c32
  360. LD b3, BO, 14 * SIZE
  361. MADD c42, b4, a2, c42
  362. LD b4, BO, 15 * SIZE
  363. MADD c51, b7, a4, c51
  364. preld 1, CO5, 3 * SIZE
  365. MADD c61, b2, a4, c61
  366. MADD c71, b3, a4, c71
  367. preld 1, CO6, 3 * SIZE
  368. MADD c81, b4, a4, c81
  369. MADD c52, b7, a2, c52
  370. LD b7, BO, 28 * SIZE
  371. MADD c62, b2, a2, c62
  372. LD b2, BO, 17 * SIZE
  373. MADD c72, b3, a2, c72
  374. LD b3, BO, 18 * SIZE
  375. MADD c82, b4, a2, c82
  376. LD b4, BO, 19 * SIZE
  377. MADD c11, b1, a3, c11
  378. LD a2, AO, 5 * SIZE
  379. MADD c21, b2, a3, c21
  380. MADD c31, b3, a3, c31
  381. preld 1, CO7, 3 * SIZE
  382. MADD c41, b4, a3, c41
  383. MADD c12, b1, a2, c12
  384. LD b1, BO, 32 * SIZE
  385. MADD c22, b2, a2, c22
  386. LD b2, BO, 21 * SIZE
  387. MADD c32, b3, a2, c32
  388. LD b3, BO, 22 * SIZE
  389. MADD c42, b4, a2, c42
  390. LD b4, BO, 23 * SIZE
  391. MADD c51, b5, a3, c51
  392. MADD c61, b2, a3, c61
  393. LD a4, AO, 6 * SIZE
  394. MADD c71, b3, a3, c71
  395. MADD c81, b4, a3, c81
  396. MADD c52, b5, a2, c52
  397. LD b5, BO, 36 * SIZE
  398. MADD c62, b2, a2, c62
  399. LD b2, BO, 25 * SIZE
  400. MADD c72, b3, a2, c72
  401. LD b3, BO, 26 * SIZE
  402. MADD c82, b4, a2, c82
  403. LD b4, BO, 27 * SIZE
  404. MADD c11, b6, a4, c11
  405. LD a2, AO, 7 * SIZE
  406. MADD c21, b2, a4, c21
  407. MADD c31, b3, a4, c31
  408. MADD c41, b4, a4, c41
  409. MADD c12, b6, a2, c12
  410. LD b6, BO, 40 * SIZE
  411. MADD c22, b2, a2, c22
  412. LD b2, BO, 29 * SIZE
  413. MADD c32, b3, a2, c32
  414. LD b3, BO, 30 * SIZE
  415. MADD c42, b4, a2, c42
  416. LD b4, BO, 31 * SIZE
  417. MADD c51, b7, a4, c51
  418. addi.d BO, BO, 32 * SIZE
  419. MADD c61, b2, a4, c61
  420. addi.d AO, AO, 8 * SIZE
  421. MADD c71, b3, a4, c71
  422. MADD c81, b4, a4, c81
  423. MADD c52, b7, a2, c52
  424. LD b7, BO, 12 * SIZE
  425. MADD c62, b2, a2, c62
  426. LD b2, BO, 1 * SIZE
  427. MADD c72, b3, a2, c72
  428. LD b3, BO, 2 * SIZE
  429. MADD c82, b4, a2, c82
  430. LD b4, BO, 3 * SIZE
  431. .align 3
  432. .L15:
  433. #ifndef TRMMKERNEL
  434. andi L, K, 3
  435. #else
  436. andi L, TEMP, 3
  437. #endif
  438. preld 1, CO8, 3 * SIZE
  439. bge $r0, L, .L18
  440. .align 3
  441. .L16:
  442. MADD c11, b1, a1, c11
  443. LD a2, AO, 1 * SIZE
  444. MADD c21, b2, a1, c21
  445. MADD c31, b3, a1, c31
  446. MADD c41, b4, a1, c41
  447. MADD c12, b1, a2, c12
  448. LD b1, BO, 8 * SIZE
  449. MADD c22, b2, a2, c22
  450. LD b2, BO, 5 * SIZE
  451. MADD c32, b3, a2, c32
  452. LD b3, BO, 6 * SIZE
  453. MADD c42, b4, a2, c42
  454. LD b4, BO, 7 * SIZE
  455. MADD c51, b5, a1, c51
  456. addi.d L, L, -1
  457. MADD c61, b2, a1, c61
  458. addi.d AO, AO, 2 * SIZE
  459. MADD c71, b3, a1, c71
  460. addi.d BO, BO, 8 * SIZE
  461. MADD c81, b4, a1, c81
  462. LD a1, AO, 0 * SIZE
  463. MADD c52, b5, a2, c52
  464. LD b5, BO, 4 * SIZE
  465. MADD c62, b2, a2, c62
  466. LD b2, BO, 1 * SIZE
  467. MADD c72, b3, a2, c72
  468. LD b3, BO, 2 * SIZE
  469. MADD c82, b4, a2, c82
  470. LD b4, BO, 3 * SIZE
  471. blt $r0, L, .L16
  472. .L18:
  473. #ifndef TRMMKERNEL
  474. LD $f22, CO1, 0 * SIZE
  475. addi.d CO3,CO3, 2 * SIZE
  476. LD $f8, CO1, 1 * SIZE
  477. addi.d CO1,CO1, 2 * SIZE
  478. LD $f23, CO2, 0 * SIZE
  479. addi.d CO4,CO4, 2 * SIZE
  480. LD $f9, CO2, 1 * SIZE
  481. addi.d CO2,CO2, 2 * SIZE
  482. LD $f10, CO3, -2 * SIZE
  483. addi.d CO5,CO5, 2 * SIZE
  484. LD $f11, CO3, -1 * SIZE
  485. addi.d CO6,CO6, 2 * SIZE
  486. LD $f12, CO4, -2 * SIZE
  487. addi.d CO7,CO7, 2 * SIZE
  488. LD $f13, CO4, -1 * SIZE
  489. addi.d I, I, -1
  490. MADD c11, c11, ALPHA, $f22
  491. LD $f22, CO5, -2 * SIZE
  492. MADD c12, c12, ALPHA, $f8
  493. LD $f8, CO5, -1 * SIZE
  494. MADD c21, c21, ALPHA, $f23
  495. LD $f23, CO6, -2 * SIZE
  496. MADD c22, c22, ALPHA, $f9
  497. LD $f9, CO6, -1 * SIZE
  498. MADD c31, c31, ALPHA, $f10
  499. LD $f10, CO7, -2 * SIZE
  500. MADD c32, c32, ALPHA, $f11
  501. LD $f11, CO7, -1 * SIZE
  502. MADD c41, c41, ALPHA, $f12
  503. LD $f12, CO8, 0 * SIZE
  504. MADD c42, c42, ALPHA, $f13
  505. LD $f13, CO8, 1 * SIZE
  506. preld 0, BB, 0 * SIZE
  507. preld 0, BB, 8 * SIZE
  508. ST c11, CO1, -2 * SIZE
  509. MTC c11, $r0
  510. ST c12, CO1, -1 * SIZE
  511. addi.d CO8,CO8, 2 * SIZE
  512. ST c21, CO2, -2 * SIZE
  513. MOV c21, c11
  514. ST c22, CO2, -1 * SIZE
  515. addi.d BB, BB, 16 * SIZE
  516. MADD c51, c51, ALPHA, $f22
  517. ST c31, CO3, -2 * SIZE
  518. MADD c52, c52, ALPHA, $f8
  519. ST c32, CO3, -1 * SIZE
  520. MADD c61, c61, ALPHA, $f23
  521. ST c41, CO4, -2 * SIZE
  522. MADD c62, c62, ALPHA, $f9
  523. ST c42, CO4, -1 * SIZE
  524. MADD c71, c71, ALPHA, $f10
  525. ST c51, CO5, -2 * SIZE
  526. MADD c72, c72, ALPHA, $f11
  527. ST c52, CO5, -1 * SIZE
  528. MADD c81, c81, ALPHA, $f12
  529. ST c61, CO6, -2 * SIZE
  530. MADD c82, c82, ALPHA, $f13
  531. ST c62, CO6, -1 * SIZE
  532. ST c71, CO7, -2 * SIZE
  533. MOV c31, c11
  534. ST c72, CO7, -1 * SIZE
  535. MOV c41, c11
  536. ST c81, CO8, -2 * SIZE
  537. MOV c51, c11
  538. ST c82, CO8, -1 * SIZE
  539. MOV c61, c11
  540. blt $r0, I, .L11
  541. #else
  542. addi.d CO4,CO4, 2 * SIZE
  543. addi.d CO5,CO5, 2 * SIZE
  544. addi.d CO6,CO6, 2 * SIZE
  545. addi.d CO7,CO7, 2 * SIZE
  546. preld 0, BB, 0 * SIZE
  547. preld 0, BB, 8 * SIZE
  548. MUL c11, ALPHA, c11
  549. addi.d CO1,CO1, 2 * SIZE
  550. MUL c12, ALPHA, c12
  551. MTC a1, $r0
  552. MUL c21, ALPHA, c21
  553. addi.d CO2,CO2, 2 * SIZE
  554. MUL c22, ALPHA, c22
  555. addi.d CO3,CO3, 2 * SIZE
  556. ST c11, CO1, -2 * SIZE
  557. MUL c31, ALPHA, c31
  558. ST c12, CO1, -1 * SIZE
  559. MUL c32, ALPHA, c32
  560. ST c21, CO2, -2 * SIZE
  561. MUL c41, ALPHA, c41
  562. ST c22, CO2, -1 * SIZE
  563. MUL c42, ALPHA, c42
  564. ST c31, CO3, -2 * SIZE
  565. MUL c51, ALPHA, c51
  566. ST c32, CO3, -1 * SIZE
  567. MUL c52, ALPHA, c52
  568. ST c41, CO4, -2 * SIZE
  569. MUL c61, ALPHA, c61
  570. ST c42, CO4, -1 * SIZE
  571. MUL c62, ALPHA, c62
  572. ST c51, CO5, -2 * SIZE
  573. MUL c71, ALPHA, c71
  574. ST c52, CO5, -1 * SIZE
  575. MUL c72, ALPHA, c72
  576. ST c61, CO6, -2 * SIZE
  577. MUL c81, ALPHA, c81
  578. ST c62, CO6, -1 * SIZE
  579. MUL c82, ALPHA, c82
  580. ST c71, CO7, -2 * SIZE
  581. MOV c11, a1
  582. ST c72, CO7, -1 * SIZE
  583. MOV c21, a1
  584. addi.d CO8,CO8, 2 * SIZE
  585. addi.d BB, BB, 16 * SIZE
  586. ST c81, CO8, -2 * SIZE
  587. MOV c31, a1
  588. ST c82, CO8, -1 * SIZE
  589. MOV c41, a1
  590. addi.d I, I, -1
  591. MOV c51, a1
  592. #if ( defined(LEFT) && defined(TRANSA)) || \
  593. (!defined(LEFT) && !defined(TRANSA))
  594. sub.d TEMP, K, KK
  595. #ifdef LEFT
  596. addi.d TEMP, TEMP, -2
  597. #else
  598. addi.d TEMP, TEMP, -8
  599. #endif
  600. slli.d L, TEMP, 1 + BASE_SHIFT
  601. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  602. add.d AO, AO, L
  603. add.d BO, BO, TEMP
  604. #endif
  605. #ifdef LEFT
  606. addi.d KK, KK, 2
  607. #endif
  608. MOV c61, a1
  609. blt $r0, I, .L11
  610. #endif
  611. .align 3
  612. .L20:
  613. andi I, M, 1
  614. MOV c61, c11
  615. MOV c71, c11
  616. bge $r0, I, .L29
  617. #if defined(TRMMKERNEL)
  618. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  619. move BO, B
  620. #else
  621. slli.d L, KK, 0 + BASE_SHIFT
  622. slli.d TEMP, KK, 3 + BASE_SHIFT
  623. add.d AO, AO, L
  624. add.d BO, B, TEMP
  625. #endif
  626. LD a1, AO, 0 * SIZE
  627. LD a2, AO, 1 * SIZE
  628. LD a3, AO, 2 * SIZE
  629. LD a4, AO, 3 * SIZE
  630. LD b1, BO, 0 * SIZE
  631. LD b2, BO, 1 * SIZE
  632. LD b3, BO, 2 * SIZE
  633. LD b4, BO, 3 * SIZE
  634. LD b5, BO, 4 * SIZE
  635. LD b6, BO, 8 * SIZE
  636. LD b7, BO, 12 * SIZE
  637. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  638. sub.d TEMP, K, KK
  639. #elif defined(LEFT)
  640. addi.d TEMP, KK, 1
  641. #else
  642. addi.d TEMP, KK, 8
  643. #endif
  644. srai.d L, TEMP, 2
  645. MOV c81, c11
  646. bge $r0, L, .L25
  647. #else
  648. LD a1, AO, 0 * SIZE
  649. LD a2, AO, 1 * SIZE
  650. LD a3, AO, 2 * SIZE
  651. LD a4, AO, 3 * SIZE
  652. LD b1, B, 0 * SIZE
  653. LD b2, B, 1 * SIZE
  654. LD b3, B, 2 * SIZE
  655. LD b4, B, 3 * SIZE
  656. LD b5, B, 4 * SIZE
  657. LD b6, B, 8 * SIZE
  658. LD b7, B, 12 * SIZE
  659. srai.d L, K, 2
  660. MOV c81, c11
  661. move BO, B
  662. bge $r0, L, .L25
  663. #endif
  664. .align 3
  665. .L22:
  666. MADD c11, b1, a1, c11
  667. LD b1, BO, 16 * SIZE
  668. MADD c21, b2, a1, c21
  669. LD b2, BO, 5 * SIZE
  670. MADD c31, b3, a1, c31
  671. LD b3, BO, 6 * SIZE
  672. MADD c41, b4, a1, c41
  673. LD b4, BO, 7 * SIZE
  674. MADD c51, b5, a1, c51
  675. LD b5, BO, 20 * SIZE
  676. MADD c61, b2, a1, c61
  677. LD b2, BO, 9 * SIZE
  678. MADD c71, b3, a1, c71
  679. LD b3, BO, 10 * SIZE
  680. MADD c81, b4, a1, c81
  681. LD b4, BO, 11 * SIZE
  682. LD a1, AO, 4 * SIZE
  683. addi.d L, L, -1
  684. MADD c11, b6, a2, c11
  685. LD b6, BO, 24 * SIZE
  686. MADD c21, b2, a2, c21
  687. LD b2, BO, 13 * SIZE
  688. MADD c31, b3, a2, c31
  689. LD b3, BO, 14 * SIZE
  690. MADD c41, b4, a2, c41
  691. LD b4, BO, 15 * SIZE
  692. MADD c51, b7, a2, c51
  693. LD b7, BO, 28 * SIZE
  694. MADD c61, b2, a2, c61
  695. LD b2, BO, 17 * SIZE
  696. MADD c71, b3, a2, c71
  697. LD b3, BO, 18 * SIZE
  698. MADD c81, b4, a2, c81
  699. LD b4, BO, 19 * SIZE
  700. LD a2, AO, 5 * SIZE
  701. addi.d AO, AO, 4 * SIZE
  702. MADD c11, b1, a3, c11
  703. LD b1, BO, 32 * SIZE
  704. MADD c21, b2, a3, c21
  705. LD b2, BO, 21 * SIZE
  706. MADD c31, b3, a3, c31
  707. LD b3, BO, 22 * SIZE
  708. MADD c41, b4, a3, c41
  709. LD b4, BO, 23 * SIZE
  710. MADD c51, b5, a3, c51
  711. LD b5, BO, 36 * SIZE
  712. MADD c61, b2, a3, c61
  713. LD b2, BO, 25 * SIZE
  714. MADD c71, b3, a3, c71
  715. LD b3, BO, 26 * SIZE
  716. MADD c81, b4, a3, c81
  717. LD b4, BO, 27 * SIZE
  718. LD a3, AO, 2 * SIZE
  719. addi.d BO, BO, 32 * SIZE
  720. MADD c11, b6, a4, c11
  721. LD b6, BO, 8 * SIZE
  722. MADD c21, b2, a4, c21
  723. LD b2, BO, -3 * SIZE
  724. MADD c31, b3, a4, c31
  725. LD b3, BO, -2 * SIZE
  726. MADD c41, b4, a4, c41
  727. LD b4, BO, -1 * SIZE
  728. MADD c51, b7, a4, c51
  729. LD b7, BO, 12 * SIZE
  730. MADD c61, b2, a4, c61
  731. LD b2, BO, 1 * SIZE
  732. MADD c71, b3, a4, c71
  733. LD b3, BO, 2 * SIZE
  734. MADD c81, b4, a4, c81
  735. LD b4, BO, 3 * SIZE
  736. LD a4, AO, 3 * SIZE
  737. blt $r0, L, .L22
  738. .align 3
  739. .L25:
  740. #ifndef TRMMKERNEL
  741. andi L, K, 3
  742. #else
  743. andi L, TEMP, 3
  744. #endif
  745. bge $r0, L, .L28
  746. .align 3
  747. .L26:
  748. MADD c11, b1, a1, c11
  749. LD b1, BO, 8 * SIZE
  750. MADD c21, b2, a1, c21
  751. LD b2, BO, 5 * SIZE
  752. MADD c31, b3, a1, c31
  753. LD b3, BO, 6 * SIZE
  754. MADD c41, b4, a1, c41
  755. LD b4, BO, 7 * SIZE
  756. addi.d L, L, -1
  757. MOV a2, a2
  758. addi.d AO, AO, 1 * SIZE
  759. addi.d BO, BO, 8 * SIZE
  760. MADD c51, b5, a1, c51
  761. LD b5, BO, 4 * SIZE
  762. MADD c61, b2, a1, c61
  763. LD b2, BO, 1 * SIZE
  764. MADD c71, b3, a1, c71
  765. LD b3, BO, 2 * SIZE
  766. MADD c81, b4, a1, c81
  767. LD a1, AO, 0 * SIZE
  768. LD b4, BO, 3 * SIZE
  769. blt $r0, L, .L26
  770. .L28:
  771. #ifndef TRMMKERNEL
  772. LD $f22, CO1, 0 * SIZE
  773. LD $f8, CO2, 0 * SIZE
  774. LD $f23, CO3, 0 * SIZE
  775. LD $f9, CO4, 0 * SIZE
  776. MADD c11, c11, ALPHA, $f22
  777. LD $f10, CO5, 0 * SIZE
  778. MADD c21, c21, ALPHA, $f8
  779. LD $f11, CO6, 0 * SIZE
  780. MADD c31, c31, ALPHA, $f23
  781. LD $f12, CO7, 0 * SIZE
  782. MADD c41, c41, ALPHA, $f9
  783. LD $f13, CO8, 0 * SIZE
  784. MADD c51, c51, ALPHA, $f10
  785. ST c11, CO1, 0 * SIZE
  786. MADD c61, c61, ALPHA, $f11
  787. ST c21, CO2, 0 * SIZE
  788. MADD c71, c71, ALPHA, $f12
  789. ST c31, CO3, 0 * SIZE
  790. MADD c81, c81, ALPHA, $f13
  791. ST c41, CO4, 0 * SIZE
  792. ST c51, CO5, 0 * SIZE
  793. ST c61, CO6, 0 * SIZE
  794. ST c71, CO7, 0 * SIZE
  795. ST c81, CO8, 0 * SIZE
  796. #else
  797. MUL c11, ALPHA, c11
  798. MUL c21, ALPHA, c21
  799. MUL c31, ALPHA, c31
  800. MUL c41, ALPHA, c41
  801. ST c11, CO1, 0 * SIZE
  802. MUL c51, ALPHA, c51
  803. ST c21, CO2, 0 * SIZE
  804. MUL c61, ALPHA, c61
  805. ST c31, CO3, 0 * SIZE
  806. MUL c71, ALPHA, c71
  807. ST c41, CO4, 0 * SIZE
  808. MUL c81, ALPHA, c81
  809. ST c51, CO5, 0 * SIZE
  810. ST c61, CO6, 0 * SIZE
  811. ST c71, CO7, 0 * SIZE
  812. ST c81, CO8, 0 * SIZE
  813. #if ( defined(LEFT) && defined(TRANSA)) || \
  814. (!defined(LEFT) && !defined(TRANSA))
  815. sub.d TEMP, K, KK
  816. #ifdef LEFT
  817. addi.d TEMP, TEMP, -1
  818. #else
  819. addi.d TEMP, TEMP, -8
  820. #endif
  821. slli.d L, TEMP, 0 + BASE_SHIFT
  822. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  823. add.d AO, AO, L
  824. add.d BO, BO, TEMP
  825. #endif
  826. #ifdef LEFT
  827. addi.d KK, KK, 1
  828. #endif
  829. #endif
  830. .align 3
  831. .L29:
  832. #if defined(TRMMKERNEL) && !defined(LEFT)
  833. addi.d KK, KK, 8
  834. #endif
  835. move B, BO
  836. blt $r0, J, .L10
  837. .align 3
  838. .L30:
  839. andi J, N, 4
  840. move AO, A
  841. bge $r0, J, .L50
  842. move CO1, C
  843. MTC c11, $r0
  844. add.d CO2, C, LDC
  845. add.d CO3, CO2, LDC
  846. add.d CO4, CO3, LDC
  847. MOV c21, c11
  848. add.d C, CO4, LDC
  849. MOV c31, c11
  850. #if defined(TRMMKERNEL) && defined(LEFT)
  851. move KK, OFFSET
  852. #endif
  853. srai.d I, M, 1
  854. MOV c41, c11
  855. bge $r0, I, .L40
  856. .L31:
  857. #if defined(TRMMKERNEL)
  858. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  859. move BO, B
  860. #else
  861. slli.d L, KK, 1 + BASE_SHIFT
  862. slli.d TEMP, KK, 2 + BASE_SHIFT
  863. add.d AO, AO, L
  864. add.d BO, B, TEMP
  865. #endif
  866. LD a1, AO, 0 * SIZE
  867. LD a3, AO, 4 * SIZE
  868. LD b1, BO, 0 * SIZE
  869. MOV c12, c11
  870. LD b2, BO, 1 * SIZE
  871. MOV c22, c11
  872. LD b3, BO, 2 * SIZE
  873. MOV c32, c11
  874. LD b4, BO, 3 * SIZE
  875. MOV c42, c11
  876. LD b5, BO, 4 * SIZE
  877. LD b6, BO, 8 * SIZE
  878. LD b7, BO, 12 * SIZE
  879. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  880. sub.d TEMP, K, KK
  881. #elif defined(LEFT)
  882. addi.d TEMP, KK, 2
  883. #else
  884. addi.d TEMP, KK, 4
  885. #endif
  886. srai.d L, TEMP, 2
  887. bge $r0, L, .L35
  888. #else
  889. LD a1, AO, 0 * SIZE
  890. LD a3, AO, 4 * SIZE
  891. LD b1, B, 0 * SIZE
  892. MOV c12, c11
  893. LD b2, B, 1 * SIZE
  894. MOV c22, c11
  895. LD b3, B, 2 * SIZE
  896. MOV c32, c11
  897. LD b4, B, 3 * SIZE
  898. MOV c42, c11
  899. LD b5, B, 4 * SIZE
  900. srai.d L, K, 2
  901. LD b6, B, 8 * SIZE
  902. LD b7, B, 12 * SIZE
  903. move BO, B
  904. bge $r0, L, .L35
  905. #endif
  906. .align 3
  907. .L32:
  908. MADD c11, b1, a1, c11
  909. LD a2, AO, 1 * SIZE
  910. MADD c21, b2, a1, c21
  911. addi.d L, L, -1
  912. MADD c31, b3, a1, c31
  913. MADD c41, b4, a1, c41
  914. LD a1, AO, 2 * SIZE
  915. MADD c12, b1, a2, c12
  916. LD b1, BO, 16 * SIZE
  917. MADD c22, b2, a2, c22
  918. LD b2, BO, 5 * SIZE
  919. MADD c32, b3, a2, c32
  920. LD b3, BO, 6 * SIZE
  921. MADD c42, b4, a2, c42
  922. LD b4, BO, 7 * SIZE
  923. MADD c11, b5, a1, c11
  924. LD a2, AO, 3 * SIZE
  925. MADD c21, b2, a1, c21
  926. MADD c31, b3, a1, c31
  927. MADD c41, b4, a1, c41
  928. LD a1, AO, 8 * SIZE
  929. MADD c12, b5, a2, c12
  930. LD b5, BO, 20 * SIZE
  931. MADD c22, b2, a2, c22
  932. LD b2, BO, 9 * SIZE
  933. MADD c32, b3, a2, c32
  934. LD b3, BO, 10 * SIZE
  935. MADD c42, b4, a2, c42
  936. LD b4, BO, 11 * SIZE
  937. MADD c11, b6, a3, c11
  938. LD a2, AO, 5 * SIZE
  939. MADD c21, b2, a3, c21
  940. MADD c31, b3, a3, c31
  941. MADD c41, b4, a3, c41
  942. LD a3, AO, 6 * SIZE
  943. MADD c12, b6, a2, c12
  944. LD b6, BO, 24 * SIZE
  945. MADD c22, b2, a2, c22
  946. LD b2, BO, 13 * SIZE
  947. MADD c32, b3, a2, c32
  948. LD b3, BO, 14 * SIZE
  949. MADD c42, b4, a2, c42
  950. LD b4, BO, 15 * SIZE
  951. MADD c11, b7, a3, c11
  952. LD a2, AO, 7 * SIZE
  953. MADD c21, b2, a3, c21
  954. addi.d AO, AO, 8 * SIZE
  955. MADD c31, b3, a3, c31
  956. addi.d BO, BO, 16 * SIZE
  957. MADD c41, b4, a3, c41
  958. LD a3, AO, 4 * SIZE
  959. MADD c12, b7, a2, c12
  960. LD b7, BO, 12 * SIZE
  961. MADD c22, b2, a2, c22
  962. LD b2, BO, 1 * SIZE
  963. MADD c32, b3, a2, c32
  964. LD b3, BO, 2 * SIZE
  965. MADD c42, b4, a2, c42
  966. LD b4, BO, 3 * SIZE
  967. blt $r0, L, .L32
  968. .align 3
  969. .L35:
  970. #ifndef TRMMKERNEL
  971. andi L, K, 3
  972. #else
  973. andi L, TEMP, 3
  974. #endif
  975. bge $r0, L, .L38
  976. .align 3
  977. .L36:
  978. MADD c11, b1, a1, c11
  979. LD a2, AO, 1 * SIZE
  980. MADD c21, b2, a1, c21
  981. addi.d L, L, -1
  982. MADD c31, b3, a1, c31
  983. addi.d AO, AO, 2 * SIZE
  984. MADD c41, b4, a1, c41
  985. LD a1, AO, 0 * SIZE
  986. MADD c12, b1, a2, c12
  987. LD b1, BO, 4 * SIZE
  988. MADD c22, b2, a2, c22
  989. LD b2, BO, 5 * SIZE
  990. MADD c32, b3, a2, c32
  991. LD b3, BO, 6 * SIZE
  992. MADD c42, b4, a2, c42
  993. LD b4, BO, 7 * SIZE
  994. addi.d BO, BO, 4 * SIZE
  995. blt $r0, L, .L36
  996. .L38:
  997. #ifndef TRMMKERNEL
  998. LD $f22, CO1, 0 * SIZE
  999. addi.d CO3,CO3, 2 * SIZE
  1000. LD $f8, CO1, 1 * SIZE
  1001. addi.d CO1,CO1, 2 * SIZE
  1002. LD $f23, CO2, 0 * SIZE
  1003. addi.d CO4,CO4, 2 * SIZE
  1004. LD $f9, CO2, 1 * SIZE
  1005. addi.d CO2,CO2, 2 * SIZE
  1006. LD $f10, CO3, -2 * SIZE
  1007. MADD c11, c11, ALPHA, $f22
  1008. LD $f11, CO3, -1 * SIZE
  1009. MADD c12, c12, ALPHA, $f8
  1010. LD $f12, CO4, -2 * SIZE
  1011. MADD c21, c21, ALPHA, $f23
  1012. LD $f13, CO4, -1 * SIZE
  1013. MADD c22, c22, ALPHA, $f9
  1014. MADD c31, c31, ALPHA, $f10
  1015. ST c11, CO1, -2 * SIZE
  1016. MADD c32, c32, ALPHA, $f11
  1017. ST c12, CO1, -1 * SIZE
  1018. MADD c41, c41, ALPHA, $f12
  1019. ST c21, CO2, -2 * SIZE
  1020. MADD c42, c42, ALPHA, $f13
  1021. ST c22, CO2, -1 * SIZE
  1022. ST c31, CO3, -2 * SIZE
  1023. MTC c11, $r0
  1024. ST c32, CO3, -1 * SIZE
  1025. addi.d I, I, -1
  1026. ST c41, CO4, -2 * SIZE
  1027. MOV c21, c11
  1028. ST c42, CO4, -1 * SIZE
  1029. MOV c31, c11
  1030. #else
  1031. MUL c11, ALPHA, c11
  1032. addi.d CO3,CO3, 2 * SIZE
  1033. MUL c12, ALPHA, c12
  1034. addi.d CO1,CO1, 2 * SIZE
  1035. MUL c21, ALPHA, c21
  1036. addi.d CO4,CO4, 2 * SIZE
  1037. MUL c22, ALPHA, c22
  1038. addi.d CO2,CO2, 2 * SIZE
  1039. ST c11, CO1, -2 * SIZE
  1040. MUL c31, ALPHA, c31
  1041. ST c12, CO1, -1 * SIZE
  1042. MUL c32, ALPHA, c32
  1043. ST c21, CO2, -2 * SIZE
  1044. MUL c41, ALPHA, c41
  1045. ST c22, CO2, -1 * SIZE
  1046. MUL c42, ALPHA, c42
  1047. ST c31, CO3, -2 * SIZE
  1048. MTC c11, $r0
  1049. ST c32, CO3, -1 * SIZE
  1050. addi.d I, I, -1
  1051. ST c41, CO4, -2 * SIZE
  1052. MOV c21, c11
  1053. ST c42, CO4, -1 * SIZE
  1054. MOV c31, c11
  1055. #if ( defined(LEFT) && defined(TRANSA)) || \
  1056. (!defined(LEFT) && !defined(TRANSA))
  1057. sub.d TEMP, K, KK
  1058. #ifdef LEFT
  1059. addi.d TEMP, TEMP, -2
  1060. #else
  1061. addi.d TEMP, TEMP, -4
  1062. #endif
  1063. slli.d L, TEMP, 1 + BASE_SHIFT
  1064. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1065. add.d AO, AO, L
  1066. add.d BO, BO, TEMP
  1067. #endif
  1068. #ifdef LEFT
  1069. addi.d KK, KK, 2
  1070. #endif
  1071. #endif
  1072. MOV c41, c11
  1073. blt $r0, I, .L31
  1074. .align 3
  1075. .L40:
  1076. andi I, M, 1
  1077. MOV c61, c11
  1078. bge $r0, I, .L49
  1079. #if defined(TRMMKERNEL)
  1080. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1081. move BO, B
  1082. #else
  1083. slli.d L, KK, 0 + BASE_SHIFT
  1084. slli.d TEMP, KK, 2 + BASE_SHIFT
  1085. add.d AO, AO, L
  1086. add.d BO, B, TEMP
  1087. #endif
  1088. LD a1, AO, 0 * SIZE
  1089. MOV c71, c11
  1090. LD a2, AO, 1 * SIZE
  1091. MOV c81, c11
  1092. LD b1, BO, 0 * SIZE
  1093. LD b2, BO, 1 * SIZE
  1094. LD b3, BO, 2 * SIZE
  1095. LD b4, BO, 3 * SIZE
  1096. LD b5, BO, 4 * SIZE
  1097. LD b6, BO, 8 * SIZE
  1098. LD b7, BO, 12 * SIZE
  1099. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1100. sub.d TEMP, K, KK
  1101. #elif defined(LEFT)
  1102. addi.d TEMP, KK, 1
  1103. #else
  1104. addi.d TEMP, KK, 4
  1105. #endif
  1106. srai.d L, TEMP, 2
  1107. bge $r0, L, .L45
  1108. #else
  1109. LD a1, AO, 0 * SIZE
  1110. MOV c71, c11
  1111. LD a2, AO, 1 * SIZE
  1112. MOV c81, c11
  1113. LD b1, B, 0 * SIZE
  1114. LD b2, B, 1 * SIZE
  1115. LD b3, B, 2 * SIZE
  1116. LD b4, B, 3 * SIZE
  1117. LD b5, B, 4 * SIZE
  1118. LD b6, B, 8 * SIZE
  1119. LD b7, B, 12 * SIZE
  1120. srai.d L, K, 2
  1121. move BO, B
  1122. bge $r0, L, .L45
  1123. #endif
  1124. .align 3
  1125. .L42:
  1126. MADD c11, b1, a1, c11
  1127. LD b1, BO, 16 * SIZE
  1128. MADD c21, b2, a1, c21
  1129. LD b2, BO, 5 * SIZE
  1130. MADD c31, b3, a1, c31
  1131. LD b3, BO, 6 * SIZE
  1132. MADD c41, b4, a1, c41
  1133. LD b4, BO, 7 * SIZE
  1134. LD a1, AO, 4 * SIZE
  1135. addi.d L, L, -1
  1136. MADD c11, b5, a2, c11
  1137. LD b5, BO, 20 * SIZE
  1138. MADD c21, b2, a2, c21
  1139. LD b2, BO, 9 * SIZE
  1140. MADD c31, b3, a2, c31
  1141. LD b3, BO, 10 * SIZE
  1142. MADD c41, b4, a2, c41
  1143. LD b4, BO, 11 * SIZE
  1144. LD a2, AO, 2 * SIZE
  1145. addi.d AO, AO, 4 * SIZE
  1146. MADD c11, b6, a2, c11
  1147. LD b6, BO, 24 * SIZE
  1148. MADD c21, b2, a2, c21
  1149. LD b2, BO, 13 * SIZE
  1150. MADD c31, b3, a2, c31
  1151. LD b3, BO, 14 * SIZE
  1152. MADD c41, b4, a2, c41
  1153. LD b4, BO, 15 * SIZE
  1154. LD a2, AO, -1 * SIZE
  1155. addi.d BO, BO, 16 * SIZE
  1156. MADD c11, b7, a2, c11
  1157. LD b7, BO, 12 * SIZE
  1158. MADD c21, b2, a2, c21
  1159. LD b2, BO, 1 * SIZE
  1160. MADD c31, b3, a2, c31
  1161. LD b3, BO, 2 * SIZE
  1162. MADD c41, b4, a2, c41
  1163. LD b4, BO, 3 * SIZE
  1164. LD a2, AO, 1 * SIZE
  1165. blt $r0, L, .L42
  1166. .align 3
  1167. .L45:
  1168. #ifndef TRMMKERNEL
  1169. andi L, K, 3
  1170. #else
  1171. andi L, TEMP, 3
  1172. #endif
  1173. bge $r0, L, .L48
  1174. .align 3
  1175. .L46:
  1176. MADD c11, b1, a1, c11
  1177. LD b1, BO, 4 * SIZE
  1178. MADD c21, b2, a1, c21
  1179. LD b2, BO, 5 * SIZE
  1180. MADD c31, b3, a1, c31
  1181. LD b3, BO, 6 * SIZE
  1182. MADD c41, b4, a1, c41
  1183. LD a1, AO, 1 * SIZE
  1184. LD b4, BO, 7 * SIZE
  1185. addi.d L, L, -1
  1186. addi.d AO, AO, 1 * SIZE
  1187. MOV a2, a2
  1188. addi.d BO, BO, 4 * SIZE
  1189. blt $r0, L, .L46
  1190. .L48:
  1191. #ifndef TRMMKERNEL
  1192. LD $f22, CO1, 0 * SIZE
  1193. LD $f8, CO2, 0 * SIZE
  1194. LD $f23, CO3, 0 * SIZE
  1195. LD $f9, CO4, 0 * SIZE
  1196. MADD c11, c11, ALPHA, $f22
  1197. MADD c21, c21, ALPHA, $f8
  1198. MADD c31, c31, ALPHA, $f23
  1199. MADD c41, c41, ALPHA, $f9
  1200. ST c11, CO1, 0 * SIZE
  1201. ST c21, CO2, 0 * SIZE
  1202. ST c31, CO3, 0 * SIZE
  1203. ST c41, CO4, 0 * SIZE
  1204. #else
  1205. MUL c11, ALPHA, c11
  1206. MUL c21, ALPHA, c21
  1207. MUL c31, ALPHA, c31
  1208. MUL c41, ALPHA, c41
  1209. ST c11, CO1, 0 * SIZE
  1210. ST c21, CO2, 0 * SIZE
  1211. ST c31, CO3, 0 * SIZE
  1212. ST c41, CO4, 0 * SIZE
  1213. #if ( defined(LEFT) && defined(TRANSA)) || \
  1214. (!defined(LEFT) && !defined(TRANSA))
  1215. sub.d TEMP, K, KK
  1216. #ifdef LEFT
  1217. addi.d TEMP, TEMP, -1
  1218. #else
  1219. addi.d TEMP, TEMP, -4
  1220. #endif
  1221. slli.d L, TEMP, 0 + BASE_SHIFT
  1222. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1223. add.d AO, AO, L
  1224. add.d BO, BO, TEMP
  1225. #endif
  1226. #ifdef LEFT
  1227. addi.d KK, KK, 1
  1228. #endif
  1229. #endif
  1230. .align 3
  1231. .L49:
  1232. #if defined(TRMMKERNEL) && !defined(LEFT)
  1233. addi.d KK, KK, 4
  1234. #endif
  1235. move B, BO
  1236. .align 3
  1237. .L50:
  1238. andi J, N, 2
  1239. move AO, A
  1240. bge $r0, J, .L70
  1241. move CO1, C
  1242. add.d CO2, C, LDC
  1243. #if defined(TRMMKERNEL) && defined(LEFT)
  1244. move KK, OFFSET
  1245. #endif
  1246. srai.d I, M, 1
  1247. add.d C, CO2, LDC
  1248. bge $r0, I, .L60
  1249. .L51:
  1250. #if defined(TRMMKERNEL)
  1251. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1252. move BO, B
  1253. #else
  1254. slli.d L, KK, 1 + BASE_SHIFT
  1255. slli.d TEMP, KK, 1 + BASE_SHIFT
  1256. add.d AO, AO, L
  1257. add.d BO, B, TEMP
  1258. #endif
  1259. LD a1, AO, 0 * SIZE
  1260. MTC c11, $r0
  1261. LD a2, AO, 1 * SIZE
  1262. MOV c21, c11
  1263. LD a5, AO, 4 * SIZE
  1264. LD b1, BO, 0 * SIZE
  1265. MOV c12, c11
  1266. LD b2, BO, 1 * SIZE
  1267. MOV c22, c11
  1268. LD b3, BO, 2 * SIZE
  1269. LD b5, BO, 4 * SIZE
  1270. LD b6, BO, 8 * SIZE
  1271. LD b7, BO, 12 * SIZE
  1272. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1273. sub.d TEMP, K, KK
  1274. #elif defined(LEFT)
  1275. addi.d TEMP, KK, 2
  1276. #else
  1277. addi.d TEMP, KK, 2
  1278. #endif
  1279. srai.d L, TEMP, 2
  1280. bge $r0, L, .L55
  1281. #else
  1282. LD a1, AO, 0 * SIZE
  1283. MTC c11, $r0
  1284. LD a2, AO, 1 * SIZE
  1285. MOV c21, c11
  1286. LD a5, AO, 4 * SIZE
  1287. LD b1, B, 0 * SIZE
  1288. MOV c12, c11
  1289. LD b2, B, 1 * SIZE
  1290. MOV c22, c11
  1291. LD b3, B, 2 * SIZE
  1292. LD b5, B, 4 * SIZE
  1293. srai.d L, K, 2
  1294. LD b6, B, 8 * SIZE
  1295. LD b7, B, 12 * SIZE
  1296. move BO, B
  1297. bge $r0, L, .L55
  1298. #endif
  1299. .align 3
  1300. .L52:
  1301. MADD c11, b1, a1, c11
  1302. LD a3, AO, 2 * SIZE
  1303. MADD c21, b2, a1, c21
  1304. LD b4, BO, 3 * SIZE
  1305. MADD c12, b1, a2, c12
  1306. LD a4, AO, 3 * SIZE
  1307. MADD c22, b2, a2, c22
  1308. LD b1, BO, 8 * SIZE
  1309. MADD c11, b3, a3, c11
  1310. LD a1, AO, 8 * SIZE
  1311. MADD c21, b4, a3, c21
  1312. LD b2, BO, 5 * SIZE
  1313. MADD c12, b3, a4, c12
  1314. LD a2, AO, 5 * SIZE
  1315. MADD c22, b4, a4, c22
  1316. LD b3, BO, 6 * SIZE
  1317. MADD c11, b5, a5, c11
  1318. LD a3, AO, 6 * SIZE
  1319. MADD c21, b2, a5, c21
  1320. LD b4, BO, 7 * SIZE
  1321. MADD c12, b5, a2, c12
  1322. LD a4, AO, 7 * SIZE
  1323. MADD c22, b2, a2, c22
  1324. LD b5, BO, 12 * SIZE
  1325. MADD c11, b3, a3, c11
  1326. LD a5, AO, 12 * SIZE
  1327. MADD c21, b4, a3, c21
  1328. LD b2, BO, 9 * SIZE
  1329. MADD c12, b3, a4, c12
  1330. LD a2, AO, 9 * SIZE
  1331. MADD c22, b4, a4, c22
  1332. LD b3, BO, 10 * SIZE
  1333. addi.d AO, AO, 8 * SIZE
  1334. addi.d L, L, -1
  1335. addi.d BO, BO, 8 * SIZE
  1336. blt $r0, L, .L52
  1337. .align 3
  1338. .L55:
  1339. #ifndef TRMMKERNEL
  1340. andi L, K, 3
  1341. #else
  1342. andi L, TEMP, 3
  1343. #endif
  1344. bge $r0, L, .L58
  1345. .align 3
  1346. .L56:
  1347. MADD c11, b1, a1, c11
  1348. LD a2, AO, 1 * SIZE
  1349. MADD c21, b2, a1, c21
  1350. LD a1, AO, 2 * SIZE
  1351. MADD c12, b1, a2, c12
  1352. LD b1, BO, 2 * SIZE
  1353. MADD c22, b2, a2, c22
  1354. LD b2, BO, 3 * SIZE
  1355. addi.d L, L, -1
  1356. addi.d AO, AO, 2 * SIZE
  1357. addi.d BO, BO, 2 * SIZE
  1358. blt $r0, L, .L56
  1359. .L58:
  1360. #ifndef TRMMKERNEL
  1361. LD $f22, CO1, 0 * SIZE
  1362. addi.d I, I, -1
  1363. LD $f8, CO1, 1 * SIZE
  1364. addi.d CO1,CO1, 2 * SIZE
  1365. LD $f23, CO2, 0 * SIZE
  1366. LD $f9, CO2, 1 * SIZE
  1367. addi.d CO2,CO2, 2 * SIZE
  1368. MADD c11, c11, ALPHA, $f22
  1369. MADD c12, c12, ALPHA, $f8
  1370. MADD c21, c21, ALPHA, $f23
  1371. MADD c22, c22, ALPHA, $f9
  1372. ST c11, CO1, -2 * SIZE
  1373. ST c12, CO1, -1 * SIZE
  1374. ST c21, CO2, -2 * SIZE
  1375. ST c22, CO2, -1 * SIZE
  1376. blt $r0, I, .L51
  1377. #else
  1378. addi.d I, I, -1
  1379. addi.d CO1,CO1, 2 * SIZE
  1380. addi.d CO2,CO2, 2 * SIZE
  1381. MUL c11, ALPHA, c11
  1382. MUL c12, ALPHA, c12
  1383. MUL c21, ALPHA, c21
  1384. MUL c22, ALPHA, c22
  1385. ST c11, CO1, -2 * SIZE
  1386. ST c12, CO1, -1 * SIZE
  1387. ST c21, CO2, -2 * SIZE
  1388. ST c22, CO2, -1 * SIZE
  1389. #if ( defined(LEFT) && defined(TRANSA)) || \
  1390. (!defined(LEFT) && !defined(TRANSA))
  1391. sub.d TEMP, K, KK
  1392. #ifdef LEFT
  1393. addi.d TEMP, TEMP, -2
  1394. #else
  1395. addi.d TEMP, TEMP, -2
  1396. #endif
  1397. slli.d L, TEMP, 1 + BASE_SHIFT
  1398. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  1399. add.d AO, AO, L
  1400. add.d BO, BO, TEMP
  1401. #endif
  1402. #ifdef LEFT
  1403. addi.d KK, KK, 2
  1404. #endif
  1405. blt $r0, I, .L51
  1406. #endif
  1407. .align 3
  1408. .L60:
  1409. andi I, M, 1
  1410. bge $r0, I, .L69
  1411. #if defined(TRMMKERNEL)
  1412. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1413. move BO, B
  1414. #else
  1415. slli.d L, KK, 0 + BASE_SHIFT
  1416. slli.d TEMP, KK, 1 + BASE_SHIFT
  1417. add.d AO, AO, L
  1418. add.d BO, B, TEMP
  1419. #endif
  1420. LD a1, AO, 0 * SIZE
  1421. MTC c11, $r0
  1422. LD a2, AO, 1 * SIZE
  1423. MOV c21, c11
  1424. LD a3, AO, 2 * SIZE
  1425. MOV c31, c11
  1426. LD a4, AO, 3 * SIZE
  1427. MOV c41, c11
  1428. LD b1, BO, 0 * SIZE
  1429. LD b2, BO, 1 * SIZE
  1430. LD b3, BO, 2 * SIZE
  1431. LD b4, BO, 3 * SIZE
  1432. LD b5, BO, 4 * SIZE
  1433. LD b6, BO, 8 * SIZE
  1434. LD b7, BO, 12 * SIZE
  1435. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1436. sub.d TEMP, K, KK
  1437. #elif defined(LEFT)
  1438. addi.d TEMP, KK, 1
  1439. #else
  1440. addi.d TEMP, KK, 2
  1441. #endif
  1442. srai.d L, TEMP, 2
  1443. bge $r0, L, .L65
  1444. #else
  1445. srai.d L, K, 2
  1446. LD a1, AO, 0 * SIZE
  1447. MTC c11, $r0
  1448. LD a2, AO, 1 * SIZE
  1449. MOV c21, c11
  1450. LD a3, AO, 2 * SIZE
  1451. MOV c31, c11
  1452. LD a4, AO, 3 * SIZE
  1453. MOV c41, c11
  1454. LD b1, B, 0 * SIZE
  1455. LD b2, B, 1 * SIZE
  1456. LD b3, B, 2 * SIZE
  1457. LD b4, B, 3 * SIZE
  1458. LD b5, B, 4 * SIZE
  1459. LD b6, B, 8 * SIZE
  1460. LD b7, B, 12 * SIZE
  1461. move BO, B
  1462. bge $r0, L, .L65
  1463. #endif
  1464. .align 3
  1465. .L62:
  1466. MADD c11, b1, a1, c11
  1467. LD b1, BO, 4 * SIZE
  1468. MADD c21, b2, a1, c21
  1469. LD b2, BO, 5 * SIZE
  1470. MADD c31, b3, a2, c31
  1471. LD b3, BO, 6 * SIZE
  1472. MADD c41, b4, a2, c41
  1473. LD b4, BO, 7 * SIZE
  1474. LD a1, AO, 4 * SIZE
  1475. LD a2, AO, 5 * SIZE
  1476. MADD c11, b1, a3, c11
  1477. LD b1, BO, 8 * SIZE
  1478. MADD c21, b2, a3, c21
  1479. LD b2, BO, 9 * SIZE
  1480. MADD c31, b3, a4, c31
  1481. LD b3, BO, 10 * SIZE
  1482. MADD c41, b4, a4, c41
  1483. LD b4, BO, 11 * SIZE
  1484. LD a3, AO, 6 * SIZE
  1485. LD a4, AO, 7 * SIZE
  1486. addi.d L, L, -1
  1487. addi.d AO, AO, 4 * SIZE
  1488. addi.d BO, BO, 8 * SIZE
  1489. blt $r0, L, .L62
  1490. .align 3
  1491. .L65:
  1492. #ifndef TRMMKERNEL
  1493. andi L, K, 3
  1494. #else
  1495. andi L, TEMP, 3
  1496. #endif
  1497. bge $r0, L, .L68
  1498. .align 3
  1499. .L66:
  1500. MADD c11, b1, a1, c11
  1501. LD b1, BO, 2 * SIZE
  1502. MADD c21, b2, a1, c21
  1503. LD b2, BO, 3 * SIZE
  1504. LD a1, AO, 1 * SIZE
  1505. addi.d L, L, -1
  1506. addi.d AO, AO, 1 * SIZE
  1507. addi.d BO, BO, 2 * SIZE
  1508. blt $r0, L, .L66
  1509. .L68:
  1510. #ifndef TRMMKERNEL
  1511. LD $f22, CO1, 0 * SIZE
  1512. LD $f8, CO2, 0 * SIZE
  1513. ADD c11, c11, c31
  1514. ADD c21, c21, c41
  1515. MADD c11, c11, ALPHA, $f22
  1516. MADD c21, c21, ALPHA, $f8
  1517. ST c11, CO1, 0 * SIZE
  1518. ST c21, CO2, 0 * SIZE
  1519. #else
  1520. ADD c11, c11, c31
  1521. ADD c21, c21, c41
  1522. MUL c11, ALPHA, c11
  1523. MUL c21, ALPHA, c21
  1524. ST c11, CO1, 0 * SIZE
  1525. ST c21, CO2, 0 * SIZE
  1526. #if ( defined(LEFT) && defined(TRANSA)) || \
  1527. (!defined(LEFT) && !defined(TRANSA))
  1528. sub.d TEMP, K, KK
  1529. #ifdef LEFT
  1530. addi.d TEMP, TEMP, -1
  1531. #else
  1532. addi.d TEMP, TEMP, -2
  1533. #endif
  1534. slli.d L, TEMP, 0 + BASE_SHIFT
  1535. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  1536. add.d AO, AO, L
  1537. add.d BO, BO, TEMP
  1538. #endif
  1539. #ifdef LEFT
  1540. addi.d KK, KK, 1
  1541. #endif
  1542. #endif
  1543. .align 3
  1544. .L69:
  1545. #if defined(TRMMKERNEL) && !defined(LEFT)
  1546. addi.d KK, KK, 2
  1547. #endif
  1548. move B, BO
  1549. .align 3
  1550. .L70:
  1551. andi J, N, 1
  1552. move AO, A
  1553. bge $r0, J, .L999
  1554. move CO1, C
  1555. #if defined(TRMMKERNEL) && defined(LEFT)
  1556. move KK, OFFSET
  1557. #endif
  1558. srai.d I, M, 1
  1559. add.d C, CO1, LDC
  1560. bge $r0, I, .L80
  1561. .L71:
  1562. #if defined(TRMMKERNEL)
  1563. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1564. move BO, B
  1565. #else
  1566. slli.d L, KK, 1 + BASE_SHIFT
  1567. slli.d TEMP, KK, 0 + BASE_SHIFT
  1568. add.d AO, AO, L
  1569. add.d BO, B, TEMP
  1570. #endif
  1571. LD a1, AO, 0 * SIZE
  1572. MTC c11, $r0
  1573. LD a2, AO, 1 * SIZE
  1574. MOV c21, c11
  1575. LD a5, AO, 4 * SIZE
  1576. LD b1, BO, 0 * SIZE
  1577. MOV c12, c11
  1578. LD b2, BO, 1 * SIZE
  1579. MOV c22, c11
  1580. LD b3, BO, 2 * SIZE
  1581. LD b5, BO, 4 * SIZE
  1582. LD b6, BO, 8 * SIZE
  1583. LD b7, BO, 12 * SIZE
  1584. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1585. sub.d TEMP, K, KK
  1586. #elif defined(LEFT)
  1587. addi.d TEMP, KK, 2
  1588. #else
  1589. addi.d TEMP, KK, 1
  1590. #endif
  1591. srai.d L, TEMP, 2
  1592. bge $r0, L, .L75
  1593. #else
  1594. LD a1, AO, 0 * SIZE
  1595. MTC c11, $r0
  1596. LD a2, AO, 1 * SIZE
  1597. MOV c21, c11
  1598. LD a5, AO, 4 * SIZE
  1599. LD b1, B, 0 * SIZE
  1600. MOV c12, c11
  1601. LD b2, B, 1 * SIZE
  1602. MOV c22, c11
  1603. LD b3, B, 2 * SIZE
  1604. LD b5, B, 4 * SIZE
  1605. srai.d L, K, 2
  1606. LD b6, B, 8 * SIZE
  1607. LD b7, B, 12 * SIZE
  1608. move BO, B
  1609. bge $r0, L, .L75
  1610. #endif
  1611. .align 3
  1612. .L72:
  1613. LD a1, AO, 0 * SIZE
  1614. LD a2, AO, 1 * SIZE
  1615. LD b1, BO, 0 * SIZE
  1616. MADD c11, b1, a1, c11
  1617. MADD c12, b1, a2, c12
  1618. LD a1, AO, 2 * SIZE
  1619. LD a2, AO, 3 * SIZE
  1620. LD b1, BO, 1 * SIZE
  1621. MADD c11, b1, a1, c11
  1622. MADD c12, b1, a2, c12
  1623. LD a1, AO, 4 * SIZE
  1624. LD a2, AO, 5 * SIZE
  1625. LD b1, BO, 2 * SIZE
  1626. MADD c11, b1, a1, c11
  1627. MADD c12, b1, a2, c12
  1628. LD a1, AO, 6 * SIZE
  1629. LD a2, AO, 7 * SIZE
  1630. LD b1, BO, 3 * SIZE
  1631. MADD c11, b1, a1, c11
  1632. MADD c12, b1, a2, c12
  1633. addi.d L, L, -1
  1634. addi.d AO, AO, 8 * SIZE
  1635. addi.d BO, BO, 4 * SIZE
  1636. blt $r0, L, .L72
  1637. .align 3
  1638. .L75:
  1639. #ifndef TRMMKERNEL
  1640. andi L, K, 3
  1641. #else
  1642. andi L, TEMP, 3
  1643. #endif
  1644. bge $r0, L, .L78
  1645. .align 3
  1646. .L76:
  1647. LD a1, AO, 0 * SIZE
  1648. LD a2, AO, 1 * SIZE
  1649. LD b1, BO, 0 * SIZE
  1650. MADD c11, b1, a1, c11
  1651. MADD c12, b1, a2, c12
  1652. addi.d L, L, -1
  1653. addi.d AO, AO, 2 * SIZE
  1654. addi.d BO, BO, 1 * SIZE
  1655. blt $r0, L, .L76
  1656. .L78:
  1657. #ifndef TRMMKERNEL
  1658. LD $f22, CO1, 0 * SIZE
  1659. addi.d I, I, -1
  1660. LD $f8, CO1, 1 * SIZE
  1661. addi.d CO1,CO1, 2 * SIZE
  1662. ADD c11, c11, c21
  1663. ADD c12, c12, c22
  1664. MADD c11, c11, ALPHA, $f22
  1665. MADD c12, c12, ALPHA, $f8
  1666. ST c11, CO1, -2 * SIZE
  1667. ST c12, CO1, -1 * SIZE
  1668. blt $r0, I, .L71
  1669. #else
  1670. ADD c11, c11, c21
  1671. addi.d I, I, -1
  1672. ADD c12, c12, c22
  1673. addi.d CO1,CO1, 2 * SIZE
  1674. MUL c11, ALPHA, c11
  1675. MUL c12, ALPHA, c12
  1676. ST c11, CO1, -2 * SIZE
  1677. ST c12, CO1, -1 * SIZE
  1678. #if ( defined(LEFT) && defined(TRANSA)) || \
  1679. (!defined(LEFT) && !defined(TRANSA))
  1680. sub.d TEMP, K, KK
  1681. #ifdef LEFT
  1682. addi.d TEMP, TEMP, -2
  1683. #else
  1684. addi.d TEMP, TEMP, -1
  1685. #endif
  1686. slli.d L, TEMP, 1 + BASE_SHIFT
  1687. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  1688. add.d AO, AO, L
  1689. add.d BO, BO, TEMP
  1690. #endif
  1691. #ifdef LEFT
  1692. addi.d KK, KK, 2
  1693. #endif
  1694. blt $r0, I, .L71
  1695. #endif
  1696. .align 3
  1697. .L80:
  1698. andi I, M, 1
  1699. bge $r0, I, .L89
  1700. #if defined(TRMMKERNEL)
  1701. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1702. move BO, B
  1703. #else
  1704. slli.d L, KK, 0 + BASE_SHIFT
  1705. slli.d TEMP, KK, 0 + BASE_SHIFT
  1706. add.d AO, AO, L
  1707. add.d BO, B, TEMP
  1708. #endif
  1709. LD a1, AO, 0 * SIZE
  1710. MTC c11, $r0
  1711. LD a2, AO, 1 * SIZE
  1712. MOV c21, c11
  1713. LD a3, AO, 2 * SIZE
  1714. LD a4, AO, 3 * SIZE
  1715. LD b1, BO, 0 * SIZE
  1716. LD b2, BO, 1 * SIZE
  1717. LD b3, BO, 2 * SIZE
  1718. LD b4, BO, 3 * SIZE
  1719. LD b5, BO, 4 * SIZE
  1720. LD b6, BO, 8 * SIZE
  1721. LD b7, BO, 12 * SIZE
  1722. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1723. sub.d TEMP, K, KK
  1724. #elif defined(LEFT)
  1725. addi.d TEMP, KK, 1
  1726. #else
  1727. addi.d TEMP, KK, 1
  1728. #endif
  1729. srai.d L, TEMP, 2
  1730. bge $r0, L, .L85
  1731. #else
  1732. LD a1, AO, 0 * SIZE
  1733. MTC c11, $r0
  1734. LD a2, AO, 1 * SIZE
  1735. MOV c21, c11
  1736. LD a3, AO, 2 * SIZE
  1737. LD a4, AO, 3 * SIZE
  1738. LD b1, B, 0 * SIZE
  1739. LD b2, B, 1 * SIZE
  1740. LD b3, B, 2 * SIZE
  1741. LD b4, B, 3 * SIZE
  1742. LD b5, B, 4 * SIZE
  1743. LD b6, B, 8 * SIZE
  1744. LD b7, B, 12 * SIZE
  1745. srai.d L, K, 2
  1746. move BO, B
  1747. bge $r0, L, .L85
  1748. #endif
  1749. .align 3
  1750. .L82:
  1751. LD a1, AO, 0 * SIZE
  1752. LD b1, BO, 0 * SIZE
  1753. MADD c11, b1, a1, c11
  1754. LD a1, AO, 1 * SIZE
  1755. LD b1, BO, 1 * SIZE
  1756. MADD c21, b1, a1, c21
  1757. LD a1, AO, 2 * SIZE
  1758. LD b1, BO, 2 * SIZE
  1759. MADD c11, b1, a1, c11
  1760. LD a1, AO, 3 * SIZE
  1761. LD b1, BO, 3 * SIZE
  1762. MADD c21, b1, a1, c21
  1763. addi.d L, L, -1
  1764. addi.d AO, AO, 4 * SIZE
  1765. addi.d BO, BO, 4 * SIZE
  1766. blt $r0, L, .L82
  1767. .align 3
  1768. .L85:
  1769. #ifndef TRMMKERNEL
  1770. andi L, K, 3
  1771. #else
  1772. andi L, TEMP, 3
  1773. #endif
  1774. bge $r0, L, .L88
  1775. .align 3
  1776. .L86:
  1777. LD a1, AO, 0 * SIZE
  1778. LD b1, BO, 0 * SIZE
  1779. MADD c11, b1, a1, c11
  1780. addi.d L, L, -1
  1781. addi.d AO, AO, 1 * SIZE
  1782. addi.d BO, BO, 1 * SIZE
  1783. blt $r0, L, .L86
  1784. .L88:
  1785. #ifndef TRMMKERNEL
  1786. LD $f22, CO1, 0 * SIZE
  1787. ADD c11, c11, c21
  1788. MADD c11, c11, ALPHA, $f22
  1789. ST c11, CO1, 0 * SIZE
  1790. #else
  1791. ADD c11, c11, c21
  1792. MUL c11, ALPHA, c11
  1793. ST c11, CO1, 0 * SIZE
  1794. #endif
  1795. .align 3
  1796. .L89:
  1797. #if defined(TRMMKERNEL) && !defined(LEFT)
  1798. addi.d KK, KK, 1
  1799. #endif
  1800. move B, BO
  1801. .align 3
  1802. .L999:
  1803. LDARG $r23, $sp, 0
  1804. LDARG $r24, $sp, 8
  1805. LDARG $r25, $sp, 16
  1806. LDARG $r26, $sp, 24
  1807. LDARG $r27, $sp, 32
  1808. LDARG $r28, $sp, 40
  1809. LDARG $r29, $sp, 48
  1810. LDARG $r30, $sp, 96
  1811. fld.d $f24, $sp, 56
  1812. fld.d $f25, $sp, 64
  1813. fld.d $f26, $sp, 72
  1814. fld.d $f27, $sp, 80
  1815. fld.d $f28, $sp, 88
  1816. #if defined(TRMMKERNEL)
  1817. LDARG $r20, $sp, 104
  1818. LDARG $r16, $sp, 112
  1819. #endif
  1820. #ifndef __64BIT__
  1821. fld.d $f18, $sp, 120
  1822. fld.d $f19, $sp, 128
  1823. fld.d $f20, $sp, 136
  1824. fld.d $f21, $sp, 144
  1825. #endif
  1826. addi.d $sp, $sp, 160
  1827. move $r4, $r17
  1828. fmov.d $f0, $f22
  1829. jirl $r0, $r1, 0x0
  1830. EPILOGUE