You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4.S 42 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 56
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. #define STACKSIZE 80
  56. #define M $16
  57. #define N $17
  58. #define K $18
  59. #define A $20
  60. #define B $21
  61. #define C $22
  62. #define LDC $23
  63. #define C1 $19
  64. #define C2 $24
  65. #define C3 $25
  66. #define C4 $27
  67. #define AO $at
  68. #define BO $5
  69. #define I $6
  70. #define J $7
  71. #define L $8
  72. #define a1 $f16
  73. #define a2 $f17
  74. #define a3 $f18
  75. #define a4 $f19
  76. #define b1 $f20
  77. #define b2 $f21
  78. #define b3 $f22
  79. #define b4 $f23
  80. #define t1 $f24
  81. #define t2 $f25
  82. #define t3 $f26
  83. #define t4 $f27
  84. #define a5 $f28
  85. #define a6 $f30
  86. #define b5 $f29
  87. #define alpha $f30
  88. #define c01 $f0
  89. #define c02 $f1
  90. #define c03 $f2
  91. #define c04 $f3
  92. #define c05 $f4
  93. #define c06 $f5
  94. #define c07 $f6
  95. #define c08 $f7
  96. #define c09 $f8
  97. #define c10 $f9
  98. #define c11 $f10
  99. #define c12 $f11
  100. #define c13 $f12
  101. #define c14 $f13
  102. #define c15 $f14
  103. #define c16 $f15
  104. #define TMP1 $0
  105. #define TMP2 $1
  106. #define KK $2
  107. #define BB $3
  108. #define OFFSET $4
  109. #define ALPHA 64($sp)
  110. PROLOGUE
  111. PROFCODE
  112. .frame $sp, STACKSIZE, $26, 0
  113. lda $sp, -STACKSIZE($sp)
  114. ldq C, 0 + STACKSIZE($sp)
  115. ldq LDC, 8 + STACKSIZE($sp)
  116. #ifdef TRMMKERNEL
  117. ldq OFFSET, 16 + STACKSIZE($sp)
  118. #endif
  119. SXADDQ LDC, 0, LDC
  120. stt $f2, 0($sp)
  121. stt $f3, 8($sp)
  122. stt $f4, 16($sp)
  123. stt $f5, 24($sp)
  124. stt $f6, 32($sp)
  125. stt $f7, 40($sp)
  126. stt $f8, 48($sp)
  127. stt $f9, 56($sp)
  128. stt $f19, ALPHA
  129. cmple M, 0, $0
  130. cmple N, 0, $1
  131. cmple K, 0, $2
  132. or $0, $1, $0
  133. or $0, $2, $0
  134. bne $0, $L999
  135. #if defined(TRMMKERNEL) && !defined(LEFT)
  136. subq $31, OFFSET, KK
  137. #endif
  138. sra N, 2, J
  139. ble J, $L40
  140. .align 4
  141. $L01:
  142. mov C, C1
  143. addq C, LDC, C2
  144. mov A, AO
  145. s4addq K, 0, BB
  146. #if defined(TRMMKERNEL) && defined(LEFT)
  147. mov OFFSET, KK
  148. #endif
  149. addq C2, LDC, C3
  150. s4addq LDC, C, C
  151. SXADDQ BB, B, BB
  152. fclr t1
  153. addq C3, LDC, C4
  154. fclr t2
  155. sra M, 2, I
  156. fclr t3
  157. fclr t4
  158. ble I, $L20
  159. .align 4
  160. $L11:
  161. #if defined(EV5) || defined(EV6)
  162. ldl $31, 0 * SIZE(BB)
  163. ldl $31, 8 * SIZE(BB)
  164. unop
  165. lda BB, 16 * SIZE(BB)
  166. #endif
  167. #if !defined(TRMMKERNEL) || \
  168. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  169. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  170. #ifdef TRMMKERNEL
  171. #ifdef LEFT
  172. addq KK, 4, TMP1
  173. #else
  174. addq KK, 4, TMP1
  175. #endif
  176. #endif
  177. LD a1, 0 * SIZE(AO)
  178. fclr c11
  179. LD a2, 1 * SIZE(AO)
  180. fclr c12
  181. LD a3, 2 * SIZE(AO)
  182. fclr c16
  183. LD a4, 3 * SIZE(AO)
  184. fclr c15
  185. LD b1, 0 * SIZE(B)
  186. fclr c01
  187. LD b2, 1 * SIZE(B)
  188. fclr c02
  189. LD b3, 2 * SIZE(B)
  190. fclr c06
  191. LD b4, 3 * SIZE(B)
  192. fclr c05
  193. lds $f31, 4 * SIZE(C1)
  194. fclr c03
  195. #ifndef TRMMKERNEL
  196. lda L, -2(K)
  197. #else
  198. lda L, -2(TMP1)
  199. #endif
  200. fclr c04
  201. lds $f31, 7 * SIZE(C2)
  202. fclr c08
  203. lda BO, 4 * SIZE(B)
  204. fclr c13
  205. lds $f31, 4 * SIZE(C3)
  206. fclr c09
  207. lda AO, 4 * SIZE(AO)
  208. fclr c10
  209. #else
  210. sll KK, BASE_SHIFT + 2, TMP1
  211. addq AO, TMP1, AO
  212. addq B, TMP1, BO
  213. subq K, KK, TMP1
  214. LD a1, 0 * SIZE(AO)
  215. fclr c11
  216. LD a2, 1 * SIZE(AO)
  217. fclr c12
  218. LD a3, 2 * SIZE(AO)
  219. fclr c16
  220. LD a4, 3 * SIZE(AO)
  221. fclr c15
  222. LD b1, 0 * SIZE(BO)
  223. fclr c01
  224. LD b2, 1 * SIZE(BO)
  225. fclr c02
  226. LD b3, 2 * SIZE(BO)
  227. fclr c06
  228. LD b4, 3 * SIZE(BO)
  229. fclr c05
  230. lds $f31, 4 * SIZE(C1)
  231. fclr c03
  232. lda L, -2(TMP1)
  233. fclr c04
  234. lds $f31, 7 * SIZE(C2)
  235. fclr c08
  236. lda BO, 4 * SIZE(BO)
  237. fclr c13
  238. lds $f31, 4 * SIZE(C3)
  239. fclr c09
  240. lda AO, 4 * SIZE(AO)
  241. fclr c10
  242. #endif
  243. lds $f31, 7 * SIZE(C4)
  244. fclr c14
  245. fclr c07
  246. ble L, $L15
  247. .align 5
  248. $L12:
  249. /* 1 */
  250. ADD c11, t1, c11
  251. #ifndef EV4
  252. ldq $31, PREFETCHSIZE * SIZE(AO)
  253. #else
  254. unop
  255. #endif
  256. MUL b1, a1, t1
  257. #ifndef EV4
  258. ldl $31, PREFETCHSIZE * SIZE(BO)
  259. #else
  260. unop
  261. #endif
  262. ADD c12, t2, c12
  263. unop
  264. MUL b1, a2, t2
  265. unop
  266. ADD c16, t3, c16
  267. unop
  268. MUL b2, a2, t3
  269. LD a5, 0 * SIZE(AO)
  270. ADD c15, t4, c15
  271. unop
  272. MUL b2, a1, t4
  273. LD b5, 0 * SIZE(BO)
  274. /* 2 */
  275. ADD c01, t1, c01
  276. UNOP
  277. MUL b1, a3, t1
  278. UNOP
  279. ADD c02, t2, c02
  280. UNOP
  281. MUL b1, a4, t2
  282. UNOP
  283. ADD c06, t3, c06
  284. unop
  285. MUL b2, a4, t3
  286. unop
  287. ADD c05, t4, c05
  288. unop
  289. MUL b4, a1, t4
  290. unop
  291. /* 3 */
  292. ADD c03, t1, c03
  293. unop
  294. MUL b3, a1, t1
  295. unop
  296. ADD c04, t2, c04
  297. unop
  298. MUL b3, a2, t2
  299. unop
  300. ADD c08, t3, c08
  301. unop
  302. MUL b4, a2, t3
  303. LD a2, 1 * SIZE(AO)
  304. ADD c13, t4, c13
  305. unop
  306. MUL b2, a3, t4
  307. LD b2, 1 * SIZE(BO)
  308. /* 4 */
  309. ADD c09, t1, c09
  310. unop
  311. MUL b3, a3, t1
  312. LD a6, 2 * SIZE(AO)
  313. ADD c10, t2, c10
  314. unop
  315. MUL b3, a4, t2
  316. LD b3, 2 * SIZE(BO)
  317. ADD c14, t3, c14
  318. unop
  319. MUL b4, a4, t3
  320. LD a4, 3 * SIZE(AO)
  321. ADD c07, t4, c07
  322. unop
  323. MUL b4, a3, t4
  324. LD b4, 3 * SIZE(BO)
  325. /* 5 */
  326. ADD c11, t1, c11
  327. unop
  328. MUL b5, a5, t1
  329. LD a1, 4 * SIZE(AO)
  330. ADD c12, t2, c12
  331. lda L, -2(L)
  332. MUL b5, a2, t2
  333. LD b1, 4 * SIZE(BO)
  334. ADD c16, t3, c16
  335. unop
  336. MUL b2, a2, t3
  337. unop
  338. ADD c15, t4, c15
  339. unop
  340. MUL b2, a5, t4
  341. unop
  342. /* 6 */
  343. ADD c01, t1, c01
  344. unop
  345. MUL b5, a6, t1
  346. unop
  347. ADD c02, t2, c02
  348. unop
  349. MUL b5, a4, t2
  350. unop
  351. ADD c06, t3, c06
  352. unop
  353. MUL b2, a4, t3
  354. unop
  355. ADD c05, t4, c05
  356. unop
  357. MUL b4, a5, t4
  358. unop
  359. /* 7 */
  360. ADD c03, t1, c03
  361. lda AO, 8 * SIZE(AO)
  362. MUL b3, a5, t1
  363. unop
  364. ADD c04, t2, c04
  365. lda BO, 8 * SIZE(BO)
  366. MUL b3, a2, t2
  367. unop
  368. ADD c08, t3, c08
  369. unop
  370. MUL b4, a2, t3
  371. LD a2, -3 * SIZE(AO)
  372. ADD c13, t4, c13
  373. unop
  374. MUL b2, a6, t4
  375. LD b2, -3 * SIZE(BO)
  376. /* 8 */
  377. ADD c09, t1, c09
  378. unop
  379. MUL b3, a6, t1
  380. LD a3, -2 * SIZE(AO)
  381. ADD c10, t2, c10
  382. unop
  383. MUL b3, a4, t2
  384. LD b3, -2 * SIZE(BO)
  385. ADD c14, t3, c14
  386. unop
  387. MUL b4, a4, t3
  388. LD a4, -1 * SIZE(AO)
  389. ADD c07, t4, c07
  390. MUL b4, a6, t4
  391. LD b4, -1 * SIZE(BO)
  392. bgt L, $L12
  393. .align 4
  394. $L15:
  395. ADD c11, t1, c11
  396. ldt alpha, ALPHA
  397. MUL b1, a1, t1
  398. #ifndef TRMMKERNEL
  399. blbs K, $L18
  400. #else
  401. blbs TMP1, $L18
  402. #endif
  403. .align 4
  404. ADD c12, t2, c12
  405. MUL b1, a2, t2
  406. ADD c16, t3, c16
  407. MUL b2, a2, t3
  408. ADD c15, t4, c15
  409. MUL b2, a1, t4
  410. ADD c01, t1, c01
  411. MUL b1, a3, t1
  412. ADD c02, t2, c02
  413. unop
  414. MUL b1, a4, t2
  415. LD b1, 0 * SIZE(BO)
  416. ADD c06, t3, c06
  417. MUL b2, a4, t3
  418. ADD c05, t4, c05
  419. MUL b4, a1, t4
  420. ADD c03, t1, c03
  421. unop
  422. MUL b3, a1, t1
  423. LD a1, 0 * SIZE(AO)
  424. ADD c04, t2, c04
  425. unop
  426. MUL b3, a2, t2
  427. unop
  428. ADD c08, t3, c08
  429. unop
  430. MUL b4, a2, t3
  431. LD a2, 1 * SIZE(AO)
  432. ADD c13, t4, c13
  433. unop
  434. MUL b2, a3, t4
  435. LD b2, 1 * SIZE(BO)
  436. ADD c09, t1, c09
  437. unop
  438. MUL b3, a3, t1
  439. lda AO, 4 * SIZE(AO)
  440. ADD c10, t2, c10
  441. unop
  442. MUL b3, a4, t2
  443. LD b3, 2 * SIZE(BO)
  444. ADD c14, t3, c14
  445. unop
  446. MUL b4, a4, t3
  447. LD a4, -1 * SIZE(AO)
  448. ADD c07, t4, c07
  449. unop
  450. MUL b4, a3, t4
  451. LD a3, -2 * SIZE(AO)
  452. ADD c11, t1, c11
  453. LD b4, 3 * SIZE(BO)
  454. MUL b1, a1, t1
  455. lda BO, 4 * SIZE(BO)
  456. .align 4
  457. $L18:
  458. ADD c12, t2, c12
  459. unop
  460. MUL b1, a2, t2
  461. #ifndef TRMMKERNEL
  462. LD a5, 0 * SIZE(C1)
  463. #else
  464. unop
  465. #endif
  466. ADD c16, t3, c16
  467. unop
  468. MUL b2, a2, t3
  469. unop
  470. ADD c15, t4, c15
  471. unop
  472. MUL b2, a1, t4
  473. #ifndef TRMMKERNEL
  474. LD b5, 1 * SIZE(C1)
  475. #else
  476. unop
  477. #endif
  478. ADD c01, t1, c01
  479. unop
  480. MUL b1, a3, t1
  481. unop
  482. ADD c02, t2, c02
  483. unop
  484. MUL b1, a4, t2
  485. #ifndef TRMMKERNEL
  486. LD b1, 0 * SIZE(C2)
  487. #else
  488. unop
  489. #endif
  490. ADD c06, t3, c06
  491. unop
  492. MUL b2, a4, t3
  493. unop
  494. ADD c05, t4, c05
  495. unop
  496. MUL b4, a1, t4
  497. unop
  498. ADD c03, t1, c03
  499. unop
  500. MUL b3, a1, t1
  501. unop
  502. ADD c04, t2, c04
  503. unop
  504. MUL b3, a2, t2
  505. #ifndef TRMMKERNEL
  506. LD a1, 0 * SIZE(C3)
  507. #else
  508. unop
  509. #endif
  510. ADD c08, t3, c08
  511. unop
  512. MUL b4, a2, t3
  513. #ifndef TRMMKERNEL
  514. LD a2, 2 * SIZE(C1)
  515. #else
  516. unop
  517. #endif
  518. ADD c13, t4, c13
  519. unop
  520. MUL b2, a3, t4
  521. #ifndef TRMMKERNEL
  522. LD b2, 3 * SIZE(C1)
  523. #else
  524. unop
  525. #endif
  526. ADD c09, t1, c09
  527. lda I, -1(I)
  528. MUL b3, a3, t1
  529. unop
  530. ADD c10, t2, c10
  531. unop
  532. MUL b3, a4, t2
  533. #ifndef TRMMKERNEL
  534. LD b3, 0 * SIZE(C4)
  535. #else
  536. unop
  537. #endif
  538. ADD c14, t3, c14
  539. unop
  540. MUL b4, a4, t3
  541. #ifndef TRMMKERNEL
  542. LD a4, 1 * SIZE(C2)
  543. #else
  544. unop
  545. #endif
  546. ADD c07, t4, c07
  547. unop
  548. MUL b4, a3, t4
  549. #ifndef TRMMKERNEL
  550. LD a3, 2 * SIZE(C2)
  551. #else
  552. unop
  553. #endif
  554. ADD c11, t1, c11
  555. unop
  556. MUL alpha, c01, c01
  557. #ifndef TRMMKERNEL
  558. LD b4, 3 * SIZE(C2)
  559. #else
  560. unop
  561. #endif
  562. ADD c12, t2, c12
  563. unop
  564. MUL alpha, c02, c02
  565. #ifndef TRMMKERNEL
  566. LD t1, 1 * SIZE(C3)
  567. #else
  568. unop
  569. #endif
  570. ADD c16, t3, c16
  571. unop
  572. MUL alpha, c03, c03
  573. #ifndef TRMMKERNEL
  574. LD t2, 2 * SIZE(C3)
  575. #else
  576. unop
  577. #endif
  578. ADD c15, t4, c15
  579. unop
  580. MUL alpha, c04, c04
  581. #ifndef TRMMKERNEL
  582. LD t3, 3 * SIZE(C3)
  583. #else
  584. unop
  585. #endif
  586. MUL alpha, c05, c05
  587. unop
  588. #ifndef TRMMKERNEL
  589. ADD c01, a5, c01
  590. LD t4, 1 * SIZE(C4)
  591. #else
  592. unop
  593. unop
  594. #endif
  595. MUL alpha, c06, c06
  596. #ifndef TRMMKERNEL
  597. unop
  598. ADD c02, b5, c02
  599. LD a5, 2 * SIZE(C4)
  600. #endif
  601. MUL alpha, c07, c07
  602. #ifndef TRMMKERNEL
  603. unop
  604. ADD c03, a2, c03
  605. LD b5, 3 * SIZE(C4)
  606. #endif
  607. MUL alpha, c08, c08
  608. #ifndef TRMMKERNEL
  609. unop
  610. ADD c04, b2, c04
  611. unop
  612. #endif
  613. MUL alpha, c09, c09
  614. ST c01, 0 * SIZE(C1)
  615. #ifndef TRMMKERNEL
  616. ADD c05, b1, c05
  617. unop
  618. #endif
  619. MUL alpha, c10, c10
  620. ST c02, 1 * SIZE(C1)
  621. #ifndef TRMMKERNEL
  622. ADD c06, a4, c06
  623. unop
  624. #endif
  625. MUL alpha, c11, c11
  626. ST c03, 2 * SIZE(C1)
  627. #ifndef TRMMKERNEL
  628. ADD c07, a3, c07
  629. unop
  630. #endif
  631. MUL alpha, c12, c12
  632. ST c04, 3 * SIZE(C1)
  633. #ifndef TRMMKERNEL
  634. ADD c08, b4, c08
  635. #else
  636. unop
  637. #endif
  638. lda C1, 4 * SIZE(C1)
  639. MUL alpha, c13, c13
  640. ST c05, 0 * SIZE(C2)
  641. #ifndef TRMMKERNEL
  642. ADD c09, a1, c09
  643. unop
  644. #endif
  645. MUL alpha, c14, c14
  646. ST c06, 1 * SIZE(C2)
  647. #ifndef TRMMKERNEL
  648. ADD c10, t1, c10
  649. unop
  650. #endif
  651. MUL alpha, c15, c15
  652. ST c07, 2 * SIZE(C2)
  653. #ifndef TRMMKERNEL
  654. ADD c11, t2, c11
  655. unop
  656. #endif
  657. MUL alpha, c16, c16
  658. ST c08, 3 * SIZE(C2)
  659. #ifndef TRMMKERNEL
  660. ADD c12, t3, c12
  661. #else
  662. unop
  663. #endif
  664. lda C2, 4 * SIZE(C2)
  665. #ifndef TRMMKERNEL
  666. ADD c13, b3, c13
  667. #else
  668. unop
  669. #endif
  670. ST c09, 0 * SIZE(C3)
  671. fclr t1
  672. lda C4, 4 * SIZE(C4)
  673. #ifndef TRMMKERNEL
  674. ADD c14, t4, c14
  675. #else
  676. unop
  677. #endif
  678. ST c10, 1 * SIZE(C3)
  679. fclr t2
  680. unop
  681. #ifndef TRMMKERNEL
  682. ADD c15, a5, c15
  683. #else
  684. unop
  685. #endif
  686. ST c11, 2 * SIZE(C3)
  687. fclr t3
  688. unop
  689. #ifndef TRMMKERNEL
  690. ADD c16, b5, c16
  691. #else
  692. unop
  693. #endif
  694. ST c12, 3 * SIZE(C3)
  695. fclr t4
  696. lda C3, 4 * SIZE(C3)
  697. ST c13, -4 * SIZE(C4)
  698. ST c14, -3 * SIZE(C4)
  699. ST c15, -2 * SIZE(C4)
  700. ST c16, -1 * SIZE(C4)
  701. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  702. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  703. subq K, KK, TMP1
  704. #ifdef LEFT
  705. subq TMP1, 4, TMP1
  706. #else
  707. subq TMP1, 4, TMP1
  708. #endif
  709. sll TMP1, BASE_SHIFT + 2, TMP1
  710. addq AO, TMP1, AO
  711. addq BO, TMP1, BO
  712. #endif
  713. #if defined(TRMMKERNEL) && defined(LEFT)
  714. addq KK, 4, KK
  715. #endif
  716. bgt I, $L11
  717. .align 4
  718. $L20:
  719. and M, 2, I
  720. ble I, $L30
  721. #if !defined(TRMMKERNEL) || \
  722. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  723. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  724. #ifdef TRMMKERNEL
  725. #ifdef LEFT
  726. addq KK, 2, TMP1
  727. #else
  728. addq KK, 4, TMP1
  729. #endif
  730. #endif
  731. LD a1, 0 * SIZE(AO)
  732. fclr c09
  733. LD a2, 1 * SIZE(AO)
  734. fclr c13
  735. LD a3, 2 * SIZE(AO)
  736. fclr c10
  737. LD a4, 3 * SIZE(AO)
  738. fclr c14
  739. LD b1, 0 * SIZE(B)
  740. #ifndef TRMMKERNEL
  741. lda L, -2(K)
  742. #else
  743. lda L, -2(TMP1)
  744. #endif
  745. LD b2, 1 * SIZE(B)
  746. lda AO, 2 * SIZE(AO)
  747. LD b3, 2 * SIZE(B)
  748. fclr c01
  749. LD b4, 3 * SIZE(B)
  750. fclr c05
  751. lda BO, 4 * SIZE(B)
  752. fclr c02
  753. fclr c06
  754. ble L, $L25
  755. #else
  756. sll KK, BASE_SHIFT + 1, TMP1
  757. addq AO, TMP1, AO
  758. sll KK, BASE_SHIFT + 2, TMP2
  759. addq B, TMP2, BO
  760. subq K, KK, TMP1
  761. LD a1, 0 * SIZE(AO)
  762. fclr c09
  763. LD a2, 1 * SIZE(AO)
  764. fclr c13
  765. LD a3, 2 * SIZE(AO)
  766. fclr c10
  767. LD a4, 3 * SIZE(AO)
  768. fclr c14
  769. LD b1, 0 * SIZE(BO)
  770. lda L, -2(TMP1)
  771. LD b2, 1 * SIZE(BO)
  772. lda AO, 2 * SIZE(AO)
  773. LD b3, 2 * SIZE(BO)
  774. fclr c01
  775. LD b4, 3 * SIZE(BO)
  776. fclr c05
  777. lda BO, 4 * SIZE(BO)
  778. fclr c02
  779. fclr c06
  780. ble L, $L25
  781. #endif
  782. .align 4
  783. $L22:
  784. ADD c09, t1, c09
  785. unop
  786. MUL a1, b1, t1
  787. unop
  788. ADD c10, t2, c10
  789. unop
  790. MUL a2, b1, t2
  791. LD b1, 0 * SIZE(BO)
  792. ADD c13, t3, c13
  793. unop
  794. MUL a1, b2, t3
  795. lda BO, 8 * SIZE(BO)
  796. ADD c14, t4, c14
  797. unop
  798. MUL a2, b2, t4
  799. LD b2, -7 * SIZE(BO)
  800. ADD c01, t1, c01
  801. unop
  802. MUL a1, b3, t1
  803. unop
  804. ADD c02, t2, c02
  805. unop
  806. MUL a2, b3, t2
  807. LD b3, -6 * SIZE(BO)
  808. ADD c05, t3, c05
  809. unop
  810. MUL a1, b4, t3
  811. LD a1, 2 * SIZE(AO)
  812. ADD c06, t4, c06
  813. MUL a2, b4, t4
  814. LD b5, -5 * SIZE(BO)
  815. ADD c09, t1, c09
  816. unop
  817. MUL a3, b1, t1
  818. LD a2, 3 * SIZE(AO)
  819. ADD c10, t2, c10
  820. unop
  821. MUL a4, b1, t2
  822. LD b1, -4 * SIZE(BO)
  823. ADD c13, t3, c13
  824. unop
  825. MUL a3, b2, t3
  826. lda AO, 4 * SIZE(AO)
  827. ADD c14, t4, c14
  828. MUL a4, b2, t4
  829. LD b2, -3 * SIZE(BO)
  830. ADD c01, t1, c01
  831. lda L, -2(L)
  832. MUL a3, b3, t1
  833. LD b4, -1 * SIZE(BO)
  834. ADD c02, t2, c02
  835. unop
  836. MUL a4, b3, t2
  837. LD b3, -2 * SIZE(BO)
  838. ADD c05, t3, c05
  839. unop
  840. MUL a3, b5, t3
  841. LD a3, 0 * SIZE(AO)
  842. ADD c06, t4, c06
  843. MUL a4, b5, t4
  844. LD a4, 1 * SIZE(AO)
  845. bgt L, $L22
  846. .align 4
  847. $L25:
  848. ADD c09, t1, c09
  849. ldt alpha, ALPHA
  850. MUL a1, b1, t1
  851. #ifndef TRMMKERNEL
  852. blbs K, $L28
  853. #else
  854. blbs TMP1, $L28
  855. #endif
  856. ADD c10, t2, c10
  857. unop
  858. MUL a2, b1, t2
  859. LD b1, 0 * SIZE(BO)
  860. ADD c13, t3, c13
  861. unop
  862. MUL a1, b2, t3
  863. unop
  864. ADD c14, t4, c14
  865. unop
  866. MUL a2, b2, t4
  867. LD b2, 1 * SIZE(BO)
  868. ADD c01, t1, c01
  869. unop
  870. MUL a1, b3, t1
  871. lda AO, 2 * SIZE(AO)
  872. ADD c02, t2, c02
  873. unop
  874. MUL a2, b3, t2
  875. LD b3, 2 * SIZE(BO)
  876. ADD c05, t3, c05
  877. unop
  878. MUL a1, b4, t3
  879. LD a1, -2 * SIZE(AO)
  880. ADD c06, t4, c06
  881. unop
  882. MUL a2, b4, t4
  883. LD a2, -1 * SIZE(AO)
  884. ADD c09, t1, c09
  885. LD b4, 3 * SIZE(BO)
  886. MUL a1, b1, t1
  887. lda BO, 4 * SIZE(BO)
  888. .align 4
  889. $L28:
  890. ADD c10, t2, c10
  891. unop
  892. MUL a2, b1, t2
  893. #ifndef TRMMKERNEL
  894. LD a3, 0 * SIZE(C1)
  895. #else
  896. unop
  897. #endif
  898. ADD c13, t3, c13
  899. unop
  900. MUL a1, b2, t3
  901. #ifndef TRMMKERNEL
  902. LD a4, 1 * SIZE(C1)
  903. #else
  904. unop
  905. #endif
  906. ADD c14, t4, c14
  907. unop
  908. MUL a2, b2, t4
  909. #ifndef TRMMKERNEL
  910. LD a5, 0 * SIZE(C2)
  911. #else
  912. unop
  913. #endif
  914. ADD c01, t1, c01
  915. unop
  916. MUL a1, b3, t1
  917. #ifndef TRMMKERNEL
  918. LD b5, 1 * SIZE(C2)
  919. #else
  920. unop
  921. #endif
  922. ADD c02, t2, c02
  923. unop
  924. MUL a2, b3, t2
  925. #ifndef TRMMKERNEL
  926. LD b1, 0 * SIZE(C3)
  927. #else
  928. unop
  929. #endif
  930. ADD c05, t3, c05
  931. unop
  932. MUL a1, b4, t3
  933. #ifndef TRMMKERNEL
  934. LD b2, 1 * SIZE(C3)
  935. #else
  936. unop
  937. #endif
  938. ADD c06, t4, c06
  939. unop
  940. MUL a2, b4, t4
  941. #ifndef TRMMKERNEL
  942. LD b3, 0 * SIZE(C4)
  943. #else
  944. unop
  945. #endif
  946. ADD c09, t1, c09
  947. unop
  948. MUL alpha, c01, c01
  949. #ifndef TRMMKERNEL
  950. LD b4, 1 * SIZE(C4)
  951. #else
  952. unop
  953. #endif
  954. ADD c10, t2, c10
  955. unop
  956. MUL alpha, c02, c02
  957. unop
  958. ADD c13, t3, c13
  959. MUL alpha, c05, c05
  960. ADD c14, t4, c14
  961. MUL alpha, c06, c06
  962. MUL alpha, c09, c09
  963. #ifndef TRMMKERNEL
  964. ADD c01, a3, c01
  965. #endif
  966. MUL alpha, c10, c10
  967. #ifndef TRMMKERNEL
  968. ADD c02, a4, c02
  969. #endif
  970. MUL alpha, c13, c13
  971. #ifndef TRMMKERNEL
  972. ADD c05, a5, c05
  973. #endif
  974. MUL alpha, c14, c14
  975. #ifndef TRMMKERNEL
  976. ADD c06, b5, c06
  977. #endif
  978. #ifndef TRMMKERNEL
  979. ADD c09, b1, c09
  980. unop
  981. #endif
  982. ST c01, 0 * SIZE(C1)
  983. fclr t1
  984. #ifndef TRMMKERNEL
  985. ADD c10, b2, c10
  986. unop
  987. #endif
  988. ST c02, 1 * SIZE(C1)
  989. fclr t2
  990. #ifndef TRMMKERNEL
  991. ADD c13, b3, c13
  992. unop
  993. #endif
  994. ST c05, 0 * SIZE(C2)
  995. fclr t3
  996. #ifndef TRMMKERNEL
  997. ADD c14, b4, c14
  998. unop
  999. #endif
  1000. ST c06, 1 * SIZE(C2)
  1001. fclr t4
  1002. ST c09, 0 * SIZE(C3)
  1003. lda C1, 2 * SIZE(C1)
  1004. ST c10, 1 * SIZE(C3)
  1005. lda C2, 2 * SIZE(C2)
  1006. ST c13, 0 * SIZE(C4)
  1007. lda C3, 2 * SIZE(C3)
  1008. ST c14, 1 * SIZE(C4)
  1009. lda C4, 2 * SIZE(C4)
  1010. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1011. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1012. subq K, KK, TMP1
  1013. #ifdef LEFT
  1014. subq TMP1, 2, TMP1
  1015. #else
  1016. subq TMP1, 4, TMP1
  1017. #endif
  1018. sll TMP1, BASE_SHIFT + 1, TMP2
  1019. addq AO, TMP2, AO
  1020. sll TMP1, BASE_SHIFT + 2, TMP2
  1021. addq BO, TMP2, BO
  1022. #endif
  1023. #if defined(TRMMKERNEL) && defined(LEFT)
  1024. addq KK, 2, KK
  1025. #endif
  1026. .align 4
  1027. $L30:
  1028. and M, 1, I
  1029. ble I, $L39
  1030. #if !defined(TRMMKERNEL) || \
  1031. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1032. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1033. #ifdef TRMMKERNEL
  1034. #ifdef LEFT
  1035. addq KK, 1, TMP1
  1036. #else
  1037. addq KK, 4, TMP1
  1038. #endif
  1039. #endif
  1040. LD a1, 0 * SIZE(AO)
  1041. fclr c01
  1042. LD a2, 1 * SIZE(AO)
  1043. fclr c05
  1044. LD b1, 0 * SIZE(B)
  1045. #ifndef TRMMKERNEL
  1046. lda L, -2(K)
  1047. #else
  1048. lda L, -2(TMP1)
  1049. #endif
  1050. LD b2, 1 * SIZE(B)
  1051. lda AO, 1 * SIZE(AO)
  1052. LD b3, 2 * SIZE(B)
  1053. fclr c09
  1054. LD b4, 3 * SIZE(B)
  1055. fclr c13
  1056. lda BO, 4 * SIZE(B)
  1057. ble L, $L35
  1058. #else
  1059. sll KK, BASE_SHIFT + 0, TMP1
  1060. addq AO, TMP1, AO
  1061. sll KK, BASE_SHIFT + 2, TMP2
  1062. addq B, TMP2, BO
  1063. subq K, KK, TMP1
  1064. LD a1, 0 * SIZE(AO)
  1065. fclr c01
  1066. LD a2, 1 * SIZE(AO)
  1067. fclr c05
  1068. LD b1, 0 * SIZE(BO)
  1069. lda L, -2(TMP1)
  1070. LD b2, 1 * SIZE(BO)
  1071. lda AO, 1 * SIZE(AO)
  1072. LD b3, 2 * SIZE(BO)
  1073. fclr c09
  1074. LD b4, 3 * SIZE(BO)
  1075. fclr c13
  1076. lda BO, 4 * SIZE(BO)
  1077. ble L, $L35
  1078. #endif
  1079. .align 4
  1080. $L32:
  1081. ADD c01, t1, c01
  1082. lda L, -2(L)
  1083. MUL a1, b1, t1
  1084. LD b1, 0 * SIZE(BO)
  1085. ADD c05, t2, c05
  1086. lda AO, 2 * SIZE(AO)
  1087. MUL a1, b2, t2
  1088. LD b2, 1 * SIZE(BO)
  1089. ADD c09, t3, c09
  1090. LD b5, 3 * SIZE(BO)
  1091. MUL a1, b3, t3
  1092. LD b3, 2 * SIZE(BO)
  1093. ADD c13, t4, c13
  1094. MUL a1, b4, t4
  1095. LD a1, -1 * SIZE(AO)
  1096. ADD c01, t1, c01
  1097. MUL a2, b1, t1
  1098. LD b1, 4 * SIZE(BO)
  1099. lda BO, 8 * SIZE(BO)
  1100. ADD c05, t2, c05
  1101. MUL a2, b2, t2
  1102. LD b2, -3 * SIZE(BO)
  1103. ADD c09, t3, c09
  1104. LD b4, -1 * SIZE(BO)
  1105. MUL a2, b3, t3
  1106. LD b3, -2 * SIZE(BO)
  1107. ADD c13, t4, c13
  1108. MUL a2, b5, t4
  1109. LD a2, 0 * SIZE(AO)
  1110. bgt L, $L32
  1111. .align 4
  1112. $L35:
  1113. ADD c01, t1, c01
  1114. ldt alpha, ALPHA
  1115. MUL a1, b1, t1
  1116. #ifndef TRMMKERNEL
  1117. blbs K, $L38
  1118. #else
  1119. blbs TMP1, $L38
  1120. #endif
  1121. .align 4
  1122. ADD c05, t2, c05
  1123. LD b1, 0 * SIZE(BO)
  1124. MUL a1, b2, t2
  1125. LD b2, 1 * SIZE(BO)
  1126. ADD c09, t3, c09
  1127. MUL a1, b3, t3
  1128. LD b3, 2 * SIZE(BO)
  1129. ADD c13, t4, c13
  1130. MUL a1, b4, t4
  1131. LD a1, 0 * SIZE(AO)
  1132. lda AO, 1 * SIZE(AO)
  1133. ADD c01, t1, c01
  1134. LD b4, 3 * SIZE(BO)
  1135. MUL a1, b1, t1
  1136. lda BO, 4 * SIZE(BO)
  1137. .align 4
  1138. $L38:
  1139. ADD c05, t2, c05
  1140. unop
  1141. MUL a1, b2, t2
  1142. #ifndef TRMMKERNEL
  1143. LD a5, 0 * SIZE(C1)
  1144. #else
  1145. unop
  1146. #endif
  1147. ADD c09, t3, c09
  1148. unop
  1149. MUL a1, b3, t3
  1150. #ifndef TRMMKERNEL
  1151. LD b5, 0 * SIZE(C2)
  1152. #else
  1153. unop
  1154. #endif
  1155. ADD c13, t4, c13
  1156. unop
  1157. MUL a1, b4, t4
  1158. #ifndef TRMMKERNEL
  1159. LD a2, 0 * SIZE(C3)
  1160. #else
  1161. unop
  1162. #endif
  1163. ADD c01, t1, c01
  1164. unop
  1165. MUL alpha, c01, c01
  1166. #ifndef TRMMKERNEL
  1167. LD a3, 0 * SIZE(C4)
  1168. #else
  1169. unop
  1170. #endif
  1171. ADD c05, t2, c05
  1172. unop
  1173. MUL alpha, c05, c05
  1174. unop
  1175. ADD c09, t3, c09
  1176. MUL alpha, c09, c09
  1177. ADD c13, t4, c13
  1178. MUL alpha, c13, c13
  1179. #ifndef TRMMKERNEL
  1180. ADD c01, a5, c01
  1181. ADD c05, b5, c05
  1182. ADD c09, a2, c09
  1183. ADD c13, a3, c13
  1184. #endif
  1185. ST c01, 0 * SIZE(C1)
  1186. ST c05, 0 * SIZE(C2)
  1187. ST c09, 0 * SIZE(C3)
  1188. ST c13, 0 * SIZE(C4)
  1189. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1190. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1191. subq K, KK, TMP1
  1192. #ifdef LEFT
  1193. subq TMP1, 1, TMP1
  1194. #else
  1195. subq TMP1, 4, TMP1
  1196. #endif
  1197. sll TMP1, BASE_SHIFT + 0, TMP2
  1198. addq AO, TMP2, AO
  1199. sll TMP1, BASE_SHIFT + 2, TMP2
  1200. addq BO, TMP2, BO
  1201. #endif
  1202. #if defined(TRMMKERNEL) && defined(LEFT)
  1203. addq KK, 1, KK
  1204. #endif
  1205. .align 4
  1206. $L39:
  1207. mov BO, B
  1208. lda J, -1(J)
  1209. #if defined(TRMMKERNEL) && !defined(LEFT)
  1210. addq KK, 4, KK
  1211. #else
  1212. unop
  1213. #endif
  1214. bgt J, $L01
  1215. .align 4
  1216. $L40:
  1217. and N, 2, J
  1218. ble J, $L80
  1219. mov C, C1
  1220. addq C, LDC, C2
  1221. mov A, AO
  1222. fclr t1
  1223. addq C2, LDC, C
  1224. fclr t2
  1225. #if defined(TRMMKERNEL) && defined(LEFT)
  1226. mov OFFSET, KK
  1227. #endif
  1228. sra M, 2, I
  1229. fclr t3
  1230. fclr t4
  1231. ble I, $L60
  1232. .align 4
  1233. $L51:
  1234. #if !defined(TRMMKERNEL) || \
  1235. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1236. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1237. #ifdef TRMMKERNEL
  1238. #ifdef LEFT
  1239. addq KK, 4, TMP1
  1240. #else
  1241. addq KK, 2, TMP1
  1242. #endif
  1243. #endif
  1244. LD a1, 0 * SIZE(AO)
  1245. fclr c03
  1246. LD a2, 1 * SIZE(AO)
  1247. fclr c07
  1248. LD a3, 2 * SIZE(AO)
  1249. fclr c04
  1250. LD a4, 3 * SIZE(AO)
  1251. fclr c08
  1252. LD b1, 0 * SIZE(B)
  1253. fclr c01
  1254. LD b2, 1 * SIZE(B)
  1255. fclr c05
  1256. LD b3, 2 * SIZE(B)
  1257. fclr c02
  1258. LD b4, 3 * SIZE(B)
  1259. fclr c06
  1260. #ifndef TRMMKERNEL
  1261. lda L, -2(K)
  1262. #else
  1263. lda L, -2(TMP1)
  1264. #endif
  1265. lda BO, 2 * SIZE(B)
  1266. lda AO, 4 * SIZE(AO)
  1267. ble L, $L55
  1268. #else
  1269. sll KK, BASE_SHIFT + 2, TMP1
  1270. addq AO, TMP1, AO
  1271. sll KK, BASE_SHIFT + 1, TMP2
  1272. addq B, TMP2, BO
  1273. subq K, KK, TMP1
  1274. LD a1, 0 * SIZE(AO)
  1275. fclr c03
  1276. LD a2, 1 * SIZE(AO)
  1277. fclr c07
  1278. LD a3, 2 * SIZE(AO)
  1279. fclr c04
  1280. LD a4, 3 * SIZE(AO)
  1281. fclr c08
  1282. LD b1, 0 * SIZE(BO)
  1283. fclr c01
  1284. LD b2, 1 * SIZE(BO)
  1285. fclr c05
  1286. LD b3, 2 * SIZE(BO)
  1287. fclr c02
  1288. LD b4, 3 * SIZE(BO)
  1289. fclr c06
  1290. lda L, -2(TMP1)
  1291. lda BO, 2 * SIZE(BO)
  1292. lda AO, 4 * SIZE(AO)
  1293. ble L, $L55
  1294. #endif
  1295. .align 4
  1296. $L52:
  1297. ADD c05, t1, c05
  1298. unop
  1299. MUL a1, b1, t1
  1300. unop
  1301. ADD c06, t2, c06
  1302. lda L, -2(L)
  1303. MUL a2, b1, t2
  1304. unop
  1305. ADD c07, t3, c07
  1306. unop
  1307. MUL a3, b1, t3
  1308. unop
  1309. ADD c08, t4, c08
  1310. unop
  1311. MUL a4, b1, t4
  1312. LD b1, 2 * SIZE(BO)
  1313. ADD c01, t1, c01
  1314. unop
  1315. MUL a1, b2, t1
  1316. LD a1, 0 * SIZE(AO)
  1317. ADD c02, t2, c02
  1318. lda BO, 4 * SIZE(BO)
  1319. MUL a2, b2, t2
  1320. LD a2, 1 * SIZE(AO)
  1321. ADD c03, t3, c03
  1322. unop
  1323. MUL a3, b2, t3
  1324. LD a3, 2 * SIZE(AO)
  1325. ADD c04, t4, c04
  1326. unop
  1327. MUL a4, b2, t4
  1328. LD a5, 3 * SIZE(AO)
  1329. ADD c05, t1, c05
  1330. unop
  1331. MUL a1, b3, t1
  1332. LD b2, -1 * SIZE(BO)
  1333. ADD c06, t2, c06
  1334. unop
  1335. MUL a2, b3, t2
  1336. unop
  1337. ADD c07, t3, c07
  1338. unop
  1339. MUL a3, b3, t3
  1340. lda AO, 8 * SIZE(AO)
  1341. ADD c08, t4, c08
  1342. unop
  1343. MUL a5, b3, t4
  1344. LD b3, 0 * SIZE(BO)
  1345. ADD c01, t1, c01
  1346. unop
  1347. MUL a1, b4, t1
  1348. LD a1, -4 * SIZE(AO)
  1349. ADD c02, t2, c02
  1350. unop
  1351. MUL a2, b4, t2
  1352. LD a2, -3 * SIZE(AO)
  1353. ADD c03, t3, c03
  1354. LD a4, -1 * SIZE(AO)
  1355. MUL a3, b4, t3
  1356. LD a3, -2 * SIZE(AO)
  1357. ADD c04, t4, c04
  1358. MUL a5, b4, t4
  1359. LD b4, 1 * SIZE(BO)
  1360. bgt L, $L52
  1361. .align 4
  1362. $L55:
  1363. ADD c05, t1, c05
  1364. ldt alpha, ALPHA
  1365. MUL a1, b1, t1
  1366. #ifndef TRMMKERNEL
  1367. blbs K, $L58
  1368. #else
  1369. blbs TMP1, $L58
  1370. #endif
  1371. .align 4
  1372. ADD c06, t2, c06
  1373. MUL a2, b1, t2
  1374. ADD c07, t3, c07
  1375. MUL a3, b1, t3
  1376. ADD c08, t4, c08
  1377. unop
  1378. MUL a4, b1, t4
  1379. LD b1, 0 * SIZE(BO)
  1380. ADD c01, t1, c01
  1381. unop
  1382. MUL a1, b2, t1
  1383. LD a1, 0 * SIZE(AO)
  1384. ADD c02, t2, c02
  1385. unop
  1386. MUL a2, b2, t2
  1387. LD a2, 1 * SIZE(AO)
  1388. ADD c03, t3, c03
  1389. unop
  1390. MUL a3, b2, t3
  1391. LD a3, 2 * SIZE(AO)
  1392. ADD c04, t4, c04
  1393. MUL a4, b2, t4
  1394. LD a4, 3 * SIZE(AO)
  1395. lda AO, 4 * SIZE(AO)
  1396. ADD c05, t1, c05
  1397. LD b2, 1 * SIZE(BO)
  1398. MUL a1, b1, t1
  1399. lda BO, 2 * SIZE(BO)
  1400. .align 4
  1401. $L58:
  1402. ADD c06, t2, c06
  1403. unop
  1404. MUL a2, b1, t2
  1405. #ifndef TRMMKERNEL
  1406. LD c09, 0 * SIZE(C1)
  1407. #else
  1408. unop
  1409. #endif
  1410. ADD c07, t3, c07
  1411. unop
  1412. MUL a3, b1, t3
  1413. #ifndef TRMMKERNEL
  1414. LD c10, 1 * SIZE(C1)
  1415. #else
  1416. unop
  1417. #endif
  1418. ADD c08, t4, c08
  1419. unop
  1420. MUL a4, b1, t4
  1421. #ifndef TRMMKERNEL
  1422. LD c11, 2 * SIZE(C1)
  1423. #else
  1424. unop
  1425. #endif
  1426. ADD c01, t1, c01
  1427. unop
  1428. MUL a1, b2, t1
  1429. #ifndef TRMMKERNEL
  1430. LD c12, 3 * SIZE(C1)
  1431. #else
  1432. unop
  1433. #endif
  1434. ADD c02, t2, c02
  1435. unop
  1436. MUL a2, b2, t2
  1437. #ifndef TRMMKERNEL
  1438. LD c13, 0 * SIZE(C2)
  1439. unop
  1440. #endif
  1441. ADD c03, t3, c03
  1442. unop
  1443. MUL a3, b2, t3
  1444. #ifndef TRMMKERNEL
  1445. LD c14, 1 * SIZE(C2)
  1446. #else
  1447. unop
  1448. #endif
  1449. ADD c04, t4, c04
  1450. unop
  1451. MUL a4, b2, t4
  1452. #ifndef TRMMKERNEL
  1453. LD c15, 2 * SIZE(C2)
  1454. #else
  1455. unop
  1456. #endif
  1457. ADD c05, t1, c05
  1458. unop
  1459. MUL alpha, c01, c01
  1460. #ifndef TRMMKERNEL
  1461. LD c16, 3 * SIZE(C2)
  1462. #else
  1463. unop
  1464. #endif
  1465. ADD c06, t2, c06
  1466. lda I, -1(I)
  1467. MUL alpha, c02, c02
  1468. unop
  1469. ADD c07, t3, c07
  1470. MUL alpha, c03, c03
  1471. ADD c08, t4, c08
  1472. MUL alpha, c04, c04
  1473. MUL alpha, c05, c05
  1474. #ifndef TRMMKERNEL
  1475. ADD c01, c09, c01
  1476. #endif
  1477. MUL alpha, c06, c06
  1478. #ifndef TRMMKERNEL
  1479. ADD c02, c10, c02
  1480. #endif
  1481. MUL alpha, c07, c07
  1482. #ifndef TRMMKERNEL
  1483. ADD c03, c11, c03
  1484. #endif
  1485. MUL alpha, c08, c08
  1486. #ifndef TRMMKERNEL
  1487. ADD c04, c12, c04
  1488. #endif
  1489. #ifndef TRMMKERNEL
  1490. ADD c05, c13, c05
  1491. #endif
  1492. ST c01, 0 * SIZE(C1)
  1493. #ifndef TRMMKERNEL
  1494. ADD c06, c14, c06
  1495. #endif
  1496. ST c02, 1 * SIZE(C1)
  1497. #ifndef TRMMKERNEL
  1498. ADD c07, c15, c07
  1499. #endif
  1500. ST c03, 2 * SIZE(C1)
  1501. #ifndef TRMMKERNEL
  1502. ADD c08, c16, c08
  1503. #endif
  1504. ST c04, 3 * SIZE(C1)
  1505. ST c05, 0 * SIZE(C2)
  1506. fclr t1
  1507. ST c06, 1 * SIZE(C2)
  1508. fclr t2
  1509. ST c07, 2 * SIZE(C2)
  1510. fclr t3
  1511. ST c08, 3 * SIZE(C2)
  1512. fclr t4
  1513. lda C1, 4 * SIZE(C1)
  1514. lda C2, 4 * SIZE(C2)
  1515. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1516. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1517. subq K, KK, TMP1
  1518. #ifdef LEFT
  1519. subq TMP1, 4, TMP1
  1520. #else
  1521. subq TMP1, 2, TMP1
  1522. #endif
  1523. sll TMP1, BASE_SHIFT + 2, TMP2
  1524. addq AO, TMP2, AO
  1525. sll TMP1, BASE_SHIFT + 1, TMP2
  1526. addq BO, TMP2, BO
  1527. #endif
  1528. #if defined(TRMMKERNEL) && defined(LEFT)
  1529. addq KK, 4, KK
  1530. #endif
  1531. bgt I, $L51
  1532. .align 4
  1533. $L60:
  1534. and M, 2, I
  1535. ble I, $L70
  1536. #if !defined(TRMMKERNEL) || \
  1537. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1538. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1539. #ifdef TRMMKERNEL
  1540. #ifdef LEFT
  1541. addq KK, 2, TMP1
  1542. #else
  1543. addq KK, 2, TMP1
  1544. #endif
  1545. #endif
  1546. LD a1, 0 * SIZE(AO)
  1547. fclr c01
  1548. LD a2, 1 * SIZE(AO)
  1549. fclr c05
  1550. LD a3, 2 * SIZE(AO)
  1551. fclr c02
  1552. LD a4, 3 * SIZE(AO)
  1553. fclr c06
  1554. LD b1, 0 * SIZE(B)
  1555. #ifndef TRMMKERNEL
  1556. lda L, -2(K)
  1557. #else
  1558. lda L, -2(TMP1)
  1559. #endif
  1560. LD b2, 1 * SIZE(B)
  1561. lda AO, 2 * SIZE(AO)
  1562. LD b3, 2 * SIZE(B)
  1563. LD b4, 3 * SIZE(B)
  1564. lda BO, 2 * SIZE(B)
  1565. ble L, $L65
  1566. #else
  1567. sll KK, BASE_SHIFT + 1, TMP1
  1568. addq AO, TMP1, AO
  1569. sll KK, BASE_SHIFT + 1, TMP2
  1570. addq B, TMP2, BO
  1571. subq K, KK, TMP1
  1572. LD a1, 0 * SIZE(AO)
  1573. fclr c01
  1574. LD a2, 1 * SIZE(AO)
  1575. fclr c05
  1576. LD a3, 2 * SIZE(AO)
  1577. fclr c02
  1578. LD a4, 3 * SIZE(AO)
  1579. fclr c06
  1580. LD b1, 0 * SIZE(BO)
  1581. lda L, -2(TMP1)
  1582. LD b2, 1 * SIZE(BO)
  1583. lda AO, 2 * SIZE(AO)
  1584. LD b3, 2 * SIZE(BO)
  1585. LD b4, 3 * SIZE(BO)
  1586. lda BO, 2 * SIZE(BO)
  1587. ble L, $L65
  1588. #endif
  1589. .align 4
  1590. $L62:
  1591. ADD c01, t1, c01
  1592. unop
  1593. MUL a1, b1, t1
  1594. unop
  1595. ADD c02, t2, c02
  1596. lda AO, 4 * SIZE(AO)
  1597. MUL a2, b1, t2
  1598. LD b1, 2 * SIZE(BO)
  1599. ADD c05, t3, c05
  1600. lda L, -2(L)
  1601. MUL a1, b2, t3
  1602. LD a1, -2 * SIZE(AO)
  1603. ADD c06, t4, c06
  1604. unop
  1605. MUL a2, b2, t4
  1606. LD a2, -1 * SIZE(AO)
  1607. ADD c01, t1, c01
  1608. LD b2, 3 * SIZE(BO)
  1609. MUL a3, b3, t1
  1610. lda BO, 4 * SIZE(BO)
  1611. ADD c02, t2, c02
  1612. unop
  1613. MUL a4, b3, t2
  1614. LD b3, 0 * SIZE(BO)
  1615. ADD c05, t3, c05
  1616. unop
  1617. MUL a3, b4, t3
  1618. LD a3, 0 * SIZE(AO)
  1619. ADD c06, t4, c06
  1620. MUL a4, b4, t4
  1621. LD b4, 1 * SIZE(BO)
  1622. unop
  1623. LD a4, 1 * SIZE(AO)
  1624. unop
  1625. unop
  1626. bgt L, $L62
  1627. .align 4
  1628. $L65:
  1629. ADD c01, t1, c01
  1630. ldt alpha, ALPHA
  1631. MUL a1, b1, t1
  1632. #ifndef TRMMKERNEL
  1633. blbs K, $L68
  1634. #else
  1635. blbs TMP1, $L68
  1636. #endif
  1637. .align 4
  1638. ADD c02, t2, c02
  1639. unop
  1640. MUL a2, b1, t2
  1641. LD b1, 0 * SIZE(BO)
  1642. ADD c05, t3, c05
  1643. lda BO, 2 * SIZE(BO)
  1644. MUL a1, b2, t3
  1645. LD a1, 0 * SIZE(AO)
  1646. ADD c06, t4, c06
  1647. unop
  1648. MUL a2, b2, t4
  1649. LD a2, 1 * SIZE(AO)
  1650. ADD c01, t1, c01
  1651. LD b2, -1 * SIZE(BO)
  1652. MUL a1, b1, t1
  1653. lda AO, 2 * SIZE(AO)
  1654. .align 4
  1655. $L68:
  1656. ADD c02, t2, c02
  1657. unop
  1658. MUL a2, b1, t2
  1659. #ifndef TRMMKERNEL
  1660. LD c09, 0 * SIZE(C1)
  1661. #else
  1662. unop
  1663. #endif
  1664. ADD c05, t3, c05
  1665. unop
  1666. MUL a1, b2, t3
  1667. #ifndef TRMMKERNEL
  1668. LD c10, 1 * SIZE(C1)
  1669. #else
  1670. unop
  1671. #endif
  1672. ADD c06, t4, c06
  1673. unop
  1674. MUL a2, b2, t4
  1675. #ifndef TRMMKERNEL
  1676. LD c11, 0 * SIZE(C2)
  1677. #else
  1678. unop
  1679. #endif
  1680. ADD c01, t1, c01
  1681. unop
  1682. MUL alpha, c01, c01
  1683. #ifndef TRMMKERNEL
  1684. LD c12, 1 * SIZE(C2)
  1685. #else
  1686. unop
  1687. #endif
  1688. ADD c02, t2, c02
  1689. lda C1, 2 * SIZE(C1)
  1690. MUL alpha, c02, c02
  1691. lda C2, 2 * SIZE(C2)
  1692. ADD c05, t3, c05
  1693. MUL alpha, c05, c05
  1694. ADD c06, t4, c06
  1695. MUL alpha, c06, c06
  1696. #ifndef TRMMKERNEL
  1697. ADD c01, c09, c01
  1698. ADD c02, c10, c02
  1699. ADD c05, c11, c05
  1700. ADD c06, c12, c06
  1701. #endif
  1702. ST c01, -2 * SIZE(C1)
  1703. fclr t1
  1704. ST c02, -1 * SIZE(C1)
  1705. fclr t2
  1706. ST c05, -2 * SIZE(C2)
  1707. fclr t3
  1708. ST c06, -1 * SIZE(C2)
  1709. fclr t4
  1710. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1711. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1712. subq K, KK, TMP1
  1713. #ifdef LEFT
  1714. subq TMP1, 2, TMP1
  1715. #else
  1716. subq TMP1, 2, TMP1
  1717. #endif
  1718. sll TMP1, BASE_SHIFT + 1, TMP2
  1719. addq AO, TMP2, AO
  1720. sll TMP1, BASE_SHIFT + 1, TMP2
  1721. addq BO, TMP2, BO
  1722. #endif
  1723. #if defined(TRMMKERNEL) && defined(LEFT)
  1724. addq KK, 2, KK
  1725. #endif
  1726. .align 4
  1727. $L70:
  1728. and M, 1, I
  1729. ble I, $L79
  1730. #if !defined(TRMMKERNEL) || \
  1731. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1732. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1733. #ifdef TRMMKERNEL
  1734. #ifdef LEFT
  1735. addq KK, 1, TMP1
  1736. #else
  1737. addq KK, 2, TMP1
  1738. #endif
  1739. #endif
  1740. LD a1, 0 * SIZE(AO)
  1741. fclr c01
  1742. LD a2, 1 * SIZE(AO)
  1743. fclr c05
  1744. LD b1, 0 * SIZE(B)
  1745. fclr c02
  1746. LD b2, 1 * SIZE(B)
  1747. fclr c06
  1748. #ifndef TRMMKERNEL
  1749. lda L, -2(K)
  1750. #else
  1751. lda L, -2(TMP1)
  1752. #endif
  1753. LD b3, 2 * SIZE(B)
  1754. lda AO, 1 * SIZE(AO)
  1755. LD b4, 3 * SIZE(B)
  1756. lda BO, 2 * SIZE(B)
  1757. ble L, $L75
  1758. #else
  1759. sll KK, BASE_SHIFT + 0, TMP1
  1760. addq AO, TMP1, AO
  1761. sll KK, BASE_SHIFT + 1, TMP2
  1762. addq B, TMP2, BO
  1763. subq K, KK, TMP1
  1764. LD a1, 0 * SIZE(AO)
  1765. fclr c01
  1766. LD a2, 1 * SIZE(AO)
  1767. fclr c05
  1768. LD b1, 0 * SIZE(BO)
  1769. fclr c02
  1770. LD b2, 1 * SIZE(BO)
  1771. fclr c06
  1772. #ifndef TRMMKERNEL
  1773. lda L, -2(K)
  1774. #else
  1775. lda L, -2(TMP1)
  1776. #endif
  1777. LD b3, 2 * SIZE(BO)
  1778. lda AO, 1 * SIZE(AO)
  1779. LD b4, 3 * SIZE(BO)
  1780. lda BO, 2 * SIZE(BO)
  1781. ble L, $L75
  1782. #endif
  1783. .align 4
  1784. $L72:
  1785. ADD c01, t1, c01
  1786. lda L, -2(L)
  1787. MUL a1, b1, t1
  1788. LD b1, 2 * SIZE(BO)
  1789. ADD c05, t2, c05
  1790. MUL a1, b2, t2
  1791. LD a1, 1 * SIZE(AO)
  1792. LD b2, 3 * SIZE(BO)
  1793. ADD c02, t3, c02
  1794. lda AO, 2 * SIZE(AO)
  1795. MUL a2, b3, t3
  1796. LD b3, 4 * SIZE(BO)
  1797. ADD c06, t4, c06
  1798. MUL a2, b4, t4
  1799. LD a2, 0 * SIZE(AO)
  1800. LD b4, 5 * SIZE(BO)
  1801. lda BO, 4 * SIZE(BO)
  1802. unop
  1803. unop
  1804. bgt L, $L72
  1805. .align 4
  1806. $L75:
  1807. ADD c01, t1, c01
  1808. ldt alpha, ALPHA
  1809. MUL a1, b1, t1
  1810. #ifndef TRMMKERNEL
  1811. blbs K, $L78
  1812. #else
  1813. blbs TMP1, $L78
  1814. #endif
  1815. .align 4
  1816. ADD c05, t2, c05
  1817. MUL a1, b2, t2
  1818. LD a1, 0 * SIZE(AO)
  1819. LD b1, 0 * SIZE(BO)
  1820. ADD c01, t1, c01
  1821. LD b2, 1 * SIZE(BO)
  1822. lda AO, 1 * SIZE(AO)
  1823. MUL a1, b1, t1
  1824. lda BO, 2 * SIZE(BO)
  1825. .align 4
  1826. $L78:
  1827. ADD c05, t2, c05
  1828. MUL a1, b2, t2
  1829. #ifndef TRMMKERNEL
  1830. LD a5, 0 * SIZE(C1)
  1831. #else
  1832. unop
  1833. #endif
  1834. ADD c02, t3, c02
  1835. ADD c06, t4, c06
  1836. #ifndef TRMMKERNEL
  1837. LD b5, 0 * SIZE(C2)
  1838. #else
  1839. unop
  1840. #endif
  1841. ADD c01, c02, c01
  1842. ADD c05, c06, c05
  1843. ADD c01, t1, c01
  1844. ADD c05, t2, c05
  1845. MUL alpha, c01, c01
  1846. MUL alpha, c05, c05
  1847. #ifndef TRMMKERNEL
  1848. ADD c01, a5, c01
  1849. ADD c05, b5, c05
  1850. #endif
  1851. ST c01, 0 * SIZE(C1)
  1852. ST c05, 0 * SIZE(C2)
  1853. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1854. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1855. subq K, KK, TMP1
  1856. #ifdef LEFT
  1857. subq TMP1, 1, TMP1
  1858. #else
  1859. subq TMP1, 2, TMP1
  1860. #endif
  1861. sll TMP1, BASE_SHIFT + 0, TMP2
  1862. addq AO, TMP2, AO
  1863. sll TMP1, BASE_SHIFT + 1, TMP2
  1864. addq BO, TMP2, BO
  1865. #endif
  1866. #if defined(TRMMKERNEL) && defined(LEFT)
  1867. addq KK, 1, KK
  1868. #endif
  1869. .align 4
  1870. $L79:
  1871. mov BO, B
  1872. #if defined(TRMMKERNEL) && !defined(LEFT)
  1873. addq KK, 2, KK
  1874. #else
  1875. unop
  1876. #endif
  1877. unop
  1878. unop
  1879. .align 4
  1880. $L80:
  1881. and N, 1, J
  1882. ble J, $L999
  1883. mov C, C1
  1884. mov A, AO
  1885. #if defined(TRMMKERNEL) && defined(LEFT)
  1886. mov OFFSET, KK
  1887. #endif
  1888. sra M, 2, I
  1889. ble I, $L100
  1890. .align 4
  1891. $L91:
  1892. #if !defined(TRMMKERNEL) || \
  1893. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1894. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1895. #ifdef TRMMKERNEL
  1896. #ifdef LEFT
  1897. addq KK, 4, TMP1
  1898. #else
  1899. addq KK, 1, TMP1
  1900. #endif
  1901. #endif
  1902. LD a1, 0 * SIZE(AO)
  1903. fclr t1
  1904. LD a2, 1 * SIZE(AO)
  1905. fclr t2
  1906. LD a3, 2 * SIZE(AO)
  1907. fclr t3
  1908. LD a4, 3 * SIZE(AO)
  1909. fclr t4
  1910. LD b1, 0 * SIZE(B)
  1911. fclr c01
  1912. LD b2, 1 * SIZE(B)
  1913. fclr c02
  1914. LD b3, 2 * SIZE(B)
  1915. fclr c03
  1916. LD b4, 3 * SIZE(B)
  1917. fclr c04
  1918. #ifndef TRMMKERNEL
  1919. sra K, 2, L
  1920. #else
  1921. sra TMP1, 2, L
  1922. #endif
  1923. mov B, BO
  1924. unop
  1925. ble L, $L95
  1926. #else
  1927. sll KK, BASE_SHIFT + 2, TMP1
  1928. addq AO, TMP1, AO
  1929. sll KK, BASE_SHIFT + 0, TMP2
  1930. addq B, TMP2, BO
  1931. subq K, KK, TMP1
  1932. LD a1, 0 * SIZE(AO)
  1933. fclr t1
  1934. LD a2, 1 * SIZE(AO)
  1935. fclr t2
  1936. LD a3, 2 * SIZE(AO)
  1937. fclr t3
  1938. LD a4, 3 * SIZE(AO)
  1939. fclr t4
  1940. LD b1, 0 * SIZE(BO)
  1941. fclr c01
  1942. LD b2, 1 * SIZE(BO)
  1943. fclr c02
  1944. LD b3, 2 * SIZE(BO)
  1945. fclr c03
  1946. LD b4, 3 * SIZE(BO)
  1947. fclr c04
  1948. #ifndef TRMMKERNEL
  1949. sra K, 2, L
  1950. #else
  1951. sra TMP1, 2, L
  1952. #endif
  1953. unop
  1954. ble L, $L95
  1955. #endif
  1956. .align 5
  1957. $L92:
  1958. ADD c01, t1, c01
  1959. unop
  1960. MUL a1, b1, t1
  1961. LD a1, 4 * SIZE(AO)
  1962. ADD c02, t2, c02
  1963. lda L, -1(L)
  1964. MUL a2, b1, t2
  1965. LD a2, 5 * SIZE(AO)
  1966. ADD c03, t3, c03
  1967. unop
  1968. MUL a3, b1, t3
  1969. LD a3, 6 * SIZE(AO)
  1970. ADD c04, t4, c04
  1971. MUL a4, b1, t4
  1972. LD a4, 7 * SIZE(AO)
  1973. LD b1, 4 * SIZE(BO)
  1974. ADD c01, t1, c01
  1975. unop
  1976. MUL a1, b2, t1
  1977. LD a1, 8 * SIZE(AO)
  1978. ADD c02, t2, c02
  1979. unop
  1980. MUL a2, b2, t2
  1981. LD a2, 9 * SIZE(AO)
  1982. ADD c03, t3, c03
  1983. unop
  1984. MUL a3, b2, t3
  1985. LD a3, 10 * SIZE(AO)
  1986. ADD c04, t4, c04
  1987. MUL a4, b2, t4
  1988. LD a4, 11 * SIZE(AO)
  1989. LD b2, 5 * SIZE(BO)
  1990. ADD c01, t1, c01
  1991. unop
  1992. MUL a1, b3, t1
  1993. LD a1, 12 * SIZE(AO)
  1994. ADD c02, t2, c02
  1995. unop
  1996. MUL a2, b3, t2
  1997. LD a2, 13 * SIZE(AO)
  1998. ADD c03, t3, c03
  1999. unop
  2000. MUL a3, b3, t3
  2001. LD a3, 14 * SIZE(AO)
  2002. ADD c04, t4, c04
  2003. MUL a4, b3, t4
  2004. LD a5, 15 * SIZE(AO)
  2005. LD b3, 6 * SIZE(BO)
  2006. ADD c01, t1, c01
  2007. MUL a1, b4, t1
  2008. LD a1, 16 * SIZE(AO)
  2009. lda AO, 16 * SIZE(AO)
  2010. ADD c02, t2, c02
  2011. lda BO, 4 * SIZE(BO)
  2012. MUL a2, b4, t2
  2013. LD a2, 1 * SIZE(AO)
  2014. ADD c03, t3, c03
  2015. LD a4, 3 * SIZE(AO)
  2016. MUL a3, b4, t3
  2017. LD a3, 2 * SIZE(AO)
  2018. ADD c04, t4, c04
  2019. MUL a5, b4, t4
  2020. LD b4, 3 * SIZE(BO)
  2021. bgt L, $L92
  2022. .align 4
  2023. $L95:
  2024. #ifndef TRMMKERNEL
  2025. and K, 3, L
  2026. #else
  2027. and TMP1, 3, L
  2028. #endif
  2029. ldt alpha, ALPHA
  2030. unop
  2031. ble L, $L98
  2032. .align 4
  2033. $L96:
  2034. ADD c01, t1, c01
  2035. lda L, -1(L)
  2036. MUL a1, b1, t1
  2037. LD a1, 4 * SIZE(AO)
  2038. ADD c02, t2, c02
  2039. lda BO, 1 * SIZE(BO)
  2040. MUL a2, b1, t2
  2041. LD a2, 5 * SIZE(AO)
  2042. ADD c03, t3, c03
  2043. unop
  2044. MUL a3, b1, t3
  2045. LD a3, 6 * SIZE(AO)
  2046. ADD c04, t4, c04
  2047. MUL a4, b1, t4
  2048. LD a4, 7 * SIZE(AO)
  2049. LD b1, 0 * SIZE(BO)
  2050. lda AO, 4 * SIZE(AO)
  2051. bgt L, $L96
  2052. .align 4
  2053. $L98:
  2054. #ifndef TRMMKERNEL
  2055. ADD c01, t1, c01
  2056. LD c05, 0 * SIZE(C1)
  2057. ADD c02, t2, c02
  2058. LD c06, 1 * SIZE(C1)
  2059. ADD c03, t3, c03
  2060. LD c07, 2 * SIZE(C1)
  2061. ADD c04, t4, c04
  2062. LD c08, 3 * SIZE(C1)
  2063. #else
  2064. ADD c01, t1, c01
  2065. ADD c02, t2, c02
  2066. ADD c03, t3, c03
  2067. ADD c04, t4, c04
  2068. #endif
  2069. MUL alpha, c01, c01
  2070. MUL alpha, c02, c02
  2071. MUL alpha, c03, c03
  2072. MUL alpha, c04, c04
  2073. #ifndef TRMMKERNEL
  2074. ADD c01, c05, c01
  2075. ADD c02, c06, c02
  2076. ADD c03, c07, c03
  2077. ADD c04, c08, c04
  2078. #endif
  2079. ST c01, 0 * SIZE(C1)
  2080. ST c02, 1 * SIZE(C1)
  2081. ST c03, 2 * SIZE(C1)
  2082. ST c04, 3 * SIZE(C1)
  2083. lda C1, 4 * SIZE(C1)
  2084. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2085. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2086. subq K, KK, TMP1
  2087. #ifdef LEFT
  2088. subq TMP1, 4, TMP1
  2089. #else
  2090. subq TMP1, 1, TMP1
  2091. #endif
  2092. sll TMP1, BASE_SHIFT + 2, TMP2
  2093. addq AO, TMP2, AO
  2094. sll TMP1, BASE_SHIFT + 0, TMP2
  2095. addq BO, TMP2, BO
  2096. #endif
  2097. #if defined(TRMMKERNEL) && defined(LEFT)
  2098. addq KK, 4, KK
  2099. #endif
  2100. lda I, -1(I)
  2101. bgt I, $L91
  2102. .align 4
  2103. $L100:
  2104. and M, 2, I
  2105. unop
  2106. unop
  2107. ble I, $L110
  2108. .align 4
  2109. $L101:
  2110. #if !defined(TRMMKERNEL) || \
  2111. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2112. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2113. #ifdef TRMMKERNEL
  2114. #ifdef LEFT
  2115. addq KK, 2, TMP1
  2116. #else
  2117. addq KK, 1, TMP1
  2118. #endif
  2119. #endif
  2120. LD a1, 0 * SIZE(AO)
  2121. fclr t1
  2122. LD a2, 1 * SIZE(AO)
  2123. fclr t2
  2124. LD a3, 2 * SIZE(AO)
  2125. fclr t3
  2126. LD a4, 3 * SIZE(AO)
  2127. fclr t4
  2128. LD b1, 0 * SIZE(B)
  2129. fclr c01
  2130. LD b2, 1 * SIZE(B)
  2131. fclr c02
  2132. LD b3, 2 * SIZE(B)
  2133. fclr c03
  2134. LD b4, 3 * SIZE(B)
  2135. fclr c04
  2136. #ifndef TRMMKERNEL
  2137. sra K, 2, L
  2138. #else
  2139. sra TMP1, 2, L
  2140. #endif
  2141. mov B, BO
  2142. unop
  2143. ble L, $L105
  2144. #else
  2145. sll KK, BASE_SHIFT + 1, TMP1
  2146. addq AO, TMP1, AO
  2147. sll KK, BASE_SHIFT + 0, TMP2
  2148. addq B, TMP2, BO
  2149. subq K, KK, TMP1
  2150. LD a1, 0 * SIZE(AO)
  2151. fclr t1
  2152. LD a2, 1 * SIZE(AO)
  2153. fclr t2
  2154. LD a3, 2 * SIZE(AO)
  2155. fclr t3
  2156. LD a4, 3 * SIZE(AO)
  2157. fclr t4
  2158. LD b1, 0 * SIZE(BO)
  2159. fclr c01
  2160. LD b2, 1 * SIZE(BO)
  2161. fclr c02
  2162. LD b3, 2 * SIZE(BO)
  2163. fclr c03
  2164. LD b4, 3 * SIZE(BO)
  2165. fclr c04
  2166. #ifndef TRMMKERNEL
  2167. sra K, 2, L
  2168. #else
  2169. sra TMP1, 2, L
  2170. #endif
  2171. unop
  2172. ble L, $L105
  2173. #endif
  2174. .align 5
  2175. $L102:
  2176. ADD c01, t1, c01
  2177. lda L, -1(L)
  2178. MUL a1, b1, t1
  2179. LD a1, 4 * SIZE(AO)
  2180. ADD c02, t2, c02
  2181. MUL a2, b1, t2
  2182. LD a2, 5 * SIZE(AO)
  2183. LD b1, 4 * SIZE(BO)
  2184. ADD c03, t3, c03
  2185. lda BO, 4 * SIZE(BO)
  2186. MUL a3, b2, t3
  2187. LD a3, 6 * SIZE(AO)
  2188. ADD c04, t4, c04
  2189. MUL a4, b2, t4
  2190. LD a5, 7 * SIZE(AO)
  2191. LD b2, 1 * SIZE(BO)
  2192. ADD c01, t1, c01
  2193. MUL a1, b3, t1
  2194. LD a1, 8 * SIZE(AO)
  2195. lda AO, 8 * SIZE(AO)
  2196. ADD c02, t2, c02
  2197. MUL a2, b3, t2
  2198. LD b3, 2 * SIZE(BO)
  2199. LD a2, 1 * SIZE(AO)
  2200. ADD c03, t3, c03
  2201. LD a4, 3 * SIZE(AO)
  2202. MUL a3, b4, t3
  2203. LD a3, 2 * SIZE(AO)
  2204. ADD c04, t4, c04
  2205. MUL a5, b4, t4
  2206. LD b4, 3 * SIZE(BO)
  2207. bgt L, $L102
  2208. .align 4
  2209. $L105:
  2210. #ifndef TRMMKERNEL
  2211. and K, 3, L
  2212. #else
  2213. and TMP1, 3, L
  2214. #endif
  2215. ldt alpha, ALPHA
  2216. #ifndef TRMMKERNEL
  2217. LD a3, 0 * SIZE(C1)
  2218. LD a4, 1 * SIZE(C1)
  2219. #endif
  2220. ble L, $L108
  2221. .align 4
  2222. $L106:
  2223. ADD c01, t1, c01
  2224. lda L, -1(L)
  2225. MUL a1, b1, t1
  2226. LD a1, 2 * SIZE(AO)
  2227. ADD c02, t2, c02
  2228. MUL a2, b1, t2
  2229. LD a2, 3 * SIZE(AO)
  2230. LD b1, 1 * SIZE(BO)
  2231. lda AO, 2 * SIZE(AO)
  2232. unop
  2233. lda BO, 1 * SIZE(BO)
  2234. bgt L, $L106
  2235. .align 4
  2236. $L108:
  2237. ADD c01, t1, c01
  2238. fclr t1
  2239. ADD c02, t2, c02
  2240. fclr t2
  2241. ADD c03, t3, c03
  2242. fclr t3
  2243. ADD c04, t4, c04
  2244. fclr t4
  2245. ADD c01, c03, c01
  2246. ADD c02, c04, c02
  2247. MUL alpha, c01, c01
  2248. MUL alpha, c02, c02
  2249. #ifndef TRMMKERNEL
  2250. ADD c01, a3, c01
  2251. ADD c02, a4, c02
  2252. #endif
  2253. ST c01, 0 * SIZE(C1)
  2254. ST c02, 1 * SIZE(C1)
  2255. lda C1, 2 * SIZE(C1)
  2256. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2257. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2258. subq K, KK, TMP1
  2259. #ifdef LEFT
  2260. subq TMP1, 2, TMP1
  2261. #else
  2262. subq TMP1, 1, TMP1
  2263. #endif
  2264. sll TMP1, BASE_SHIFT + 1, TMP2
  2265. addq AO, TMP2, AO
  2266. sll TMP1, BASE_SHIFT + 0, TMP2
  2267. addq BO, TMP2, BO
  2268. #endif
  2269. #if defined(TRMMKERNEL) && defined(LEFT)
  2270. addq KK, 2, KK
  2271. #endif
  2272. .align 4
  2273. $L110:
  2274. and M, 1, I
  2275. ble I, $L999
  2276. .align 4
  2277. $L111:
  2278. #if !defined(TRMMKERNEL) || \
  2279. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2280. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2281. #ifdef TRMMKERNEL
  2282. #ifdef LEFT
  2283. addq KK, 1, TMP1
  2284. #else
  2285. addq KK, 1, TMP1
  2286. #endif
  2287. #endif
  2288. LD a1, 0 * SIZE(AO)
  2289. fclr t1
  2290. LD a2, 1 * SIZE(AO)
  2291. fclr t2
  2292. LD a3, 2 * SIZE(AO)
  2293. fclr t3
  2294. LD a4, 3 * SIZE(AO)
  2295. fclr t4
  2296. LD b1, 0 * SIZE(B)
  2297. fclr c01
  2298. LD b2, 1 * SIZE(B)
  2299. fclr c02
  2300. LD b3, 2 * SIZE(B)
  2301. fclr c03
  2302. LD b4, 3 * SIZE(B)
  2303. fclr c04
  2304. #ifndef TRMMKERNEL
  2305. sra K, 2, L
  2306. #else
  2307. sra TMP1, 2, L
  2308. #endif
  2309. mov B, BO
  2310. unop
  2311. ble L, $L115
  2312. #else
  2313. sll KK, BASE_SHIFT + 0, TMP1
  2314. addq AO, TMP1, AO
  2315. sll KK, BASE_SHIFT + 0, TMP2
  2316. addq B, TMP2, BO
  2317. subq K, KK, TMP1
  2318. LD a1, 0 * SIZE(AO)
  2319. fclr t1
  2320. LD a2, 1 * SIZE(AO)
  2321. fclr t2
  2322. LD a3, 2 * SIZE(AO)
  2323. fclr t3
  2324. LD a4, 3 * SIZE(AO)
  2325. fclr t4
  2326. LD b1, 0 * SIZE(BO)
  2327. fclr c01
  2328. LD b2, 1 * SIZE(BO)
  2329. fclr c02
  2330. LD b3, 2 * SIZE(BO)
  2331. fclr c03
  2332. LD b4, 3 * SIZE(BO)
  2333. fclr c04
  2334. #ifndef TRMMKERNEL
  2335. sra K, 2, L
  2336. #else
  2337. sra TMP1, 2, L
  2338. #endif
  2339. unop
  2340. ble L, $L115
  2341. #endif
  2342. .align 4
  2343. $L112:
  2344. ADD c01, t1, c01
  2345. MUL a1, b1, t1
  2346. LD a1, 4 * SIZE(AO)
  2347. LD b1, 4 * SIZE(BO)
  2348. ADD c02, t2, c02
  2349. MUL a2, b2, t2
  2350. LD a2, 5 * SIZE(AO)
  2351. LD b2, 5 * SIZE(BO)
  2352. ADD c03, t3, c03
  2353. MUL a3, b3, t3
  2354. LD a3, 6 * SIZE(AO)
  2355. LD b3, 6 * SIZE(BO)
  2356. ADD c04, t4, c04
  2357. MUL a4, b4, t4
  2358. LD a4, 7 * SIZE(AO)
  2359. LD b4, 7 * SIZE(BO)
  2360. lda L, -1(L)
  2361. lda AO, 4 * SIZE(AO)
  2362. lda BO, 4 * SIZE(BO)
  2363. bgt L, $L112
  2364. .align 4
  2365. $L115:
  2366. #ifndef TRMMKERNEL
  2367. and K, 3, L
  2368. #else
  2369. and TMP1, 3, L
  2370. #endif
  2371. ldt alpha, ALPHA
  2372. #ifndef TRMMKERNEL
  2373. LD a2, 0 * SIZE(C1)
  2374. #endif
  2375. ble L, $L118
  2376. .align 4
  2377. $L116:
  2378. ADD c01, t1, c01
  2379. MUL a1, b1, t1
  2380. LD a1, 1 * SIZE(AO)
  2381. LD b1, 1 * SIZE(BO)
  2382. lda L, -1(L)
  2383. lda AO, 1 * SIZE(AO)
  2384. lda BO, 1 * SIZE(BO)
  2385. bgt L, $L116
  2386. .align 4
  2387. $L118:
  2388. ADD c01, t1, c01
  2389. ADD c02, t2, c02
  2390. ADD c03, t3, c03
  2391. ADD c04, t4, c04
  2392. ADD c01, c02, c01
  2393. ADD c03, c04, c03
  2394. ADD c01, c03, c01
  2395. MUL alpha, c01, c01
  2396. #ifndef TRMMKERNEL
  2397. ADD c01, a2, c01
  2398. #endif
  2399. ST c01, 0 * SIZE(C1)
  2400. .align 4
  2401. $L999:
  2402. ldt $f2, 0($sp)
  2403. ldt $f3, 8($sp)
  2404. ldt $f4, 16($sp)
  2405. ldt $f5, 24($sp)
  2406. ldt $f6, 32($sp)
  2407. ldt $f7, 40($sp)
  2408. ldt $f8, 48($sp)
  2409. ldt $f9, 56($sp)
  2410. clr $0
  2411. lda $sp, STACKSIZE($sp)
  2412. ret
  2413. EPILOGUE