You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 48
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. .set noat
  56. .set noreorder
  57. .arch ev6
  58. .text
  59. .align 5
  60. .globl CNAME
  61. .ent CNAME
  62. #define STACKSIZE 80
  63. #define M $16
  64. #define N $17
  65. #define K $18
  66. #define A $21
  67. #define B $22
  68. #define C $20
  69. #define LDC $23
  70. #define C1 $19
  71. #define C2 $24
  72. #define AO $at
  73. #define BO $5
  74. #define I $6
  75. #define J $7
  76. #define L $8
  77. #define a1 $f16
  78. #define a2 $f17
  79. #define a3 $f18
  80. #define a4 $f19
  81. #define b1 $f20
  82. #define b2 $f21
  83. #define b3 $f22
  84. #define b4 $f23
  85. #define t1 $f24
  86. #define t2 $f25
  87. #define t3 $f26
  88. #define t4 $f27
  89. #define a5 $f28
  90. #define a6 $f30
  91. #define b5 $f29
  92. #define alpha_i $f29
  93. #define alpha_r $f30
  94. #define c01 $f0
  95. #define c02 $f1
  96. #define c03 $f2
  97. #define c04 $f3
  98. #define c05 $f4
  99. #define c06 $f5
  100. #define c07 $f6
  101. #define c08 $f7
  102. #define c09 $f8
  103. #define c10 $f9
  104. #define c11 $f10
  105. #define c12 $f11
  106. #define c13 $f12
  107. #define c14 $f13
  108. #define c15 $f14
  109. #define c16 $f15
  110. #define TMP1 $0
  111. #define TMP2 $1
  112. #define KK $2
  113. #define BB $3
  114. #define OFFSET $4
  115. #define ALPHA_R 64($sp)
  116. #define ALPHA_I 72($sp)
  117. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  118. #define ADD1 ADD
  119. #define ADD2 SUB
  120. #define ADD3 ADD
  121. #define ADD4 ADD
  122. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  123. #define ADD1 ADD
  124. #define ADD2 ADD
  125. #define ADD3 SUB
  126. #define ADD4 ADD
  127. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  128. #define ADD1 ADD
  129. #define ADD2 ADD
  130. #define ADD3 ADD
  131. #define ADD4 SUB
  132. #else
  133. #define ADD1 ADD
  134. #define ADD2 SUB
  135. #define ADD3 SUB
  136. #define ADD4 SUB
  137. #endif
  138. CNAME:
  139. .frame $sp, STACKSIZE, $26, 0
  140. #ifdef PROFILE
  141. ldgp $gp, 0($27)
  142. lda $at, _mcount
  143. jsr $at, ($at), _mcount
  144. #endif
  145. #ifndef PROFILE
  146. .prologue 0
  147. #else
  148. .prologue 1
  149. #endif
  150. lda $sp, -STACKSIZE($sp)
  151. ldq B, 0 + STACKSIZE($sp)
  152. ldq C, 8 + STACKSIZE($sp)
  153. ldq LDC, 16 + STACKSIZE($sp)
  154. #ifdef TRMMKERNEL
  155. ldq OFFSET, 24 + STACKSIZE($sp)
  156. #endif
  157. sll LDC, ZBASE_SHIFT, LDC
  158. stt $f2, 0($sp)
  159. stt $f3, 8($sp)
  160. stt $f4, 16($sp)
  161. stt $f5, 24($sp)
  162. stt $f6, 32($sp)
  163. stt $f7, 40($sp)
  164. stt $f8, 48($sp)
  165. stt $f9, 56($sp)
  166. stt $f19, ALPHA_R
  167. stt $f20, ALPHA_I
  168. cmple M, 0, $0
  169. cmple N, 0, $1
  170. cmple K, 0, $2
  171. or $0, $1, $0
  172. or $0, $2, $0
  173. bne $0, $L999
  174. #if defined(TRMMKERNEL) && !defined(LEFT)
  175. subq $31, OFFSET, KK
  176. #endif
  177. sra N, 1, J
  178. ble J, $L30
  179. .align 4
  180. $L01:
  181. mov C, C1
  182. addq C, LDC, C2
  183. mov A, AO
  184. s4addq K, 0, BB
  185. #if defined(TRMMKERNEL) && defined(LEFT)
  186. mov OFFSET, KK
  187. #endif
  188. SXADDQ BB, B, BB
  189. addq C2, LDC, C
  190. unop
  191. sra M, 1, I
  192. fclr t1
  193. fclr t2
  194. fclr t3
  195. fclr t4
  196. fclr c01
  197. fclr c05
  198. ble I, $L20
  199. .align 4
  200. $L11:
  201. #ifndef EV4
  202. ldl $31, 0 * SIZE(BB)
  203. ldl $31, 8 * SIZE(BB)
  204. unop
  205. lda BB, 16 * SIZE(BB)
  206. #endif
  207. #if !defined(TRMMKERNEL) || \
  208. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  209. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  210. #ifdef TRMMKERNEL
  211. #ifdef LEFT
  212. addq KK, 2, TMP1
  213. #else
  214. addq KK, 2, TMP1
  215. #endif
  216. #endif
  217. LD a1, 0 * SIZE(AO)
  218. fclr c09
  219. LD a2, 1 * SIZE(AO)
  220. fclr c13
  221. LD a3, 2 * SIZE(AO)
  222. fclr c02
  223. LD a4, 3 * SIZE(AO)
  224. fclr c06
  225. LD b1, 0 * SIZE(B)
  226. fclr c10
  227. LD b2, 1 * SIZE(B)
  228. fclr c14
  229. LD b3, 2 * SIZE(B)
  230. fclr c03
  231. LD b4, 3 * SIZE(B)
  232. fclr c07
  233. lda BO, 4 * SIZE(B)
  234. fclr c11
  235. lda AO, 4 * SIZE(AO)
  236. fclr c15
  237. lds $f31, 4 * SIZE(C1)
  238. fclr c04
  239. #ifndef TRMMKERNEL
  240. lda L, -2(K)
  241. #else
  242. lda L, -2(TMP1)
  243. #endif
  244. fclr c08
  245. lds $f31, 4 * SIZE(C2)
  246. fclr c12
  247. fclr c16
  248. ble L, $L15
  249. #else
  250. sll KK, ZBASE_SHIFT + 1, TMP1
  251. addq AO, TMP1, AO
  252. addq B, TMP1, BO
  253. subq K, KK, TMP1
  254. LD a1, 0 * SIZE(AO)
  255. fclr c09
  256. LD a2, 1 * SIZE(AO)
  257. fclr c13
  258. LD a3, 2 * SIZE(AO)
  259. fclr c02
  260. LD a4, 3 * SIZE(AO)
  261. fclr c06
  262. LD b1, 0 * SIZE(BO)
  263. fclr c10
  264. LD b2, 1 * SIZE(BO)
  265. fclr c14
  266. LD b3, 2 * SIZE(BO)
  267. fclr c03
  268. LD b4, 3 * SIZE(BO)
  269. fclr c07
  270. lda BO, 4 * SIZE(BO)
  271. fclr c11
  272. lda AO, 4 * SIZE(AO)
  273. fclr c15
  274. lds $f31, 4 * SIZE(C1)
  275. fclr c04
  276. lda L, -2(TMP1)
  277. fclr c08
  278. lds $f31, 4 * SIZE(C2)
  279. fclr c12
  280. fclr c16
  281. ble L, $L15
  282. #endif
  283. .align 5
  284. $L12:
  285. /* 1 */
  286. ADD1 c11, t1, c11
  287. #ifndef EV4
  288. ldq $31, PREFETCHSIZE * SIZE(AO)
  289. #else
  290. unop
  291. #endif
  292. MUL b1, a1, t1
  293. #ifndef EV4
  294. ldl $31, PREFETCHSIZE * SIZE(BO)
  295. #else
  296. unop
  297. #endif
  298. ADD3 c12, t2, c12
  299. unop
  300. MUL b1, a2, t2
  301. unop
  302. ADD2 c16, t3, c16
  303. unop
  304. MUL b2, a2, t3
  305. LD a5, 0 * SIZE(AO)
  306. ADD4 c15, t4, c15
  307. unop
  308. MUL b2, a1, t4
  309. LD b5, 0 * SIZE(BO)
  310. /* 2 */
  311. ADD1 c01, t1, c01
  312. UNOP
  313. MUL b1, a3, t1
  314. UNOP
  315. ADD3 c02, t2, c02
  316. UNOP
  317. MUL b1, a4, t2
  318. UNOP
  319. ADD2 c06, t3, c06
  320. unop
  321. MUL b2, a4, t3
  322. unop
  323. ADD4 c05, t4, c05
  324. unop
  325. MUL b4, a1, t4
  326. unop
  327. /* 3 */
  328. ADD1 c03, t1, c03
  329. unop
  330. MUL b3, a1, t1
  331. unop
  332. ADD3 c04, t2, c04
  333. unop
  334. MUL b3, a2, t2
  335. unop
  336. ADD2 c08, t3, c08
  337. unop
  338. MUL b4, a2, t3
  339. LD a2, 1 * SIZE(AO)
  340. ADD4 c13, t4, c13
  341. unop
  342. MUL b2, a3, t4
  343. LD b2, 1 * SIZE(BO)
  344. /* 4 */
  345. ADD1 c09, t1, c09
  346. unop
  347. MUL b3, a3, t1
  348. LD a6, 2 * SIZE(AO)
  349. ADD3 c10, t2, c10
  350. unop
  351. MUL b3, a4, t2
  352. LD b3, 2 * SIZE(BO)
  353. ADD2 c14, t3, c14
  354. unop
  355. MUL b4, a4, t3
  356. LD a4, 3 * SIZE(AO)
  357. ADD4 c07, t4, c07
  358. unop
  359. MUL b4, a3, t4
  360. LD b4, 3 * SIZE(BO)
  361. /* 5 */
  362. ADD1 c11, t1, c11
  363. unop
  364. MUL b5, a5, t1
  365. LD a1, 4 * SIZE(AO)
  366. ADD3 c12, t2, c12
  367. lda L, -2(L)
  368. MUL b5, a2, t2
  369. LD b1, 4 * SIZE(BO)
  370. ADD2 c16, t3, c16
  371. unop
  372. MUL b2, a2, t3
  373. unop
  374. ADD4 c15, t4, c15
  375. unop
  376. MUL b2, a5, t4
  377. unop
  378. /* 6 */
  379. ADD1 c01, t1, c01
  380. unop
  381. MUL b5, a6, t1
  382. unop
  383. ADD3 c02, t2, c02
  384. unop
  385. MUL b5, a4, t2
  386. unop
  387. ADD2 c06, t3, c06
  388. unop
  389. MUL b2, a4, t3
  390. unop
  391. ADD4 c05, t4, c05
  392. unop
  393. MUL b4, a5, t4
  394. unop
  395. /* 7 */
  396. ADD1 c03, t1, c03
  397. lda AO, 8 * SIZE(AO)
  398. MUL b3, a5, t1
  399. unop
  400. ADD3 c04, t2, c04
  401. lda BO, 8 * SIZE(BO)
  402. MUL b3, a2, t2
  403. unop
  404. ADD2 c08, t3, c08
  405. unop
  406. MUL b4, a2, t3
  407. LD a2, -3 * SIZE(AO)
  408. ADD4 c13, t4, c13
  409. unop
  410. MUL b2, a6, t4
  411. LD b2, -3 * SIZE(BO)
  412. /* 8 */
  413. ADD1 c09, t1, c09
  414. unop
  415. MUL b3, a6, t1
  416. LD a3, -2 * SIZE(AO)
  417. ADD3 c10, t2, c10
  418. unop
  419. MUL b3, a4, t2
  420. LD b3, -2 * SIZE(BO)
  421. ADD2 c14, t3, c14
  422. unop
  423. MUL b4, a4, t3
  424. LD a4, -1 * SIZE(AO)
  425. ADD4 c07, t4, c07
  426. MUL b4, a6, t4
  427. LD b4, -1 * SIZE(BO)
  428. bgt L, $L12
  429. .align 4
  430. $L15:
  431. ADD1 c11, t1, c11
  432. ldt alpha_r, ALPHA_R
  433. MUL b1, a1, t1
  434. #ifndef TRMMKERNEL
  435. blbs K, $L18
  436. #else
  437. blbs TMP1, $L18
  438. #endif
  439. .align 4
  440. ADD3 c12, t2, c12
  441. MUL b1, a2, t2
  442. ADD2 c16, t3, c16
  443. MUL b2, a2, t3
  444. ADD4 c15, t4, c15
  445. MUL b2, a1, t4
  446. ADD1 c01, t1, c01
  447. MUL b1, a3, t1
  448. ADD3 c02, t2, c02
  449. unop
  450. MUL b1, a4, t2
  451. LD b1, 0 * SIZE(BO)
  452. ADD2 c06, t3, c06
  453. MUL b2, a4, t3
  454. ADD4 c05, t4, c05
  455. MUL b4, a1, t4
  456. ADD1 c03, t1, c03
  457. unop
  458. MUL b3, a1, t1
  459. LD a1, 0 * SIZE(AO)
  460. ADD3 c04, t2, c04
  461. unop
  462. MUL b3, a2, t2
  463. unop
  464. ADD2 c08, t3, c08
  465. unop
  466. MUL b4, a2, t3
  467. LD a2, 1 * SIZE(AO)
  468. ADD4 c13, t4, c13
  469. unop
  470. MUL b2, a3, t4
  471. LD b2, 1 * SIZE(BO)
  472. ADD1 c09, t1, c09
  473. unop
  474. MUL b3, a3, t1
  475. lda AO, 4 * SIZE(AO)
  476. ADD3 c10, t2, c10
  477. unop
  478. MUL b3, a4, t2
  479. LD b3, 2 * SIZE(BO)
  480. ADD2 c14, t3, c14
  481. unop
  482. MUL b4, a4, t3
  483. LD a4, -1 * SIZE(AO)
  484. ADD4 c07, t4, c07
  485. unop
  486. MUL b4, a3, t4
  487. LD a3, -2 * SIZE(AO)
  488. ADD1 c11, t1, c11
  489. LD b4, 3 * SIZE(BO)
  490. MUL b1, a1, t1
  491. lda BO, 4 * SIZE(BO)
  492. .align 4
  493. $L18:
  494. ADD3 c12, t2, c12
  495. unop
  496. MUL b1, a2, t2
  497. ldt alpha_i, ALPHA_I
  498. ADD2 c16, t3, c16
  499. unop
  500. MUL b2, a2, t3
  501. #ifndef TRMMKERNEL
  502. LD a5, 0 * SIZE(C1)
  503. #else
  504. unop
  505. #endif
  506. ADD4 c15, t4, c15
  507. MUL b2, a1, t4
  508. ADD1 c01, t1, c01
  509. MUL b1, a3, t1
  510. ADD3 c02, t2, c02
  511. unop
  512. MUL b1, a4, t2
  513. #ifndef TRMMKERNEL
  514. LD b1, 1 * SIZE(C1)
  515. #else
  516. unop
  517. #endif
  518. ADD2 c06, t3, c06
  519. MUL b2, a4, t3
  520. ADD4 c05, t4, c05
  521. MUL b4, a1, t4
  522. ADD1 c03, t1, c03
  523. unop
  524. MUL b3, a1, t1
  525. #ifndef TRMMKERNEL
  526. LD a1, 2 * SIZE(C1)
  527. #else
  528. unop
  529. #endif
  530. ADD3 c04, t2, c04
  531. unop
  532. MUL b3, a2, t2
  533. unop
  534. ADD2 c08, t3, c08
  535. unop
  536. MUL b4, a2, t3
  537. #ifndef TRMMKERNEL
  538. LD a2, 3 * SIZE(C1)
  539. #else
  540. unop
  541. #endif
  542. ADD4 c13, t4, c13
  543. unop
  544. MUL b2, a3, t4
  545. #ifndef TRMMKERNEL
  546. LD b2, 0 * SIZE(C2)
  547. #else
  548. unop
  549. #endif
  550. ADD1 c09, t1, c09
  551. lda I, -1(I)
  552. MUL b3, a3, t1
  553. unop
  554. ADD3 c10, t2, c10
  555. unop
  556. MUL b3, a4, t2
  557. #ifndef TRMMKERNEL
  558. LD b3, 1 * SIZE(C2)
  559. #else
  560. unop
  561. #endif
  562. ADD2 c14, t3, c14
  563. unop
  564. MUL b4, a4, t3
  565. #ifndef TRMMKERNEL
  566. LD a4, 2 * SIZE(C2)
  567. #else
  568. unop
  569. #endif
  570. ADD4 c07, t4, c07
  571. unop
  572. MUL b4, a3, t4
  573. #ifndef TRMMKERNEL
  574. LD a3, 3 * SIZE(C2)
  575. #else
  576. unop
  577. #endif
  578. ADD1 c11, t1, c11
  579. ADD3 c12, t2, c12
  580. ADD2 c16, t3, c16
  581. ADD4 c15, t4, c15
  582. ADD c01, c06, c01
  583. ADD c02, c05, c02
  584. ADD c03, c08, c03
  585. ADD c04, c07, c04
  586. ADD c09, c14, c09
  587. MUL alpha_r, c01, t1
  588. ADD c10, c13, c10
  589. MUL alpha_r, c02, t2
  590. ADD c11, c16, c11
  591. MUL alpha_r, c03, t3
  592. ADD c12, c15, c12
  593. MUL alpha_r, c04, t4
  594. #ifndef TRMMKERNEL
  595. ADD a5, t1, a5
  596. MUL alpha_i, c02, t1
  597. ADD b1, t2, b1
  598. MUL alpha_i, c01, t2
  599. ADD a1, t3, a1
  600. MUL alpha_i, c04, t3
  601. ADD a2, t4, a2
  602. MUL alpha_i, c03, t4
  603. #else
  604. ADD $f31, t1, a5
  605. MUL alpha_i, c02, t1
  606. ADD $f31, t2, b1
  607. MUL alpha_i, c01, t2
  608. ADD $f31, t3, a1
  609. MUL alpha_i, c04, t3
  610. ADD $f31, t4, a2
  611. MUL alpha_i, c03, t4
  612. #endif
  613. SUB a5, t1, a5
  614. MUL alpha_r, c09, t1
  615. ADD b1, t2, b1
  616. MUL alpha_r, c10, t2
  617. SUB a1, t3, a1
  618. MUL alpha_r, c11, t3
  619. ADD a2, t4, a2
  620. MUL alpha_r, c12, t4
  621. #ifndef TRMMKERNEL
  622. ADD b2, t1, b2
  623. MUL alpha_i, c10, t1
  624. ADD b3, t2, b3
  625. MUL alpha_i, c09, t2
  626. ADD a4, t3, a4
  627. MUL alpha_i, c12, t3
  628. ADD a3, t4, a3
  629. MUL alpha_i, c11, t4
  630. #else
  631. ADD $f31, t1, b2
  632. MUL alpha_i, c10, t1
  633. ADD $f31, t2, b3
  634. MUL alpha_i, c09, t2
  635. ADD $f31, t3, a4
  636. MUL alpha_i, c12, t3
  637. ADD $f31, t4, a3
  638. MUL alpha_i, c11, t4
  639. #endif
  640. SUB b2, t1, b2
  641. ST a5, 0 * SIZE(C1)
  642. fclr t1
  643. unop
  644. ADD b3, t2, b3
  645. ST b1, 1 * SIZE(C1)
  646. fclr t2
  647. unop
  648. SUB a4, t3, a4
  649. ST a1, 2 * SIZE(C1)
  650. fclr t3
  651. unop
  652. ADD a3, t4, a3
  653. ST a2, 3 * SIZE(C1)
  654. fclr t4
  655. unop
  656. ST b2, 0 * SIZE(C2)
  657. fclr c01
  658. ST b3, 1 * SIZE(C2)
  659. fclr c05
  660. ST a4, 2 * SIZE(C2)
  661. lda C1, 4 * SIZE(C1)
  662. ST a3, 3 * SIZE(C2)
  663. lda C2, 4 * SIZE(C2)
  664. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  665. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  666. subq K, KK, TMP1
  667. #ifdef LEFT
  668. subq TMP1, 2, TMP1
  669. #else
  670. subq TMP1, 2, TMP1
  671. #endif
  672. sll TMP1, ZBASE_SHIFT + 1, TMP1
  673. addq AO, TMP1, AO
  674. addq BO, TMP1, BO
  675. #endif
  676. #if defined(TRMMKERNEL) && defined(LEFT)
  677. addq KK, 2, KK
  678. #endif
  679. bgt I, $L11
  680. .align 4
  681. $L20:
  682. and M, 1, I
  683. ble I, $L29
  684. #if !defined(TRMMKERNEL) || \
  685. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  686. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  687. #ifdef TRMMKERNEL
  688. #ifdef LEFT
  689. addq KK, 1, TMP1
  690. #else
  691. addq KK, 2, TMP1
  692. #endif
  693. #endif
  694. LD a1, 0 * SIZE(AO)
  695. fclr c09
  696. LD a2, 1 * SIZE(AO)
  697. fclr c13
  698. LD a3, 2 * SIZE(AO)
  699. fclr c02
  700. LD a4, 3 * SIZE(AO)
  701. fclr c06
  702. LD b1, 0 * SIZE(B)
  703. fclr c10
  704. LD b2, 1 * SIZE(B)
  705. fclr c14
  706. LD b3, 2 * SIZE(B)
  707. lda AO, 2 * SIZE(AO)
  708. LD b4, 3 * SIZE(B)
  709. lda BO, 4 * SIZE(B)
  710. #ifndef TRMMKERNEL
  711. lda L, -2(K)
  712. #else
  713. lda L, -2(TMP1)
  714. #endif
  715. ble L, $L25
  716. #else
  717. sll KK, ZBASE_SHIFT + 0, TMP1
  718. addq AO, TMP1, AO
  719. sll KK, ZBASE_SHIFT + 1, TMP1
  720. addq B, TMP1, BO
  721. subq K, KK, TMP1
  722. LD a1, 0 * SIZE(AO)
  723. fclr c09
  724. LD a2, 1 * SIZE(AO)
  725. fclr c13
  726. LD a3, 2 * SIZE(AO)
  727. fclr c02
  728. LD a4, 3 * SIZE(AO)
  729. fclr c06
  730. LD b1, 0 * SIZE(BO)
  731. fclr c10
  732. LD b2, 1 * SIZE(BO)
  733. fclr c14
  734. LD b3, 2 * SIZE(BO)
  735. lda AO, 2 * SIZE(AO)
  736. LD b4, 3 * SIZE(BO)
  737. lda BO, 4 * SIZE(BO)
  738. lda L, -2(TMP1)
  739. ble L, $L25
  740. #endif
  741. .align 5
  742. $L22:
  743. ADD1 c09, t1, c09
  744. unop
  745. MUL a1, b1, t1
  746. unop
  747. ADD3 c10, t2, c10
  748. unop
  749. MUL a2, b1, t2
  750. LD b1, 0 * SIZE(BO)
  751. ADD4 c13, t3, c13
  752. unop
  753. MUL a1, b2, t3
  754. lda BO, 8 * SIZE(BO)
  755. ADD2 c14, t4, c14
  756. unop
  757. MUL a2, b2, t4
  758. LD b2, -7 * SIZE(BO)
  759. ADD1 c01, t1, c01
  760. unop
  761. MUL a1, b3, t1
  762. unop
  763. ADD3 c02, t2, c02
  764. unop
  765. MUL a2, b3, t2
  766. LD b3, -6 * SIZE(BO)
  767. ADD4 c05, t3, c05
  768. unop
  769. MUL a1, b4, t3
  770. LD a1, 2 * SIZE(AO)
  771. ADD2 c06, t4, c06
  772. MUL a2, b4, t4
  773. LD b5, -5 * SIZE(BO)
  774. ADD1 c09, t1, c09
  775. unop
  776. MUL a3, b1, t1
  777. LD a2, 3 * SIZE(AO)
  778. ADD3 c10, t2, c10
  779. unop
  780. MUL a4, b1, t2
  781. LD b1, -4 * SIZE(BO)
  782. ADD4 c13, t3, c13
  783. unop
  784. MUL a3, b2, t3
  785. lda AO, 4 * SIZE(AO)
  786. ADD2 c14, t4, c14
  787. MUL a4, b2, t4
  788. LD b2, -3 * SIZE(BO)
  789. ADD1 c01, t1, c01
  790. lda L, -2(L)
  791. MUL a3, b3, t1
  792. LD b4, -1 * SIZE(BO)
  793. ADD3 c02, t2, c02
  794. unop
  795. MUL a4, b3, t2
  796. LD b3, -2 * SIZE(BO)
  797. ADD4 c05, t3, c05
  798. unop
  799. MUL a3, b5, t3
  800. LD a3, 0 * SIZE(AO)
  801. ADD2 c06, t4, c06
  802. MUL a4, b5, t4
  803. LD a4, 1 * SIZE(AO)
  804. bgt L, $L22
  805. .align 4
  806. $L25:
  807. ADD1 c09, t1, c09
  808. ldt alpha_r, ALPHA_R
  809. MUL a1, b1, t1
  810. #ifndef TRMMKERNEL
  811. blbs K, $L28
  812. #else
  813. blbs TMP1, $L28
  814. #endif
  815. .align 4
  816. ADD3 c10, t2, c10
  817. unop
  818. MUL a2, b1, t2
  819. LD b1, 0 * SIZE(BO)
  820. ADD4 c13, t3, c13
  821. unop
  822. MUL a1, b2, t3
  823. unop
  824. ADD2 c14, t4, c14
  825. unop
  826. MUL a2, b2, t4
  827. LD b2, 1 * SIZE(BO)
  828. ADD1 c01, t1, c01
  829. unop
  830. MUL a1, b3, t1
  831. lda AO, 2 * SIZE(AO)
  832. ADD3 c02, t2, c02
  833. unop
  834. MUL a2, b3, t2
  835. LD b3, 2 * SIZE(BO)
  836. ADD4 c05, t3, c05
  837. unop
  838. MUL a1, b4, t3
  839. LD a1, -2 * SIZE(AO)
  840. ADD2 c06, t4, c06
  841. unop
  842. MUL a2, b4, t4
  843. LD a2, -1 * SIZE(AO)
  844. ADD1 c09, t1, c09
  845. LD b4, 3 * SIZE(BO)
  846. MUL a1, b1, t1
  847. lda BO, 4 * SIZE(BO)
  848. .align 4
  849. $L28:
  850. ADD3 c10, t2, c10
  851. unop
  852. MUL a2, b1, t2
  853. ldt alpha_i, ALPHA_I
  854. ADD4 c13, t3, c13
  855. unop
  856. MUL a1, b2, t3
  857. #ifndef TRMMKERNEL
  858. LD c03, 0 * SIZE(C1)
  859. #else
  860. unop
  861. #endif
  862. ADD2 c14, t4, c14
  863. unop
  864. MUL a2, b2, t4
  865. #ifndef TRMMKERNEL
  866. LD c04, 1 * SIZE(C1)
  867. #else
  868. unop
  869. #endif
  870. ADD1 c01, t1, c01
  871. unop
  872. MUL a1, b3, t1
  873. #ifndef TRMMKERNEL
  874. LD c11, 0 * SIZE(C2)
  875. #else
  876. unop
  877. #endif
  878. ADD3 c02, t2, c02
  879. unop
  880. MUL a2, b3, t2
  881. #ifndef TRMMKERNEL
  882. LD c12, 1 * SIZE(C2)
  883. #else
  884. unop
  885. #endif
  886. ADD4 c05, t3, c05
  887. MUL a1, b4, t3
  888. ADD2 c06, t4, c06
  889. MUL a2, b4, t4
  890. ADD1 c09, t1, c09
  891. ADD3 c10, t2, c10
  892. ADD4 c13, t3, c13
  893. ADD2 c14, t4, c14
  894. ADD c01, c06, c01
  895. ADD c02, c05, c02
  896. ADD c09, c14, c09
  897. ADD c10, c13, c10
  898. MUL alpha_r, c01, t1
  899. MUL alpha_r, c02, t2
  900. MUL alpha_r, c09, t3
  901. MUL alpha_r, c10, t4
  902. #ifndef TRMMKERNEL
  903. ADD c03, t1, c03
  904. MUL alpha_i, c02, t1
  905. ADD c04, t2, c04
  906. MUL alpha_i, c01, t2
  907. ADD c11, t3, c11
  908. MUL alpha_i, c10, t3
  909. ADD c12, t4, c12
  910. MUL alpha_i, c09, t4
  911. #else
  912. ADD $f31, t1, c03
  913. MUL alpha_i, c02, t1
  914. ADD $f31, t2, c04
  915. MUL alpha_i, c01, t2
  916. ADD $f31, t3, c11
  917. MUL alpha_i, c10, t3
  918. ADD $f31, t4, c12
  919. MUL alpha_i, c09, t4
  920. #endif
  921. SUB c03, t1, c03
  922. ADD c04, t2, c04
  923. SUB c11, t3, c11
  924. ADD c12, t4, c12
  925. ST c03, 0 * SIZE(C1)
  926. ST c04, 1 * SIZE(C1)
  927. ST c11, 0 * SIZE(C2)
  928. ST c12, 1 * SIZE(C2)
  929. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  930. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  931. subq K, KK, TMP1
  932. #ifdef LEFT
  933. subq TMP1, 1, TMP1
  934. #else
  935. subq TMP1, 2, TMP1
  936. #endif
  937. sll TMP1, ZBASE_SHIFT + 0, TMP2
  938. addq AO, TMP2, AO
  939. sll TMP1, ZBASE_SHIFT + 1, TMP2
  940. addq BO, TMP2, BO
  941. #endif
  942. #if defined(TRMMKERNEL) && defined(LEFT)
  943. addq KK, 1, KK
  944. #endif
  945. .align 4
  946. $L29:
  947. mov BO, B
  948. lda J, -1(J)
  949. #if defined(TRMMKERNEL) && !defined(LEFT)
  950. addq KK, 2, KK
  951. #else
  952. unop
  953. #endif
  954. bgt J, $L01
  955. .align 4
  956. $L30:
  957. and N, 1, J
  958. ble J, $L999
  959. mov C, C1
  960. mov A, AO
  961. #if defined(TRMMKERNEL) && defined(LEFT)
  962. mov OFFSET, KK
  963. #endif
  964. sra M, 1, I
  965. ble I, $L50
  966. .align 4
  967. $L41:
  968. #if !defined(TRMMKERNEL) || \
  969. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  970. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  971. #ifdef TRMMKERNEL
  972. #ifdef LEFT
  973. addq KK, 2, TMP1
  974. #else
  975. addq KK, 1, TMP1
  976. #endif
  977. #endif
  978. LD a1, 0 * SIZE(AO)
  979. fclr t1
  980. LD a2, 1 * SIZE(AO)
  981. fclr t2
  982. LD a3, 2 * SIZE(AO)
  983. fclr t3
  984. LD a4, 3 * SIZE(AO)
  985. fclr t4
  986. LD b1, 0 * SIZE(B)
  987. fclr c01
  988. LD b2, 1 * SIZE(B)
  989. fclr c05
  990. LD b3, 2 * SIZE(B)
  991. fclr c02
  992. LD b4, 3 * SIZE(B)
  993. fclr c06
  994. lda BO, 2 * SIZE(B)
  995. fclr c03
  996. lda AO, 4 * SIZE(AO)
  997. fclr c07
  998. #ifndef TRMMKERNEL
  999. lda L, -2(K)
  1000. #else
  1001. lda L, -2(TMP1)
  1002. #endif
  1003. fclr c04
  1004. fclr c08
  1005. ble L, $L45
  1006. #else
  1007. sll KK, ZBASE_SHIFT + 1, TMP1
  1008. addq AO, TMP1, AO
  1009. sll KK, ZBASE_SHIFT + 0, TMP1
  1010. addq B, TMP1, BO
  1011. subq K, KK, TMP1
  1012. LD a1, 0 * SIZE(AO)
  1013. fclr t1
  1014. LD a2, 1 * SIZE(AO)
  1015. fclr t2
  1016. LD a3, 2 * SIZE(AO)
  1017. fclr t3
  1018. LD a4, 3 * SIZE(AO)
  1019. fclr t4
  1020. LD b1, 0 * SIZE(BO)
  1021. fclr c01
  1022. LD b2, 1 * SIZE(BO)
  1023. fclr c05
  1024. LD b3, 2 * SIZE(BO)
  1025. fclr c02
  1026. LD b4, 3 * SIZE(BO)
  1027. fclr c06
  1028. lda BO, 2 * SIZE(BO)
  1029. fclr c03
  1030. lda AO, 4 * SIZE(AO)
  1031. fclr c07
  1032. lda L, -2(TMP1)
  1033. fclr c04
  1034. fclr c08
  1035. ble L, $L45
  1036. #endif
  1037. .align 5
  1038. $L42:
  1039. ADD4 c05, t1, c05
  1040. unop
  1041. MUL a1, b1, t1
  1042. unop
  1043. ADD2 c06, t2, c06
  1044. lda L, -2(L)
  1045. MUL a2, b1, t2
  1046. unop
  1047. ADD4 c07, t3, c07
  1048. unop
  1049. MUL a3, b1, t3
  1050. unop
  1051. ADD2 c08, t4, c08
  1052. unop
  1053. MUL a4, b1, t4
  1054. LD b1, 2 * SIZE(BO)
  1055. ADD1 c01, t1, c01
  1056. unop
  1057. MUL a1, b2, t1
  1058. LD a1, 0 * SIZE(AO)
  1059. ADD3 c02, t2, c02
  1060. lda BO, 4 * SIZE(BO)
  1061. MUL a2, b2, t2
  1062. LD a2, 1 * SIZE(AO)
  1063. ADD1 c03, t3, c03
  1064. unop
  1065. MUL a3, b2, t3
  1066. LD a3, 2 * SIZE(AO)
  1067. ADD3 c04, t4, c04
  1068. unop
  1069. MUL a4, b2, t4
  1070. LD a5, 3 * SIZE(AO)
  1071. ADD4 c05, t1, c05
  1072. unop
  1073. MUL a1, b3, t1
  1074. LD b2, -1 * SIZE(BO)
  1075. ADD2 c06, t2, c06
  1076. unop
  1077. MUL a2, b3, t2
  1078. unop
  1079. ADD4 c07, t3, c07
  1080. unop
  1081. MUL a3, b3, t3
  1082. lda AO, 8 * SIZE(AO)
  1083. ADD2 c08, t4, c08
  1084. unop
  1085. MUL a5, b3, t4
  1086. LD b3, 0 * SIZE(BO)
  1087. ADD1 c01, t1, c01
  1088. unop
  1089. MUL a1, b4, t1
  1090. LD a1, -4 * SIZE(AO)
  1091. ADD3 c02, t2, c02
  1092. unop
  1093. MUL a2, b4, t2
  1094. LD a2, -3 * SIZE(AO)
  1095. ADD1 c03, t3, c03
  1096. LD a4, -1 * SIZE(AO)
  1097. MUL a3, b4, t3
  1098. LD a3, -2 * SIZE(AO)
  1099. ADD3 c04, t4, c04
  1100. MUL a5, b4, t4
  1101. LD b4, 1 * SIZE(BO)
  1102. bgt L, $L42
  1103. .align 4
  1104. $L45:
  1105. ADD4 c05, t1, c05
  1106. ldt alpha_r, ALPHA_R
  1107. MUL b1, a1, t1
  1108. #ifndef TRMMKERNEL
  1109. blbs K, $L48
  1110. #else
  1111. blbs TMP1, $L48
  1112. #endif
  1113. .align 4
  1114. ADD2 c06, t2, c06
  1115. MUL a2, b1, t2
  1116. ADD4 c07, t3, c07
  1117. MUL a3, b1, t3
  1118. ADD2 c08, t4, c08
  1119. unop
  1120. MUL a4, b1, t4
  1121. LD b1, 0 * SIZE(BO)
  1122. ADD1 c01, t1, c01
  1123. unop
  1124. MUL a1, b2, t1
  1125. LD a1, 0 * SIZE(AO)
  1126. ADD3 c02, t2, c02
  1127. unop
  1128. MUL a2, b2, t2
  1129. LD a2, 1 * SIZE(AO)
  1130. ADD1 c03, t3, c03
  1131. unop
  1132. MUL a3, b2, t3
  1133. LD a3, 2 * SIZE(AO)
  1134. ADD3 c04, t4, c04
  1135. MUL a4, b2, t4
  1136. LD a4, 3 * SIZE(AO)
  1137. lda AO, 4 * SIZE(AO)
  1138. ADD4 c05, t1, c05
  1139. LD b2, 1 * SIZE(BO)
  1140. MUL a1, b1, t1
  1141. lda BO, 2 * SIZE(BO)
  1142. .align 4
  1143. $L48:
  1144. ADD2 c06, t2, c06
  1145. unop
  1146. MUL a2, b1, t2
  1147. ldt alpha_i, ALPHA_I
  1148. ADD4 c07, t3, c07
  1149. lda I, -1(I)
  1150. MUL a3, b1, t3
  1151. #ifndef TRMMKERNEL
  1152. LD c09, 0 * SIZE(C1)
  1153. #else
  1154. unop
  1155. #endif
  1156. ADD2 c08, t4, c08
  1157. unop
  1158. MUL a4, b1, t4
  1159. #ifndef TRMMKERNEL
  1160. LD c10, 1 * SIZE(C1)
  1161. #else
  1162. unop
  1163. #endif
  1164. ADD1 c01, t1, c01
  1165. unop
  1166. MUL a1, b2, t1
  1167. #ifndef TRMMKERNEL
  1168. LD c11, 2 * SIZE(C1)
  1169. #else
  1170. unop
  1171. #endif
  1172. ADD3 c02, t2, c02
  1173. unop
  1174. MUL a2, b2, t2
  1175. #ifndef TRMMKERNEL
  1176. LD c12, 3 * SIZE(C1)
  1177. #else
  1178. unop
  1179. #endif
  1180. ADD1 c03, t3, c03
  1181. MUL a3, b2, t3
  1182. ADD3 c04, t4, c04
  1183. MUL a4, b2, t4
  1184. ADD4 c05, t1, c05
  1185. ADD2 c06, t2, c06
  1186. ADD4 c07, t3, c07
  1187. ADD2 c08, t4, c08
  1188. ADD c01, c06, c01
  1189. ADD c02, c05, c02
  1190. ADD c03, c08, c03
  1191. ADD c04, c07, c04
  1192. MUL alpha_r, c01, t1
  1193. MUL alpha_r, c02, t2
  1194. MUL alpha_r, c03, t3
  1195. MUL alpha_r, c04, t4
  1196. #ifndef TRMMKERNEL
  1197. ADD c09, t1, c09
  1198. MUL alpha_i, c02, t1
  1199. ADD c10, t2, c10
  1200. MUL alpha_i, c01, t2
  1201. ADD c11, t3, c11
  1202. MUL alpha_i, c04, t3
  1203. ADD c12, t4, c12
  1204. MUL alpha_i, c03, t4
  1205. #else
  1206. ADD $f31, t1, c09
  1207. MUL alpha_i, c02, t1
  1208. ADD $f31, t2, c10
  1209. MUL alpha_i, c01, t2
  1210. ADD $f31, t3, c11
  1211. MUL alpha_i, c04, t3
  1212. ADD $f31, t4, c12
  1213. MUL alpha_i, c03, t4
  1214. #endif
  1215. SUB c09, t1, c09
  1216. ADD c10, t2, c10
  1217. SUB c11, t3, c11
  1218. ADD c12, t4, c12
  1219. ST c09, 0 * SIZE(C1)
  1220. ST c10, 1 * SIZE(C1)
  1221. ST c11, 2 * SIZE(C1)
  1222. ST c12, 3 * SIZE(C1)
  1223. lda C1, 4 * SIZE(C1)
  1224. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1225. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1226. subq K, KK, TMP1
  1227. #ifdef LEFT
  1228. subq TMP1, 2, TMP1
  1229. #else
  1230. subq TMP1, 1, TMP1
  1231. #endif
  1232. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1233. addq AO, TMP2, AO
  1234. sll TMP1, ZBASE_SHIFT + 0, TMP2
  1235. addq BO, TMP2, BO
  1236. #endif
  1237. #if defined(TRMMKERNEL) && defined(LEFT)
  1238. addq KK, 2, KK
  1239. #endif
  1240. bgt I, $L41
  1241. .align 4
  1242. $L50:
  1243. and M, 1, I
  1244. ble I, $L999
  1245. #if !defined(TRMMKERNEL) || \
  1246. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1247. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1248. #ifdef TRMMKERNEL
  1249. #ifdef LEFT
  1250. addq KK, 1, TMP1
  1251. #else
  1252. addq KK, 1, TMP1
  1253. #endif
  1254. #endif
  1255. LD a1, 0 * SIZE(AO)
  1256. fclr t1
  1257. LD a2, 1 * SIZE(AO)
  1258. fclr t2
  1259. LD a3, 2 * SIZE(AO)
  1260. fclr t3
  1261. LD a4, 3 * SIZE(AO)
  1262. fclr t4
  1263. LD b1, 0 * SIZE(B)
  1264. fclr c01
  1265. LD b2, 1 * SIZE(B)
  1266. fclr c05
  1267. LD b3, 2 * SIZE(B)
  1268. fclr c02
  1269. LD b4, 3 * SIZE(B)
  1270. fclr c06
  1271. lda AO, 2 * SIZE(AO)
  1272. lda BO, 2 * SIZE(B)
  1273. #ifndef TRMMKERNEL
  1274. lda L, -2(K)
  1275. #else
  1276. lda L, -2(TMP1)
  1277. #endif
  1278. ble L, $L55
  1279. #else
  1280. sll KK, ZBASE_SHIFT + 0, TMP1
  1281. addq AO, TMP1, AO
  1282. addq B, TMP1, BO
  1283. subq K, KK, TMP1
  1284. LD a1, 0 * SIZE(AO)
  1285. fclr t1
  1286. LD a2, 1 * SIZE(AO)
  1287. fclr t2
  1288. LD a3, 2 * SIZE(AO)
  1289. fclr t3
  1290. LD a4, 3 * SIZE(AO)
  1291. fclr t4
  1292. LD b1, 0 * SIZE(BO)
  1293. fclr c01
  1294. LD b2, 1 * SIZE(BO)
  1295. fclr c05
  1296. LD b3, 2 * SIZE(BO)
  1297. fclr c02
  1298. LD b4, 3 * SIZE(BO)
  1299. fclr c06
  1300. lda AO, 2 * SIZE(AO)
  1301. lda BO, 2 * SIZE(BO)
  1302. lda L, -2(TMP1)
  1303. ble L, $L55
  1304. #endif
  1305. .align 5
  1306. $L52:
  1307. ADD1 c01, t1, c01
  1308. unop
  1309. MUL a1, b1, t1
  1310. unop
  1311. ADD3 c02, t2, c02
  1312. lda AO, 4 * SIZE(AO)
  1313. MUL a2, b1, t2
  1314. LD b1, 2 * SIZE(BO)
  1315. ADD4 c05, t3, c05
  1316. lda L, -2(L)
  1317. MUL a1, b2, t3
  1318. LD a1, -2 * SIZE(AO)
  1319. ADD2 c06, t4, c06
  1320. unop
  1321. MUL a2, b2, t4
  1322. LD a2, -1 * SIZE(AO)
  1323. ADD1 c01, t1, c01
  1324. LD b2, 3 * SIZE(BO)
  1325. MUL a3, b3, t1
  1326. lda BO, 4 * SIZE(BO)
  1327. ADD3 c02, t2, c02
  1328. unop
  1329. MUL a4, b3, t2
  1330. LD b3, 0 * SIZE(BO)
  1331. ADD4 c05, t3, c05
  1332. unop
  1333. MUL a3, b4, t3
  1334. LD a3, 0 * SIZE(AO)
  1335. ADD2 c06, t4, c06
  1336. MUL a4, b4, t4
  1337. LD b4, 1 * SIZE(BO)
  1338. unop
  1339. LD a4, 1 * SIZE(AO)
  1340. unop
  1341. unop
  1342. bgt L, $L52
  1343. .align 4
  1344. $L55:
  1345. ADD1 c01, t1, c01
  1346. ldt alpha_r, ALPHA_R
  1347. MUL a1, b1, t1
  1348. #ifndef TRMMKERNEL
  1349. blbs K, $L58
  1350. #else
  1351. blbs TMP1, $L58
  1352. #endif
  1353. .align 4
  1354. ADD3 c02, t2, c02
  1355. unop
  1356. MUL a2, b1, t2
  1357. LD b1, 0 * SIZE(BO)
  1358. ADD4 c05, t3, c05
  1359. lda BO, 2 * SIZE(BO)
  1360. MUL a1, b2, t3
  1361. LD a1, 0 * SIZE(AO)
  1362. ADD2 c06, t4, c06
  1363. unop
  1364. MUL a2, b2, t4
  1365. LD a2, 1 * SIZE(AO)
  1366. ADD1 c01, t1, c01
  1367. LD b2, -1 * SIZE(BO)
  1368. MUL a1, b1, t1
  1369. lda AO, 2 * SIZE(AO)
  1370. .align 4
  1371. $L58:
  1372. ADD3 c02, t2, c02
  1373. unop
  1374. MUL a2, b1, t2
  1375. ldt alpha_i, ALPHA_I
  1376. ADD4 c05, t3, c05
  1377. unop
  1378. MUL a1, b2, t3
  1379. #ifndef TRMMKERNEL
  1380. LD c03, 0 * SIZE(C1)
  1381. #else
  1382. unop
  1383. #endif
  1384. ADD2 c06, t4, c06
  1385. unop
  1386. MUL a2, b2, t4
  1387. #ifndef TRMMKERNEL
  1388. LD c04, 1 * SIZE(C1)
  1389. #else
  1390. unop
  1391. #endif
  1392. ADD1 c01, t1, c01
  1393. ADD3 c02, t2, c02
  1394. ADD4 c05, t3, c05
  1395. ADD2 c06, t4, c06
  1396. ADD c01, c06, c01
  1397. ADD c02, c05, c02
  1398. MUL alpha_r, c01, t1
  1399. MUL alpha_r, c02, t2
  1400. MUL alpha_i, c02, t3
  1401. MUL alpha_i, c01, t4
  1402. #ifndef TRMMKERNEL
  1403. ADD c03, t1, c03
  1404. ADD c04, t2, c04
  1405. #else
  1406. ADD $f31, t1, c03
  1407. ADD $f31, t2, c04
  1408. #endif
  1409. SUB c03, t3, c03
  1410. ADD c04, t4, c04
  1411. ST c03, 0 * SIZE(C1)
  1412. ST c04, 1 * SIZE(C1)
  1413. .align 4
  1414. $L999:
  1415. ldt $f2, 0($sp)
  1416. ldt $f3, 8($sp)
  1417. ldt $f4, 16($sp)
  1418. ldt $f5, 24($sp)
  1419. ldt $f6, 32($sp)
  1420. ldt $f7, 40($sp)
  1421. ldt $f8, 48($sp)
  1422. ldt $f9, 56($sp)
  1423. clr $0
  1424. lda $sp, STACKSIZE($sp)
  1425. ret
  1426. .ident VERSION
  1427. .end CNAME