You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel.S 34 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359
  1. /***************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define AO $r12
  37. #define BO $r13
  38. #define I $r17
  39. #define J $r18
  40. #define L $r11
  41. #define CO1 $r14
  42. #define CO2 $r15
  43. #define CO3 $r23
  44. #define CO4 $r24
  45. #define CO5 $r25
  46. #define CO6 $r26
  47. #define CO7 $r27
  48. #define CO8 $r28
  49. #define a1 $f22
  50. #define a2 $f8
  51. #define a3 $f28
  52. #define a4 $f29
  53. #define b1 $f23
  54. #define b2 $f9
  55. #define b3 $f10
  56. #define b4 $f11
  57. #define b5 $f12
  58. #define b6 $f13
  59. #define b7 $f14
  60. #define b8 $f15
  61. #define a5 b8
  62. #define c11 $f16
  63. #define c12 $f17
  64. #define c21 $f3
  65. #define c22 $f4
  66. #define c31 $f2
  67. #define c32 $f5
  68. #define c41 $f6
  69. #define c42 $f7
  70. #define c51 $f18
  71. #define c52 $f19
  72. #define c61 $f20
  73. #define c62 $f21
  74. #define c71 $f24
  75. #define c72 $f25
  76. #define c81 $f26
  77. #define c82 $f27
  78. #define ALPHA_R $f0
  79. #define ALPHA_I $f1
  80. PROLOGUE
  81. addi.d $sp, $sp, -128
  82. SDARG $r23, $sp, 0
  83. SDARG $r24, $sp, 8
  84. SDARG $r25, $sp, 16
  85. SDARG $r26, $sp, 24
  86. SDARG $r27, $sp, 32
  87. SDARG $r28, $sp, 40
  88. fst.d $f24, $sp, 48
  89. fst.d $f25, $sp, 56
  90. fst.d $f26, $sp, 64
  91. fst.d $f27, $sp, 72
  92. fst.d $f28, $sp, 80
  93. fst.d $f29, $sp, 88
  94. slli.d LDC, LDC, ZBASE_SHIFT
  95. srai.d J, N, 3
  96. bge $r0, J, .L30
  97. .L10:
  98. move CO1, C
  99. MTC c11, $r0
  100. add.d CO2, C, LDC
  101. move AO, A
  102. add.d CO3, CO2, LDC
  103. addi.d J, J, -1
  104. add.d CO4, CO3, LDC
  105. MOV c21, c11
  106. add.d CO5, CO4, LDC
  107. MOV c31, c11
  108. add.d CO6, CO5, LDC
  109. MOV c41, c11
  110. add.d CO7, CO6, LDC
  111. MOV c51, c11
  112. add.d CO8, CO7, LDC
  113. srai.d I, M, 1
  114. add.d C, CO8, LDC
  115. MOV c61, c11
  116. bge $r0, I, .L20
  117. .L11:
  118. LD a1, AO, 0 * SIZE
  119. MOV c71, c11
  120. LD b1, B, 0 * SIZE
  121. MOV c81, c11
  122. LD a3, AO, 4 * SIZE
  123. MOV c12, c11
  124. LD b2, B, 1 * SIZE
  125. MOV c22, c11
  126. srai.d L, K, 2
  127. MOV c32, c11
  128. LD b3, B, 2 * SIZE
  129. MOV c42, c11
  130. LD b4, B, 3 * SIZE
  131. MOV c52, c11
  132. LD b5, B, 4 * SIZE
  133. MOV c62, c11
  134. LD b6, B, 8 * SIZE
  135. MOV c72, c11
  136. LD b7, B, 12 * SIZE
  137. MOV c82, c11
  138. move BO, B
  139. bge $r0, L, .L15
  140. MADD c11, b1, a1, c11
  141. LD a2, AO, 1 * SIZE
  142. MADD c21, b2, a1, c21
  143. addi.d L, L, -1
  144. MADD c31, b3, a1, c31
  145. MADD c41, b4, a1, c41
  146. bge $r0, L, .L13
  147. .align 3
  148. .L12:
  149. MADD c12, b1, a2, c12
  150. LD b1, BO, 16 * SIZE
  151. MADD c22, b2, a2, c22
  152. LD b2, BO, 5 * SIZE
  153. MADD c32, b3, a2, c32
  154. LD b3, BO, 6 * SIZE
  155. MADD c42, b4, a2, c42
  156. LD b4, BO, 7 * SIZE
  157. MADD c51, b5, a1, c51
  158. LD a4, AO, 2 * SIZE
  159. MADD c61, b2, a1, c61
  160. MADD c71, b3, a1, c71
  161. MADD c81, b4, a1, c81
  162. LD a1, AO, 8 * SIZE
  163. MADD c52, b5, a2, c52
  164. LD b5, BO, 20 * SIZE
  165. MADD c62, b2, a2, c62
  166. LD b2, BO, 9 * SIZE
  167. MADD c72, b3, a2, c72
  168. LD b3, BO, 10 * SIZE
  169. MADD c82, b4, a2, c82
  170. LD b4, BO, 11 * SIZE
  171. MADD c11, b6, a4, c11
  172. LD a2, AO, 3 * SIZE
  173. MADD c21, b2, a4, c21
  174. MADD c31, b3, a4, c31
  175. MADD c41, b4, a4, c41
  176. MADD c12, b6, a2, c12
  177. LD b6, BO, 24 * SIZE
  178. MADD c22, b2, a2, c22
  179. LD b2, BO, 13 * SIZE
  180. MADD c32, b3, a2, c32
  181. LD b3, BO, 14 * SIZE
  182. MADD c42, b4, a2, c42
  183. LD b4, BO, 15 * SIZE
  184. MADD c51, b7, a4, c51
  185. MADD c61, b2, a4, c61
  186. MADD c71, b3, a4, c71
  187. MADD c81, b4, a4, c81
  188. MADD c52, b7, a2, c52
  189. LD b7, BO, 28 * SIZE
  190. MADD c62, b2, a2, c62
  191. LD b2, BO, 17 * SIZE
  192. MADD c72, b3, a2, c72
  193. LD b3, BO, 18 * SIZE
  194. MADD c82, b4, a2, c82
  195. LD b4, BO, 19 * SIZE
  196. MADD c11, b1, a3, c11
  197. LD a2, AO, 5 * SIZE
  198. MADD c21, b2, a3, c21
  199. MADD c31, b3, a3, c31
  200. MADD c41, b4, a3, c41
  201. MADD c12, b1, a2, c12
  202. LD b1, BO, 32 * SIZE
  203. MADD c22, b2, a2, c22
  204. LD b2, BO, 21 * SIZE
  205. MADD c32, b3, a2, c32
  206. LD b3, BO, 22 * SIZE
  207. MADD c42, b4, a2, c42
  208. LD b4, BO, 23 * SIZE
  209. MADD c51, b5, a3, c51
  210. LD a4, AO, 6 * SIZE
  211. MADD c61, b2, a3, c61
  212. MADD c71, b3, a3, c71
  213. MADD c81, b4, a3, c81
  214. LD a3, AO, 12 * SIZE
  215. MADD c52, b5, a2, c52
  216. LD b5, BO, 36 * SIZE
  217. MADD c62, b2, a2, c62
  218. LD b2, BO, 25 * SIZE
  219. MADD c72, b3, a2, c72
  220. LD b3, BO, 26 * SIZE
  221. MADD c82, b4, a2, c82
  222. LD b4, BO, 27 * SIZE
  223. MADD c11, b6, a4, c11
  224. LD a2, AO, 7 * SIZE
  225. MADD c21, b2, a4, c21
  226. MADD c31, b3, a4, c31
  227. MADD c41, b4, a4, c41
  228. addi.d L, L, -1
  229. MADD c12, b6, a2, c12
  230. LD b6, BO, 40 * SIZE
  231. MADD c22, b2, a2, c22
  232. LD b2, BO, 29 * SIZE
  233. MADD c32, b3, a2, c32
  234. LD b3, BO, 30 * SIZE
  235. MADD c42, b4, a2, c42
  236. LD b4, BO, 31 * SIZE
  237. MADD c51, b7, a4, c51
  238. addi.d BO, BO, 32 * SIZE
  239. MADD c61, b2, a4, c61
  240. addi.d AO, AO, 8 * SIZE
  241. MADD c71, b3, a4, c71
  242. MADD c81, b4, a4, c81
  243. MADD c52, b7, a2, c52
  244. LD b7, BO, 12 * SIZE
  245. MADD c62, b2, a2, c62
  246. LD b2, BO, 1 * SIZE
  247. MADD c72, b3, a2, c72
  248. LD b3, BO, 2 * SIZE
  249. MADD c82, b4, a2, c82
  250. LD b4, BO, 3 * SIZE
  251. MADD c11, b1, a1, c11
  252. LD a2, AO, 1 * SIZE
  253. MADD c21, b2, a1, c21
  254. MADD c31, b3, a1, c31
  255. MADD c41, b4, a1, c41
  256. blt $r0, L, .L12
  257. .align 3
  258. .L13:
  259. MADD c12, b1, a2, c12
  260. LD b1, BO, 16 * SIZE
  261. MADD c22, b2, a2, c22
  262. LD b2, BO, 5 * SIZE
  263. MADD c32, b3, a2, c32
  264. LD b3, BO, 6 * SIZE
  265. MADD c42, b4, a2, c42
  266. LD b4, BO, 7 * SIZE
  267. MADD c51, b5, a1, c51
  268. MADD c61, b2, a1, c61
  269. LD a4, AO, 2 * SIZE
  270. MADD c71, b3, a1, c71
  271. MADD c81, b4, a1, c81
  272. LD a1, AO, 8 * SIZE
  273. MADD c52, b5, a2, c52
  274. LD b5, BO, 20 * SIZE
  275. MADD c62, b2, a2, c62
  276. LD b2, BO, 9 * SIZE
  277. MADD c72, b3, a2, c72
  278. LD b3, BO, 10 * SIZE
  279. MADD c82, b4, a2, c82
  280. LD b4, BO, 11 * SIZE
  281. MADD c11, b6, a4, c11
  282. LD a2, AO, 3 * SIZE
  283. MADD c21, b2, a4, c21
  284. MADD c31, b3, a4, c31
  285. MADD c41, b4, a4, c41
  286. MADD c12, b6, a2, c12
  287. LD b6, BO, 24 * SIZE
  288. MADD c22, b2, a2, c22
  289. LD b2, BO, 13 * SIZE
  290. MADD c32, b3, a2, c32
  291. LD b3, BO, 14 * SIZE
  292. MADD c42, b4, a2, c42
  293. LD b4, BO, 15 * SIZE
  294. MADD c51, b7, a4, c51
  295. MADD c61, b2, a4, c61
  296. MADD c71, b3, a4, c71
  297. MADD c81, b4, a4, c81
  298. MADD c52, b7, a2, c52
  299. LD b7, BO, 28 * SIZE
  300. MADD c62, b2, a2, c62
  301. LD b2, BO, 17 * SIZE
  302. MADD c72, b3, a2, c72
  303. LD b3, BO, 18 * SIZE
  304. MADD c82, b4, a2, c82
  305. LD b4, BO, 19 * SIZE
  306. MADD c11, b1, a3, c11
  307. LD a2, AO, 5 * SIZE
  308. MADD c21, b2, a3, c21
  309. MADD c31, b3, a3, c31
  310. MADD c41, b4, a3, c41
  311. MADD c12, b1, a2, c12
  312. LD b1, BO, 32 * SIZE
  313. MADD c22, b2, a2, c22
  314. LD b2, BO, 21 * SIZE
  315. MADD c32, b3, a2, c32
  316. LD b3, BO, 22 * SIZE
  317. MADD c42, b4, a2, c42
  318. LD b4, BO, 23 * SIZE
  319. MADD c51, b5, a3, c51
  320. MADD c61, b2, a3, c61
  321. LD a4, AO, 6 * SIZE
  322. MADD c71, b3, a3, c71
  323. MADD c81, b4, a3, c81
  324. LD a3, AO, 12 * SIZE
  325. MADD c52, b5, a2, c52
  326. LD b5, BO, 36 * SIZE
  327. MADD c62, b2, a2, c62
  328. LD b2, BO, 25 * SIZE
  329. MADD c72, b3, a2, c72
  330. LD b3, BO, 26 * SIZE
  331. MADD c82, b4, a2, c82
  332. LD b4, BO, 27 * SIZE
  333. MADD c11, b6, a4, c11
  334. LD a2, AO, 7 * SIZE
  335. MADD c21, b2, a4, c21
  336. MADD c31, b3, a4, c31
  337. MADD c41, b4, a4, c41
  338. MADD c12, b6, a2, c12
  339. LD b6, BO, 40 * SIZE
  340. MADD c22, b2, a2, c22
  341. LD b2, BO, 29 * SIZE
  342. MADD c32, b3, a2, c32
  343. LD b3, BO, 30 * SIZE
  344. MADD c42, b4, a2, c42
  345. LD b4, BO, 31 * SIZE
  346. MADD c51, b7, a4, c51
  347. addi.d BO, BO, 32 * SIZE
  348. MADD c61, b2, a4, c61
  349. addi.d AO, AO, 8 * SIZE
  350. MADD c71, b3, a4, c71
  351. MADD c81, b4, a4, c81
  352. MADD c52, b7, a2, c52
  353. LD b7, BO, 12 * SIZE
  354. MADD c62, b2, a2, c62
  355. LD b2, BO, 1 * SIZE
  356. MADD c72, b3, a2, c72
  357. LD b3, BO, 2 * SIZE
  358. MADD c82, b4, a2, c82
  359. LD b4, BO, 3 * SIZE
  360. .align 3
  361. .L15:
  362. andi L, K, 3
  363. bge $r0, L, .L18
  364. .align 3
  365. .L16:
  366. MADD c11, b1, a1, c11
  367. LD a2, AO, 1 * SIZE
  368. MADD c21, b2, a1, c21
  369. MADD c31, b3, a1, c31
  370. MADD c41, b4, a1, c41
  371. MADD c12, b1, a2, c12
  372. LD b1, BO, 8 * SIZE
  373. MADD c22, b2, a2, c22
  374. LD b2, BO, 5 * SIZE
  375. MADD c32, b3, a2, c32
  376. LD b3, BO, 6 * SIZE
  377. MADD c42, b4, a2, c42
  378. LD b4, BO, 7 * SIZE
  379. MADD c51, b5, a1, c51
  380. addi.d L, L, -1
  381. MADD c61, b2, a1, c61
  382. addi.d AO, AO, 2 * SIZE
  383. MADD c71, b3, a1, c71
  384. addi.d BO, BO, 8 * SIZE
  385. MADD c81, b4, a1, c81
  386. LD a1, AO, 0 * SIZE
  387. MADD c52, b5, a2, c52
  388. LD b5, BO, 4 * SIZE
  389. MADD c62, b2, a2, c62
  390. LD b2, BO, 1 * SIZE
  391. MADD c72, b3, a2, c72
  392. LD b3, BO, 2 * SIZE
  393. MADD c82, b4, a2, c82
  394. LD b4, BO, 3 * SIZE
  395. blt $r0, L, .L16
  396. .L18:
  397. LD $f22, CO1, 0 * SIZE
  398. LD $f8, CO1, 1 * SIZE
  399. LD $f23, CO1, 2 * SIZE
  400. LD $f9, CO1, 3 * SIZE
  401. LD $f10, CO2, 0 * SIZE
  402. MADD $f22, c11, ALPHA_R, $f22
  403. LD $f11, CO2, 1 * SIZE
  404. MADD $f8, c11, ALPHA_I, $f8
  405. LD $f12, CO2, 2 * SIZE
  406. MADD $f23, c12, ALPHA_R, $f23
  407. LD $f13, CO2, 3 * SIZE
  408. MADD $f9, c12, ALPHA_I, $f9
  409. MADD $f10, c21, ALPHA_R, $f10
  410. ST $f22, CO1, 0 * SIZE
  411. MADD $f11, c21, ALPHA_I, $f11
  412. ST $f8, CO1, 1 * SIZE
  413. MADD $f12, c22, ALPHA_R, $f12
  414. ST $f23, CO1, 2 * SIZE
  415. MADD $f13, c22, ALPHA_I, $f13
  416. ST $f9, CO1, 3 * SIZE
  417. LD $f22, CO3, 0 * SIZE
  418. LD $f8, CO3, 1 * SIZE
  419. LD $f23, CO3, 2 * SIZE
  420. LD $f9, CO3, 3 * SIZE
  421. ST $f10, CO2, 0 * SIZE
  422. ST $f11, CO2, 1 * SIZE
  423. ST $f12, CO2, 2 * SIZE
  424. ST $f13, CO2, 3 * SIZE
  425. LD $f10, CO4, 0 * SIZE
  426. LD $f11, CO4, 1 * SIZE
  427. LD $f12, CO4, 2 * SIZE
  428. LD $f13, CO4, 3 * SIZE
  429. MADD $f22, c31, ALPHA_R, $f22
  430. MADD $f8, c31, ALPHA_I, $f8
  431. MADD $f23, c32, ALPHA_R, $f23
  432. MADD $f9, c32, ALPHA_I, $f9
  433. MADD $f10, c41, ALPHA_R, $f10
  434. ST $f22, CO3, 0 * SIZE
  435. MADD $f11, c41, ALPHA_I, $f11
  436. ST $f8, CO3, 1 * SIZE
  437. MADD $f12, c42, ALPHA_R, $f12
  438. ST $f23, CO3, 2 * SIZE
  439. MADD $f13, c42, ALPHA_I, $f13
  440. ST $f9, CO3, 3 * SIZE
  441. LD $f22, CO5, 0 * SIZE
  442. LD $f8, CO5, 1 * SIZE
  443. LD $f23, CO5, 2 * SIZE
  444. LD $f9, CO5, 3 * SIZE
  445. ST $f10, CO4, 0 * SIZE
  446. ST $f11, CO4, 1 * SIZE
  447. ST $f12, CO4, 2 * SIZE
  448. ST $f13, CO4, 3 * SIZE
  449. LD $f10, CO6, 0 * SIZE
  450. LD $f11, CO6, 1 * SIZE
  451. LD $f12, CO6, 2 * SIZE
  452. LD $f13, CO6, 3 * SIZE
  453. MADD $f22, c51, ALPHA_R, $f22
  454. addi.d CO1,CO1, 4 * SIZE
  455. MADD $f8, c51, ALPHA_I, $f8
  456. addi.d CO2,CO2, 4 * SIZE
  457. MADD $f23, c52, ALPHA_R, $f23
  458. addi.d CO3,CO3, 4 * SIZE
  459. MADD $f9, c52, ALPHA_I, $f9
  460. addi.d CO4,CO4, 4 * SIZE
  461. MADD $f10, c61, ALPHA_R, $f10
  462. ST $f22, CO5, 0 * SIZE
  463. MADD $f11, c61, ALPHA_I, $f11
  464. ST $f8, CO5, 1 * SIZE
  465. MADD $f12, c62, ALPHA_R, $f12
  466. ST $f23, CO5, 2 * SIZE
  467. MADD $f13, c62, ALPHA_I, $f13
  468. ST $f9, CO5, 3 * SIZE
  469. LD $f22, CO7, 0 * SIZE
  470. LD $f8, CO7, 1 * SIZE
  471. LD $f23, CO7, 2 * SIZE
  472. LD $f9, CO7, 3 * SIZE
  473. ST $f10, CO6, 0 * SIZE
  474. ST $f11, CO6, 1 * SIZE
  475. ST $f12, CO6, 2 * SIZE
  476. ST $f13, CO6, 3 * SIZE
  477. LD $f10, CO8, 0 * SIZE
  478. addi.d I, I, -1
  479. LD $f11, CO8, 1 * SIZE
  480. MTC c11, $r0
  481. LD $f12, CO8, 2 * SIZE
  482. LD $f13, CO8, 3 * SIZE
  483. MADD $f22, c71, ALPHA_R, $f22
  484. addi.d CO5,CO5, 4 * SIZE
  485. MADD $f8, c71, ALPHA_I, $f8
  486. addi.d CO6,CO6, 4 * SIZE
  487. MADD $f23, c72, ALPHA_R, $f23
  488. addi.d CO7,CO7, 4 * SIZE
  489. MADD $f9, c72, ALPHA_I, $f9
  490. addi.d CO8,CO8, 4 * SIZE
  491. MADD $f10, c81, ALPHA_R, $f10
  492. ST $f22, CO7, -4 * SIZE
  493. MADD $f11, c81, ALPHA_I, $f11
  494. ST $f8, CO7, -3 * SIZE
  495. MADD $f12, c82, ALPHA_R, $f12
  496. ST $f23, CO7, -2 * SIZE
  497. MADD $f13, c82, ALPHA_I, $f13
  498. ST $f9, CO7, -1 * SIZE
  499. ST $f10, CO8, -4 * SIZE
  500. MOV c21, c11
  501. ST $f11, CO8, -3 * SIZE
  502. MOV c31, c11
  503. ST $f12, CO8, -2 * SIZE
  504. MOV c41, c11
  505. ST $f13, CO8, -1 * SIZE
  506. MOV c51, c11
  507. MOV c61, c11
  508. blt $r0, I, .L11
  509. .align 3
  510. .L20:
  511. andi I, M, 1
  512. MOV c61, c11
  513. MOV c71, c11
  514. bge $r0, I, .L29
  515. LD a1, AO, 0 * SIZE
  516. LD a2, AO, 1 * SIZE
  517. LD a3, AO, 2 * SIZE
  518. LD a4, AO, 3 * SIZE
  519. LD b1, B, 0 * SIZE
  520. LD b2, B, 1 * SIZE
  521. LD b3, B, 2 * SIZE
  522. LD b4, B, 3 * SIZE
  523. LD b5, B, 4 * SIZE
  524. LD b6, B, 8 * SIZE
  525. LD b7, B, 12 * SIZE
  526. srai.d L, K, 2
  527. MOV c81, c11
  528. move BO, B
  529. bge $r0, L, .L25
  530. .align 3
  531. .L22:
  532. MADD c11, b1, a1, c11
  533. LD b1, BO, 16 * SIZE
  534. MADD c21, b2, a1, c21
  535. LD b2, BO, 5 * SIZE
  536. MADD c31, b3, a1, c31
  537. LD b3, BO, 6 * SIZE
  538. MADD c41, b4, a1, c41
  539. LD b4, BO, 7 * SIZE
  540. MADD c51, b5, a1, c51
  541. LD b5, BO, 20 * SIZE
  542. MADD c61, b2, a1, c61
  543. LD b2, BO, 9 * SIZE
  544. MADD c71, b3, a1, c71
  545. LD b3, BO, 10 * SIZE
  546. MADD c81, b4, a1, c81
  547. LD b4, BO, 11 * SIZE
  548. LD a1, AO, 4 * SIZE
  549. addi.d L, L, -1
  550. MADD c11, b6, a2, c11
  551. LD b6, BO, 24 * SIZE
  552. MADD c21, b2, a2, c21
  553. LD b2, BO, 13 * SIZE
  554. MADD c31, b3, a2, c31
  555. LD b3, BO, 14 * SIZE
  556. MADD c41, b4, a2, c41
  557. LD b4, BO, 15 * SIZE
  558. MADD c51, b7, a2, c51
  559. LD b7, BO, 28 * SIZE
  560. MADD c61, b2, a2, c61
  561. LD b2, BO, 17 * SIZE
  562. MADD c71, b3, a2, c71
  563. LD b3, BO, 18 * SIZE
  564. MADD c81, b4, a2, c81
  565. LD b4, BO, 19 * SIZE
  566. LD a2, AO, 5 * SIZE
  567. addi.d AO, AO, 4 * SIZE
  568. MADD c11, b1, a3, c11
  569. LD b1, BO, 32 * SIZE
  570. MADD c21, b2, a3, c21
  571. LD b2, BO, 21 * SIZE
  572. MADD c31, b3, a3, c31
  573. LD b3, BO, 22 * SIZE
  574. MADD c41, b4, a3, c41
  575. LD b4, BO, 23 * SIZE
  576. MADD c51, b5, a3, c51
  577. LD b5, BO, 36 * SIZE
  578. MADD c61, b2, a3, c61
  579. LD b2, BO, 25 * SIZE
  580. MADD c71, b3, a3, c71
  581. LD b3, BO, 26 * SIZE
  582. MADD c81, b4, a3, c81
  583. LD b4, BO, 27 * SIZE
  584. LD a3, AO, 2 * SIZE
  585. addi.d BO, BO, 32 * SIZE
  586. MADD c11, b6, a4, c11
  587. LD b6, BO, 8 * SIZE
  588. MADD c21, b2, a4, c21
  589. LD b2, BO, -3 * SIZE
  590. MADD c31, b3, a4, c31
  591. LD b3, BO, -2 * SIZE
  592. MADD c41, b4, a4, c41
  593. LD b4, BO, -1 * SIZE
  594. MADD c51, b7, a4, c51
  595. LD b7, BO, 12 * SIZE
  596. MADD c61, b2, a4, c61
  597. LD b2, BO, 1 * SIZE
  598. MADD c71, b3, a4, c71
  599. LD b3, BO, 2 * SIZE
  600. MADD c81, b4, a4, c81
  601. LD b4, BO, 3 * SIZE
  602. LD a4, AO, 3 * SIZE
  603. blt $r0, L, .L22
  604. .align 3
  605. .L25:
  606. andi L, K, 3
  607. bge $r0, L, .L28
  608. .align 3
  609. .L26:
  610. MADD c11, b1, a1, c11
  611. LD b1, BO, 8 * SIZE
  612. MADD c21, b2, a1, c21
  613. LD b2, BO, 5 * SIZE
  614. MADD c31, b3, a1, c31
  615. LD b3, BO, 6 * SIZE
  616. MADD c41, b4, a1, c41
  617. LD b4, BO, 7 * SIZE
  618. addi.d L, L, -1
  619. MOV a2, a2
  620. addi.d AO, AO, 1 * SIZE
  621. addi.d BO, BO, 8 * SIZE
  622. MADD c51, b5, a1, c51
  623. LD b5, BO, 4 * SIZE
  624. MADD c61, b2, a1, c61
  625. LD b2, BO, 1 * SIZE
  626. MADD c71, b3, a1, c71
  627. LD b3, BO, 2 * SIZE
  628. MADD c81, b4, a1, c81
  629. LD a1, AO, 0 * SIZE
  630. LD b4, BO, 3 * SIZE
  631. blt $r0, L, .L26
  632. .L28:
  633. LD $f22, CO1, 0 * SIZE
  634. LD $f8, CO1, 1 * SIZE
  635. LD $f23, CO2, 0 * SIZE
  636. LD $f9, CO2, 1 * SIZE
  637. LD $f10, CO3, 0 * SIZE
  638. MADD $f22, c11, ALPHA_R, $f22
  639. LD $f11, CO3, 1 * SIZE
  640. MADD $f8, c11, ALPHA_I, $f8
  641. LD $f12, CO4, 0 * SIZE
  642. MADD $f23, c21, ALPHA_R, $f23
  643. LD $f13, CO4, 1 * SIZE
  644. MADD $f9, c21, ALPHA_I, $f9
  645. MADD $f10, c31, ALPHA_R, $f10
  646. ST $f22, CO1, 0 * SIZE
  647. MADD $f11, c31, ALPHA_I, $f11
  648. ST $f8, CO1, 1 * SIZE
  649. MADD $f12, c41, ALPHA_R, $f12
  650. ST $f23, CO2, 0 * SIZE
  651. MADD $f13, c41, ALPHA_I, $f13
  652. ST $f9, CO2, 1 * SIZE
  653. LD $f22, CO5, 0 * SIZE
  654. LD $f8, CO5, 1 * SIZE
  655. LD $f23, CO6, 0 * SIZE
  656. LD $f9, CO6, 1 * SIZE
  657. ST $f10, CO3, 0 * SIZE
  658. ST $f11, CO3, 1 * SIZE
  659. ST $f12, CO4, 0 * SIZE
  660. ST $f13, CO4, 1 * SIZE
  661. LD $f10, CO7, 0 * SIZE
  662. MADD $f22, c51, ALPHA_R, $f22
  663. LD $f11, CO7, 1 * SIZE
  664. MADD $f8, c51, ALPHA_I, $f8
  665. LD $f12, CO8, 0 * SIZE
  666. MADD $f23, c61, ALPHA_R, $f23
  667. LD $f13, CO8, 1 * SIZE
  668. MADD $f9, c61, ALPHA_I, $f9
  669. MADD $f10, c71, ALPHA_R, $f10
  670. ST $f22, CO5, 0 * SIZE
  671. MADD $f11, c71, ALPHA_I, $f11
  672. ST $f8, CO5, 1 * SIZE
  673. MADD $f12, c81, ALPHA_R, $f12
  674. ST $f23, CO6, 0 * SIZE
  675. MADD $f13, c81, ALPHA_I, $f13
  676. ST $f9, CO6, 1 * SIZE
  677. ST $f10, CO7, 0 * SIZE
  678. ST $f11, CO7, 1 * SIZE
  679. ST $f12, CO8, 0 * SIZE
  680. ST $f13, CO8, 1 * SIZE
  681. .align 3
  682. .L29:
  683. move B, BO
  684. blt $r0, J, .L10
  685. .align 3
  686. .L30:
  687. andi J, N, 4
  688. move AO, A
  689. bge $r0, J, .L50
  690. move CO1, C
  691. MTC c11, $r0
  692. add.d CO2, C, LDC
  693. add.d CO3, CO2, LDC
  694. add.d CO4, CO3, LDC
  695. MOV c21, c11
  696. add.d C, CO4, LDC
  697. MOV c31, c11
  698. srai.d I, M, 1
  699. MOV c41, c11
  700. bge $r0, I, .L40
  701. .L31:
  702. LD a1, AO, 0 * SIZE
  703. LD a3, AO, 4 * SIZE
  704. LD b1, B, 0 * SIZE
  705. MOV c12, c11
  706. LD b2, B, 1 * SIZE
  707. MOV c22, c11
  708. LD b3, B, 2 * SIZE
  709. MOV c32, c11
  710. LD b4, B, 3 * SIZE
  711. MOV c42, c11
  712. LD b5, B, 4 * SIZE
  713. srai.d L, K, 2
  714. LD b6, B, 8 * SIZE
  715. LD b7, B, 12 * SIZE
  716. move BO, B
  717. bge $r0, L, .L35
  718. .align 3
  719. .L32:
  720. MADD c11, b1, a1, c11
  721. LD a2, AO, 1 * SIZE
  722. MADD c21, b2, a1, c21
  723. addi.d L, L, -1
  724. MADD c31, b3, a1, c31
  725. MADD c41, b4, a1, c41
  726. LD a1, AO, 2 * SIZE
  727. MADD c12, b1, a2, c12
  728. LD b1, BO, 16 * SIZE
  729. MADD c22, b2, a2, c22
  730. LD b2, BO, 5 * SIZE
  731. MADD c32, b3, a2, c32
  732. LD b3, BO, 6 * SIZE
  733. MADD c42, b4, a2, c42
  734. LD b4, BO, 7 * SIZE
  735. MADD c11, b5, a1, c11
  736. LD a2, AO, 3 * SIZE
  737. MADD c21, b2, a1, c21
  738. MADD c31, b3, a1, c31
  739. MADD c41, b4, a1, c41
  740. LD a1, AO, 8 * SIZE
  741. MADD c12, b5, a2, c12
  742. LD b5, BO, 20 * SIZE
  743. MADD c22, b2, a2, c22
  744. LD b2, BO, 9 * SIZE
  745. MADD c32, b3, a2, c32
  746. LD b3, BO, 10 * SIZE
  747. MADD c42, b4, a2, c42
  748. LD b4, BO, 11 * SIZE
  749. MADD c11, b6, a3, c11
  750. LD a2, AO, 5 * SIZE
  751. MADD c21, b2, a3, c21
  752. MADD c31, b3, a3, c31
  753. MADD c41, b4, a3, c41
  754. LD a3, AO, 6 * SIZE
  755. MADD c12, b6, a2, c12
  756. LD b6, BO, 24 * SIZE
  757. MADD c22, b2, a2, c22
  758. LD b2, BO, 13 * SIZE
  759. MADD c32, b3, a2, c32
  760. LD b3, BO, 14 * SIZE
  761. MADD c42, b4, a2, c42
  762. LD b4, BO, 15 * SIZE
  763. MADD c11, b7, a3, c11
  764. LD a2, AO, 7 * SIZE
  765. MADD c21, b2, a3, c21
  766. addi.d AO, AO, 8 * SIZE
  767. MADD c31, b3, a3, c31
  768. addi.d BO, BO, 16 * SIZE
  769. MADD c41, b4, a3, c41
  770. LD a3, AO, 4 * SIZE
  771. MADD c12, b7, a2, c12
  772. LD b7, BO, 12 * SIZE
  773. MADD c22, b2, a2, c22
  774. LD b2, BO, 1 * SIZE
  775. MADD c32, b3, a2, c32
  776. LD b3, BO, 2 * SIZE
  777. MADD c42, b4, a2, c42
  778. LD b4, BO, 3 * SIZE
  779. blt $r0, L, .L32
  780. .align 3
  781. .L35:
  782. andi L, K, 3
  783. bge $r0, L, .L38
  784. .align 3
  785. .L36:
  786. MADD c11, b1, a1, c11
  787. LD a2, AO, 1 * SIZE
  788. MADD c21, b2, a1, c21
  789. addi.d L, L, -1
  790. MADD c31, b3, a1, c31
  791. addi.d AO, AO, 2 * SIZE
  792. MADD c41, b4, a1, c41
  793. LD a1, AO, 0 * SIZE
  794. MADD c12, b1, a2, c12
  795. LD b1, BO, 4 * SIZE
  796. MADD c22, b2, a2, c22
  797. LD b2, BO, 5 * SIZE
  798. MADD c32, b3, a2, c32
  799. LD b3, BO, 6 * SIZE
  800. MADD c42, b4, a2, c42
  801. LD b4, BO, 7 * SIZE
  802. addi.d BO, BO, 4 * SIZE
  803. blt $r0, L, .L36
  804. .L38:
  805. LD $f22, CO1, 0 * SIZE
  806. LD $f8, CO1, 1 * SIZE
  807. LD $f23, CO1, 2 * SIZE
  808. LD $f9, CO1, 3 * SIZE
  809. LD $f10, CO2, 0 * SIZE
  810. LD $f11, CO2, 1 * SIZE
  811. LD $f12, CO2, 2 * SIZE
  812. LD $f13, CO2, 3 * SIZE
  813. MADD $f22, c11, ALPHA_R, $f22
  814. MADD $f8, c11, ALPHA_I, $f8
  815. MADD $f23, c12, ALPHA_R, $f23
  816. MADD $f9, c12, ALPHA_I, $f9
  817. MADD $f10, c21, ALPHA_R, $f10
  818. ST $f22, CO1, 0 * SIZE
  819. MADD $f11, c21, ALPHA_I, $f11
  820. ST $f8, CO1, 1 * SIZE
  821. MADD $f12, c22, ALPHA_R, $f12
  822. ST $f23, CO1, 2 * SIZE
  823. MADD $f13, c22, ALPHA_I, $f13
  824. ST $f9, CO1, 3 * SIZE
  825. LD $f22, CO3, 0 * SIZE
  826. LD $f8, CO3, 1 * SIZE
  827. LD $f23, CO3, 2 * SIZE
  828. LD $f9, CO3, 3 * SIZE
  829. ST $f10, CO2, 0 * SIZE
  830. MADD $f22, c31, ALPHA_R, $f22
  831. ST $f11, CO2, 1 * SIZE
  832. MADD $f8, c31, ALPHA_I, $f8
  833. ST $f12, CO2, 2 * SIZE
  834. MADD $f23, c32, ALPHA_R, $f23
  835. ST $f13, CO2, 3 * SIZE
  836. MADD $f9, c32, ALPHA_I, $f9
  837. LD $f10, CO4, 0 * SIZE
  838. LD $f11, CO4, 1 * SIZE
  839. LD $f12, CO4, 2 * SIZE
  840. LD $f13, CO4, 3 * SIZE
  841. MADD $f10, c41, ALPHA_R, $f10
  842. addi.d CO1,CO1, 4 * SIZE
  843. MADD $f11, c41, ALPHA_I, $f11
  844. addi.d CO2,CO2, 4 * SIZE
  845. MADD $f12, c42, ALPHA_R, $f12
  846. addi.d CO3,CO3, 4 * SIZE
  847. MADD $f13, c42, ALPHA_I, $f13
  848. addi.d CO4,CO4, 4 * SIZE
  849. ST $f22, CO3, -4 * SIZE
  850. addi.d I, I, -1
  851. ST $f8, CO3, -3 * SIZE
  852. ST $f23, CO3, -2 * SIZE
  853. ST $f9, CO3, -1 * SIZE
  854. ST $f10, CO4, -4 * SIZE
  855. MTC c11, $r0
  856. ST $f11, CO4, -3 * SIZE
  857. MOV c21, c11
  858. ST $f12, CO4, -2 * SIZE
  859. MOV c31, c11
  860. ST $f13, CO4, -1 * SIZE
  861. MOV c41, c11
  862. blt $r0, I, .L31
  863. .align 3
  864. .L40:
  865. andi I, M, 1
  866. MOV c61, c11
  867. bge $r0, I, .L49
  868. LD a1, AO, 0 * SIZE
  869. MOV c71, c11
  870. LD a2, AO, 1 * SIZE
  871. MOV c81, c11
  872. LD b1, B, 0 * SIZE
  873. LD b2, B, 1 * SIZE
  874. LD b3, B, 2 * SIZE
  875. LD b4, B, 3 * SIZE
  876. LD b5, B, 4 * SIZE
  877. LD b6, B, 8 * SIZE
  878. LD b7, B, 12 * SIZE
  879. srai.d L, K, 2
  880. move BO, B
  881. bge $r0, L, .L45
  882. .align 3
  883. .L42:
  884. MADD c11, b1, a1, c11
  885. LD b1, BO, 16 * SIZE
  886. MADD c21, b2, a1, c21
  887. LD b2, BO, 5 * SIZE
  888. MADD c31, b3, a1, c31
  889. LD b3, BO, 6 * SIZE
  890. MADD c41, b4, a1, c41
  891. LD b4, BO, 7 * SIZE
  892. LD a1, AO, 4 * SIZE
  893. addi.d L, L, -1
  894. MADD c11, b5, a2, c11
  895. LD b5, BO, 20 * SIZE
  896. MADD c21, b2, a2, c21
  897. LD b2, BO, 9 * SIZE
  898. MADD c31, b3, a2, c31
  899. LD b3, BO, 10 * SIZE
  900. MADD c41, b4, a2, c41
  901. LD b4, BO, 11 * SIZE
  902. LD a2, AO, 2 * SIZE
  903. addi.d AO, AO, 4 * SIZE
  904. MADD c11, b6, a2, c11
  905. LD b6, BO, 24 * SIZE
  906. MADD c21, b2, a2, c21
  907. LD b2, BO, 13 * SIZE
  908. MADD c31, b3, a2, c31
  909. LD b3, BO, 14 * SIZE
  910. MADD c41, b4, a2, c41
  911. LD b4, BO, 15 * SIZE
  912. LD a2, AO, -1 * SIZE
  913. addi.d BO, BO, 16 * SIZE
  914. MADD c11, b7, a2, c11
  915. LD b7, BO, 12 * SIZE
  916. MADD c21, b2, a2, c21
  917. LD b2, BO, 1 * SIZE
  918. MADD c31, b3, a2, c31
  919. LD b3, BO, 2 * SIZE
  920. MADD c41, b4, a2, c41
  921. LD b4, BO, 3 * SIZE
  922. LD a2, AO, 1 * SIZE
  923. blt $r0, L, .L42
  924. .align 3
  925. .L45:
  926. andi L, K, 3
  927. bge $r0, L, .L48
  928. .align 3
  929. .L46:
  930. MADD c11, b1, a1, c11
  931. LD b1, BO, 4 * SIZE
  932. MADD c21, b2, a1, c21
  933. LD b2, BO, 5 * SIZE
  934. MADD c31, b3, a1, c31
  935. LD b3, BO, 6 * SIZE
  936. MADD c41, b4, a1, c41
  937. LD a1, AO, 1 * SIZE
  938. LD b4, BO, 7 * SIZE
  939. addi.d L, L, -1
  940. addi.d AO, AO, 1 * SIZE
  941. MOV a2, a2
  942. addi.d BO, BO, 4 * SIZE
  943. blt $r0, L, .L46
  944. .L48:
  945. LD $f22, CO1, 0 * SIZE
  946. LD $f8, CO1, 1 * SIZE
  947. LD $f23, CO2, 0 * SIZE
  948. LD $f9, CO2, 1 * SIZE
  949. LD $f10, CO3, 0 * SIZE
  950. MADD $f22, c11, ALPHA_R, $f22
  951. LD $f11, CO3, 1 * SIZE
  952. MADD $f8, c11, ALPHA_I, $f8
  953. LD $f12, CO4, 0 * SIZE
  954. MADD $f23, c21, ALPHA_R, $f23
  955. LD $f13, CO4, 1 * SIZE
  956. MADD $f9, c21, ALPHA_I, $f9
  957. MADD $f10, c31, ALPHA_R, $f10
  958. ST $f22, CO1, 0 * SIZE
  959. MADD $f11, c31, ALPHA_I, $f11
  960. ST $f8, CO1, 1 * SIZE
  961. MADD $f12, c41, ALPHA_R, $f12
  962. ST $f23, CO2, 0 * SIZE
  963. MADD $f13, c41, ALPHA_I, $f13
  964. ST $f9, CO2, 1 * SIZE
  965. ST $f10, CO3, 0 * SIZE
  966. ST $f11, CO3, 1 * SIZE
  967. ST $f12, CO4, 0 * SIZE
  968. ST $f13, CO4, 1 * SIZE
  969. .align 3
  970. .L49:
  971. move B, BO
  972. .align 3
  973. .L50:
  974. andi J, N, 2
  975. move AO, A
  976. bge $r0, J, .L70
  977. move CO1, C
  978. add.d CO2, C, LDC
  979. srai.d I, M, 1
  980. add.d C, CO2, LDC
  981. bge $r0, I, .L60
  982. .L51:
  983. LD a1, AO, 0 * SIZE
  984. MTC c11, $r0
  985. LD a2, AO, 1 * SIZE
  986. MOV c21, c11
  987. LD a5, AO, 4 * SIZE
  988. LD b1, B, 0 * SIZE
  989. MOV c12, c11
  990. LD b2, B, 1 * SIZE
  991. MOV c22, c11
  992. LD b3, B, 2 * SIZE
  993. LD b5, B, 4 * SIZE
  994. srai.d L, K, 2
  995. LD b6, B, 8 * SIZE
  996. LD b7, B, 12 * SIZE
  997. move BO, B
  998. bge $r0, L, .L55
  999. .align 3
  1000. .L52:
  1001. MADD c11, b1, a1, c11
  1002. LD a3, AO, 2 * SIZE
  1003. MADD c21, b2, a1, c21
  1004. LD b4, BO, 3 * SIZE
  1005. MADD c12, b1, a2, c12
  1006. LD a4, AO, 3 * SIZE
  1007. MADD c22, b2, a2, c22
  1008. LD b1, BO, 8 * SIZE
  1009. MADD c11, b3, a3, c11
  1010. LD a1, AO, 8 * SIZE
  1011. MADD c21, b4, a3, c21
  1012. LD b2, BO, 5 * SIZE
  1013. MADD c12, b3, a4, c12
  1014. LD a2, AO, 5 * SIZE
  1015. MADD c22, b4, a4, c22
  1016. LD b3, BO, 6 * SIZE
  1017. MADD c11, b5, a5, c11
  1018. LD a3, AO, 6 * SIZE
  1019. MADD c21, b2, a5, c21
  1020. LD b4, BO, 7 * SIZE
  1021. MADD c12, b5, a2, c12
  1022. LD a4, AO, 7 * SIZE
  1023. MADD c22, b2, a2, c22
  1024. LD b5, BO, 12 * SIZE
  1025. MADD c11, b3, a3, c11
  1026. LD a5, AO, 12 * SIZE
  1027. MADD c21, b4, a3, c21
  1028. LD b2, BO, 9 * SIZE
  1029. MADD c12, b3, a4, c12
  1030. LD a2, AO, 9 * SIZE
  1031. MADD c22, b4, a4, c22
  1032. LD b3, BO, 10 * SIZE
  1033. addi.d AO, AO, 8 * SIZE
  1034. addi.d L, L, -1
  1035. addi.d BO, BO, 8 * SIZE
  1036. blt $r0, L, .L52
  1037. .align 3
  1038. .L55:
  1039. andi L, K, 3
  1040. bge $r0, L, .L58
  1041. .align 3
  1042. .L56:
  1043. MADD c11, b1, a1, c11
  1044. LD a2, AO, 1 * SIZE
  1045. MADD c21, b2, a1, c21
  1046. LD a1, AO, 2 * SIZE
  1047. MADD c12, b1, a2, c12
  1048. LD b1, BO, 2 * SIZE
  1049. MADD c22, b2, a2, c22
  1050. LD b2, BO, 3 * SIZE
  1051. addi.d L, L, -1
  1052. addi.d AO, AO, 2 * SIZE
  1053. addi.d BO, BO, 2 * SIZE
  1054. blt $r0, L, .L56
  1055. .L58:
  1056. LD $f22, CO1, 0 * SIZE
  1057. LD $f8, CO1, 1 * SIZE
  1058. LD $f23, CO1, 2 * SIZE
  1059. LD $f9, CO1, 3 * SIZE
  1060. LD $f10, CO2, 0 * SIZE
  1061. LD $f11, CO2, 1 * SIZE
  1062. LD $f12, CO2, 2 * SIZE
  1063. LD $f13, CO2, 3 * SIZE
  1064. MADD $f22, c11, ALPHA_R, $f22
  1065. addi.d I, I, -1
  1066. MADD $f8, c11, ALPHA_I, $f8
  1067. addi.d CO1,CO1, 4 * SIZE
  1068. MADD $f23, c12, ALPHA_R, $f23
  1069. addi.d CO2,CO2, 4 * SIZE
  1070. MADD $f9, c12, ALPHA_I, $f9
  1071. MADD $f10, c21, ALPHA_R, $f10
  1072. MADD $f11, c21, ALPHA_I, $f11
  1073. MADD $f12, c22, ALPHA_R, $f12
  1074. MADD $f13, c22, ALPHA_I, $f13
  1075. ST $f22, CO1, -4 * SIZE
  1076. ST $f8, CO1, -3 * SIZE
  1077. ST $f23, CO1, -2 * SIZE
  1078. ST $f9, CO1, -1 * SIZE
  1079. ST $f10, CO2, -4 * SIZE
  1080. ST $f11, CO2, -3 * SIZE
  1081. ST $f12, CO2, -2 * SIZE
  1082. ST $f13, CO2, -1 * SIZE
  1083. blt $r0, I, .L51
  1084. .align 3
  1085. .L60:
  1086. andi I, M, 1
  1087. bge $r0, I, .L69
  1088. srai.d L, K, 2
  1089. LD a1, AO, 0 * SIZE
  1090. MTC c11, $r0
  1091. LD a2, AO, 1 * SIZE
  1092. MOV c21, c11
  1093. LD a3, AO, 2 * SIZE
  1094. MOV c31, c11
  1095. LD a4, AO, 3 * SIZE
  1096. MOV c41, c11
  1097. LD b1, B, 0 * SIZE
  1098. LD b2, B, 1 * SIZE
  1099. LD b3, B, 2 * SIZE
  1100. LD b4, B, 3 * SIZE
  1101. LD b5, B, 4 * SIZE
  1102. LD b6, B, 8 * SIZE
  1103. LD b7, B, 12 * SIZE
  1104. move BO, B
  1105. bge $r0, L, .L65
  1106. .align 3
  1107. .L62:
  1108. MADD c11, b1, a1, c11
  1109. LD b1, BO, 4 * SIZE
  1110. MADD c21, b2, a1, c21
  1111. LD b2, BO, 5 * SIZE
  1112. MADD c31, b3, a2, c31
  1113. LD b3, BO, 6 * SIZE
  1114. MADD c41, b4, a2, c41
  1115. LD b4, BO, 7 * SIZE
  1116. LD a1, AO, 4 * SIZE
  1117. LD a2, AO, 5 * SIZE
  1118. MADD c11, b1, a3, c11
  1119. LD b1, BO, 8 * SIZE
  1120. MADD c21, b2, a3, c21
  1121. LD b2, BO, 9 * SIZE
  1122. MADD c31, b3, a4, c31
  1123. LD b3, BO, 10 * SIZE
  1124. MADD c41, b4, a4, c41
  1125. LD b4, BO, 11 * SIZE
  1126. LD a3, AO, 6 * SIZE
  1127. LD a4, AO, 7 * SIZE
  1128. addi.d L, L, -1
  1129. addi.d AO, AO, 4 * SIZE
  1130. addi.d BO, BO, 8 * SIZE
  1131. blt $r0, L, .L62
  1132. .align 3
  1133. .L65:
  1134. andi L, K, 3
  1135. bge $r0, L, .L68
  1136. .align 3
  1137. .L66:
  1138. MADD c11, b1, a1, c11
  1139. LD b1, BO, 2 * SIZE
  1140. MADD c21, b2, a1, c21
  1141. LD b2, BO, 3 * SIZE
  1142. LD a1, AO, 1 * SIZE
  1143. addi.d L, L, -1
  1144. addi.d AO, AO, 1 * SIZE
  1145. addi.d BO, BO, 2 * SIZE
  1146. blt $r0, L, .L66
  1147. .L68:
  1148. LD $f22, CO1, 0 * SIZE
  1149. LD $f8, CO1, 1 * SIZE
  1150. LD $f23, CO2, 0 * SIZE
  1151. LD $f9, CO2, 1 * SIZE
  1152. ADD c11, c11, c31
  1153. ADD c21, c21, c41
  1154. MADD $f22, c11, ALPHA_R, $f22
  1155. MADD $f8, c11, ALPHA_I, $f8
  1156. MADD $f23, c21, ALPHA_R, $f23
  1157. MADD $f9, c21, ALPHA_I, $f9
  1158. ST $f22, CO1, 0 * SIZE
  1159. ST $f8, CO1, 1 * SIZE
  1160. ST $f23, CO2, 0 * SIZE
  1161. ST $f9, CO2, 1 * SIZE
  1162. .align 3
  1163. .L69:
  1164. move B, BO
  1165. .align 3
  1166. .L70:
  1167. andi J, N, 1
  1168. move AO, A
  1169. bge $r0, J, .L999
  1170. move CO1, C
  1171. srai.d I, M, 1
  1172. add.d C, CO1, LDC
  1173. bge $r0, I, .L80
  1174. .L71:
  1175. LD a1, AO, 0 * SIZE
  1176. MTC c11, $r0
  1177. LD a2, AO, 1 * SIZE
  1178. MOV c21, c11
  1179. LD a5, AO, 4 * SIZE
  1180. LD b1, B, 0 * SIZE
  1181. MOV c12, c11
  1182. LD b2, B, 1 * SIZE
  1183. MOV c22, c11
  1184. LD b3, B, 2 * SIZE
  1185. LD b5, B, 4 * SIZE
  1186. srai.d L, K, 2
  1187. LD b6, B, 8 * SIZE
  1188. LD b7, B, 12 * SIZE
  1189. move BO, B
  1190. bge $r0, L, .L75
  1191. .align 3
  1192. .L72:
  1193. LD a1, AO, 0 * SIZE
  1194. LD a2, AO, 1 * SIZE
  1195. LD b1, BO, 0 * SIZE
  1196. MADD c11, b1, a1, c11
  1197. MADD c12, b1, a2, c12
  1198. LD a1, AO, 2 * SIZE
  1199. LD a2, AO, 3 * SIZE
  1200. LD b1, BO, 1 * SIZE
  1201. MADD c11, b1, a1, c11
  1202. MADD c12, b1, a2, c12
  1203. LD a1, AO, 4 * SIZE
  1204. LD a2, AO, 5 * SIZE
  1205. LD b1, BO, 2 * SIZE
  1206. MADD c11, b1, a1, c11
  1207. MADD c12, b1, a2, c12
  1208. LD a1, AO, 6 * SIZE
  1209. LD a2, AO, 7 * SIZE
  1210. LD b1, BO, 3 * SIZE
  1211. MADD c11, b1, a1, c11
  1212. MADD c12, b1, a2, c12
  1213. addi.d L, L, -1
  1214. addi.d AO, AO, 8 * SIZE
  1215. addi.d BO, BO, 4 * SIZE
  1216. blt $r0, L, .L72
  1217. .align 3
  1218. .L75:
  1219. andi L, K, 3
  1220. bge $r0, L, .L78
  1221. .align 3
  1222. .L76:
  1223. LD a1, AO, 0 * SIZE
  1224. LD a2, AO, 1 * SIZE
  1225. LD b1, BO, 0 * SIZE
  1226. MADD c11, b1, a1, c11
  1227. MADD c12, b1, a2, c12
  1228. addi.d L, L, -1
  1229. addi.d AO, AO, 2 * SIZE
  1230. addi.d BO, BO, 1 * SIZE
  1231. blt $r0, L, .L76
  1232. .L78:
  1233. LD $f22, CO1, 0 * SIZE
  1234. LD $f8, CO1, 1 * SIZE
  1235. LD $f23, CO1, 2 * SIZE
  1236. LD $f9, CO1, 3 * SIZE
  1237. ADD c11, c11, c21
  1238. addi.d I, I, -1
  1239. ADD c12, c12, c22
  1240. addi.d CO1,CO1, 4 * SIZE
  1241. MADD $f22, c11, ALPHA_R, $f22
  1242. MADD $f8, c11, ALPHA_I, $f8
  1243. MADD $f23, c12, ALPHA_R, $f23
  1244. MADD $f9, c12, ALPHA_I, $f9
  1245. ST $f22, CO1, -4 * SIZE
  1246. ST $f8, CO1, -3 * SIZE
  1247. ST $f23, CO1, -2 * SIZE
  1248. ST $f9, CO1, -1 * SIZE
  1249. blt $r0, I, .L71
  1250. .align 3
  1251. .L80:
  1252. andi I, M, 1
  1253. bge $r0, I, .L89
  1254. LD a1, AO, 0 * SIZE
  1255. MTC c11, $r0
  1256. LD a2, AO, 1 * SIZE
  1257. MOV c21, c11
  1258. LD a3, AO, 2 * SIZE
  1259. LD a4, AO, 3 * SIZE
  1260. LD b1, B, 0 * SIZE
  1261. LD b2, B, 1 * SIZE
  1262. LD b3, B, 2 * SIZE
  1263. LD b4, B, 3 * SIZE
  1264. LD b5, B, 4 * SIZE
  1265. LD b6, B, 8 * SIZE
  1266. LD b7, B, 12 * SIZE
  1267. srai.d L, K, 2
  1268. move BO, B
  1269. bge $r0, L, .L85
  1270. .align 3
  1271. .L82:
  1272. LD a1, AO, 0 * SIZE
  1273. LD b1, BO, 0 * SIZE
  1274. MADD c11, b1, a1, c11
  1275. LD a1, AO, 1 * SIZE
  1276. LD b1, BO, 1 * SIZE
  1277. MADD c21, b1, a1, c21
  1278. LD a1, AO, 2 * SIZE
  1279. LD b1, BO, 2 * SIZE
  1280. MADD c11, b1, a1, c11
  1281. LD a1, AO, 3 * SIZE
  1282. LD b1, BO, 3 * SIZE
  1283. MADD c21, b1, a1, c21
  1284. addi.d L, L, -1
  1285. addi.d AO, AO, 4 * SIZE
  1286. addi.d BO, BO, 4 * SIZE
  1287. blt $r0, L, .L82
  1288. .align 3
  1289. .L85:
  1290. andi L, K, 3
  1291. bge $r0, L, .L88
  1292. .align 3
  1293. .L86:
  1294. LD a1, AO, 0 * SIZE
  1295. LD b1, BO, 0 * SIZE
  1296. MADD c11, b1, a1, c11
  1297. addi.d L, L, -1
  1298. addi.d AO, AO, 1 * SIZE
  1299. addi.d BO, BO, 1 * SIZE
  1300. blt $r0, L, .L86
  1301. .L88:
  1302. LD $f22, CO1, 0 * SIZE
  1303. LD $f8, CO1, 1 * SIZE
  1304. ADD c11, c11, c21
  1305. MADD $f22, c11, ALPHA_R, $f22
  1306. MADD $f8, c11, ALPHA_I, $f8
  1307. ST $f22, CO1, 0 * SIZE
  1308. ST $f8, CO1, 1 * SIZE
  1309. .align 3
  1310. .L89:
  1311. move B, BO
  1312. .align 3
  1313. .L999:
  1314. LDARG $r23, $sp, 0
  1315. LDARG $r24, $sp, 8
  1316. LDARG $r25, $sp, 16
  1317. LDARG $r26, $sp, 24
  1318. LDARG $r27, $sp, 32
  1319. LDARG $r28, $sp, 40
  1320. fld.d $f24, $sp, 48
  1321. fld.d $f25, $sp, 56
  1322. fld.d $f26, $sp, 64
  1323. fld.d $f27, $sp, 72
  1324. fld.d $f28, $sp, 80
  1325. fld.d $f29, $sp, 88
  1326. addi.d $sp, $sp, 128
  1327. move $r4, $r17
  1328. fmov.d $f0, $f22
  1329. jirl $r0, $r1, 0x0
  1330. EPILOGUE