You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n_ppc440.S 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r8
  54. #define LDA r9
  55. #define X r10
  56. #define INCX r5
  57. #define Y r6
  58. #define INCY r7
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r10
  66. #define LDA r5
  67. #define X r6
  68. #define INCX r7
  69. #define Y r8
  70. #define INCY r9
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r8
  75. #define LDA r9
  76. #define X r10
  77. #define INCX r5
  78. #define Y r6
  79. #define INCY r7
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define Y1 r18
  89. #define Y2 r19
  90. #define PREA r20
  91. #define YY r21
  92. #define BUFFER r22
  93. #define y01 f0
  94. #define y02 f1
  95. #define y03 f2
  96. #define y04 f3
  97. #define y05 f4
  98. #define y06 f5
  99. #define y07 f6
  100. #define y08 f7
  101. #define y09 f8
  102. #define y10 f9
  103. #define y11 f10
  104. #define y12 f11
  105. #define y13 f12
  106. #define y14 f13
  107. #define y15 f14
  108. #define y16 f15
  109. #define alpha1r f16
  110. #define alpha1i f17
  111. #define alpha2r f18
  112. #define alpha2i f19
  113. #define alpha3r f20
  114. #define alpha3i f21
  115. #define alpha4r f22
  116. #define alpha4i f23
  117. #define a1 f24
  118. #define a2 f25
  119. #define a3 f26
  120. #define a4 f27
  121. #define a5 f28
  122. #define a6 f29
  123. #define a7 f30
  124. #define a8 f31
  125. #define alpha_r f14
  126. #define alpha_i f15
  127. #if defined(PPCG4)
  128. #define PREFETCHSIZE_A (3 * 4)
  129. #endif
  130. #if defined(POWER6)
  131. #define PREFETCHSIZE_A (3 * 4)
  132. #endif
  133. #ifndef XCONJ
  134. #define FMADDR FMADD
  135. #define FMSUBR FNMSUB
  136. #else
  137. #define FMADDR FNMSUB
  138. #define FMSUBR FMADD
  139. #endif
  140. #ifndef CONJ
  141. #define FMADDX FMADD
  142. #define FMSUBX FNMSUB
  143. #else
  144. #define FMADDX FNMSUB
  145. #define FMSUBX FMADD
  146. #endif
  147. #ifndef NEEDPARAM
  148. #ifndef __64BIT__
  149. #define STACKSIZE 232
  150. #define ALPHA_R 208(SP)
  151. #define ALPHA_I 216(SP)
  152. #define FZERO 224(SP)
  153. #else
  154. #define STACKSIZE 280
  155. #define ALPHA_R 256(SP)
  156. #define ALPHA_I 264(SP)
  157. #define FZERO 272(SP)
  158. #endif
  159. PROLOGUE
  160. PROFCODE
  161. addi SP, SP, -STACKSIZE
  162. li r0, 0
  163. stfd f14, 0(SP)
  164. stfd f15, 8(SP)
  165. stfd f16, 16(SP)
  166. stfd f17, 24(SP)
  167. stfd f18, 32(SP)
  168. stfd f19, 40(SP)
  169. stfd f20, 48(SP)
  170. stfd f21, 56(SP)
  171. stfd f22, 64(SP)
  172. stfd f23, 72(SP)
  173. stfd f24, 80(SP)
  174. stfd f25, 88(SP)
  175. stfd f26, 96(SP)
  176. stfd f27, 104(SP)
  177. stfd f28, 112(SP)
  178. stfd f29, 120(SP)
  179. stfd f30, 128(SP)
  180. stfd f31, 136(SP)
  181. #ifdef __64BIT__
  182. std r0, FZERO
  183. std r14, 144(SP)
  184. std r15, 152(SP)
  185. std r16, 160(SP)
  186. std r17, 168(SP)
  187. std r18, 176(SP)
  188. std r19, 184(SP)
  189. std r20, 192(SP)
  190. std r21, 200(SP)
  191. std r22, 208(SP)
  192. #else
  193. stw r0, 0 + FZERO
  194. stw r0, 4 + FZERO
  195. stw r14, 144(SP)
  196. stw r15, 148(SP)
  197. stw r16, 152(SP)
  198. stw r17, 156(SP)
  199. stw r18, 160(SP)
  200. stw r19, 164(SP)
  201. stw r20, 168(SP)
  202. stw r21, 172(SP)
  203. stw r22, 176(SP)
  204. #endif
  205. #if defined(linux) || defined(__FreeBSD__)
  206. #ifndef __64BIT__
  207. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  208. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  209. #else
  210. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  211. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  212. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  213. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  214. #endif
  215. #endif
  216. #if defined(_AIX) || defined(__APPLE__)
  217. #ifndef __64BIT__
  218. #ifdef DOUBLE
  219. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  220. lwz X, FRAMESLOT(1) + STACKSIZE(SP)
  221. lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
  222. lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
  223. lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
  224. lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
  225. #else
  226. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  227. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  228. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  229. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  230. #endif
  231. #else
  232. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  233. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  234. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  235. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  236. #endif
  237. #endif
  238. stfd f1, ALPHA_R
  239. stfd f2, ALPHA_I
  240. slwi LDA, LDA, ZBASE_SHIFT
  241. slwi INCX, INCX, ZBASE_SHIFT
  242. slwi INCY, INCY, ZBASE_SHIFT
  243. addi INCX, INCX, -SIZE
  244. addi INCY, INCY, -SIZE
  245. addi A, A, -SIZE
  246. cmpwi cr0, M, 0
  247. ble- LL(999)
  248. sub X, X, INCX
  249. cmpwi cr0, N, 0
  250. sub Y, Y, INCY
  251. ble- LL(999)
  252. li PREA, PREFETCHSIZE_A * SIZE
  253. mr YY, Y
  254. lfd f0, FZERO
  255. cmpi cr0, 0, INCY, SIZE
  256. beq LL(10)
  257. addi YY, BUFFER, -SIZE
  258. addi Y1, BUFFER, -SIZE
  259. addi r0, M, 3
  260. srawi. r0, r0, 2
  261. mtspr CTR, r0
  262. .align 4
  263. LL(02):
  264. STFDU f0, 1 * SIZE(Y1)
  265. STFDU f0, 1 * SIZE(Y1)
  266. STFDU f0, 1 * SIZE(Y1)
  267. STFDU f0, 1 * SIZE(Y1)
  268. STFDU f0, 1 * SIZE(Y1)
  269. STFDU f0, 1 * SIZE(Y1)
  270. STFDU f0, 1 * SIZE(Y1)
  271. STFDU f0, 1 * SIZE(Y1)
  272. bdnz LL(02)
  273. .align 4
  274. LL(10):
  275. srawi. J, N, 2
  276. ble LL(20)
  277. .align 4
  278. LL(11):
  279. lfd alpha_r, ALPHA_R
  280. lfd alpha_i, ALPHA_I
  281. LFDUX a1, X, INCX
  282. LFDU a2, 1 * SIZE(X)
  283. LFDUX a3, X, INCX
  284. LFDU a4, 1 * SIZE(X)
  285. LFDUX a5, X, INCX
  286. LFDU a6, 1 * SIZE(X)
  287. LFDUX a7, X, INCX
  288. LFDU a8, 1 * SIZE(X)
  289. FMUL alpha1r, alpha_r, a1
  290. FMUL alpha1i, alpha_i, a1
  291. FMUL alpha2r, alpha_r, a3
  292. FMUL alpha2i, alpha_i, a3
  293. FMUL alpha3r, alpha_r, a5
  294. mr Y1, YY
  295. FMUL alpha3i, alpha_i, a5
  296. mr Y2, YY
  297. FMUL alpha4r, alpha_r, a7
  298. mr AO1, A
  299. FMUL alpha4i, alpha_i, a7
  300. add AO2, A, LDA
  301. FMSUBR alpha1r, alpha_i, a2, alpha1r
  302. add AO3, AO2, LDA
  303. FMADDR alpha1i, alpha_r, a2, alpha1i
  304. add AO4, AO3, LDA
  305. FMSUBR alpha2r, alpha_i, a4, alpha2r
  306. add A, AO4, LDA
  307. FMADDR alpha2i, alpha_r, a4, alpha2i
  308. FMSUBR alpha3r, alpha_i, a6, alpha3r
  309. srawi. r0, M, 2
  310. FMADDR alpha3i, alpha_r, a6, alpha3i
  311. FMSUBR alpha4r, alpha_i, a8, alpha4r
  312. mtspr CTR, r0
  313. FMADDR alpha4i, alpha_r, a8, alpha4i
  314. ble LL(15)
  315. .align 4
  316. LFDU a1, 1 * SIZE(AO1)
  317. LFDU y01, 1 * SIZE(Y1)
  318. LFDU a2, 1 * SIZE(AO1)
  319. LFDU y02, 1 * SIZE(Y1)
  320. LFDU a3, 1 * SIZE(AO1)
  321. LFDU y03, 1 * SIZE(Y1)
  322. LFDU a4, 1 * SIZE(AO1)
  323. LFDU y04, 1 * SIZE(Y1)
  324. LFDU a5, 1 * SIZE(AO1)
  325. LFDU y05, 1 * SIZE(Y1)
  326. LFDU a6, 1 * SIZE(AO1)
  327. LFDU y06, 1 * SIZE(Y1)
  328. LFDU a7, 1 * SIZE(AO1)
  329. LFDU y07, 1 * SIZE(Y1)
  330. LFDU a8, 1 * SIZE(AO1)
  331. LFDU y08, 1 * SIZE(Y1)
  332. FMADD y09, alpha1r, a1, y01
  333. FMADD y10, alpha1i, a1, y02
  334. FMADD y11, alpha1r, a3, y03
  335. FMADD y12, alpha1i, a3, y04
  336. FMADD y13, alpha1r, a5, y05
  337. FMADD y14, alpha1i, a5, y06
  338. FMADD y15, alpha1r, a7, y07
  339. FMADD y16, alpha1i, a7, y08
  340. bdz LL(13)
  341. .align 4
  342. LL(12):
  343. FMSUBX y09, alpha1i, a2, y09
  344. LFDU a1, 1 * SIZE(AO2)
  345. FMADDX y10, alpha1r, a2, y10
  346. LFDU a2, 1 * SIZE(AO2)
  347. FMSUBX y11, alpha1i, a4, y11
  348. LFDU a3, 1 * SIZE(AO2)
  349. FMADDX y12, alpha1r, a4, y12
  350. LFDU a4, 1 * SIZE(AO2)
  351. #ifdef PPCG4
  352. dcbt AO2, PREA
  353. #endif
  354. FMSUBX y13, alpha1i, a6, y13
  355. LFDU a5, 1 * SIZE(AO2)
  356. FMADDX y14, alpha1r, a6, y14
  357. LFDU a6, 1 * SIZE(AO2)
  358. FMSUBX y15, alpha1i, a8, y15
  359. LFDU a7, 1 * SIZE(AO2)
  360. FMADDX y16, alpha1r, a8, y16
  361. LFDU a8, 1 * SIZE(AO2)
  362. #if defined(PPCG4) && defined(DOUBLE)
  363. dcbt AO2, PREA
  364. #endif
  365. FMADD y09, alpha2r, a1, y09
  366. LFDU y01, 1 * SIZE(Y1)
  367. FMADD y10, alpha2i, a1, y10
  368. LFDU y02, 1 * SIZE(Y1)
  369. FMADD y11, alpha2r, a3, y11
  370. LFDU y03, 1 * SIZE(Y1)
  371. FMADD y12, alpha2i, a3, y12
  372. LFDU y04, 1 * SIZE(Y1)
  373. #ifdef PPCG4
  374. dcbtst Y1, PREA
  375. #endif
  376. FMADD y13, alpha2r, a5, y13
  377. FMADD y14, alpha2i, a5, y14
  378. FMADD y15, alpha2r, a7, y15
  379. FMADD y16, alpha2i, a7, y16
  380. FMSUBX y09, alpha2i, a2, y09
  381. LFDU a1, 1 * SIZE(AO3)
  382. FMADDX y10, alpha2r, a2, y10
  383. LFDU a2, 1 * SIZE(AO3)
  384. FMSUBX y11, alpha2i, a4, y11
  385. LFDU a3, 1 * SIZE(AO3)
  386. FMADDX y12, alpha2r, a4, y12
  387. LFDU a4, 1 * SIZE(AO3)
  388. #ifdef PPCG4
  389. dcbt AO3, PREA
  390. #endif
  391. FMSUBX y13, alpha2i, a6, y13
  392. LFDU a5, 1 * SIZE(AO3)
  393. FMADDX y14, alpha2r, a6, y14
  394. LFDU a6, 1 * SIZE(AO3)
  395. FMSUBX y15, alpha2i, a8, y15
  396. LFDU a7, 1 * SIZE(AO3)
  397. FMADDX y16, alpha2r, a8, y16
  398. LFDU a8, 1 * SIZE(AO3)
  399. #if defined(PPCG4) && defined(DOUBLE)
  400. dcbt AO3, PREA
  401. #endif
  402. FMADD y09, alpha3r, a1, y09
  403. LFDU y05, 1 * SIZE(Y1)
  404. FMADD y10, alpha3i, a1, y10
  405. LFDU y06, 1 * SIZE(Y1)
  406. FMADD y11, alpha3r, a3, y11
  407. LFDU y07, 1 * SIZE(Y1)
  408. FMADD y12, alpha3i, a3, y12
  409. LFDU y08, 1 * SIZE(Y1)
  410. #if defined(PPCG4) && defined(DOUBLE)
  411. dcbtst Y1, PREA
  412. #endif
  413. FMADD y13, alpha3r, a5, y13
  414. FMADD y14, alpha3i, a5, y14
  415. FMADD y15, alpha3r, a7, y15
  416. FMADD y16, alpha3i, a7, y16
  417. FMSUBX y09, alpha3i, a2, y09
  418. LFDU a1, 1 * SIZE(AO4)
  419. FMADDX y10, alpha3r, a2, y10
  420. LFDU a2, 1 * SIZE(AO4)
  421. FMSUBX y11, alpha3i, a4, y11
  422. LFDU a3, 1 * SIZE(AO4)
  423. FMADDX y12, alpha3r, a4, y12
  424. LFDU a4, 1 * SIZE(AO4)
  425. #ifdef PPCG4
  426. dcbt AO4, PREA
  427. #endif
  428. FMSUBX y13, alpha3i, a6, y13
  429. LFDU a5, 1 * SIZE(AO4)
  430. FMADDX y14, alpha3r, a6, y14
  431. LFDU a6, 1 * SIZE(AO4)
  432. FMSUBX y15, alpha3i, a8, y15
  433. LFDU a7, 1 * SIZE(AO4)
  434. FMADDX y16, alpha3r, a8, y16
  435. LFDU a8, 1 * SIZE(AO4)
  436. #if defined(PPCG4) && defined(DOUBLE)
  437. dcbt AO4, PREA
  438. #endif
  439. FMADD y09, alpha4r, a1, y09
  440. FMADD y10, alpha4i, a1, y10
  441. FMADD y11, alpha4r, a3, y11
  442. FMADD y12, alpha4i, a3, y12
  443. FMADD y13, alpha4r, a5, y13
  444. FMADD y14, alpha4i, a5, y14
  445. FMADD y15, alpha4r, a7, y15
  446. FMADD y16, alpha4i, a7, y16
  447. FMSUBX y09, alpha4i, a2, y09
  448. LFDU a1, 1 * SIZE(AO1)
  449. FMADDX y10, alpha4r, a2, y10
  450. LFDU a2, 1 * SIZE(AO1)
  451. FMSUBX y11, alpha4i, a4, y11
  452. LFDU a3, 1 * SIZE(AO1)
  453. FMADDX y12, alpha4r, a4, y12
  454. LFDU a4, 1 * SIZE(AO1)
  455. #ifdef PPCG4
  456. dcbt AO1, PREA
  457. #endif
  458. FMSUBX y13, alpha4i, a6, y13
  459. LFDU a5, 1 * SIZE(AO1)
  460. FMADDX y14, alpha4r, a6, y14
  461. LFDU a6, 1 * SIZE(AO1)
  462. FMSUBX y15, alpha4i, a8, y15
  463. LFDU a7, 1 * SIZE(AO1)
  464. FMADDX y16, alpha4r, a8, y16
  465. LFDU a8, 1 * SIZE(AO1)
  466. #if defined(PPCG4) && defined(DOUBLE)
  467. dcbt AO1, PREA
  468. #endif
  469. STFDU y09, 1 * SIZE(Y2)
  470. FMADD y09, alpha1r, a1, y01
  471. STFDU y10, 1 * SIZE(Y2)
  472. FMADD y10, alpha1i, a1, y02
  473. STFDU y11, 1 * SIZE(Y2)
  474. FMADD y11, alpha1r, a3, y03
  475. STFDU y12, 1 * SIZE(Y2)
  476. FMADD y12, alpha1i, a3, y04
  477. STFDU y13, 1 * SIZE(Y2)
  478. FMADD y13, alpha1r, a5, y05
  479. STFDU y14, 1 * SIZE(Y2)
  480. FMADD y14, alpha1i, a5, y06
  481. STFDU y15, 1 * SIZE(Y2)
  482. FMADD y15, alpha1r, a7, y07
  483. STFDU y16, 1 * SIZE(Y2)
  484. FMADD y16, alpha1i, a7, y08
  485. bdnz LL(12)
  486. .align 4
  487. LL(13):
  488. FMSUBX y09, alpha1i, a2, y09
  489. LFDU a1, 1 * SIZE(AO2)
  490. FMADDX y10, alpha1r, a2, y10
  491. LFDU a2, 1 * SIZE(AO2)
  492. FMSUBX y11, alpha1i, a4, y11
  493. LFDU a3, 1 * SIZE(AO2)
  494. FMADDX y12, alpha1r, a4, y12
  495. LFDU a4, 1 * SIZE(AO2)
  496. FMSUBX y13, alpha1i, a6, y13
  497. LFDU a5, 1 * SIZE(AO2)
  498. FMADDX y14, alpha1r, a6, y14
  499. LFDU a6, 1 * SIZE(AO2)
  500. FMSUBX y15, alpha1i, a8, y15
  501. LFDU a7, 1 * SIZE(AO2)
  502. FMADDX y16, alpha1r, a8, y16
  503. LFDU a8, 1 * SIZE(AO2)
  504. FMADD y09, alpha2r, a1, y09
  505. FMADD y10, alpha2i, a1, y10
  506. FMADD y11, alpha2r, a3, y11
  507. FMADD y12, alpha2i, a3, y12
  508. FMADD y13, alpha2r, a5, y13
  509. FMADD y14, alpha2i, a5, y14
  510. FMADD y15, alpha2r, a7, y15
  511. FMADD y16, alpha2i, a7, y16
  512. FMSUBX y09, alpha2i, a2, y09
  513. LFDU a1, 1 * SIZE(AO3)
  514. FMADDX y10, alpha2r, a2, y10
  515. LFDU a2, 1 * SIZE(AO3)
  516. FMSUBX y11, alpha2i, a4, y11
  517. LFDU a3, 1 * SIZE(AO3)
  518. FMADDX y12, alpha2r, a4, y12
  519. LFDU a4, 1 * SIZE(AO3)
  520. FMSUBX y13, alpha2i, a6, y13
  521. LFDU a5, 1 * SIZE(AO3)
  522. FMADDX y14, alpha2r, a6, y14
  523. LFDU a6, 1 * SIZE(AO3)
  524. FMSUBX y15, alpha2i, a8, y15
  525. LFDU a7, 1 * SIZE(AO3)
  526. FMADDX y16, alpha2r, a8, y16
  527. LFDU a8, 1 * SIZE(AO3)
  528. FMADD y09, alpha3r, a1, y09
  529. FMADD y10, alpha3i, a1, y10
  530. FMADD y11, alpha3r, a3, y11
  531. FMADD y12, alpha3i, a3, y12
  532. FMADD y13, alpha3r, a5, y13
  533. FMADD y14, alpha3i, a5, y14
  534. FMADD y15, alpha3r, a7, y15
  535. FMADD y16, alpha3i, a7, y16
  536. FMSUBX y09, alpha3i, a2, y09
  537. LFDU a1, 1 * SIZE(AO4)
  538. FMADDX y10, alpha3r, a2, y10
  539. LFDU a2, 1 * SIZE(AO4)
  540. FMSUBX y11, alpha3i, a4, y11
  541. LFDU a3, 1 * SIZE(AO4)
  542. FMADDX y12, alpha3r, a4, y12
  543. LFDU a4, 1 * SIZE(AO4)
  544. FMSUBX y13, alpha3i, a6, y13
  545. LFDU a5, 1 * SIZE(AO4)
  546. FMADDX y14, alpha3r, a6, y14
  547. LFDU a6, 1 * SIZE(AO4)
  548. FMSUBX y15, alpha3i, a8, y15
  549. LFDU a7, 1 * SIZE(AO4)
  550. FMADDX y16, alpha3r, a8, y16
  551. LFDU a8, 1 * SIZE(AO4)
  552. FMADD y09, alpha4r, a1, y09
  553. FMADD y10, alpha4i, a1, y10
  554. FMADD y11, alpha4r, a3, y11
  555. FMADD y12, alpha4i, a3, y12
  556. FMADD y13, alpha4r, a5, y13
  557. FMADD y14, alpha4i, a5, y14
  558. FMADD y15, alpha4r, a7, y15
  559. FMADD y16, alpha4i, a7, y16
  560. FMSUBX y09, alpha4i, a2, y09
  561. FMADDX y10, alpha4r, a2, y10
  562. FMSUBX y11, alpha4i, a4, y11
  563. FMADDX y12, alpha4r, a4, y12
  564. FMSUBX y13, alpha4i, a6, y13
  565. STFDU y09, 1 * SIZE(Y2)
  566. FMADDX y14, alpha4r, a6, y14
  567. STFDU y10, 1 * SIZE(Y2)
  568. FMSUBX y15, alpha4i, a8, y15
  569. STFDU y11, 1 * SIZE(Y2)
  570. FMADDX y16, alpha4r, a8, y16
  571. STFDU y12, 1 * SIZE(Y2)
  572. STFDU y13, 1 * SIZE(Y2)
  573. STFDU y14, 1 * SIZE(Y2)
  574. STFDU y15, 1 * SIZE(Y2)
  575. STFDU y16, 1 * SIZE(Y2)
  576. .align 4
  577. LL(15):
  578. andi. r0, M, 2
  579. ble LL(17)
  580. LFDU a1, 1 * SIZE(AO1)
  581. LFDU y01, 1 * SIZE(Y1)
  582. LFDU a2, 1 * SIZE(AO1)
  583. LFDU y02, 1 * SIZE(Y1)
  584. LFDU a3, 1 * SIZE(AO1)
  585. LFDU y03, 1 * SIZE(Y1)
  586. LFDU a4, 1 * SIZE(AO1)
  587. LFDU y04, 1 * SIZE(Y1)
  588. FMADD y01, alpha1r, a1, y01
  589. LFDU a5, 1 * SIZE(AO2)
  590. FMADD y02, alpha1i, a1, y02
  591. LFDU a6, 1 * SIZE(AO2)
  592. FMADD y03, alpha1r, a3, y03
  593. LFDU a7, 1 * SIZE(AO2)
  594. FMADD y04, alpha1i, a3, y04
  595. LFDU a8, 1 * SIZE(AO2)
  596. FMSUBX y01, alpha1i, a2, y01
  597. LFDU a1, 1 * SIZE(AO3)
  598. FMADDX y02, alpha1r, a2, y02
  599. LFDU a2, 1 * SIZE(AO3)
  600. FMSUBX y03, alpha1i, a4, y03
  601. LFDU a3, 1 * SIZE(AO3)
  602. FMADDX y04, alpha1r, a4, y04
  603. LFDU a4, 1 * SIZE(AO3)
  604. FMADD y01, alpha2r, a5, y01
  605. FMADD y02, alpha2i, a5, y02
  606. FMADD y03, alpha2r, a7, y03
  607. FMADD y04, alpha2i, a7, y04
  608. FMSUBX y01, alpha2i, a6, y01
  609. LFDU a5, 1 * SIZE(AO4)
  610. FMADDX y02, alpha2r, a6, y02
  611. LFDU a6, 1 * SIZE(AO4)
  612. FMSUBX y03, alpha2i, a8, y03
  613. LFDU a7, 1 * SIZE(AO4)
  614. FMADDX y04, alpha2r, a8, y04
  615. LFDU a8, 1 * SIZE(AO4)
  616. FMADD y01, alpha3r, a1, y01
  617. FMADD y02, alpha3i, a1, y02
  618. FMADD y03, alpha3r, a3, y03
  619. FMADD y04, alpha3i, a3, y04
  620. FMSUBX y01, alpha3i, a2, y01
  621. FMADDX y02, alpha3r, a2, y02
  622. FMSUBX y03, alpha3i, a4, y03
  623. FMADDX y04, alpha3r, a4, y04
  624. FMADD y01, alpha4r, a5, y01
  625. FMADD y02, alpha4i, a5, y02
  626. FMADD y03, alpha4r, a7, y03
  627. FMADD y04, alpha4i, a7, y04
  628. FMSUBX y01, alpha4i, a6, y01
  629. FMADDX y02, alpha4r, a6, y02
  630. FMSUBX y03, alpha4i, a8, y03
  631. FMADDX y04, alpha4r, a8, y04
  632. STFDU y01, 1 * SIZE(Y2)
  633. STFDU y02, 1 * SIZE(Y2)
  634. STFDU y03, 1 * SIZE(Y2)
  635. STFDU y04, 1 * SIZE(Y2)
  636. .align 4
  637. LL(17):
  638. andi. r0, M, 1
  639. ble LL(19)
  640. LFDU y01, 1 * SIZE(Y1)
  641. LFDU y02, 1 * SIZE(Y1)
  642. LFDU a1, 1 * SIZE(AO1)
  643. LFDU a2, 1 * SIZE(AO1)
  644. LFDU a3, 1 * SIZE(AO2)
  645. LFDU a4, 1 * SIZE(AO2)
  646. FMADD y01, alpha1r, a1, y01
  647. LFDU a5, 1 * SIZE(AO3)
  648. FMADD y02, alpha1i, a1, y02
  649. LFDU a6, 1 * SIZE(AO3)
  650. FMSUBX y01, alpha1i, a2, y01
  651. LFDU a7, 1 * SIZE(AO4)
  652. FMADDX y02, alpha1r, a2, y02
  653. LFDU a8, 1 * SIZE(AO4)
  654. FMADD y01, alpha2r, a3, y01
  655. FMADD y02, alpha2i, a3, y02
  656. FMSUBX y01, alpha2i, a4, y01
  657. FMADDX y02, alpha2r, a4, y02
  658. FMADD y01, alpha3r, a5, y01
  659. FMADD y02, alpha3i, a5, y02
  660. FMSUBX y01, alpha3i, a6, y01
  661. FMADDX y02, alpha3r, a6, y02
  662. FMADD y01, alpha4r, a7, y01
  663. FMADD y02, alpha4i, a7, y02
  664. FMSUBX y01, alpha4i, a8, y01
  665. FMADDX y02, alpha4r, a8, y02
  666. STFDU y01, 1 * SIZE(Y2)
  667. STFDU y02, 1 * SIZE(Y2)
  668. .align 4
  669. LL(19):
  670. addi J, J, -1
  671. cmpi cr0, 0, J, 0
  672. bgt LL(11)
  673. .align 4
  674. LL(20):
  675. andi. J, N, 2
  676. ble LL(30)
  677. lfd alpha_r, ALPHA_R
  678. lfd alpha_i, ALPHA_I
  679. LFDUX a1, X, INCX
  680. LFDU a2, 1 * SIZE(X)
  681. LFDUX a3, X, INCX
  682. LFDU a4, 1 * SIZE(X)
  683. FMUL alpha1r, alpha_r, a1
  684. mr Y1, YY
  685. FMUL alpha1i, alpha_i, a1
  686. mr Y2, YY
  687. FMUL alpha2r, alpha_r, a3
  688. mr AO1, A
  689. FMUL alpha2i, alpha_i, a3
  690. add AO2, A, LDA
  691. FMSUBR alpha1r, alpha_i, a2, alpha1r
  692. add A, AO2, LDA
  693. FMADDR alpha1i, alpha_r, a2, alpha1i
  694. srawi. r0, M, 2
  695. FMSUBR alpha2r, alpha_i, a4, alpha2r
  696. mtspr CTR, r0
  697. FMADDR alpha2i, alpha_r, a4, alpha2i
  698. ble LL(25)
  699. .align 4
  700. LFDU a1, 1 * SIZE(AO1)
  701. LFDU y01, 1 * SIZE(Y1)
  702. LFDU a2, 1 * SIZE(AO1)
  703. LFDU y02, 1 * SIZE(Y1)
  704. LFDU a3, 1 * SIZE(AO1)
  705. LFDU y03, 1 * SIZE(Y1)
  706. LFDU a4, 1 * SIZE(AO1)
  707. LFDU y04, 1 * SIZE(Y1)
  708. LFDU a5, 1 * SIZE(AO1)
  709. LFDU y05, 1 * SIZE(Y1)
  710. LFDU a6, 1 * SIZE(AO1)
  711. LFDU y06, 1 * SIZE(Y1)
  712. LFDU a7, 1 * SIZE(AO1)
  713. LFDU y07, 1 * SIZE(Y1)
  714. LFDU a8, 1 * SIZE(AO1)
  715. LFDU y08, 1 * SIZE(Y1)
  716. FMADD y09, alpha1r, a1, y01
  717. FMADD y10, alpha1i, a1, y02
  718. FMADD y11, alpha1r, a3, y03
  719. FMADD y12, alpha1i, a3, y04
  720. FMADD y13, alpha1r, a5, y05
  721. FMADD y14, alpha1i, a5, y06
  722. FMADD y15, alpha1r, a7, y07
  723. FMADD y16, alpha1i, a7, y08
  724. bdz LL(23)
  725. .align 4
  726. LL(22):
  727. FMSUBX y09, alpha1i, a2, y09
  728. LFDU a1, 1 * SIZE(AO2)
  729. FMADDX y10, alpha1r, a2, y10
  730. LFDU a2, 1 * SIZE(AO2)
  731. FMSUBX y11, alpha1i, a4, y11
  732. LFDU a3, 1 * SIZE(AO2)
  733. FMADDX y12, alpha1r, a4, y12
  734. LFDU a4, 1 * SIZE(AO2)
  735. #ifdef PPCG4
  736. dcbt AO2, PREA
  737. #endif
  738. FMSUBX y13, alpha1i, a6, y13
  739. LFDU a5, 1 * SIZE(AO2)
  740. FMADDX y14, alpha1r, a6, y14
  741. LFDU a6, 1 * SIZE(AO2)
  742. FMSUBX y15, alpha1i, a8, y15
  743. LFDU a7, 1 * SIZE(AO2)
  744. FMADDX y16, alpha1r, a8, y16
  745. LFDU a8, 1 * SIZE(AO2)
  746. #if defined(PPCG4) && defined(DOUBLE)
  747. dcbt AO2, PREA
  748. #endif
  749. FMADD y09, alpha2r, a1, y09
  750. LFDU y01, 1 * SIZE(Y1)
  751. FMADD y10, alpha2i, a1, y10
  752. LFDU y02, 1 * SIZE(Y1)
  753. FMADD y11, alpha2r, a3, y11
  754. LFDU y03, 1 * SIZE(Y1)
  755. FMADD y12, alpha2i, a3, y12
  756. LFDU y04, 1 * SIZE(Y1)
  757. #ifdef PPCG4
  758. dcbtst Y1, PREA
  759. #endif
  760. FMADD y13, alpha2r, a5, y13
  761. LFDU y05, 1 * SIZE(Y1)
  762. FMADD y14, alpha2i, a5, y14
  763. LFDU y06, 1 * SIZE(Y1)
  764. FMADD y15, alpha2r, a7, y15
  765. LFDU y07, 1 * SIZE(Y1)
  766. FMADD y16, alpha2i, a7, y16
  767. LFDU y08, 1 * SIZE(Y1)
  768. #if defined(PPCG4) && defined(DOUBLE)
  769. dcbtst Y1, PREA
  770. #endif
  771. FMSUBX y09, alpha2i, a2, y09
  772. LFDU a1, 1 * SIZE(AO1)
  773. FMADDX y10, alpha2r, a2, y10
  774. LFDU a2, 1 * SIZE(AO1)
  775. FMSUBX y11, alpha2i, a4, y11
  776. LFDU a3, 1 * SIZE(AO1)
  777. FMADDX y12, alpha2r, a4, y12
  778. LFDU a4, 1 * SIZE(AO1)
  779. #ifdef PPCG4
  780. dcbt AO1, PREA
  781. #endif
  782. FMSUBX y13, alpha2i, a6, y13
  783. LFDU a5, 1 * SIZE(AO1)
  784. FMADDX y14, alpha2r, a6, y14
  785. LFDU a6, 1 * SIZE(AO1)
  786. FMSUBX y15, alpha2i, a8, y15
  787. LFDU a7, 1 * SIZE(AO1)
  788. FMADDX y16, alpha2r, a8, y16
  789. LFDU a8, 1 * SIZE(AO1)
  790. #if defined(PPCG4) && defined(DOUBLE)
  791. dcbt AO1, PREA
  792. #endif
  793. STFDU y09, 1 * SIZE(Y2)
  794. FMADD y09, alpha1r, a1, y01
  795. STFDU y10, 1 * SIZE(Y2)
  796. FMADD y10, alpha1i, a1, y02
  797. STFDU y11, 1 * SIZE(Y2)
  798. FMADD y11, alpha1r, a3, y03
  799. STFDU y12, 1 * SIZE(Y2)
  800. FMADD y12, alpha1i, a3, y04
  801. STFDU y13, 1 * SIZE(Y2)
  802. FMADD y13, alpha1r, a5, y05
  803. STFDU y14, 1 * SIZE(Y2)
  804. FMADD y14, alpha1i, a5, y06
  805. STFDU y15, 1 * SIZE(Y2)
  806. FMADD y15, alpha1r, a7, y07
  807. STFDU y16, 1 * SIZE(Y2)
  808. FMADD y16, alpha1i, a7, y08
  809. bdnz LL(22)
  810. .align 4
  811. LL(23):
  812. FMSUBX y09, alpha1i, a2, y09
  813. LFDU a1, 1 * SIZE(AO2)
  814. FMADDX y10, alpha1r, a2, y10
  815. LFDU a2, 1 * SIZE(AO2)
  816. FMSUBX y11, alpha1i, a4, y11
  817. LFDU a3, 1 * SIZE(AO2)
  818. FMADDX y12, alpha1r, a4, y12
  819. LFDU a4, 1 * SIZE(AO2)
  820. FMSUBX y13, alpha1i, a6, y13
  821. LFDU a5, 1 * SIZE(AO2)
  822. FMADDX y14, alpha1r, a6, y14
  823. LFDU a6, 1 * SIZE(AO2)
  824. FMSUBX y15, alpha1i, a8, y15
  825. LFDU a7, 1 * SIZE(AO2)
  826. FMADDX y16, alpha1r, a8, y16
  827. LFDU a8, 1 * SIZE(AO2)
  828. FMADD y09, alpha2r, a1, y09
  829. FMADD y10, alpha2i, a1, y10
  830. FMADD y11, alpha2r, a3, y11
  831. FMADD y12, alpha2i, a3, y12
  832. FMADD y13, alpha2r, a5, y13
  833. FMADD y14, alpha2i, a5, y14
  834. FMADD y15, alpha2r, a7, y15
  835. FMADD y16, alpha2i, a7, y16
  836. FMSUBX y09, alpha2i, a2, y09
  837. FMADDX y10, alpha2r, a2, y10
  838. FMSUBX y11, alpha2i, a4, y11
  839. FMADDX y12, alpha2r, a4, y12
  840. FMSUBX y13, alpha2i, a6, y13
  841. STFDU y09, 1 * SIZE(Y2)
  842. FMADDX y14, alpha2r, a6, y14
  843. STFDU y10, 1 * SIZE(Y2)
  844. FMSUBX y15, alpha2i, a8, y15
  845. STFDU y11, 1 * SIZE(Y2)
  846. FMADDX y16, alpha2r, a8, y16
  847. STFDU y12, 1 * SIZE(Y2)
  848. STFDU y13, 1 * SIZE(Y2)
  849. STFDU y14, 1 * SIZE(Y2)
  850. STFDU y15, 1 * SIZE(Y2)
  851. STFDU y16, 1 * SIZE(Y2)
  852. .align 4
  853. LL(25):
  854. andi. r0, M, 2
  855. ble LL(27)
  856. LFDU a1, 1 * SIZE(AO1)
  857. LFDU y01, 1 * SIZE(Y1)
  858. LFDU a2, 1 * SIZE(AO1)
  859. LFDU y02, 1 * SIZE(Y1)
  860. LFDU a3, 1 * SIZE(AO1)
  861. LFDU y03, 1 * SIZE(Y1)
  862. LFDU a4, 1 * SIZE(AO1)
  863. LFDU y04, 1 * SIZE(Y1)
  864. FMADD y01, alpha1r, a1, y01
  865. LFDU a5, 1 * SIZE(AO2)
  866. FMADD y02, alpha1i, a1, y02
  867. LFDU a6, 1 * SIZE(AO2)
  868. FMADD y03, alpha1r, a3, y03
  869. LFDU a7, 1 * SIZE(AO2)
  870. FMADD y04, alpha1i, a3, y04
  871. LFDU a8, 1 * SIZE(AO2)
  872. FMSUBX y01, alpha1i, a2, y01
  873. FMADDX y02, alpha1r, a2, y02
  874. FMSUBX y03, alpha1i, a4, y03
  875. FMADDX y04, alpha1r, a4, y04
  876. FMADD y01, alpha2r, a5, y01
  877. FMADD y02, alpha2i, a5, y02
  878. FMADD y03, alpha2r, a7, y03
  879. FMADD y04, alpha2i, a7, y04
  880. FMSUBX y01, alpha2i, a6, y01
  881. FMADDX y02, alpha2r, a6, y02
  882. FMSUBX y03, alpha2i, a8, y03
  883. FMADDX y04, alpha2r, a8, y04
  884. STFDU y01, 1 * SIZE(Y2)
  885. STFDU y02, 1 * SIZE(Y2)
  886. STFDU y03, 1 * SIZE(Y2)
  887. STFDU y04, 1 * SIZE(Y2)
  888. .align 4
  889. LL(27):
  890. andi. r0, M, 1
  891. ble LL(30)
  892. LFDU y01, 1 * SIZE(Y1)
  893. LFDU y02, 1 * SIZE(Y1)
  894. LFDU a1, 1 * SIZE(AO1)
  895. LFDU a2, 1 * SIZE(AO1)
  896. LFDU a3, 1 * SIZE(AO2)
  897. LFDU a4, 1 * SIZE(AO2)
  898. FMADD y01, alpha1r, a1, y01
  899. FMADD y02, alpha1i, a1, y02
  900. FMSUBX y01, alpha1i, a2, y01
  901. FMADDX y02, alpha1r, a2, y02
  902. FMADD y01, alpha2r, a3, y01
  903. FMADD y02, alpha2i, a3, y02
  904. FMSUBX y01, alpha2i, a4, y01
  905. FMADDX y02, alpha2r, a4, y02
  906. STFDU y01, 1 * SIZE(Y2)
  907. STFDU y02, 1 * SIZE(Y2)
  908. .align 4
  909. LL(30):
  910. andi. J, N, 1
  911. ble LL(990)
  912. .align 4
  913. lfd alpha_r, ALPHA_R
  914. lfd alpha_i, ALPHA_I
  915. LFDUX a1, X, INCX
  916. LFDU a2, 1 * SIZE(X)
  917. FMUL alpha1r, alpha_r, a1
  918. mr Y1, YY
  919. mr Y2, YY
  920. FMUL alpha1i, alpha_i, a1
  921. mr AO1, A
  922. add A, A, LDA
  923. FMSUBR alpha1r, alpha_i, a2, alpha1r
  924. srawi. r0, M, 2
  925. mtspr CTR, r0
  926. FMADDR alpha1i, alpha_r, a2, alpha1i
  927. ble LL(35)
  928. .align 4
  929. LFDU a1, 1 * SIZE(AO1)
  930. LFDU y01, 1 * SIZE(Y1)
  931. LFDU a2, 1 * SIZE(AO1)
  932. LFDU y02, 1 * SIZE(Y1)
  933. LFDU a3, 1 * SIZE(AO1)
  934. LFDU y03, 1 * SIZE(Y1)
  935. LFDU a4, 1 * SIZE(AO1)
  936. LFDU y04, 1 * SIZE(Y1)
  937. LFDU a5, 1 * SIZE(AO1)
  938. LFDU y05, 1 * SIZE(Y1)
  939. LFDU a6, 1 * SIZE(AO1)
  940. LFDU y06, 1 * SIZE(Y1)
  941. LFDU a7, 1 * SIZE(AO1)
  942. LFDU y07, 1 * SIZE(Y1)
  943. LFDU a8, 1 * SIZE(AO1)
  944. LFDU y08, 1 * SIZE(Y1)
  945. FMADD y09, alpha1r, a1, y01
  946. FMADD y10, alpha1i, a1, y02
  947. FMADD y11, alpha1r, a3, y03
  948. FMADD y12, alpha1i, a3, y04
  949. FMADD y13, alpha1r, a5, y05
  950. FMADD y14, alpha1i, a5, y06
  951. FMADD y15, alpha1r, a7, y07
  952. FMADD y16, alpha1i, a7, y08
  953. bdz LL(33)
  954. .align 4
  955. LL(32):
  956. FMSUBX y09, alpha1i, a2, y09
  957. LFDU a1, 1 * SIZE(AO1)
  958. FMADDX y10, alpha1r, a2, y10
  959. LFDU a2, 1 * SIZE(AO1)
  960. FMSUBX y11, alpha1i, a4, y11
  961. LFDU a3, 1 * SIZE(AO1)
  962. FMADDX y12, alpha1r, a4, y12
  963. LFDU a4, 1 * SIZE(AO1)
  964. #ifdef PPCG4
  965. dcbt AO1, PREA
  966. #endif
  967. LFDU y01, 1 * SIZE(Y1)
  968. LFDU y02, 1 * SIZE(Y1)
  969. LFDU y03, 1 * SIZE(Y1)
  970. LFDU y04, 1 * SIZE(Y1)
  971. #ifdef PPCG4
  972. dcbtst Y1, PREA
  973. #endif
  974. FMSUBX y13, alpha1i, a6, y13
  975. LFDU a5, 1 * SIZE(AO1)
  976. FMADDX y14, alpha1r, a6, y14
  977. LFDU a6, 1 * SIZE(AO1)
  978. FMSUBX y15, alpha1i, a8, y15
  979. LFDU a7, 1 * SIZE(AO1)
  980. FMADDX y16, alpha1r, a8, y16
  981. LFDU a8, 1 * SIZE(AO1)
  982. #if defined(PPCG4) && defined(DOUBLE)
  983. dcbt AO1, PREA
  984. #endif
  985. LFDU y05, 1 * SIZE(Y1)
  986. LFDU y06, 1 * SIZE(Y1)
  987. LFDU y07, 1 * SIZE(Y1)
  988. LFDU y08, 1 * SIZE(Y1)
  989. #if defined(PPCG4) && defined(DOUBLE)
  990. dcbtst Y1, PREA
  991. #endif
  992. STFDU y09, 1 * SIZE(Y2)
  993. FMADD y09, alpha1r, a1, y01
  994. STFDU y10, 1 * SIZE(Y2)
  995. FMADD y10, alpha1i, a1, y02
  996. STFDU y11, 1 * SIZE(Y2)
  997. FMADD y11, alpha1r, a3, y03
  998. STFDU y12, 1 * SIZE(Y2)
  999. FMADD y12, alpha1i, a3, y04
  1000. STFDU y13, 1 * SIZE(Y2)
  1001. FMADD y13, alpha1r, a5, y05
  1002. STFDU y14, 1 * SIZE(Y2)
  1003. FMADD y14, alpha1i, a5, y06
  1004. STFDU y15, 1 * SIZE(Y2)
  1005. FMADD y15, alpha1r, a7, y07
  1006. STFDU y16, 1 * SIZE(Y2)
  1007. FMADD y16, alpha1i, a7, y08
  1008. bdnz LL(32)
  1009. .align 4
  1010. LL(33):
  1011. FMSUBX y09, alpha1i, a2, y09
  1012. FMADDX y10, alpha1r, a2, y10
  1013. FMSUBX y11, alpha1i, a4, y11
  1014. FMADDX y12, alpha1r, a4, y12
  1015. FMSUBX y13, alpha1i, a6, y13
  1016. STFDU y09, 1 * SIZE(Y2)
  1017. FMADDX y14, alpha1r, a6, y14
  1018. STFDU y10, 1 * SIZE(Y2)
  1019. FMSUBX y15, alpha1i, a8, y15
  1020. STFDU y11, 1 * SIZE(Y2)
  1021. FMADDX y16, alpha1r, a8, y16
  1022. STFDU y12, 1 * SIZE(Y2)
  1023. STFDU y13, 1 * SIZE(Y2)
  1024. STFDU y14, 1 * SIZE(Y2)
  1025. STFDU y15, 1 * SIZE(Y2)
  1026. STFDU y16, 1 * SIZE(Y2)
  1027. .align 4
  1028. LL(35):
  1029. andi. r0, M, 2
  1030. ble LL(37)
  1031. LFDU a1, 1 * SIZE(AO1)
  1032. LFDU y01, 1 * SIZE(Y1)
  1033. LFDU a2, 1 * SIZE(AO1)
  1034. LFDU y02, 1 * SIZE(Y1)
  1035. LFDU a3, 1 * SIZE(AO1)
  1036. LFDU y03, 1 * SIZE(Y1)
  1037. LFDU a4, 1 * SIZE(AO1)
  1038. LFDU y04, 1 * SIZE(Y1)
  1039. FMADD y01, alpha1r, a1, y01
  1040. FMADD y02, alpha1i, a1, y02
  1041. FMADD y03, alpha1r, a3, y03
  1042. FMADD y04, alpha1i, a3, y04
  1043. FMSUBX y01, alpha1i, a2, y01
  1044. FMADDX y02, alpha1r, a2, y02
  1045. FMSUBX y03, alpha1i, a4, y03
  1046. FMADDX y04, alpha1r, a4, y04
  1047. STFDU y01, 1 * SIZE(Y2)
  1048. STFDU y02, 1 * SIZE(Y2)
  1049. STFDU y03, 1 * SIZE(Y2)
  1050. STFDU y04, 1 * SIZE(Y2)
  1051. .align 4
  1052. LL(37):
  1053. andi. r0, M, 1
  1054. ble LL(990)
  1055. LFDU y01, 1 * SIZE(Y1)
  1056. LFDU a1, 1 * SIZE(AO1)
  1057. LFDU y02, 1 * SIZE(Y1)
  1058. LFDU a2, 1 * SIZE(AO1)
  1059. FMADD y01, alpha1r, a1, y01
  1060. FMADD y02, alpha1i, a1, y02
  1061. FMSUBX y01, alpha1i, a2, y01
  1062. FMADDX y02, alpha1r, a2, y02
  1063. STFDU y01, 1 * SIZE(Y2)
  1064. STFDU y02, 1 * SIZE(Y2)
  1065. .align 4
  1066. LL(990):
  1067. cmpi cr0, 0, INCY, SIZE
  1068. beq LL(999)
  1069. addi YY, BUFFER, -SIZE
  1070. mr Y1, Y
  1071. srawi. r0, M, 2
  1072. mtspr CTR, r0
  1073. ble LL(995)
  1074. .align 4
  1075. LL(991):
  1076. LFDUX f0, Y, INCY
  1077. LFDU f1, 1 * SIZE(Y)
  1078. LFDUX f2, Y, INCY
  1079. LFDU f3, 1 * SIZE(Y)
  1080. LFDUX f4, Y, INCY
  1081. LFDU f5, 1 * SIZE(Y)
  1082. LFDUX f6, Y, INCY
  1083. LFDU f7, 1 * SIZE(Y)
  1084. LFDU f8, 1 * SIZE(YY)
  1085. LFDU f9, 1 * SIZE(YY)
  1086. LFDU f10, 1 * SIZE(YY)
  1087. LFDU f11, 1 * SIZE(YY)
  1088. LFDU f12, 1 * SIZE(YY)
  1089. LFDU f13, 1 * SIZE(YY)
  1090. LFDU f14, 1 * SIZE(YY)
  1091. LFDU f15, 1 * SIZE(YY)
  1092. FADD f8, f8, f0
  1093. FADD f9, f9, f1
  1094. FADD f10, f10, f2
  1095. FADD f11, f11, f3
  1096. FADD f12, f12, f4
  1097. FADD f13, f13, f5
  1098. FADD f14, f14, f6
  1099. FADD f15, f15, f7
  1100. STFDUX f8, Y1, INCY
  1101. STFDU f9, 1 * SIZE(Y1)
  1102. STFDUX f10, Y1, INCY
  1103. STFDU f11, 1 * SIZE(Y1)
  1104. STFDUX f12, Y1, INCY
  1105. STFDU f13, 1 * SIZE(Y1)
  1106. STFDUX f14, Y1, INCY
  1107. STFDU f15, 1 * SIZE(Y1)
  1108. bdnz LL(991)
  1109. .align 4
  1110. LL(995):
  1111. andi. J, M, 2
  1112. ble LL(996)
  1113. LFDUX f0, Y, INCY
  1114. LFDU f1, 1 * SIZE(Y)
  1115. LFDUX f2, Y, INCY
  1116. LFDU f3, 1 * SIZE(Y)
  1117. LFDU f8, 1 * SIZE(YY)
  1118. LFDU f9, 1 * SIZE(YY)
  1119. LFDU f10, 1 * SIZE(YY)
  1120. LFDU f11, 1 * SIZE(YY)
  1121. FADD f8, f8, f0
  1122. FADD f9, f9, f1
  1123. FADD f10, f10, f2
  1124. FADD f11, f11, f3
  1125. STFDUX f8, Y1, INCY
  1126. STFDU f9, 1 * SIZE(Y1)
  1127. STFDUX f10, Y1, INCY
  1128. STFDU f11, 1 * SIZE(Y1)
  1129. .align 4
  1130. LL(996):
  1131. andi. J, M, 1
  1132. ble LL(999)
  1133. LFDUX f0, Y, INCY
  1134. LFDU f1, 1 * SIZE(Y)
  1135. LFDU f8, 1 * SIZE(YY)
  1136. LFDU f9, 1 * SIZE(YY)
  1137. FADD f8, f8, f0
  1138. FADD f9, f9, f1
  1139. STFDUX f8, Y1, INCY
  1140. STFDU f9, 1 * SIZE(Y1)
  1141. .align 4
  1142. LL(999):
  1143. li r3, 0
  1144. lfd f14, 0(SP)
  1145. lfd f15, 8(SP)
  1146. lfd f16, 16(SP)
  1147. lfd f17, 24(SP)
  1148. lfd f18, 32(SP)
  1149. lfd f19, 40(SP)
  1150. lfd f20, 48(SP)
  1151. lfd f21, 56(SP)
  1152. lfd f22, 64(SP)
  1153. lfd f23, 72(SP)
  1154. lfd f24, 80(SP)
  1155. lfd f25, 88(SP)
  1156. lfd f26, 96(SP)
  1157. lfd f27, 104(SP)
  1158. lfd f28, 112(SP)
  1159. lfd f29, 120(SP)
  1160. lfd f30, 128(SP)
  1161. lfd f31, 136(SP)
  1162. #ifdef __64BIT__
  1163. ld r14, 144(SP)
  1164. ld r15, 152(SP)
  1165. ld r16, 160(SP)
  1166. ld r17, 168(SP)
  1167. ld r18, 176(SP)
  1168. ld r19, 184(SP)
  1169. ld r20, 192(SP)
  1170. ld r21, 200(SP)
  1171. ld r22, 208(SP)
  1172. #else
  1173. lwz r14, 144(SP)
  1174. lwz r15, 148(SP)
  1175. lwz r16, 152(SP)
  1176. lwz r17, 156(SP)
  1177. lwz r18, 160(SP)
  1178. lwz r19, 164(SP)
  1179. lwz r20, 168(SP)
  1180. lwz r21, 172(SP)
  1181. lwz r22, 176(SP)
  1182. #endif
  1183. addi SP, SP, STACKSIZE
  1184. blr
  1185. EPILOGUE
  1186. #endif