You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_ppc440.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r6
  45. #define LDA r7
  46. #define X r8
  47. #define INCX r9
  48. #define Y r10
  49. #define INCY r5
  50. #else
  51. #define M r3
  52. #define N r4
  53. #define A r7
  54. #define LDA r8
  55. #define X r9
  56. #define INCX r10
  57. #define Y r5
  58. #define INCY r6
  59. #endif
  60. #endif
  61. #if defined(_AIX) || defined(__APPLE__)
  62. #if !defined(__64BIT__) && defined(DOUBLE)
  63. #define M r3
  64. #define N r4
  65. #define A r8
  66. #define LDA r9
  67. #define X r10
  68. #define INCX r5
  69. #define Y r6
  70. #define INCY r7
  71. #else
  72. #define M r3
  73. #define N r4
  74. #define A r7
  75. #define LDA r8
  76. #define X r9
  77. #define INCX r10
  78. #define Y r5
  79. #define INCY r6
  80. #endif
  81. #endif
  82. #define I r11
  83. #define J r12
  84. #define AO1 r14
  85. #define AO2 r15
  86. #define AO3 r16
  87. #define AO4 r17
  88. #define LDA8 r18
  89. #define Y1 r19
  90. #define Y2 r20
  91. #define PREA r21
  92. #define YY r22
  93. #define BUFFER r23
  94. #define y01 f0
  95. #define y02 f1
  96. #define y03 f2
  97. #define y04 f3
  98. #define y05 f4
  99. #define y06 f5
  100. #define y07 f6
  101. #define y08 f7
  102. #define y09 f8
  103. #define y10 f9
  104. #define y11 f10
  105. #define y12 f11
  106. #define y13 f12
  107. #define y14 f13
  108. #define y15 f14
  109. #define y16 f15
  110. #define alpha1 f16
  111. #define alpha2 f17
  112. #define alpha3 f18
  113. #define alpha4 f19
  114. #define a1 f20
  115. #define a2 f21
  116. #define a3 f22
  117. #define a4 f23
  118. #define a5 f24
  119. #define a6 f25
  120. #define a7 f26
  121. #define a8 f27
  122. #define alpha f27
  123. #if defined(PPCG4)
  124. #define PREFETCHSIZE_A (3 * 4)
  125. #endif
  126. #if defined(POWER6)
  127. #define PREFETCHSIZE_A (3 * 4)
  128. #endif
  129. #ifndef NEEDPARAM
  130. #ifndef __64BIT__
  131. #define STACKSIZE 224
  132. #define ALPHA 200(SP)
  133. #define FZERO 208(SP)
  134. #else
  135. #define STACKSIZE 280
  136. #define ALPHA 256(SP)
  137. #define FZERO 264(SP)
  138. #endif
  139. PROLOGUE
  140. PROFCODE
  141. addi SP, SP, -STACKSIZE
  142. li r0, 0
  143. stfd f14, 0(SP)
  144. stfd f15, 8(SP)
  145. stfd f16, 16(SP)
  146. stfd f17, 24(SP)
  147. stfd f18, 32(SP)
  148. stfd f19, 40(SP)
  149. stfd f20, 48(SP)
  150. stfd f21, 56(SP)
  151. stfd f22, 64(SP)
  152. stfd f23, 72(SP)
  153. stfd f24, 80(SP)
  154. stfd f25, 88(SP)
  155. stfd f26, 96(SP)
  156. stfd f27, 104(SP)
  157. #ifdef __64BIT__
  158. std r0, FZERO
  159. std r14, 144(SP)
  160. std r15, 152(SP)
  161. std r16, 160(SP)
  162. std r17, 168(SP)
  163. std r18, 176(SP)
  164. std r19, 184(SP)
  165. std r20, 192(SP)
  166. std r21, 200(SP)
  167. std r22, 208(SP)
  168. std r23, 216(SP)
  169. #else
  170. stw r0, 0 + FZERO
  171. stw r0, 4 + FZERO
  172. stw r14, 144(SP)
  173. stw r15, 148(SP)
  174. stw r16, 152(SP)
  175. stw r17, 156(SP)
  176. stw r18, 160(SP)
  177. stw r19, 164(SP)
  178. stw r20, 168(SP)
  179. stw r21, 172(SP)
  180. stw r22, 176(SP)
  181. stw r23, 180(SP)
  182. #endif
  183. #if defined(linux) || defined(__FreeBSD__)
  184. #ifndef __64BIT__
  185. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  186. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  187. #else
  188. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  189. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  190. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  191. #endif
  192. #endif
  193. #if defined(_AIX) || defined(__APPLE__)
  194. #ifndef __64BIT__
  195. #ifdef DOUBLE
  196. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  197. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  198. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  199. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  200. #else
  201. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  202. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  203. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  204. #endif
  205. #else
  206. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  207. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  208. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  209. #endif
  210. #endif
  211. stfd f1, ALPHA
  212. fmr alpha, f1
  213. slwi LDA, LDA, BASE_SHIFT
  214. slwi INCX, INCX, BASE_SHIFT
  215. slwi INCY, INCY, BASE_SHIFT
  216. li PREA, PREFETCHSIZE_A * SIZE
  217. cmpwi cr0, M, 0
  218. ble- LL(999)
  219. cmpwi cr0, N, 0
  220. ble- LL(999)
  221. addi A, A, -SIZE
  222. sub X, X, INCX
  223. sub Y, Y, INCY
  224. mr YY, Y
  225. lfd f0, FZERO
  226. cmpi cr0, 0, INCY, SIZE
  227. beq LL(10)
  228. addi YY, BUFFER, -SIZE
  229. addi Y1, BUFFER, -SIZE
  230. addi r0, M, 7
  231. srawi. r0, r0, 3
  232. mtspr CTR, r0
  233. .align 4
  234. LL(02):
  235. STFDU f0, 1 * SIZE(Y1)
  236. STFDU f0, 1 * SIZE(Y1)
  237. STFDU f0, 1 * SIZE(Y1)
  238. STFDU f0, 1 * SIZE(Y1)
  239. STFDU f0, 1 * SIZE(Y1)
  240. STFDU f0, 1 * SIZE(Y1)
  241. STFDU f0, 1 * SIZE(Y1)
  242. STFDU f0, 1 * SIZE(Y1)
  243. bdnz LL(02)
  244. .align 4
  245. LL(10):
  246. srawi. J, N, 2
  247. ble LL(30)
  248. .align 4
  249. LL(21):
  250. mr AO1, A
  251. add AO2, A, LDA
  252. LFDUX alpha1, X, INCX
  253. LFDUX alpha2, X, INCX
  254. LFDUX alpha3, X, INCX
  255. LFDUX alpha4, X, INCX
  256. FMUL alpha1, alpha, alpha1
  257. add AO3, AO2, LDA
  258. FMUL alpha2, alpha, alpha2
  259. add AO4, AO3, LDA
  260. FMUL alpha3, alpha, alpha3
  261. add A, AO4, LDA
  262. FMUL alpha4, alpha, alpha4
  263. mr Y1, YY
  264. mr Y2, YY
  265. srawi. r0, M, 3
  266. mtspr CTR, r0
  267. ble LL(25)
  268. LFDU y01, 1 * SIZE(Y1)
  269. LFDU a1, 1 * SIZE(AO1)
  270. LFDU y02, 1 * SIZE(Y1)
  271. LFDU a2, 1 * SIZE(AO1)
  272. LFDU y03, 1 * SIZE(Y1)
  273. LFDU a3, 1 * SIZE(AO1)
  274. LFDU y04, 1 * SIZE(Y1)
  275. LFDU a4, 1 * SIZE(AO1)
  276. LFDU y05, 1 * SIZE(Y1)
  277. LFDU a5, 1 * SIZE(AO1)
  278. LFDU y06, 1 * SIZE(Y1)
  279. LFDU a6, 1 * SIZE(AO1)
  280. LFDU y07, 1 * SIZE(Y1)
  281. LFDU a7, 1 * SIZE(AO1)
  282. LFDU y08, 1 * SIZE(Y1)
  283. LFDU a8, 1 * SIZE(AO1)
  284. bdz LL(23)
  285. .align 4
  286. LL(22):
  287. #ifdef PPCG4
  288. dcbtst Y1, PREA
  289. #endif
  290. FMADD y09, alpha1, a1, y01
  291. LFDU a1, 1 * SIZE(AO2)
  292. FMADD y10, alpha1, a2, y02
  293. LFDU a2, 1 * SIZE(AO2)
  294. FMADD y11, alpha1, a3, y03
  295. LFDU a3, 1 * SIZE(AO2)
  296. FMADD y12, alpha1, a4, y04
  297. LFDU a4, 1 * SIZE(AO2)
  298. LFDU y01, 1 * SIZE(Y1)
  299. #ifdef PPCG4
  300. dcbt AO2, PREA
  301. #endif
  302. FMADD y13, alpha1, a5, y05
  303. LFDU a5, 1 * SIZE(AO2)
  304. FMADD y14, alpha1, a6, y06
  305. LFDU a6, 1 * SIZE(AO2)
  306. FMADD y15, alpha1, a7, y07
  307. LFDU a7, 1 * SIZE(AO2)
  308. FMADD y16, alpha1, a8, y08
  309. LFDU a8, 1 * SIZE(AO2)
  310. LFDU y02, 1 * SIZE(Y1)
  311. #if defined(PPCG4) && defined(DOUBLE)
  312. dcbt AO2, PREA
  313. #endif
  314. FMADD y09, alpha2, a1, y09
  315. LFDU a1, 1 * SIZE(AO3)
  316. FMADD y10, alpha2, a2, y10
  317. LFDU a2, 1 * SIZE(AO3)
  318. FMADD y11, alpha2, a3, y11
  319. LFDU a3, 1 * SIZE(AO3)
  320. FMADD y12, alpha2, a4, y12
  321. LFDU a4, 1 * SIZE(AO3)
  322. LFDU y03, 1 * SIZE(Y1)
  323. #ifdef PPCG4
  324. dcbt AO3, PREA
  325. #endif
  326. FMADD y13, alpha2, a5, y13
  327. LFDU a5, 1 * SIZE(AO3)
  328. FMADD y14, alpha2, a6, y14
  329. LFDU a6, 1 * SIZE(AO3)
  330. FMADD y15, alpha2, a7, y15
  331. LFDU a7, 1 * SIZE(AO3)
  332. FMADD y16, alpha2, a8, y16
  333. LFDU a8, 1 * SIZE(AO3)
  334. LFDU y04, 1 * SIZE(Y1)
  335. #if defined(PPCG4) && defined(DOUBLE)
  336. dcbt AO3, PREA
  337. #endif
  338. FMADD y09, alpha3, a1, y09
  339. LFDU a1, 1 * SIZE(AO4)
  340. FMADD y10, alpha3, a2, y10
  341. LFDU a2, 1 * SIZE(AO4)
  342. FMADD y11, alpha3, a3, y11
  343. LFDU a3, 1 * SIZE(AO4)
  344. FMADD y12, alpha3, a4, y12
  345. LFDU a4, 1 * SIZE(AO4)
  346. #if defined(PPCG4) && defined(DOUBLE)
  347. dcbtst Y1, PREA
  348. #endif
  349. LFDU y05, 1 * SIZE(Y1)
  350. #ifdef PPCG4
  351. dcbt AO4, PREA
  352. #endif
  353. FMADD y13, alpha3, a5, y13
  354. LFDU a5, 1 * SIZE(AO4)
  355. FMADD y14, alpha3, a6, y14
  356. LFDU a6, 1 * SIZE(AO4)
  357. FMADD y15, alpha3, a7, y15
  358. LFDU a7, 1 * SIZE(AO4)
  359. FMADD y16, alpha3, a8, y16
  360. LFDU a8, 1 * SIZE(AO4)
  361. LFDU y06, 1 * SIZE(Y1)
  362. #if defined(PPCG4) && defined(DOUBLE)
  363. dcbt AO4, PREA
  364. #endif
  365. FMADD y09, alpha4, a1, y09
  366. LFDU a1, 1 * SIZE(AO1)
  367. FMADD y10, alpha4, a2, y10
  368. LFDU a2, 1 * SIZE(AO1)
  369. FMADD y11, alpha4, a3, y11
  370. LFDU a3, 1 * SIZE(AO1)
  371. FMADD y12, alpha4, a4, y12
  372. LFDU a4, 1 * SIZE(AO1)
  373. LFDU y07, 1 * SIZE(Y1)
  374. #ifdef PPCG4
  375. dcbt AO1, PREA
  376. #endif
  377. STFDU y09, 1 * SIZE(Y2)
  378. STFDU y10, 1 * SIZE(Y2)
  379. STFDU y11, 1 * SIZE(Y2)
  380. STFDU y12, 1 * SIZE(Y2)
  381. FMADD y13, alpha4, a5, y13
  382. LFDU a5, 1 * SIZE(AO1)
  383. FMADD y14, alpha4, a6, y14
  384. LFDU a6, 1 * SIZE(AO1)
  385. FMADD y15, alpha4, a7, y15
  386. LFDU a7, 1 * SIZE(AO1)
  387. FMADD y16, alpha4, a8, y16
  388. LFDU a8, 1 * SIZE(AO1)
  389. LFDU y08, 1 * SIZE(Y1)
  390. #if defined(PPCG4) && defined(DOUBLE)
  391. dcbt AO1, PREA
  392. #endif
  393. STFDU y13, 1 * SIZE(Y2)
  394. STFDU y14, 1 * SIZE(Y2)
  395. STFDU y15, 1 * SIZE(Y2)
  396. STFDU y16, 1 * SIZE(Y2)
  397. bdnz LL(22)
  398. .align 4
  399. LL(23):
  400. FMADD y01, alpha1, a1, y01
  401. LFDU a1, 1 * SIZE(AO2)
  402. FMADD y02, alpha1, a2, y02
  403. LFDU a2, 1 * SIZE(AO2)
  404. FMADD y03, alpha1, a3, y03
  405. LFDU a3, 1 * SIZE(AO2)
  406. FMADD y04, alpha1, a4, y04
  407. LFDU a4, 1 * SIZE(AO2)
  408. FMADD y05, alpha1, a5, y05
  409. LFDU a5, 1 * SIZE(AO2)
  410. FMADD y06, alpha1, a6, y06
  411. LFDU a6, 1 * SIZE(AO2)
  412. FMADD y07, alpha1, a7, y07
  413. LFDU a7, 1 * SIZE(AO2)
  414. FMADD y08, alpha1, a8, y08
  415. LFDU a8, 1 * SIZE(AO2)
  416. FMADD y01, alpha2, a1, y01
  417. LFDU a1, 1 * SIZE(AO3)
  418. FMADD y02, alpha2, a2, y02
  419. LFDU a2, 1 * SIZE(AO3)
  420. FMADD y03, alpha2, a3, y03
  421. LFDU a3, 1 * SIZE(AO3)
  422. FMADD y04, alpha2, a4, y04
  423. LFDU a4, 1 * SIZE(AO3)
  424. FMADD y05, alpha2, a5, y05
  425. LFDU a5, 1 * SIZE(AO3)
  426. FMADD y06, alpha2, a6, y06
  427. LFDU a6, 1 * SIZE(AO3)
  428. FMADD y07, alpha2, a7, y07
  429. LFDU a7, 1 * SIZE(AO3)
  430. FMADD y08, alpha2, a8, y08
  431. LFDU a8, 1 * SIZE(AO3)
  432. FMADD y01, alpha3, a1, y01
  433. LFDU a1, 1 * SIZE(AO4)
  434. FMADD y02, alpha3, a2, y02
  435. LFDU a2, 1 * SIZE(AO4)
  436. FMADD y03, alpha3, a3, y03
  437. LFDU a3, 1 * SIZE(AO4)
  438. FMADD y04, alpha3, a4, y04
  439. LFDU a4, 1 * SIZE(AO4)
  440. FMADD y05, alpha3, a5, y05
  441. LFDU a5, 1 * SIZE(AO4)
  442. FMADD y06, alpha3, a6, y06
  443. LFDU a6, 1 * SIZE(AO4)
  444. FMADD y07, alpha3, a7, y07
  445. LFDU a7, 1 * SIZE(AO4)
  446. FMADD y08, alpha3, a8, y08
  447. LFDU a8, 1 * SIZE(AO4)
  448. FMADD y01, alpha4, a1, y01
  449. FMADD y02, alpha4, a2, y02
  450. FMADD y03, alpha4, a3, y03
  451. FMADD y04, alpha4, a4, y04
  452. FMADD y05, alpha4, a5, y05
  453. STFDU y01, 1 * SIZE(Y2)
  454. FMADD y06, alpha4, a6, y06
  455. STFDU y02, 1 * SIZE(Y2)
  456. FMADD y07, alpha4, a7, y07
  457. STFDU y03, 1 * SIZE(Y2)
  458. FMADD y08, alpha4, a8, y08
  459. STFDU y04, 1 * SIZE(Y2)
  460. STFDU y05, 1 * SIZE(Y2)
  461. STFDU y06, 1 * SIZE(Y2)
  462. STFDU y07, 1 * SIZE(Y2)
  463. STFDU y08, 1 * SIZE(Y2)
  464. .align 4
  465. LL(25):
  466. andi. r0, M, 7
  467. ble LL(29)
  468. andi. r0, M, 4
  469. ble LL(27)
  470. LFDU a1, 1 * SIZE(AO1)
  471. LFDU y01, 1 * SIZE(Y1)
  472. LFDU a2, 1 * SIZE(AO1)
  473. LFDU y02, 1 * SIZE(Y1)
  474. LFDU a3, 1 * SIZE(AO1)
  475. LFDU y03, 1 * SIZE(Y1)
  476. LFDU a4, 1 * SIZE(AO1)
  477. LFDU y04, 1 * SIZE(Y1)
  478. FMADD y01, alpha1, a1, y01
  479. LFDU a5, 1 * SIZE(AO2)
  480. FMADD y02, alpha1, a2, y02
  481. LFDU a6, 1 * SIZE(AO2)
  482. FMADD y03, alpha1, a3, y03
  483. LFDU a7, 1 * SIZE(AO2)
  484. FMADD y04, alpha1, a4, y04
  485. LFDU a8, 1 * SIZE(AO2)
  486. FMADD y01, alpha2, a5, y01
  487. LFDU a1, 1 * SIZE(AO3)
  488. FMADD y02, alpha2, a6, y02
  489. LFDU a2, 1 * SIZE(AO3)
  490. FMADD y03, alpha2, a7, y03
  491. LFDU a3, 1 * SIZE(AO3)
  492. FMADD y04, alpha2, a8, y04
  493. LFDU a4, 1 * SIZE(AO3)
  494. FMADD y01, alpha3, a1, y01
  495. LFDU a5, 1 * SIZE(AO4)
  496. FMADD y02, alpha3, a2, y02
  497. LFDU a6, 1 * SIZE(AO4)
  498. FMADD y03, alpha3, a3, y03
  499. LFDU a7, 1 * SIZE(AO4)
  500. FMADD y04, alpha3, a4, y04
  501. LFDU a8, 1 * SIZE(AO4)
  502. FMADD y01, alpha4, a5, y01
  503. FMADD y02, alpha4, a6, y02
  504. FMADD y03, alpha4, a7, y03
  505. FMADD y04, alpha4, a8, y04
  506. STFDU y01, 1 * SIZE(Y2)
  507. STFDU y02, 1 * SIZE(Y2)
  508. STFDU y03, 1 * SIZE(Y2)
  509. STFDU y04, 1 * SIZE(Y2)
  510. .align 4
  511. LL(27):
  512. andi. r0, M, 2
  513. ble LL(28)
  514. LFDU a1, 1 * SIZE(AO1)
  515. LFDU y01, 1 * SIZE(Y1)
  516. LFDU a2, 1 * SIZE(AO1)
  517. LFDU y02, 1 * SIZE(Y1)
  518. LFDU a3, 1 * SIZE(AO2)
  519. LFDU a4, 1 * SIZE(AO2)
  520. FMADD y01, alpha1, a1, y01
  521. LFDU a5, 1 * SIZE(AO3)
  522. FMADD y02, alpha1, a2, y02
  523. LFDU a6, 1 * SIZE(AO3)
  524. FMADD y01, alpha2, a3, y01
  525. LFDU a7, 1 * SIZE(AO4)
  526. FMADD y02, alpha2, a4, y02
  527. LFDU a8, 1 * SIZE(AO4)
  528. FMADD y01, alpha3, a5, y01
  529. FMADD y02, alpha3, a6, y02
  530. FMADD y01, alpha4, a7, y01
  531. FMADD y02, alpha4, a8, y02
  532. STFDU y01, 1 * SIZE(Y2)
  533. STFDU y02, 1 * SIZE(Y2)
  534. .align 4
  535. LL(28):
  536. andi. r0, M, 1
  537. ble LL(29)
  538. LFDU a1, 1 * SIZE(AO1)
  539. LFDU y01, 1 * SIZE(Y1)
  540. LFDU a2, 1 * SIZE(AO2)
  541. LFDU a3, 1 * SIZE(AO3)
  542. LFDU a4, 1 * SIZE(AO4)
  543. FMADD y01, alpha1, a1, y01
  544. FMADD y01, alpha2, a2, y01
  545. FMADD y01, alpha3, a3, y01
  546. FMADD y01, alpha4, a4, y01
  547. STFDU y01, 1 * SIZE(Y2)
  548. .align 4
  549. LL(29):
  550. addi J, J, -1
  551. lfd alpha, ALPHA
  552. cmpi cr0, 0, J, 0
  553. bgt LL(21)
  554. .align 4
  555. LL(30):
  556. andi. J, N, 2
  557. ble LL(40)
  558. LFDUX alpha1, X, INCX
  559. LFDUX alpha2, X, INCX
  560. mr AO1, A
  561. add AO2, A, LDA
  562. add A, AO2, LDA
  563. FMUL alpha1, alpha, alpha1
  564. mr Y1, YY
  565. FMUL alpha2, alpha, alpha2
  566. mr Y2, YY
  567. srawi. r0, M, 3
  568. mtspr CTR, r0
  569. ble LL(35)
  570. LFDU y01, 1 * SIZE(Y1)
  571. LFDU a1, 1 * SIZE(AO1)
  572. LFDU y02, 1 * SIZE(Y1)
  573. LFDU a2, 1 * SIZE(AO1)
  574. LFDU y03, 1 * SIZE(Y1)
  575. LFDU a3, 1 * SIZE(AO1)
  576. LFDU y04, 1 * SIZE(Y1)
  577. LFDU a4, 1 * SIZE(AO1)
  578. LFDU y05, 1 * SIZE(Y1)
  579. LFDU a5, 1 * SIZE(AO1)
  580. LFDU y06, 1 * SIZE(Y1)
  581. LFDU a6, 1 * SIZE(AO1)
  582. LFDU y07, 1 * SIZE(Y1)
  583. LFDU a7, 1 * SIZE(AO1)
  584. LFDU y08, 1 * SIZE(Y1)
  585. LFDU a8, 1 * SIZE(AO1)
  586. bdz LL(33)
  587. .align 4
  588. LL(32):
  589. #ifdef PPCG4
  590. dcbtst Y1, PREA
  591. #endif
  592. FMADD y09, alpha1, a1, y01
  593. LFDU a1, 1 * SIZE(AO2)
  594. FMADD y10, alpha1, a2, y02
  595. LFDU a2, 1 * SIZE(AO2)
  596. FMADD y11, alpha1, a3, y03
  597. LFDU a3, 1 * SIZE(AO2)
  598. FMADD y12, alpha1, a4, y04
  599. LFDU a4, 1 * SIZE(AO2)
  600. LFDU y01, 1 * SIZE(Y1)
  601. LFDU y02, 1 * SIZE(Y1)
  602. #ifdef PPCG4
  603. dcbt AO2, PREA
  604. #endif
  605. FMADD y13, alpha1, a5, y05
  606. LFDU a5, 1 * SIZE(AO2)
  607. FMADD y14, alpha1, a6, y06
  608. LFDU a6, 1 * SIZE(AO2)
  609. FMADD y15, alpha1, a7, y07
  610. LFDU a7, 1 * SIZE(AO2)
  611. FMADD y16, alpha1, a8, y08
  612. LFDU a8, 1 * SIZE(AO2)
  613. LFDU y03, 1 * SIZE(Y1)
  614. LFDU y04, 1 * SIZE(Y1)
  615. #if defined(PPCG4) && defined(DOUBLE)
  616. dcbt AO2, PREA
  617. #endif
  618. FMADD y09, alpha2, a1, y09
  619. LFDU a1, 1 * SIZE(AO1)
  620. FMADD y10, alpha2, a2, y10
  621. LFDU a2, 1 * SIZE(AO1)
  622. FMADD y11, alpha2, a3, y11
  623. LFDU a3, 1 * SIZE(AO1)
  624. FMADD y12, alpha2, a4, y12
  625. LFDU a4, 1 * SIZE(AO1)
  626. #if defined(PPCG4) && defined(DOUBLE)
  627. dcbtst Y1, PREA
  628. #endif
  629. LFDU y05, 1 * SIZE(Y1)
  630. LFDU y06, 1 * SIZE(Y1)
  631. #ifdef PPCG4
  632. dcbt AO1, PREA
  633. #endif
  634. FMADD y13, alpha2, a5, y13
  635. LFDU a5, 1 * SIZE(AO1)
  636. FMADD y14, alpha2, a6, y14
  637. LFDU a6, 1 * SIZE(AO1)
  638. FMADD y15, alpha2, a7, y15
  639. LFDU a7, 1 * SIZE(AO1)
  640. FMADD y16, alpha2, a8, y16
  641. LFDU a8, 1 * SIZE(AO1)
  642. LFDU y07, 1 * SIZE(Y1)
  643. LFDU y08, 1 * SIZE(Y1)
  644. #if defined(PPCG4) && defined(DOUBLE)
  645. dcbt AO1, PREA
  646. #endif
  647. STFDU y09, 1 * SIZE(Y2)
  648. STFDU y10, 1 * SIZE(Y2)
  649. STFDU y11, 1 * SIZE(Y2)
  650. STFDU y12, 1 * SIZE(Y2)
  651. STFDU y13, 1 * SIZE(Y2)
  652. STFDU y14, 1 * SIZE(Y2)
  653. STFDU y15, 1 * SIZE(Y2)
  654. STFDU y16, 1 * SIZE(Y2)
  655. bdnz LL(32)
  656. .align 4
  657. LL(33):
  658. FMADD y01, alpha1, a1, y01
  659. LFDU a1, 1 * SIZE(AO2)
  660. FMADD y02, alpha1, a2, y02
  661. LFDU a2, 1 * SIZE(AO2)
  662. FMADD y03, alpha1, a3, y03
  663. LFDU a3, 1 * SIZE(AO2)
  664. FMADD y04, alpha1, a4, y04
  665. LFDU a4, 1 * SIZE(AO2)
  666. FMADD y05, alpha1, a5, y05
  667. LFDU a5, 1 * SIZE(AO2)
  668. FMADD y06, alpha1, a6, y06
  669. LFDU a6, 1 * SIZE(AO2)
  670. FMADD y07, alpha1, a7, y07
  671. LFDU a7, 1 * SIZE(AO2)
  672. FMADD y08, alpha1, a8, y08
  673. LFDU a8, 1 * SIZE(AO2)
  674. FMADD y01, alpha2, a1, y01
  675. FMADD y02, alpha2, a2, y02
  676. FMADD y03, alpha2, a3, y03
  677. FMADD y04, alpha2, a4, y04
  678. FMADD y05, alpha2, a5, y05
  679. STFDU y01, 1 * SIZE(Y2)
  680. FMADD y06, alpha2, a6, y06
  681. STFDU y02, 1 * SIZE(Y2)
  682. FMADD y07, alpha2, a7, y07
  683. STFDU y03, 1 * SIZE(Y2)
  684. FMADD y08, alpha2, a8, y08
  685. STFDU y04, 1 * SIZE(Y2)
  686. STFDU y05, 1 * SIZE(Y2)
  687. STFDU y06, 1 * SIZE(Y2)
  688. STFDU y07, 1 * SIZE(Y2)
  689. STFDU y08, 1 * SIZE(Y2)
  690. .align 4
  691. LL(35):
  692. andi. r0, M, 7
  693. ble LL(40)
  694. andi. r0, M, 4
  695. ble LL(37)
  696. LFDU a1, 1 * SIZE(AO1)
  697. LFDU y01, 1 * SIZE(Y1)
  698. LFDU a2, 1 * SIZE(AO1)
  699. LFDU y02, 1 * SIZE(Y1)
  700. LFDU a3, 1 * SIZE(AO1)
  701. LFDU y03, 1 * SIZE(Y1)
  702. LFDU a4, 1 * SIZE(AO1)
  703. LFDU y04, 1 * SIZE(Y1)
  704. FMADD y01, alpha1, a1, y01
  705. LFDU a5, 1 * SIZE(AO2)
  706. FMADD y02, alpha1, a2, y02
  707. LFDU a6, 1 * SIZE(AO2)
  708. FMADD y03, alpha1, a3, y03
  709. LFDU a7, 1 * SIZE(AO2)
  710. FMADD y04, alpha1, a4, y04
  711. LFDU a8, 1 * SIZE(AO2)
  712. FMADD y01, alpha2, a5, y01
  713. FMADD y02, alpha2, a6, y02
  714. FMADD y03, alpha2, a7, y03
  715. FMADD y04, alpha2, a8, y04
  716. STFDU y01, 1 * SIZE(Y2)
  717. STFDU y02, 1 * SIZE(Y2)
  718. STFDU y03, 1 * SIZE(Y2)
  719. STFDU y04, 1 * SIZE(Y2)
  720. .align 4
  721. LL(37):
  722. andi. r0, M, 2
  723. ble LL(38)
  724. LFDU a1, 1 * SIZE(AO1)
  725. LFDU y01, 1 * SIZE(Y1)
  726. LFDU a2, 1 * SIZE(AO1)
  727. LFDU y02, 1 * SIZE(Y1)
  728. LFDU a3, 1 * SIZE(AO2)
  729. LFDU a4, 1 * SIZE(AO2)
  730. FMADD y01, alpha1, a1, y01
  731. FMADD y02, alpha1, a2, y02
  732. FMADD y01, alpha2, a3, y01
  733. FMADD y02, alpha2, a4, y02
  734. STFDU y01, 1 * SIZE(Y2)
  735. STFDU y02, 1 * SIZE(Y2)
  736. .align 4
  737. LL(38):
  738. andi. r0, M, 1
  739. ble LL(40)
  740. LFDU a1, 1 * SIZE(AO1)
  741. LFDU y01, 1 * SIZE(Y1)
  742. LFDU a2, 1 * SIZE(AO2)
  743. FMADD y01, alpha1, a1, y01
  744. FMADD y01, alpha2, a2, y01
  745. STFDU y01, 1 * SIZE(Y2)
  746. .align 4
  747. LL(40):
  748. andi. J, N, 1
  749. lfd alpha, ALPHA
  750. ble LL(990)
  751. LFDUX alpha1, X, INCX
  752. mr AO1, A
  753. add A, A, LDA
  754. FMUL alpha1, alpha, alpha1
  755. mr Y1, YY
  756. mr Y2, YY
  757. srawi. r0, M, 3
  758. mtspr CTR, r0
  759. ble LL(45)
  760. LFDU y01, 1 * SIZE(Y1)
  761. LFDU a1, 1 * SIZE(AO1)
  762. LFDU y02, 1 * SIZE(Y1)
  763. LFDU a2, 1 * SIZE(AO1)
  764. LFDU y03, 1 * SIZE(Y1)
  765. LFDU a3, 1 * SIZE(AO1)
  766. LFDU y04, 1 * SIZE(Y1)
  767. LFDU a4, 1 * SIZE(AO1)
  768. LFDU y05, 1 * SIZE(Y1)
  769. LFDU a5, 1 * SIZE(AO1)
  770. LFDU y06, 1 * SIZE(Y1)
  771. LFDU a6, 1 * SIZE(AO1)
  772. LFDU y07, 1 * SIZE(Y1)
  773. LFDU a7, 1 * SIZE(AO1)
  774. LFDU y08, 1 * SIZE(Y1)
  775. LFDU a8, 1 * SIZE(AO1)
  776. bdz LL(43)
  777. .align 4
  778. LL(42):
  779. #ifdef PPCG4
  780. dcbtst Y1, PREA
  781. #endif
  782. FMADD y09, alpha1, a1, y01
  783. LFDU a1, 1 * SIZE(AO1)
  784. FMADD y10, alpha1, a2, y02
  785. LFDU a2, 1 * SIZE(AO1)
  786. FMADD y11, alpha1, a3, y03
  787. LFDU a3, 1 * SIZE(AO1)
  788. FMADD y12, alpha1, a4, y04
  789. LFDU a4, 1 * SIZE(AO1)
  790. LFDU y01, 1 * SIZE(Y1)
  791. LFDU y02, 1 * SIZE(Y1)
  792. LFDU y03, 1 * SIZE(Y1)
  793. LFDU y04, 1 * SIZE(Y1)
  794. #ifdef PPCG4
  795. dcbt AO1, PREA
  796. #endif
  797. FMADD y13, alpha1, a5, y05
  798. LFDU a5, 1 * SIZE(AO1)
  799. FMADD y14, alpha1, a6, y06
  800. LFDU a6, 1 * SIZE(AO1)
  801. FMADD y15, alpha1, a7, y07
  802. LFDU a7, 1 * SIZE(AO1)
  803. FMADD y16, alpha1, a8, y08
  804. LFDU a8, 1 * SIZE(AO1)
  805. #if defined(PPCG4) && defined(DOUBLE)
  806. dcbtst Y1, PREA
  807. #endif
  808. LFDU y05, 1 * SIZE(Y1)
  809. LFDU y06, 1 * SIZE(Y1)
  810. LFDU y07, 1 * SIZE(Y1)
  811. LFDU y08, 1 * SIZE(Y1)
  812. #if defined(PPCG4) && defined(DOUBLE)
  813. dcbt AO1, PREA
  814. #endif
  815. STFDU y09, 1 * SIZE(Y2)
  816. STFDU y10, 1 * SIZE(Y2)
  817. STFDU y11, 1 * SIZE(Y2)
  818. STFDU y12, 1 * SIZE(Y2)
  819. STFDU y13, 1 * SIZE(Y2)
  820. STFDU y14, 1 * SIZE(Y2)
  821. STFDU y15, 1 * SIZE(Y2)
  822. STFDU y16, 1 * SIZE(Y2)
  823. bdnz LL(42)
  824. .align 4
  825. LL(43):
  826. FMADD y01, alpha1, a1, y01
  827. FMADD y02, alpha1, a2, y02
  828. FMADD y03, alpha1, a3, y03
  829. FMADD y04, alpha1, a4, y04
  830. FMADD y05, alpha1, a5, y05
  831. STFDU y01, 1 * SIZE(Y2)
  832. FMADD y06, alpha1, a6, y06
  833. STFDU y02, 1 * SIZE(Y2)
  834. FMADD y07, alpha1, a7, y07
  835. STFDU y03, 1 * SIZE(Y2)
  836. FMADD y08, alpha1, a8, y08
  837. STFDU y04, 1 * SIZE(Y2)
  838. STFDU y05, 1 * SIZE(Y2)
  839. STFDU y06, 1 * SIZE(Y2)
  840. STFDU y07, 1 * SIZE(Y2)
  841. STFDU y08, 1 * SIZE(Y2)
  842. .align 4
  843. LL(45):
  844. andi. r0, M, 7
  845. ble LL(990)
  846. andi. r0, M, 4
  847. ble LL(47)
  848. LFDU a1, 1 * SIZE(AO1)
  849. LFDU y01, 1 * SIZE(Y1)
  850. LFDU a2, 1 * SIZE(AO1)
  851. LFDU y02, 1 * SIZE(Y1)
  852. LFDU a3, 1 * SIZE(AO1)
  853. LFDU y03, 1 * SIZE(Y1)
  854. LFDU a4, 1 * SIZE(AO1)
  855. LFDU y04, 1 * SIZE(Y1)
  856. FMADD y01, alpha1, a1, y01
  857. FMADD y02, alpha1, a2, y02
  858. FMADD y03, alpha1, a3, y03
  859. FMADD y04, alpha1, a4, y04
  860. STFDU y01, 1 * SIZE(Y2)
  861. STFDU y02, 1 * SIZE(Y2)
  862. STFDU y03, 1 * SIZE(Y2)
  863. STFDU y04, 1 * SIZE(Y2)
  864. .align 4
  865. LL(47):
  866. andi. r0, M, 2
  867. ble LL(48)
  868. LFDU a1, 1 * SIZE(AO1)
  869. LFDU y01, 1 * SIZE(Y1)
  870. LFDU a2, 1 * SIZE(AO1)
  871. LFDU y02, 1 * SIZE(Y1)
  872. FMADD y01, alpha1, a1, y01
  873. FMADD y02, alpha1, a2, y02
  874. STFDU y01, 1 * SIZE(Y2)
  875. STFDU y02, 1 * SIZE(Y2)
  876. .align 4
  877. LL(48):
  878. andi. r0, M, 1
  879. ble LL(990)
  880. LFDU a1, 1 * SIZE(AO1)
  881. LFDU y01, 1 * SIZE(Y1)
  882. FMADD y01, alpha1, a1, y01
  883. STFDU y01, 1 * SIZE(Y2)
  884. .align 4
  885. LL(990):
  886. cmpi cr0, 0, INCY, SIZE
  887. beq LL(999)
  888. addi YY, BUFFER, -SIZE
  889. mr Y1, Y
  890. srawi. r0, M, 3
  891. mtspr CTR, r0
  892. ble LL(995)
  893. .align 4
  894. LL(991):
  895. LFDUX f0, Y, INCY
  896. LFDUX f1, Y, INCY
  897. LFDUX f2, Y, INCY
  898. LFDUX f3, Y, INCY
  899. LFDUX f4, Y, INCY
  900. LFDUX f5, Y, INCY
  901. LFDUX f6, Y, INCY
  902. LFDUX f7, Y, INCY
  903. LFDU f8, 1 * SIZE(YY)
  904. LFDU f9, 1 * SIZE(YY)
  905. LFDU f10, 1 * SIZE(YY)
  906. LFDU f11, 1 * SIZE(YY)
  907. LFDU f12, 1 * SIZE(YY)
  908. LFDU f13, 1 * SIZE(YY)
  909. LFDU f14, 1 * SIZE(YY)
  910. LFDU f15, 1 * SIZE(YY)
  911. FADD f8, f8, f0
  912. FADD f9, f9, f1
  913. FADD f10, f10, f2
  914. FADD f11, f11, f3
  915. FADD f12, f12, f4
  916. FADD f13, f13, f5
  917. FADD f14, f14, f6
  918. FADD f15, f15, f7
  919. STFDUX f8, Y1, INCY
  920. STFDUX f9, Y1, INCY
  921. STFDUX f10, Y1, INCY
  922. STFDUX f11, Y1, INCY
  923. STFDUX f12, Y1, INCY
  924. STFDUX f13, Y1, INCY
  925. STFDUX f14, Y1, INCY
  926. STFDUX f15, Y1, INCY
  927. bdnz LL(991)
  928. .align 4
  929. LL(995):
  930. andi. J, M, 4
  931. ble LL(996)
  932. LFDUX f0, Y, INCY
  933. LFDUX f1, Y, INCY
  934. LFDUX f2, Y, INCY
  935. LFDUX f3, Y, INCY
  936. LFDU f8, 1 * SIZE(YY)
  937. LFDU f9, 1 * SIZE(YY)
  938. LFDU f10, 1 * SIZE(YY)
  939. LFDU f11, 1 * SIZE(YY)
  940. FADD f8, f8, f0
  941. FADD f9, f9, f1
  942. FADD f10, f10, f2
  943. FADD f11, f11, f3
  944. STFDUX f8, Y1, INCY
  945. STFDUX f9, Y1, INCY
  946. STFDUX f10, Y1, INCY
  947. STFDUX f11, Y1, INCY
  948. .align 4
  949. LL(996):
  950. andi. J, M, 2
  951. ble LL(997)
  952. LFDUX f0, Y, INCY
  953. LFDUX f1, Y, INCY
  954. LFDU f8, 1 * SIZE(YY)
  955. LFDU f9, 1 * SIZE(YY)
  956. FADD f8, f8, f0
  957. FADD f9, f9, f1
  958. STFDUX f8, Y1, INCY
  959. STFDUX f9, Y1, INCY
  960. .align 4
  961. LL(997):
  962. andi. J, M, 1
  963. ble LL(999)
  964. LFDUX f0, Y, INCY
  965. LFDU f8, 1 * SIZE(YY)
  966. FADD f8, f8, f0
  967. STFDUX f8, Y1, INCY
  968. .align 4
  969. LL(999):
  970. li r3, 0
  971. lfd f14, 0(SP)
  972. lfd f15, 8(SP)
  973. lfd f16, 16(SP)
  974. lfd f17, 24(SP)
  975. lfd f18, 32(SP)
  976. lfd f19, 40(SP)
  977. lfd f20, 48(SP)
  978. lfd f21, 56(SP)
  979. lfd f22, 64(SP)
  980. lfd f23, 72(SP)
  981. lfd f24, 80(SP)
  982. lfd f25, 88(SP)
  983. lfd f26, 96(SP)
  984. lfd f27, 104(SP)
  985. #ifdef __64BIT__
  986. ld r14, 144(SP)
  987. ld r15, 152(SP)
  988. ld r16, 160(SP)
  989. ld r17, 168(SP)
  990. ld r18, 176(SP)
  991. ld r19, 184(SP)
  992. ld r20, 192(SP)
  993. ld r21, 200(SP)
  994. ld r22, 208(SP)
  995. ld r23, 216(SP)
  996. #else
  997. lwz r14, 144(SP)
  998. lwz r15, 148(SP)
  999. lwz r16, 152(SP)
  1000. lwz r17, 156(SP)
  1001. lwz r18, 160(SP)
  1002. lwz r19, 164(SP)
  1003. lwz r20, 168(SP)
  1004. lwz r21, 172(SP)
  1005. lwz r22, 176(SP)
  1006. lwz r23, 180(SP)
  1007. #endif
  1008. addi SP, SP, STACKSIZE
  1009. blr
  1010. EPILOGUE
  1011. #endif