You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_macros_power10.S 52 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131
  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define unit_size 8
  28. #define DISP32(ind, disp) (ind*unit_size*32+disp)
  29. #define DISP16(ind, disp) (ind*unit_size*16+disp)
  30. #define DISP8(ind, disp) (ind*unit_size*8+disp)
  31. #define DISP4(ind, disp) (ind*unit_size*4+disp)
  32. #define DISP2(ind, disp) (ind*unit_size*2+disp)
  33. #define DISP1(ind, disp) (ind*unit_size+disp)
  34. #define DISPX(disp) (disp)
  35. .macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
  36. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  37. xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  38. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  39. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  40. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  41. xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  42. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  43. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  44. xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
  45. #else // CC || CR || RC || RR
  46. /*we will assume {-alpha_r,-alpha_i} for this case */
  47. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  48. xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
  49. /*we will negate alpha image instead to fix sign*/
  50. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  51. #endif
  52. .endm
  53. .macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
  54. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  55. xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  56. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  57. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  58. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  59. xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
  60. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  61. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  62. xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  63. #else // CC || CR || RC || RR
  64. /*we will assume {-alpha_r,-alpha_i} for this case */
  65. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  66. xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
  67. /*we will negate alpha image instead to fix sign*/
  68. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  69. #endif
  70. .endm
  71. /* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
  72. .macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2
  73. xvmulsp \VSOUT1, \VSINII, alpha_i
  74. xvmulsp \VSOUT2, \VSINRR, alpha_i
  75. .endm
  76. /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
  77. .macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2
  78. xvmsubasp \VSOUT1, \VSINRR, alpha_r
  79. xvmaddasp \VSOUT2, \VSINII, alpha_r
  80. .endm
  81. .macro PERMUTE1 OUT, R1, R2, R3, R4
  82. xxsel vs62, \R1, \R2, vs57
  83. xxsel \OUT, \R3, \R4, vs57
  84. xxpermdi \OUT, \OUT, vs62, 1
  85. .endm
  86. .macro PERMUTE2 OUT, R1, R2, R3, R4
  87. xxsel vs62, \R2, \R1, vs57
  88. xxsel \OUT, \R4, \R3, vs57
  89. xxpermdi \OUT, vs62, \OUT, 1
  90. xxperm \OUT, \OUT, permute_mask
  91. .endm
  92. .macro PERMUTE3 OUT, R1, R2, R3, R4
  93. xxsel vs62, \R1, \R2, vs57
  94. xxsel \OUT, \R3, \R4, vs57
  95. xxpermdi \OUT, vs62, \OUT, 2
  96. .endm
  97. .macro PERMUTE4 OUT, R1, R2, R3, R4
  98. xxsel vs62, \R2, \R1, vs57
  99. xxsel \OUT, \R4, \R3, vs57
  100. xxpermdi \OUT, \OUT, vs62, 2
  101. xxperm \OUT, \OUT, permute_mask
  102. .endm
  103. .macro GROUP1
  104. xxperm vs0, vs32, permute_mask
  105. xxperm vs4, vs40, permute_mask
  106. xxperm vs1, vs33, permute_mask
  107. xxperm vs5, vs41, permute_mask
  108. xxperm vs8, vs36, permute_mask
  109. xxperm vs12, vs44, permute_mask
  110. xxperm vs9, vs37, permute_mask
  111. xxperm vs13, vs45, permute_mask
  112. .endm
  113. .macro AGG_GROUP1
  114. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  115. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  116. AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12
  117. AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13
  118. .endm
  119. .macro GROUP2
  120. xxperm vs0, vs34, permute_mask
  121. xxperm vs4, vs42, permute_mask
  122. xxperm vs1, vs35, permute_mask
  123. xxperm vs5, vs43, permute_mask
  124. xxperm vs8, vs38, permute_mask
  125. xxperm vs12, vs46, permute_mask
  126. xxperm vs9, vs39, permute_mask
  127. xxperm vs13, vs47, permute_mask
  128. .endm
  129. .macro AGG_GROUP2
  130. AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4
  131. AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5
  132. AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12
  133. AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13
  134. .endm
  135. .macro MULTIPLY_GROUP1
  136. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  137. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  138. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  139. MULT_APLHA_PART1 vs37, vs45, vs10, vs11
  140. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  141. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  142. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  143. MULT_APLHA_PART2 vs37, vs45, vs10, vs11
  144. .endm
  145. .macro MULTIPLY_GROUP2
  146. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  147. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  148. MULT_APLHA_PART1 vs38, vs46, vs12, vs13
  149. MULT_APLHA_PART1 vs39, vs47, vs14, vs15
  150. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  151. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  152. MULT_APLHA_PART2 vs38, vs46, vs12, vs13
  153. MULT_APLHA_PART2 vs39, vs47, vs14, vs15
  154. .endm
  155. /* reconstruct r, i pairs*/
  156. .macro RECONSTRUCT_PAIR1
  157. xxperm vs0, vs1, save_permute_1
  158. xxperm vs2, vs3, save_permute_1
  159. xxperm vs8, vs9, save_permute_1
  160. xxperm vs10, vs11, save_permute_1
  161. .endm
  162. .macro RECONSTRUCT_PAIR2
  163. xxperm vs4, vs5, save_permute_1
  164. xxperm vs6, vs7, save_permute_1
  165. xxperm vs12, vs13, save_permute_1
  166. xxperm vs14, vs15, save_permute_1
  167. .endm
  168. .macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4
  169. xxmfacc \ACC
  170. PERMUTE1 \O1, \R3, \R2, \R1, \R0
  171. PERMUTE2 \O2, \R1, \R0, \R3, \R2
  172. PERMUTE3 \O3, \R1, \R0, \R3, \R2
  173. PERMUTE4 \O4, \R3, \R2, \R1, \R0
  174. .endm
  175. /* macros for N=4 and M=8
  176. **********************************************************************************************/
  177. .macro ZERO4x8
  178. xxsetaccz 0
  179. xxsetaccz 1
  180. xxsetaccz 2
  181. xxsetaccz 3
  182. xxsetaccz 4
  183. xxsetaccz 5
  184. xxsetaccz 6
  185. xxsetaccz 7
  186. .endm
  187. .macro LOAD4x8
  188. LOAD4x8O 0, 0
  189. .endm
  190. .macro LOAD4x8O OffsetA, OffsetB
  191. lxvp vs34, (\OffsetB+0)(BO)
  192. lxvp vs32, (\OffsetA+0)(AO)
  193. lxvp vs36, (\OffsetA+32)(AO)
  194. .endm
  195. .macro END4x8_NORMAL
  196. END4x8 AO, BO, 64, 32
  197. .endm
  198. .macro END4x8_WITHOUT_ADD
  199. END4x8 AO, BO, 0, 0
  200. .endm
  201. .macro END4x8 AREG, BREG, OffsetA, OffsetB
  202. .if \OffsetB != 0
  203. addi \BREG, \BREG, \OffsetB
  204. .endif
  205. .if \OffsetA != 0
  206. addi \AREG, \AREG, \OffsetA
  207. .endif
  208. xvf32gerpp 3, 36, 35
  209. xvf32gerpp 2, 37, 35
  210. xvf32gerpp 1, 32, 35
  211. xvf32gerpp 0, 33, 35
  212. xvf32gerpp 7, 36, 34
  213. xvf32gerpp 6, 37, 34
  214. xvf32gerpp 5, 32, 34
  215. xvf32gerpp 4, 33, 34
  216. .endm
  217. .macro LOAD4x8_2
  218. LOAD4x8_2O 0, 0
  219. .endm
  220. .macro LOAD4x8_2O OffsetA, OffsetB
  221. lxvp vs34, (\OffsetB)(BO)
  222. lxvp vs38, (32+\OffsetB)(BO)
  223. lxvp vs32, (0+\OffsetA)(AO)
  224. lxvp vs36, (32+\OffsetA)(AO)
  225. lxvp vs40, (64+\OffsetA)(AO)
  226. lxvp vs42, (64+32+\OffsetA)(AO)
  227. .endm
  228. .macro END4x8_2
  229. /*for load2 offset will be 128 and 64*/
  230. KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1
  231. .endm
  232. .macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast
  233. KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  234. .endm
  235. .macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast
  236. KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  237. .endm
  238. .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  239. xvf32gerpp 3, 36, 35
  240. xvf32gerpp 2, 37, 35
  241. xvf32gerpp 1, 32, 35
  242. xvf32gerpp 0, 33, 35
  243. xvf32gerpp 7, 36, 34
  244. xvf32gerpp 6, 37, 34
  245. xvf32gerpp 5, 32, 34
  246. xvf32gerpp 4, 33, 34
  247. .if \Complete==0
  248. lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
  249. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  250. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  251. .endif
  252. xvf32gerpp 3, 42, 39
  253. xvf32gerpp 2, 43, 39
  254. xvf32gerpp 1, 40, 39
  255. xvf32gerpp 0, 41, 39
  256. xvf32gerpp 7, 42, 38
  257. xvf32gerpp 6, 43, 38
  258. xvf32gerpp 5, 40, 38
  259. xvf32gerpp 4, 41, 38
  260. .if \Complete==0
  261. lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
  262. lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
  263. lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  264. .endif
  265. .if \IsLast==1
  266. .if \Complete==1
  267. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  268. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  269. .else
  270. addi \BREG, \BREG, DISP8(\Index, 64)
  271. addi \AREG, \AREG, DISP16(\Index, 128)
  272. .endif
  273. .endif
  274. .endm
  275. .macro KERNEL4x8
  276. LOAD4x8
  277. END4x8 AO, BO, 64, 32
  278. .endm
  279. .macro SAVE4x8
  280. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  281. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  282. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  283. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  284. SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
  285. SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
  286. SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
  287. SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
  288. add T4, LDC, LDC
  289. add T1, CO, LDC
  290. #ifndef TRMMKERNEL
  291. lxvp vs24, 0(CO)
  292. #endif
  293. #ifndef TRMMKERNEL
  294. lxvp vs26, 32(CO)
  295. #endif
  296. #ifndef TRMMKERNEL
  297. lxvp vs28, 0(T1)
  298. #endif
  299. xxperm vs2, vs34, permute_mask
  300. xxperm vs6, vs42, permute_mask
  301. #ifndef TRMMKERNEL
  302. lxvp vs30, 32(T1)
  303. #endif
  304. xxperm vs3, vs35, permute_mask
  305. xxperm vs7, vs43, permute_mask
  306. add T2, CO, T4
  307. add T3, T1, T4
  308. GROUP1
  309. AGG_GROUP1
  310. AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
  311. xxperm vs10, vs38, permute_mask
  312. xxperm vs14, vs46, permute_mask
  313. AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
  314. xxperm vs11, vs39, permute_mask
  315. xxperm vs15, vs47, permute_mask
  316. xxperm vs0, vs48, permute_mask
  317. xxperm vs4, vs56, permute_mask
  318. xxperm vs1, vs49, permute_mask
  319. xxperm vs5, vs16, permute_mask
  320. AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14
  321. xxperm vs2, vs50, permute_mask
  322. xxperm vs6, vs58, permute_mask
  323. AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15
  324. xxperm vs3, vs17, permute_mask
  325. xxperm vs7, vs19, permute_mask
  326. AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4
  327. xxperm vs8, vs52, permute_mask
  328. xxperm vs12, vs60, permute_mask
  329. AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5
  330. xxperm vs9, vs53, permute_mask
  331. xxperm vs13, vs61, permute_mask
  332. AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6
  333. xxperm vs10, vs54, permute_mask
  334. xxperm vs14, vs21, permute_mask
  335. AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7
  336. xxperm vs11, vs18, permute_mask
  337. xxperm vs15, vs20, permute_mask
  338. AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12
  339. AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13
  340. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  341. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  342. AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14
  343. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  344. AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15
  345. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  346. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  347. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  348. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  349. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  350. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  351. #ifndef TRMMKERNEL
  352. lxvp vs32, 0(T2)
  353. #endif
  354. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  355. MULT_APLHA_PART1 vs37, vs45, vs10, vs11
  356. #ifndef TRMMKERNEL
  357. lxvp vs40, 32(T2)
  358. #endif
  359. MULT_APLHA_PART1 vs38, vs46, vs12, vs13
  360. MULT_APLHA_PART1 vs39, vs47, vs14, vs15
  361. #ifndef TRMMKERNEL
  362. lxvp vs34, 0(T3)
  363. #endif
  364. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  365. MULT_APLHA_PART2 vs37, vs45, vs10, vs11
  366. #ifndef TRMMKERNEL
  367. lxvp vs42, 32(T3)
  368. #endif
  369. MULT_APLHA_PART2 vs38, vs46, vs12, vs13
  370. MULT_APLHA_PART2 vs39, vs47, vs14, vs15
  371. RECONSTRUCT_PAIR1
  372. RECONSTRUCT_PAIR2
  373. #ifndef TRMMKERNEL
  374. /* add */
  375. xxpermdi vs1, vs8, vs0, 2
  376. xxpermdi vs3, vs10, vs2, 2
  377. xxpermdi vs5, vs12, vs4, 2
  378. xxpermdi vs7, vs14, vs6, 2
  379. xxpermdi vs9, vs0, vs8, 2
  380. xxpermdi vs11, vs2, vs10, 2
  381. xvaddsp vs24, vs24, vs3
  382. xvaddsp vs25, vs25, vs1
  383. xxpermdi vs13, vs4, vs12, 2
  384. xxpermdi vs15, vs6, vs14, 2
  385. xvaddsp vs26, vs26, vs7
  386. xvaddsp vs27, vs27, vs5
  387. xvaddsp vs28, vs28, vs11
  388. xvaddsp vs29, vs29, vs9
  389. xvaddsp vs30, vs30, vs15
  390. xvaddsp vs31, vs31, vs13
  391. #else
  392. xxpermdi vs25, vs8, vs0, 2
  393. xxpermdi vs24, vs10, vs2, 2
  394. xxpermdi vs27, vs12, vs4, 2
  395. xxpermdi vs26, vs14, vs6, 2
  396. xxpermdi vs29, vs0, vs8, 2
  397. xxpermdi vs28, vs2, vs10, 2
  398. xxpermdi vs31, vs4, vs12, 2
  399. xxpermdi vs30, vs6, vs14, 2
  400. #endif
  401. stxvp vs24, 0(CO)
  402. MULT_APLHA_PART1 vs48, vs56, vs0, vs1
  403. MULT_APLHA_PART1 vs49, vs16, vs2, vs3
  404. stxvp vs26, 32(CO)
  405. MULT_APLHA_PART1 vs50, vs58, vs4, vs5
  406. MULT_APLHA_PART1 vs17, vs19, vs6, vs7
  407. stxvp vs28, 0(T1)
  408. MULT_APLHA_PART2 vs48, vs56, vs0, vs1
  409. MULT_APLHA_PART2 vs49, vs16, vs2, vs3
  410. stxvp vs30, 32(T1)
  411. MULT_APLHA_PART2 vs50, vs58, vs4, vs5
  412. MULT_APLHA_PART2 vs17, vs19, vs6, vs7
  413. MULT_APLHA_PART1 vs52, vs60, vs8, vs9
  414. MULT_APLHA_PART1 vs53, vs61, vs10, vs11
  415. MULT_APLHA_PART1 vs54, vs21, vs12, vs13
  416. MULT_APLHA_PART1 vs18, vs20, vs14, vs15
  417. MULT_APLHA_PART2 vs52, vs60, vs8, vs9
  418. MULT_APLHA_PART2 vs53, vs61, vs10, vs11
  419. MULT_APLHA_PART2 vs54, vs21, vs12, vs13
  420. MULT_APLHA_PART2 vs18, vs20, vs14, vs15
  421. RECONSTRUCT_PAIR1
  422. RECONSTRUCT_PAIR2
  423. #ifndef TRMMKERNEL
  424. /* add */
  425. xxpermdi vs1, vs8, vs0, 2
  426. xxpermdi vs3, vs10, vs2, 2
  427. xxpermdi vs5, vs12, vs4, 2
  428. xxpermdi vs7, vs14, vs6, 2
  429. xxpermdi vs9, vs0, vs8, 2
  430. xxpermdi vs11, vs2, vs10, 2
  431. xvaddsp vs32, vs32, vs3
  432. xvaddsp vs33, vs33, vs1
  433. xxpermdi vs13, vs4, vs12, 2
  434. xxpermdi vs15, vs6, vs14, 2
  435. xvaddsp vs40, vs40, vs7
  436. xvaddsp vs41, vs41, vs5
  437. xvaddsp vs34, vs34, vs11
  438. xvaddsp vs35, vs35, vs9
  439. xvaddsp vs42, vs42, vs15
  440. xvaddsp vs43, vs43, vs13
  441. #else
  442. xxpermdi vs33, vs8, vs0, 2
  443. xxpermdi vs32, vs10, vs2, 2
  444. xxpermdi vs41, vs12, vs4, 2
  445. xxpermdi vs40, vs14, vs6, 2
  446. xxpermdi vs35, vs0, vs8, 2
  447. xxpermdi vs34, vs2, vs10, 2
  448. xxpermdi vs43, vs4, vs12, 2
  449. xxpermdi vs42, vs6, vs14, 2
  450. #endif
  451. stxvp vs32, 0(T2)
  452. stxvp vs40, 32(T2)
  453. stxvp vs34, 0(T3)
  454. stxvp vs42, 32(T3)
  455. addi CO, CO, 64
  456. .endm
  457. /* macros for N=4 and M=4
  458. **********************************************************************************************/
  459. .macro ZERO4x4
  460. xxsetaccz 0
  461. xxsetaccz 1
  462. xxsetaccz 2
  463. xxsetaccz 3
  464. .endm
  465. .macro LOAD4x4
  466. LOAD4x4O 0, 0
  467. .endm
  468. .macro LOAD4x4O OffsetA, OffsetB
  469. lxvp vs34, (\OffsetB+0)(BO)
  470. lxvp vs32, (\OffsetA+0)(AO)
  471. .endm
  472. .macro END4x4_NORMAL
  473. END4x4 AO, BO, 32, 32
  474. .endm
  475. .macro END4x4_WITHOUT_ADD
  476. END4x4 AO, BO, 0, 0
  477. .endm
  478. .macro END4x4 AREG, BREG, OffsetA, OffsetB
  479. .if \OffsetB != 0
  480. addi \BREG, \BREG, \OffsetB
  481. .endif
  482. .if \OffsetA != 0
  483. addi \AREG, \AREG, \OffsetA
  484. .endif
  485. xvf32gerpp 3, 32, 34
  486. xvf32gerpp 2, 33, 34
  487. xvf32gerpp 1, 32, 35
  488. xvf32gerpp 0, 33, 35
  489. .endm
  490. .macro LOAD4x4_2
  491. LOAD4x4_2O 0, 0
  492. .endm
  493. .macro LOAD4x4_2O OffsetA, OffsetB
  494. lxvp vs34, (\OffsetB)(BO)
  495. lxvp vs38, (32+\OffsetB)(BO)
  496. lxvp vs32, (0+\OffsetA)(AO)
  497. lxvp vs36, (32+\OffsetA)(AO)
  498. .endm
  499. .macro END4x4_2
  500. /*for load2 offset will be 64 and 64*/
  501. KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1
  502. .endm
  503. .macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast
  504. KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  505. .endm
  506. .macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast
  507. KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  508. .endm
  509. .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  510. xvf32gerpp 3, 32, 34
  511. xvf32gerpp 2, 33, 34
  512. xvf32gerpp 1, 32, 35
  513. xvf32gerpp 0, 33, 35
  514. .if \Complete==0
  515. lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
  516. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  517. .endif
  518. xvf32gerpp 3, 36, 38
  519. xvf32gerpp 2, 37, 38
  520. xvf32gerpp 1, 36, 39
  521. xvf32gerpp 0, 37, 39
  522. .if \Complete==0
  523. lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
  524. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  525. .endif
  526. .if \IsLast==1
  527. .if \Complete==1
  528. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  529. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  530. .else
  531. addi \BREG, \BREG, DISP8(\Index, 64)
  532. addi \AREG, \AREG, DISP8(\Index, 64)
  533. .endif
  534. .endif
  535. .endm
  536. .macro KERNEL4x4
  537. LOAD4x4
  538. END4x4 AO, BO, 32, 32
  539. .endm
  540. .macro SAVE4x4
  541. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  542. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  543. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  544. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  545. add T4, LDC, LDC
  546. add T1, CO, LDC
  547. #ifndef TRMMKERNEL
  548. lxvp vs24, 0(CO)
  549. #endif
  550. add T2, CO, T4
  551. add T3, T1, T4
  552. #ifndef TRMMKERNEL
  553. lxvp vs26, 0(T1)
  554. #endif
  555. #ifndef TRMMKERNEL
  556. lxvp vs28, 0(T2)
  557. #endif
  558. #ifndef TRMMKERNEL
  559. lxvp vs30, 0(T3)
  560. #endif
  561. GROUP1
  562. AGG_GROUP1
  563. GROUP2
  564. AGG_GROUP2
  565. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  566. MULTIPLY_GROUP1
  567. MULTIPLY_GROUP2
  568. /* reconstruct r, i pairs*/
  569. RECONSTRUCT_PAIR1
  570. RECONSTRUCT_PAIR2
  571. #ifndef TRMMKERNEL
  572. /* add */
  573. xxpermdi vs1, vs8, vs0, 2
  574. xxpermdi vs3, vs10, vs2, 2
  575. xxpermdi vs9, vs0, vs8, 2
  576. xxpermdi vs11, vs2, vs10, 2
  577. xxpermdi vs5, vs12, vs4, 2
  578. xxpermdi vs7, vs14, vs6, 2
  579. xxpermdi vs13, vs4, vs12, 2
  580. xxpermdi vs15, vs6, vs14, 2
  581. xvaddsp vs24, vs24, vs3
  582. xvaddsp vs25, vs25, vs1
  583. xvaddsp vs26, vs26, vs11
  584. xvaddsp vs27, vs27, vs9
  585. xvaddsp vs28, vs28, vs7
  586. xvaddsp vs29, vs29, vs5
  587. xvaddsp vs30, vs30, vs15
  588. xvaddsp vs31, vs31, vs13
  589. #else
  590. xxpermdi vs25, vs8, vs0, 2
  591. xxpermdi vs24, vs10, vs2, 2
  592. xxpermdi vs27, vs0, vs8, 2
  593. xxpermdi vs26, vs2, vs10, 2
  594. xxpermdi vs29, vs12, vs4, 2
  595. xxpermdi vs28, vs14, vs6, 2
  596. xxpermdi vs31, vs4, vs12, 2
  597. xxpermdi vs30, vs6, vs14, 2
  598. #endif
  599. stxvp vs24, 0(CO)
  600. stxvp vs26, 0(T1)
  601. stxvp vs28, 0(T2)
  602. stxvp vs30, 0(T3)
  603. addi CO, CO, 32
  604. .endm
  605. /* macros for N=4 and M=2
  606. **********************************************************************************************/
  607. .macro ZERO4x2
  608. xxsetaccz 0
  609. xxsetaccz 1
  610. .endm
  611. .macro LOAD4x2
  612. LOAD4x2O 0, 0
  613. .endm
  614. .macro LOAD4x2O OffsetA, OffsetB
  615. lxv vs32, (\OffsetA+0)(AO)
  616. lxvp vs34, (\OffsetB+0)(BO)
  617. .endm
  618. .macro END4x2_NORMAL
  619. END4x2 AO, BO, 16, 32
  620. .endm
  621. .macro END4x2_WITHOUT_ADD
  622. END4x2 AO, BO, 0, 0
  623. .endm
  624. .macro END4x2 AREG, BREG, OffsetA, OffsetB
  625. .if \OffsetB != 0
  626. addi \BREG, \BREG, \OffsetB
  627. .endif
  628. .if \OffsetA != 0
  629. addi \AREG, \AREG, \OffsetA
  630. .endif
  631. xvf32gerpp 1, 34, 32
  632. xvf32gerpp 0, 35, 32
  633. .endm
  634. .macro LOAD4x2_2
  635. LOAD4x2_2O 0, 0
  636. .endm
  637. .macro LOAD4x2_2O OffsetA, OffsetB
  638. lxvp vs32, (\OffsetA)(AO)
  639. lxvp vs34, (0+\OffsetB)(BO)
  640. lxvp vs36, (32+\OffsetB)(BO)
  641. .endm
  642. .macro END4x2_2
  643. /*for load2 offset will be 32 and 64*/
  644. KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1
  645. .endm
  646. .macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast
  647. KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  648. .endm
  649. .macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast
  650. KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  651. .endm
  652. .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  653. xvf32gerpp 1, 34, 33
  654. xvf32gerpp 0, 35, 33
  655. .if \Complete==0
  656. lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
  657. .endif
  658. xvf32gerpp 1, 36, 32
  659. xvf32gerpp 0, 37, 32
  660. .if \Complete==0
  661. lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
  662. lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
  663. .endif
  664. .if \IsLast==1
  665. .if \Complete==1
  666. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  667. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  668. .else
  669. addi \AREG, \AREG, DISP4(\Index, 32)
  670. addi \BREG, \BREG, DISP8(\Index, 64)
  671. .endif
  672. .endif
  673. .endm
  674. .macro KERNEL4x2
  675. LOAD4x2
  676. END4x2 AO, BO, 16, 32
  677. .endm
  678. .macro SAVE4x2
  679. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  680. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  681. add T4, LDC, LDC
  682. add T1, CO, LDC
  683. add T2, CO, T4
  684. add T3, T1, T4
  685. #ifndef TRMMKERNEL
  686. lxv vs24, 0(CO)
  687. #endif
  688. #ifndef TRMMKERNEL
  689. lxv vs25, 0(T1)
  690. #endif
  691. #ifndef TRMMKERNEL
  692. lxv vs26, 0(T2)
  693. #endif
  694. #ifndef TRMMKERNEL
  695. lxv vs27, 0(T3)
  696. #endif
  697. GROUP1
  698. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  699. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  700. AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
  701. AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
  702. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  703. MULTIPLY_GROUP1
  704. /* reconstruct r, i pairs*/
  705. RECONSTRUCT_PAIR1
  706. #ifndef TRMMKERNEL
  707. /* add */
  708. xxpermdi vs1, vs8, vs0, 0
  709. xxpermdi vs9, vs10, vs2, 0
  710. xxpermdi vs3, vs0, vs8, 3
  711. xxpermdi vs11, vs2, vs10, 3
  712. xvaddsp vs24, vs24, vs1
  713. xvaddsp vs26, vs26, vs9
  714. xvaddsp vs25, vs25, vs3
  715. xvaddsp vs27, vs27, vs11
  716. #else
  717. xxpermdi vs24, vs8, vs0, 0
  718. xxpermdi vs26, vs10, vs2, 0
  719. xxpermdi vs25, vs0, vs8, 3
  720. xxpermdi vs27, vs2, vs10, 3
  721. #endif
  722. stxv vs24, 0(CO)
  723. stxv vs25, 0(T1)
  724. stxv vs26, 0(T2)
  725. stxv vs27, 0(T3)
  726. addi CO, CO, 16
  727. .endm
  728. /* macros for N=4 and M=2
  729. **********************************************************************************************/
  730. .macro ZERO4x1
  731. xxsetaccz 0
  732. xxsetaccz 1
  733. .endm
  734. .macro LOAD4x1
  735. LOAD4x1O 0, 0
  736. .endm
  737. .macro LOAD4x1O OffsetA, OffsetB
  738. lxsd v0, (\OffsetA+0)(AO)
  739. lxvp vs34, (\OffsetB+0)(BO)
  740. .endm
  741. .macro END4x1_NORMAL
  742. END4x1 AO, BO,8, 32
  743. .endm
  744. .macro END4x1_WITHOUT_ADD
  745. END4x1 AO, BO, 0, 0
  746. .endm
  747. .macro END4x1 AREG, BREG, OffsetA, OffsetB
  748. .if \OffsetB != 0
  749. addi \BREG, \BREG, \OffsetB
  750. .endif
  751. .if \OffsetA != 0
  752. addi \AREG, \AREG, \OffsetA
  753. .endif
  754. xvf32gerpp 0, 35, 32
  755. xvf32gerpp 1, 34, 32
  756. .endm
  757. .macro LOAD4x1_2
  758. LOAD4x1_2O 0, 0
  759. .endm
  760. .macro LOAD4x1_2O OffsetA, OffsetB
  761. lxv vs32, (\OffsetA)(AO)
  762. vspltisb v6, 0
  763. xxpermdi vs33, vs32, vs38, 0
  764. xxpermdi vs32, vs32, vs38, 2
  765. lxvp vs34, (0+\OffsetB)(BO)
  766. lxvp vs36, (32+\OffsetB)(BO)
  767. .endm
  768. .macro END4x1_2
  769. /*for load2 offset will be 16 and 64*/
  770. KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1
  771. .endm
  772. .macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
  773. KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  774. .endm
  775. .macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
  776. KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  777. .endm
  778. .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  779. xvf32gerpp 0, 35, 32
  780. xvf32gerpp 1, 34, 32
  781. .if \Complete==0
  782. lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
  783. .endif
  784. xvf32gerpp 0, 37, 33
  785. xvf32gerpp 1, 36, 33
  786. .if \Complete==0
  787. lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
  788. lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
  789. xxpermdi vs33, vs32, vs38, 0
  790. xxpermdi vs32, vs32, vs38, 2
  791. .endif
  792. .if \IsLast==1
  793. .if \Complete==1
  794. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  795. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  796. .else
  797. addi \AREG, \AREG, DISP2(\Index, 16)
  798. addi \BREG, \BREG, DISP8(\Index, 64)
  799. .endif
  800. .endif
  801. .endm
  802. .macro KERNEL4x1
  803. LOAD4x1
  804. END4x1 AO, BO, 8, 32
  805. .endm
  806. .macro SAVE4x1
  807. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  808. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  809. xxpermdi vs32, vs32, vs36, 1
  810. xxpermdi vs40, vs40, vs44, 1
  811. xxpermdi vs33, vs33, vs37, 1
  812. xxpermdi vs41, vs41, vs45, 1
  813. add T4, LDC, LDC
  814. add T1, CO, LDC
  815. add T2, CO, T4
  816. add T3, T1, T4
  817. #ifndef TRMMKERNEL
  818. lxsd v4, 0(CO)
  819. #endif
  820. #ifndef TRMMKERNEL
  821. lxsd v5, 0(T1)
  822. #endif
  823. #ifndef TRMMKERNEL
  824. lxsd v6, 0(T2)
  825. #endif
  826. #ifndef TRMMKERNEL
  827. lxsd v7, 0(T3)
  828. #endif
  829. xxperm vs0, vs32, permute_mask
  830. xxperm vs4, vs40, permute_mask
  831. xxperm vs1, vs33, permute_mask
  832. xxperm vs5, vs41, permute_mask
  833. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  834. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  835. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  836. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  837. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  838. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  839. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  840. /* reconstruct r, i pairs*/
  841. xxperm vs0, vs1, save_permute_1
  842. xxperm vs2, vs3, save_permute_1
  843. #ifndef TRMMKERNEL
  844. /* add */
  845. xxspltd vs1, vs0, 0
  846. xxspltd vs3, vs0, 1
  847. xxspltd vs9, vs2, 0
  848. xxspltd vs11, vs2, 1
  849. /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
  850. xvaddsp vs36, vs36, vs1
  851. xvaddsp vs37, vs37, vs3
  852. xvaddsp vs38, vs38, vs9
  853. xvaddsp vs39, vs39, vs11
  854. #else
  855. /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
  856. xxspltd vs36, vs0, 0
  857. xxspltd vs37, vs0, 1
  858. xxspltd vs38, vs2, 0
  859. xxspltd vs39, vs2, 1
  860. #endif
  861. stxsd v4, 0(CO)
  862. stxsd v5, 0(T1)
  863. stxsd v6, 0(T2)
  864. stxsd v7, 0(T3)
  865. addi CO, CO, 8
  866. .endm
  867. /* macros for N=2 and M=8
  868. **********************************************************************************************/
  869. .macro ZERO2x8
  870. xxsetaccz 0
  871. xxsetaccz 1
  872. xxsetaccz 2
  873. xxsetaccz 3
  874. .endm
  875. .macro LOAD2x8
  876. LOAD2x8O 0, 0
  877. .endm
  878. .macro LOAD2x8O OffsetA, OffsetB
  879. lxv vs34, (\OffsetB+0)(BO)
  880. lxvp vs32, (\OffsetA+0)(AO)
  881. lxvp vs36, (\OffsetA+32)(AO)
  882. .endm
  883. .macro END2x8_NORMAL
  884. END2x8 AO, BO, 64, 16
  885. .endm
  886. .macro END2x8_WITHOUT_ADD
  887. END2x8 AO, BO, 0, 0
  888. .endm
  889. .macro END2x8 AREG, BREG, OffsetA, OffsetB
  890. .if \OffsetB != 0
  891. addi \BREG, \BREG, \OffsetB
  892. .endif
  893. .if \OffsetA != 0
  894. addi \AREG, \AREG, \OffsetA
  895. .endif
  896. xvf32gerpp 2, 37, 34
  897. xvf32gerpp 3, 36, 34
  898. xvf32gerpp 0, 33, 34
  899. xvf32gerpp 1, 32, 34
  900. .endm
  901. .macro LOAD2x8_2
  902. LOAD2x8_2O 0, 0
  903. .endm
  904. .macro LOAD2x8_2O OffsetA, OffsetB
  905. lxvp vs34, (\OffsetB)(BO)
  906. lxvp vs32, (0+\OffsetA)(AO)
  907. lxvp vs36, (32+\OffsetA)(AO)
  908. lxvp vs38, (64+\OffsetA)(AO)
  909. lxvp vs40, (64+32+\OffsetA)(AO)
  910. .endm
  911. .macro END2x8_2
  912. /*for load2 offset will be 128 and 32*/
  913. KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1
  914. .endm
  915. .macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
  916. KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  917. .endm
  918. .macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
  919. KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  920. .endm
  921. .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  922. xvf32gerpp 2, 37, 35
  923. xvf32gerpp 3, 36, 35
  924. xvf32gerpp 0, 33, 35
  925. xvf32gerpp 1, 32, 35
  926. .if \Complete==0
  927. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  928. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  929. .endif
  930. xvf32gerpp 2, 41, 34
  931. xvf32gerpp 3, 40, 34
  932. xvf32gerpp 0, 39, 34
  933. xvf32gerpp 1, 38, 34
  934. .if \Complete==0
  935. lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
  936. lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
  937. lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  938. .endif
  939. .if \IsLast==1
  940. .if \Complete==1
  941. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  942. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  943. .else
  944. addi \BREG, \BREG, DISP4(\Index, 32)
  945. addi \AREG, \AREG, DISP16(\Index, 128)
  946. .endif
  947. .endif
  948. .endm
  949. .macro KERNEL2x8
  950. LOAD2x8
  951. END2x8 AO, BO, 64, 16
  952. .endm
  953. .macro SAVE2x8
  954. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  955. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  956. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  957. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  958. add T1, CO, LDC
  959. #ifndef TRMMKERNEL
  960. lxvp vs24, 0(CO)
  961. #endif
  962. #ifndef TRMMKERNEL
  963. lxvp vs26, 32(CO)
  964. #endif
  965. #ifndef TRMMKERNEL
  966. lxvp vs28, 0(T1)
  967. #endif
  968. #ifndef TRMMKERNEL
  969. lxvp vs30, 32(T1)
  970. #endif
  971. add T2, CO, T4
  972. add T3, T1, T4
  973. GROUP1
  974. AGG_GROUP1
  975. GROUP2
  976. AGG_GROUP2
  977. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  978. MULTIPLY_GROUP1
  979. MULTIPLY_GROUP2
  980. /* reconstruct r, i pairs*/
  981. RECONSTRUCT_PAIR1
  982. RECONSTRUCT_PAIR2
  983. #ifndef TRMMKERNEL
  984. /* add */
  985. xxpermdi vs1, vs8, vs0, 2
  986. xxpermdi vs3, vs10, vs2, 2
  987. xxpermdi vs5, vs12, vs4, 2
  988. xxpermdi vs7, vs14, vs6, 2
  989. xxpermdi vs9, vs0, vs8, 2
  990. xxpermdi vs11, vs2, vs10, 2
  991. xvaddsp vs24, vs24, vs3
  992. xvaddsp vs25, vs25, vs1
  993. xxpermdi vs13, vs4, vs12, 2
  994. xxpermdi vs15, vs6, vs14, 2
  995. xvaddsp vs26, vs26, vs7
  996. xvaddsp vs27, vs27, vs5
  997. xvaddsp vs28, vs28, vs11
  998. xvaddsp vs29, vs29, vs9
  999. xvaddsp vs30, vs30, vs15
  1000. xvaddsp vs31, vs31, vs13
  1001. #else
  1002. xxpermdi vs25, vs8, vs0, 2
  1003. xxpermdi vs24, vs10, vs2, 2
  1004. xxpermdi vs27, vs12, vs4, 2
  1005. xxpermdi vs26, vs14, vs6, 2
  1006. xxpermdi vs29, vs0, vs8, 2
  1007. xxpermdi vs28, vs2, vs10, 2
  1008. xxpermdi vs31, vs4, vs12, 2
  1009. xxpermdi vs30, vs6, vs14, 2
  1010. #endif
  1011. stxvp vs24, 0(CO)
  1012. stxvp vs26, 32(CO)
  1013. stxvp vs28, 0(T1)
  1014. stxvp vs30, 32(T1)
  1015. addi CO, CO, 64
  1016. .endm
  1017. /* macros for N=2 and M=4
  1018. **********************************************************************************************/
  1019. .macro ZERO2x4
  1020. xxsetaccz 0
  1021. xxsetaccz 1
  1022. .endm
  1023. .macro LOAD2x4
  1024. LOAD2x4O 0, 0
  1025. .endm
  1026. .macro LOAD2x4O OffsetA, OffsetB
  1027. lxv vs34, (\OffsetB+0)(BO)
  1028. lxvp vs32, (\OffsetA+0)(AO)
  1029. .endm
  1030. .macro END2x4_NORMAL
  1031. END2x4 AO, BO, 32, 16
  1032. .endm
  1033. .macro END2x4_WITHOUT_ADD
  1034. END2x4 AO, BO, 0, 0
  1035. .endm
  1036. .macro END2x4 AREG, BREG, OffsetA, OffsetB
  1037. .if \OffsetB != 0
  1038. addi \BREG, \BREG, \OffsetB
  1039. .endif
  1040. .if \OffsetA != 0
  1041. addi \AREG, \AREG, \OffsetA
  1042. .endif
  1043. xvf32gerpp 0, 33, 34
  1044. xvf32gerpp 1, 32, 34
  1045. .endm
  1046. .macro LOAD2x4_2
  1047. LOAD2x4_2O 0, 0
  1048. .endm
  1049. .macro LOAD2x4_2O OffsetA, OffsetB
  1050. lxvp vs34, (\OffsetB)(BO)
  1051. lxvp vs32, (0+\OffsetA)(AO)
  1052. lxvp vs36, (32+\OffsetA)(AO)
  1053. .endm
  1054. .macro END2x4_2
  1055. /*for load2 offset will be 64 and 32*/
  1056. KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1
  1057. .endm
  1058. .macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
  1059. KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1060. .endm
  1061. .macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
  1062. KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1063. .endm
  1064. .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1065. xvf32gerpp 0, 33, 35
  1066. xvf32gerpp 1, 32, 35
  1067. .if \Complete==0
  1068. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  1069. .endif
  1070. xvf32gerpp 0, 37, 34
  1071. xvf32gerpp 1, 36, 34
  1072. .if \Complete==0
  1073. lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
  1074. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  1075. .endif
  1076. .if \IsLast==1
  1077. .if \Complete==1
  1078. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1079. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  1080. .else
  1081. addi \BREG, \BREG, DISP4(\Index, 32)
  1082. addi \AREG, \AREG, DISP8(\Index, 64)
  1083. .endif
  1084. .endif
  1085. .endm
  1086. .macro KERNEL2x4
  1087. LOAD2x4
  1088. END2x4 AO, BO, 32, 16
  1089. .endm
  1090. .macro SAVE2x4
  1091. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1092. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1093. add T1, CO, LDC
  1094. #ifndef TRMMKERNEL
  1095. lxvp vs24, 0(CO)
  1096. #endif
  1097. #ifndef TRMMKERNEL
  1098. lxvp vs26, 0(T1)
  1099. #endif
  1100. GROUP1
  1101. AGG_GROUP1
  1102. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1103. MULTIPLY_GROUP1
  1104. /* reconstruct r, i pairs*/
  1105. RECONSTRUCT_PAIR1
  1106. #ifndef TRMMKERNEL
  1107. /* add */
  1108. xxpermdi vs1, vs8, vs0, 2
  1109. xxpermdi vs3, vs10, vs2, 2
  1110. xxpermdi vs9, vs0, vs8, 2
  1111. xxpermdi vs11, vs2, vs10, 2
  1112. xvaddsp vs24, vs24, vs3
  1113. xvaddsp vs25, vs25, vs1
  1114. xvaddsp vs26, vs26, vs11
  1115. xvaddsp vs27, vs27, vs9
  1116. #else
  1117. xxpermdi vs25, vs8, vs0, 2
  1118. xxpermdi vs24, vs10, vs2, 2
  1119. xxpermdi vs27, vs0, vs8, 2
  1120. xxpermdi vs26, vs2, vs10, 2
  1121. #endif
  1122. stxvp vs24, 0(CO)
  1123. stxvp vs26, 0(T1)
  1124. addi CO, CO, 32
  1125. .endm
  1126. /* macros for N=2 and M=2
  1127. **********************************************************************************************/
  1128. .macro ZERO2x2
  1129. xxsetaccz 0
  1130. .endm
  1131. .macro LOAD2x2
  1132. LOAD2x2O 0, 0
  1133. .endm
  1134. .macro LOAD2x2O OffsetA, OffsetB
  1135. lxv vs32, (\OffsetA+0)(AO)
  1136. lxv vs34, (\OffsetB+0)(BO)
  1137. .endm
  1138. .macro END2x2_NORMAL
  1139. END2x2 AO, BO, 16, 16
  1140. .endm
  1141. .macro END2x2_WITHOUT_ADD
  1142. END2x2 AO, BO, 0, 0
  1143. .endm
  1144. .macro END2x2 AREG, BREG, OffsetA, OffsetB
  1145. .if \OffsetB != 0
  1146. addi \BREG, \BREG, \OffsetB
  1147. .endif
  1148. .if \OffsetA != 0
  1149. addi \AREG, \AREG, \OffsetA
  1150. .endif
  1151. xvf32gerpp 0, 34, 32
  1152. .endm
  1153. .macro LOAD2x2_2
  1154. LOAD2x2_2O 0, 0
  1155. .endm
  1156. .macro LOAD2x2_2O OffsetA, OffsetB
  1157. lxvp vs32, (\OffsetA)(AO)
  1158. lxvp vs34, (0+\OffsetB)(BO)
  1159. .endm
  1160. .macro END2x2_2
  1161. /*for load2 offset will be 32 and 32*/
  1162. KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1
  1163. .endm
  1164. .macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
  1165. KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1166. .endm
  1167. .macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
  1168. KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1169. .endm
  1170. .macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1171. xvf32gerpp 0, 34, 32
  1172. xvf32gerpp 0, 35, 33
  1173. .if \Complete==0
  1174. lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
  1175. lxvp vs34, DISP4(\Index, \OffsetA)(\BREG)
  1176. .endif
  1177. .if \IsLast==1
  1178. .if \Complete==1
  1179. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  1180. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1181. .else
  1182. addi \AREG, \AREG, DISP4(\Index, 32)
  1183. addi \BREG, \BREG, DISP4(\Index, 32)
  1184. .endif
  1185. .endif
  1186. .endm
  1187. .macro KERNEL2x2
  1188. LOAD2x2
  1189. END2x2 AO, BO, 16, 16
  1190. .endm
  1191. .macro SAVE2x2
  1192. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1193. add T1, CO, LDC
  1194. #ifndef TRMMKERNEL
  1195. lxv vs24, 0(CO)
  1196. #endif
  1197. #ifndef TRMMKERNEL
  1198. lxv vs26, 0(T1)
  1199. #endif
  1200. xxperm vs0, vs32, permute_mask
  1201. xxperm vs4, vs40, permute_mask
  1202. xxperm vs8, vs36, permute_mask
  1203. xxperm vs12, vs44, permute_mask
  1204. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  1205. AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
  1206. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1207. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1208. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  1209. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1210. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  1211. /* reconstruct r, i pairs*/
  1212. xxperm vs0, vs1, save_permute_1
  1213. xxperm vs8, vs9, save_permute_1
  1214. #ifndef TRMMKERNEL
  1215. /* add */
  1216. xxpermdi vs1, vs8, vs0, 0
  1217. xxpermdi vs9, vs0, vs8, 3
  1218. xvaddsp vs24, vs24, vs1
  1219. xvaddsp vs26, vs26, vs9
  1220. #else
  1221. xxpermdi vs24, vs8, vs0, 0
  1222. xxpermdi vs26, vs0, vs8, 3
  1223. #endif
  1224. stxv vs24, 0(CO)
  1225. stxv vs26, 0(T1)
  1226. addi CO, CO, 16
  1227. .endm
  1228. /* macros for N=2 and M=1
  1229. **********************************************************************************************/
  1230. .macro ZERO2x1
  1231. xxlxor vs32, vs32, vs32
  1232. xxlxor vs40, vs40, vs40
  1233. .endm
  1234. .macro LOAD2x1
  1235. LOAD2x1O 0, 0
  1236. .endm
  1237. .macro LOAD2x1O OffsetA, OffsetB
  1238. lxsd v4, (\OffsetA+0)(AO)
  1239. lxv vs0, (\OffsetB+0)(BO)
  1240. xxspltd vs24, vs36, 0
  1241. xxperm vs26, vs24, permute_mask
  1242. .endm
  1243. .macro END2x1_NORMAL
  1244. END2x1 AO, BO,8, 16
  1245. .endm
  1246. .macro END2x1_WITHOUT_ADD
  1247. END2x1 AO, BO, 0, 0
  1248. .endm
  1249. .macro END2x1 AREG, BREG, OffsetA, OffsetB
  1250. .if \OffsetB != 0
  1251. addi \BREG, \BREG, \OffsetB
  1252. .endif
  1253. .if \OffsetA != 0
  1254. addi \AREG, \AREG, \OffsetA
  1255. .endif
  1256. xvmaddasp vs32, vs0, vs24
  1257. xvmaddasp vs40, vs0, vs26
  1258. .endm
  1259. .macro LOAD2x1_2
  1260. LOAD2x1_2O 0, 0
  1261. .endm
  1262. .macro LOAD2x1_2O OffsetA, OffsetB
  1263. lxv vs27, (\OffsetA)(AO)
  1264. lxvp vs4, (0+\OffsetB)(BO)
  1265. xxspltd vs8, vs27, 1
  1266. xxspltd vs24, vs27, 0
  1267. xxperm vs10, vs8, permute_mask
  1268. xxperm vs26, vs24, permute_mask
  1269. .endm
  1270. .macro END2x1_2
  1271. /*for load2 offset will be 16 and 32*/
  1272. KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1
  1273. .endm
  1274. .macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
  1275. KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1276. .endm
  1277. .macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
  1278. KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1279. .endm
  1280. .macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1281. xvmaddasp vs32, vs5, vs8
  1282. xvmaddasp vs40, vs5, vs10
  1283. .if \Complete==0
  1284. lxv vs27, DISP2(\Index, \OffsetA)(\AREG)
  1285. xxspltd vs8, vs27, 1
  1286. .endif
  1287. .if \Complete==0
  1288. xxperm vs10, vs8, permute_mask
  1289. .endif
  1290. xvmaddasp vs32, vs4, vs24
  1291. xvmaddasp vs40, vs4, vs26
  1292. .if \Complete==0
  1293. xxspltd vs24, vs27, 0
  1294. xxperm vs26, vs24, permute_mask
  1295. .endif
  1296. .if \Complete==0
  1297. lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
  1298. .endif
  1299. .if \IsLast==1
  1300. .if \Complete==1
  1301. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  1302. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1303. .else
  1304. addi \AREG, \AREG, DISP2(\Index, 16)
  1305. addi \BREG, \BREG, DISP4(\Index, 32)
  1306. .endif
  1307. .endif
  1308. .endm
  1309. .macro KERNEL2x1
  1310. LOAD2x1
  1311. END2x1 AO, BO, 8, 16
  1312. .endm
  1313. .macro SAVE2x1
  1314. add T1, CO, LDC
  1315. #ifndef TRMMKERNEL
  1316. lxsd v4, 0(CO)
  1317. #endif
  1318. #ifndef TRMMKERNEL
  1319. lxsd v5, 0(T1)
  1320. #endif
  1321. xxperm vs0, vs32, permute_mask
  1322. xxperm vs4, vs40, permute_mask
  1323. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  1324. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  1325. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1326. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1327. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1328. /* reconstruct r, i pairs*/
  1329. xxperm vs0, vs1, save_permute_1
  1330. #ifndef TRMMKERNEL
  1331. /* add */
  1332. xxspltd vs1, vs0, 0
  1333. xxspltd vs3, vs0, 1
  1334. /*--v4==vs36 v5==vs37---*/
  1335. xvaddsp vs36, vs36, vs1
  1336. xvaddsp vs37, vs37, vs3
  1337. #else
  1338. /*--v4==vs36 v5==vs37---*/
  1339. xxspltd vs36, vs0, 0
  1340. xxspltd vs37, vs0, 1
  1341. #endif
  1342. stxsd v4, 0(CO)
  1343. stxsd v5, 0(T1)
  1344. addi CO, CO, 8
  1345. .endm
  1346. /* macros for N=1 and M=8
  1347. **********************************************************************************************/
  1348. .macro ZERO1x8
  1349. xxsetaccz 0
  1350. xxsetaccz 1
  1351. xxsetaccz 2
  1352. xxsetaccz 3
  1353. .endm
  1354. .macro LOAD1x8
  1355. LOAD1x8O 0, 0
  1356. .endm
  1357. .macro LOAD1x8O OffsetA, OffsetB
  1358. lxsd v2, (\OffsetB+0)(BO)
  1359. lxvp vs32, (\OffsetA+0)(AO)
  1360. lxvp vs36, (\OffsetA+32)(AO)
  1361. .endm
  1362. .macro END1x8_NORMAL
  1363. END1x8 AO, BO, 64,8
  1364. .endm
  1365. .macro END1x8_WITHOUT_ADD
  1366. END1x8 AO, BO, 0, 0
  1367. .endm
  1368. .macro END1x8 AREG, BREG, OffsetA, OffsetB
  1369. .if \OffsetB != 0
  1370. addi \BREG, \BREG, \OffsetB
  1371. .endif
  1372. .if \OffsetA != 0
  1373. addi \AREG, \AREG, \OffsetA
  1374. .endif
  1375. xvf32gerpp 0, 34, 33
  1376. xvf32gerpp 1, 34, 32
  1377. xvf32gerpp 2, 34, 37
  1378. xvf32gerpp 3, 34, 36
  1379. .endm
  1380. .macro LOAD1x8_2
  1381. LOAD1x8_2O 0, 0
  1382. .endm
  1383. .macro LOAD1x8_2O OffsetA, OffsetB
  1384. lxv vs34, (\OffsetB)(BO)
  1385. lxvp vs32, (0+\OffsetA)(AO)
  1386. lxvp vs36, (32+\OffsetA)(AO)
  1387. vspltisb v10, 0
  1388. xxpermdi vs35, vs34, vs42, 0
  1389. xxpermdi vs34, vs34, vs42, 2
  1390. lxvp vs38, (64+\OffsetA)(AO)
  1391. lxvp vs40, (64+32+\OffsetA)(AO)
  1392. .endm
  1393. .macro END1x8_2
  1394. /*for load2 offset will be 128 and 16*/
  1395. KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1
  1396. .endm
  1397. .macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
  1398. KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1399. .endm
  1400. .macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
  1401. KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1402. .endm
  1403. .macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1404. xvf32gerpp 0, 34, 33
  1405. xvf32gerpp 1, 34, 32
  1406. .if \Complete==0
  1407. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  1408. .endif
  1409. xvf32gerpp 2, 34, 37
  1410. xvf32gerpp 3, 34, 36
  1411. .if \Complete==0
  1412. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  1413. .endif
  1414. xvf32gerpp 0, 35, 39
  1415. xvf32gerpp 1, 35, 38
  1416. .if \Complete==0
  1417. lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
  1418. .endif
  1419. xvf32gerpp 2, 35, 41
  1420. xvf32gerpp 3, 35, 40
  1421. .if \Complete==0
  1422. lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
  1423. xxpermdi vs35, vs34, vs42, 0
  1424. xxpermdi vs34, vs34, vs42, 2
  1425. lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  1426. .endif
  1427. .if \IsLast==1
  1428. .if \Complete==1
  1429. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1430. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  1431. .else
  1432. addi \BREG, \BREG, DISP2(\Index, 16)
  1433. addi \AREG, \AREG, DISP16(\Index, 128)
  1434. .endif
  1435. .endif
  1436. .endm
  1437. .macro KERNEL1x8
  1438. LOAD1x8
  1439. END1x8 AO, BO, 64,8
  1440. .endm
  1441. .macro SAVE1x8
  1442. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1443. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1444. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  1445. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  1446. xxpermdi vs32, vs32, vs36, 0
  1447. xxpermdi vs33, vs33, vs37, 0
  1448. xxpermdi vs34, vs34, vs38, 0
  1449. xxpermdi vs35, vs35, vs39, 0
  1450. xxpermdi vs40, vs40, vs44, 0
  1451. xxperm vs40, vs40, permute_mask
  1452. xxpermdi vs41, vs41, vs45, 0
  1453. xxperm vs41, vs41, permute_mask
  1454. xxpermdi vs42, vs42, vs46, 0
  1455. xxperm vs42, vs42, permute_mask
  1456. xxpermdi vs43, vs43, vs47, 0
  1457. xxperm vs43, vs43, permute_mask
  1458. #ifndef TRMMKERNEL
  1459. lxvp vs24, 0(CO)
  1460. #endif
  1461. xxperm vs0, vs32, permute_mask
  1462. xxperm vs4, vs40, permute_mask
  1463. #ifndef TRMMKERNEL
  1464. lxvp vs26, 32(CO)
  1465. #endif
  1466. xxperm vs1, vs33, permute_mask
  1467. xxperm vs5, vs41, permute_mask
  1468. xxperm vs2, vs34, permute_mask
  1469. xxperm vs6, vs42, permute_mask
  1470. xxperm vs3, vs35, permute_mask
  1471. xxperm vs7, vs43, permute_mask
  1472. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1473. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  1474. AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
  1475. AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
  1476. /*inner reverse save_permute and store vs28 */
  1477. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1478. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1479. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1480. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  1481. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  1482. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  1483. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1484. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  1485. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  1486. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  1487. /* reconstruct r, i pairs*/
  1488. xxperm vs0, vs1, vs28
  1489. xxperm vs2, vs3, vs28
  1490. xxperm vs4, vs5, vs28
  1491. xxperm vs6, vs7, vs28
  1492. #ifndef TRMMKERNEL
  1493. /* add */
  1494. xvaddsp vs24, vs24, vs2
  1495. xvaddsp vs25, vs25, vs0
  1496. xvaddsp vs26, vs26, vs6
  1497. xvaddsp vs27, vs27, vs4
  1498. stxvp vs24, 0(CO)
  1499. stxvp vs26, 32(CO)
  1500. #else
  1501. /* reconstruct r, i pairs*/
  1502. stxv vs0, 0(CO)
  1503. stxv vs2, 16(CO)
  1504. stxv vs4, 32(CO)
  1505. stxv vs6, 48(CO)
  1506. #endif
  1507. addi CO, CO, 64
  1508. .endm
  1509. /* macros for N=1 and M=4
  1510. **********************************************************************************************/
  1511. .macro ZERO1x4
  1512. xxsetaccz 0
  1513. xxsetaccz 1
  1514. .endm
  1515. .macro LOAD1x4
  1516. LOAD1x4O 0, 0
  1517. .endm
  1518. .macro LOAD1x4O OffsetA, OffsetB
  1519. lxsd v2, (\OffsetB+0)(BO)
  1520. lxvp vs32, (\OffsetA+0)(AO)
  1521. .endm
  1522. .macro END1x4_NORMAL
  1523. END1x4 AO, BO, 32,8
  1524. .endm
  1525. .macro END1x4_WITHOUT_ADD
  1526. END1x4 AO, BO, 0, 0
  1527. .endm
  1528. .macro END1x4 AREG, BREG, OffsetA, OffsetB
  1529. .if \OffsetB != 0
  1530. addi \BREG, \BREG, \OffsetB
  1531. .endif
  1532. .if \OffsetA != 0
  1533. addi \AREG, \AREG, \OffsetA
  1534. .endif
  1535. xvf32gerpp 0, 34, 33
  1536. xvf32gerpp 1, 34, 32
  1537. .endm
  1538. .macro LOAD1x4_2
  1539. LOAD1x4_2O 0, 0
  1540. .endm
  1541. .macro LOAD1x4_2O OffsetA, OffsetB
  1542. lxv vs34, (\OffsetB)(BO)
  1543. lxvp vs32, (0+\OffsetA)(AO)
  1544. vspltisb v6, 0
  1545. xxpermdi vs35, vs34, vs38, 0
  1546. xxpermdi vs34, vs34, vs38, 2
  1547. lxvp vs36, (32+\OffsetA)(AO)
  1548. .endm
  1549. .macro END1x4_2
  1550. /*for load2 offset will be 64 and 16*/
  1551. KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1
  1552. .endm
  1553. .macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
  1554. KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1555. .endm
  1556. .macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
  1557. KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1558. .endm
  1559. .macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1560. xvf32gerpp 0, 34, 33
  1561. xvf32gerpp 1, 34, 32
  1562. .if \Complete==0
  1563. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  1564. .endif
  1565. xvf32gerpp 0, 35, 37
  1566. xvf32gerpp 1, 35, 36
  1567. .if \Complete==0
  1568. lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
  1569. xxpermdi vs35, vs34, vs38, 0
  1570. xxpermdi vs34, vs34, vs38, 2
  1571. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  1572. .endif
  1573. .if \IsLast==1
  1574. .if \Complete==1
  1575. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1576. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  1577. .else
  1578. addi \BREG, \BREG, DISP2(\Index, 16)
  1579. addi \AREG, \AREG, DISP8(\Index, 64)
  1580. .endif
  1581. .endif
  1582. .endm
  1583. .macro KERNEL1x4
  1584. LOAD1x4
  1585. END1x4 AO, BO, 32,8
  1586. .endm
  1587. .macro SAVE1x4
  1588. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1589. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1590. xxpermdi vs32, vs32, vs36, 0
  1591. xxpermdi vs40, vs40, vs44, 0
  1592. xxpermdi vs33, vs33, vs37, 0
  1593. xxpermdi vs41, vs41, vs45, 0
  1594. xxperm vs40, vs40, permute_mask
  1595. xxperm vs41, vs41, permute_mask
  1596. #ifndef TRMMKERNEL
  1597. lxvp vs24, 0(CO)
  1598. #endif
  1599. xxperm vs0, vs32, permute_mask
  1600. xxperm vs4, vs40, permute_mask
  1601. xxperm vs1, vs33, permute_mask
  1602. xxperm vs5, vs41, permute_mask
  1603. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1604. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  1605. /*inner reverse save_permute and store vs28 */
  1606. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1607. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1608. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1609. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  1610. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1611. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  1612. /* reconstruct r, i pairs*/
  1613. xxperm vs0, vs1, vs28
  1614. xxperm vs2, vs3, vs28
  1615. #ifndef TRMMKERNEL
  1616. /* add */
  1617. xvaddsp vs24, vs24, vs2
  1618. xvaddsp vs25, vs25, vs0
  1619. stxvp vs24, 0(CO)
  1620. #else
  1621. /* reconstruct r, i pairs*/
  1622. stxv vs0, 0(CO)
  1623. stxv vs2, 16(CO)
  1624. #endif
  1625. addi CO, CO, 32
  1626. .endm
  1627. /* macros for N=1 and M=2
  1628. **********************************************************************************************/
  1629. .macro ZERO1x2
  1630. xxlxor vs32, vs32, vs32
  1631. xxlxor vs40, vs40, vs40
  1632. .endm
  1633. .macro LOAD1x2
  1634. LOAD1x2O 0, 0
  1635. .endm
  1636. .macro LOAD1x2O OffsetA, OffsetB
  1637. lxsd vs4, (\OffsetB+0)(BO)
  1638. lxv vs0, (\OffsetA+0)(AO)
  1639. xxspltd vs24, vs36, 0
  1640. xxperm vs26, vs24, permute_mask
  1641. .endm
  1642. .macro END1x2_NORMAL
  1643. END1x2 AO, BO, 16,8
  1644. .endm
  1645. .macro END1x2_WITHOUT_ADD
  1646. END1x2 AO, BO, 0, 0
  1647. .endm
  1648. .macro END1x2 AREG, BREG, OffsetA, OffsetB
  1649. .if \OffsetB != 0
  1650. addi \BREG, \BREG, \OffsetB
  1651. .endif
  1652. .if \OffsetA != 0
  1653. addi \AREG, \AREG, \OffsetA
  1654. .endif
  1655. xvmaddasp vs32, vs0, vs24
  1656. xvmaddasp vs40, vs0, vs26
  1657. .endm
  1658. .macro LOAD1x2_2
  1659. LOAD1x2_2O 0, 0
  1660. .endm
  1661. .macro LOAD1x2_2O OffsetA, OffsetB
  1662. lxv vs27, (\OffsetB)(BO)
  1663. lxvp vs4, (0+\OffsetA)(AO)
  1664. xxspltd vs8, vs27, 1
  1665. xxspltd vs24, vs27, 0
  1666. xxperm vs10, vs8, permute_mask
  1667. xxperm vs26, vs24, permute_mask
  1668. .endm
  1669. .macro END1x2_2
  1670. /*for load2 offset will be 32 and 16*/
  1671. KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1
  1672. .endm
  1673. .macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
  1674. KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1675. .endm
  1676. .macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
  1677. KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1678. .endm
  1679. .macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1680. .if \Complete==0
  1681. lxv vs27, DISP2(\Index, \OffsetB)(\BREG)
  1682. .endif
  1683. xvmaddasp vs32, vs5, vs8
  1684. xvmaddasp vs40, vs5, vs10
  1685. .if \Complete==0
  1686. xxspltd vs8, vs27, 1
  1687. xxperm vs10, vs8, permute_mask
  1688. .endif
  1689. xvmaddasp vs32, vs4, vs24
  1690. xvmaddasp vs40, vs4, vs26
  1691. .if \Complete==0
  1692. lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
  1693. .endif
  1694. .if \Complete==0
  1695. xxspltd vs24, vs27, 0
  1696. xxperm vs26, vs24, permute_mask
  1697. .endif
  1698. .if \IsLast==1
  1699. .if \Complete==1
  1700. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1701. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  1702. .else
  1703. addi \BREG, \BREG, DISP2(\Index, 16)
  1704. addi \AREG, \AREG, DISP4(\Index, 32)
  1705. .endif
  1706. .endif
  1707. .endm
  1708. .macro KERNEL1x2
  1709. LOAD1x2
  1710. END1x2 AO, BO, 16,8
  1711. .endm
  1712. .macro SAVE1x2
  1713. #ifndef TRMMKERNEL
  1714. lxv vs24, 0(CO)
  1715. #endif
  1716. xxperm vs0, vs32, permute_mask
  1717. xxperm vs4, vs40, permute_mask
  1718. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1719. /*inner reverse save_permute and store vs28 */
  1720. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1721. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1722. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1723. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1724. /* reconstruct r, i pairs*/
  1725. xxperm vs0, vs1, vs28
  1726. #ifndef TRMMKERNEL
  1727. /* add */
  1728. xvaddsp vs24, vs24, vs0
  1729. stxv vs24, 0(CO)
  1730. #else
  1731. /* reconstruct r, i pairs*/
  1732. stxv vs0, 0(CO)
  1733. #endif
  1734. addi CO, CO, 16
  1735. .endm
  1736. /* macros for N=1 and M=1
  1737. **********************************************************************************************/
  1738. .macro ZERO1x1
  1739. xxlxor vs32, vs32, vs32
  1740. xxlxor vs40, vs40, vs40
  1741. .endm
  1742. .macro LOAD1x1
  1743. LOAD1x1O 0, 0
  1744. .endm
  1745. .macro LOAD1x1O OffsetA, OffsetB
  1746. lxsd v4, (\OffsetB+0)(BO)
  1747. lxsd v5, (\OffsetA+0)(AO)
  1748. xxperm vs38, vs36, permute_mask
  1749. .endm
  1750. .macro END1x1_NORMAL
  1751. END1x1 AO, BO,8,8
  1752. .endm
  1753. .macro END1x1_WITHOUT_ADD
  1754. END1x1 AO, BO, 0, 0
  1755. .endm
  1756. .macro END1x1 AREG, BREG, OffsetA, OffsetB
  1757. .if \OffsetB != 0
  1758. addi \BREG, \BREG, \OffsetB
  1759. .endif
  1760. .if \OffsetA != 0
  1761. addi \AREG, \AREG, \OffsetA
  1762. .endif
  1763. xvmaddasp vs32, vs37, vs36
  1764. xvmaddasp vs40, vs37, vs38
  1765. .endm
  1766. .macro LOAD1x1_2
  1767. LOAD1x1_2O 0, 0
  1768. .endm
  1769. .macro LOAD1x1_2O OffsetA, OffsetB
  1770. lxv vs8, (\OffsetB)(BO)
  1771. lxv vs4, (0+\OffsetA)(AO)
  1772. xxperm vs10, vs8, permute_mask
  1773. .endm
  1774. .macro END1x1_2
  1775. /*for load2 offset will be 16 and 16*/
  1776. KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1
  1777. .endm
  1778. .macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
  1779. KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1780. .endm
  1781. .macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
  1782. KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1783. .endm
  1784. .macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1785. xvmaddasp vs32, vs4, vs8
  1786. xvmaddasp vs40, vs4, vs10
  1787. .if \Complete==0
  1788. lxv vs8, DISP2(\Index, \OffsetB)(\BREG)
  1789. lxv vs4, DISP2(\Index, \OffsetB)(\AREG)
  1790. xxperm vs10, vs8, permute_mask
  1791. .endif
  1792. .if \IsLast==1
  1793. .if \Complete==1
  1794. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1795. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  1796. .else
  1797. addi \BREG, \BREG, DISP2(\Index, 16)
  1798. addi \AREG, \AREG, DISP2(\Index, 16)
  1799. .endif
  1800. .endif
  1801. .endm
  1802. .macro KERNEL1x1
  1803. LOAD1x1
  1804. END1x1 AO, BO, 8,8
  1805. .endm
  1806. .macro SAVE1x1
  1807. #ifndef TRMMKERNEL
  1808. lxsd v4, 0(CO)
  1809. #endif
  1810. /*aggregate x2*/
  1811. xxpermdi vs33, vs32, vs32, 2
  1812. xxpermdi vs41, vs40, vs40, 2
  1813. xvaddsp vs32, vs32, vs33
  1814. xvaddsp vs40, vs40, vs41
  1815. xxperm vs0, vs32, permute_mask
  1816. xxperm vs4, vs40, permute_mask
  1817. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1818. /*inner reverse save_permute and store vs28 */
  1819. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1820. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1821. MULT_APLHA_PART1 vs32, vs40, vs37, vs1
  1822. MULT_APLHA_PART2 vs32, vs40, vs37, vs1
  1823. /* reconstruct r, i pairs*/
  1824. xxperm vs37, vs1, vs28
  1825. #ifndef TRMMKERNEL
  1826. /* add */
  1827. xvaddsp vs36, vs36, vs37
  1828. stxsd v4, 0(CO)
  1829. #else
  1830. /* vs37 is v5 */
  1831. stxsd v5, 0(CO)
  1832. #endif
  1833. addi CO, CO, 8
  1834. .endm
  1835. /****************************TRMM POINTER REFRESH MACROSES*************************/
  1836. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  1837. .if \SHIFT_VAL==16
  1838. slwi \REG1, \REG2, 7
  1839. .elseif \SHIFT_VAL==8
  1840. slwi \REG1, \REG2, 6
  1841. .elseif \SHIFT_VAL==4
  1842. slwi \REG1, \REG2, 5
  1843. .elseif \SHIFT_VAL==2
  1844. slwi \REG1, \REG2, 4
  1845. .elseif \SHIFT_VAL==1
  1846. slwi \REG1, \REG2, 3
  1847. .endif
  1848. .endm
  1849. /*
  1850. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1851. // ptrbb = bb;
  1852. // #else
  1853. // ptrba += off*8;
  1854. // ptrbb = bb + off*4;
  1855. // #endif
  1856. */
  1857. .macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
  1858. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1859. /* ptrbb = bb;*/
  1860. mr \PTR_B, \B_VAL /* refresh BPOINT */
  1861. #else
  1862. /*
  1863. // ptrba =ptrba+ off*C_A;
  1864. // ptrbb = bb + off*C_B;
  1865. */
  1866. SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */
  1867. SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */
  1868. add \PTR_B, \B_VAL, T4 /* Add values to BO */
  1869. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  1870. #endif
  1871. .endm
  1872. /*
  1873. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1874. // temp = bk-off;
  1875. // #elif defined(LEFT)
  1876. // temp = off+8; // number of values in A
  1877. // #else
  1878. // temp = off+4; // number of values in B
  1879. // #endif
  1880. */
  1881. .macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
  1882. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1883. /* temp = bk-off;*/
  1884. sub \TEMP_BK, \BK_VAL, \OFF_VAL
  1885. #elif defined(LEFT)
  1886. /* temp = off+INCR_A; // number of values in A */
  1887. addi \TEMP_BK, \OFF_VAL, \INCR_A
  1888. #else
  1889. /* temp = off+INCR_B // number of values in B*/
  1890. addi \TEMP_BK, \OFF_VAL, \INCR_B
  1891. #endif
  1892. .endm
  1893. /*
  1894. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1895. // temp = bk - off;
  1896. // #ifdef LEFT
  1897. // temp -= 8; // number of values in A
  1898. // #else
  1899. // temp -= 4; // number of values in B
  1900. // #endif
  1901. // ptrba += temp*8;
  1902. // ptrbb += temp*4;
  1903. // #endif
  1904. // #ifdef LEFT
  1905. // off += 8; // number of values in A
  1906. // #endif
  1907. */
  1908. .macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
  1909. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1910. /*temp = bk - off;*/
  1911. sub \TEMP_BK, \BK_VAL, \OFF_VAL
  1912. #ifdef LEFT
  1913. /*temp -= 8; // number of values in A*/
  1914. addi \TEMP_BK, \TEMP_BK,-\C_A
  1915. #else
  1916. /*temp -= 4; // number of values in B*/
  1917. addi \TEMP_BK, \TEMP_BK,-\C_B
  1918. #endif
  1919. /*ptrba += temp*C_A;
  1920. ptrbb += temp*C_B;*/
  1921. SHIFT_REG T4, \TEMP_BK, \C_A
  1922. SHIFT_REG T2, \TEMP_BK, \C_B
  1923. add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
  1924. add \PTR_B, \PTR_B, T2
  1925. #endif
  1926. #ifdef LEFT
  1927. /*off += 8; // number of values in A*/
  1928. addi \OFF_VAL, \OFF_VAL, \C_A
  1929. #endif
  1930. .endm