You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4_sandy.S 81 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. #define ASSEMBLER
  29. #include "common.h"
  30. #define old_bm %rdi
  31. #define old_bn %rsi
  32. #define old_bk %rdx
  33. #define bm %r13
  34. #define bn %r14
  35. #define bk %r15
  36. #define ALPHA %xmm0
  37. #define ba %rcx
  38. #define bb %r8
  39. #define C %r9
  40. #define ldc %r10
  41. #define i %r11
  42. #define k %rax
  43. #define ptrba %rdi
  44. #define ptrbb %rsi
  45. #define C0 %rbx
  46. #define C1 %rbp
  47. #define prebb %r12
  48. #ifndef WINDOWS_ABI
  49. #define STACKSIZE 128
  50. #define old_ldc 8+STACKSIZE(%rsp)
  51. #define old_offset 16+STACKSIZE(%rsp)
  52. #define MEMALPHA_R 48(%rsp)
  53. #define MEMALPHA_I 56(%rsp)
  54. #define j 64(%rsp)
  55. #define OFFSET 72(%rsp)
  56. #define kk 80(%rsp)
  57. #define kkk 88(%rsp)
  58. #else
  59. #define STACKSIZE 512
  60. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  61. #define OLD_A 48 + STACKSIZE(%rsp)
  62. #define OLD_B 56 + STACKSIZE(%rsp)
  63. #define OLD_C 64 + STACKSIZE(%rsp)
  64. #define old_ldc 72 + STACKSIZE(%rsp)
  65. #define old_offset 80 + STACKSIZE(%rsp)
  66. #define MEMALPHA_R 224(%rsp)
  67. #define MEMALPHA_I 232(%rsp)
  68. #define j 240(%rsp)
  69. #define OFFSET 248(%rsp)
  70. #define kk 256(%rsp)
  71. #define kkk 264(%rsp)
  72. #endif
  73. #define PREFETCH0 prefetcht0
  74. #define PREFETCH1 prefetcht0
  75. #define PREFETCH2 prefetcht0
  76. #define PRESIZE 64
  77. #define xvec0 %xmm0
  78. #define xvec1 %xmm1
  79. #define xvec2 %xmm2
  80. #define xvec3 %xmm3
  81. #define xvec4 %xmm4
  82. #define xvec5 %xmm5
  83. #define xvec6 %xmm6
  84. #define xvec7 %xmm7
  85. #define xvec8 %xmm8
  86. #define xvec9 %xmm9
  87. #define xvec10 %xmm10
  88. #define xvec11 %xmm11
  89. #define xvec12 %xmm12
  90. #define xvec13 %xmm13
  91. #define xvec14 %xmm14
  92. #define xvec15 %xmm15
  93. #define yvec0 %ymm0
  94. #define yvec1 %ymm1
  95. #define yvec2 %ymm2
  96. #define yvec3 %ymm3
  97. #define yvec4 %ymm4
  98. #define yvec5 %ymm5
  99. #define yvec6 %ymm6
  100. #define yvec7 %ymm7
  101. #define yvec8 %ymm8
  102. #define yvec9 %ymm9
  103. #define yvec10 %ymm10
  104. #define yvec11 %ymm11
  105. #define yvec12 %ymm12
  106. #define yvec13 %ymm13
  107. #define yvec14 %ymm14
  108. #define yvec15 %ymm15
  109. #define LEAQ leaq
  110. #define ADDQ addq
  111. #define MULQ imulq
  112. #define SARQ sarq
  113. #define SALQ salq
  114. #define ANDQ andq
  115. #define SUBQ subq
  116. #define DECQ decq
  117. #define JG jg
  118. #define JLE jle
  119. #define TEST testq
  120. #define OR orq
  121. #define JNE jne
  122. #define JMP jmp
  123. #define NOP
  124. #define XOR xorpd
  125. #undef MOVQ
  126. #define MOVQ movq
  127. #define XOR_DY vxorpd
  128. #define XOR_DX vxorpd
  129. #define LD_DY vmovapd
  130. #define LD_DX vmovapd
  131. #define LDL_DY vmovlpd
  132. #define LDL_DX vmovlpd
  133. #define LDH_DY vmovhpd
  134. #define LDH_DX vmovhpd
  135. #define ST_DY vmovapd
  136. #define ST_DX vmovapd
  137. #define STL_DY vmovlpd
  138. #define STL_DX vmovlpd
  139. #define STH_DY vmovhpd
  140. #define STH_DX vmovhpd
  141. #define EDUP_DY vmovddup
  142. #define ADD_DY vaddpd
  143. #define ADD_DX vaddpd
  144. #define SUB_DY vsubpd
  145. #define SUB_DX vsubpd
  146. #define ADDSUB_DY vaddsubpd
  147. #define ADDSUB_DX vaddsubpd
  148. #define MUL_DY vmulpd
  149. #define MUL_DX vmulpd
  150. #define SHUF_DY vperm2f128
  151. #define SHUF_DX vpshufd
  152. #define VPERMILP_DY vpermilpd
  153. #define BROAD_DY vbroadcastsd
  154. #define BROAD_DX vmovddup
  155. #define MOV_DY vmovapd
  156. #define MOV_DX vmovapd
  157. #define REVS_DY vshufpd
  158. #define REVS_DX vmovsd
  159. #define EXTRA_DY vextractf128
  160. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  161. #define ADD1_DX ADD_DX
  162. #define ADD1_DY ADD_DY
  163. #define ADD2_DY ADDSUB_DY
  164. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  165. #define ADD1_DX SUB_DX
  166. #define ADD1_DY SUB_DY
  167. #define ADD2_DY ADDSUB_DY
  168. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  169. #define ADD1_DX SUB_DX
  170. #define ADD1_DY SUB_DY
  171. #define ADD2_DY ADDSUB_DY
  172. #else
  173. #define ADD1_DX ADD_DX
  174. #define ADD1_DY ADD_DY
  175. #define ADD2_DY ADDSUB_DY
  176. #endif
  177. PROLOGUE
  178. subq $STACKSIZE, %rsp;
  179. movq %rbx, 0(%rsp);
  180. movq %rbp, 8(%rsp);
  181. movq %r12, 16(%rsp);
  182. movq %r13, 24(%rsp);
  183. movq %r14, 32(%rsp);
  184. movq %r15, 40(%rsp);
  185. #ifdef WINDOWS_ABI
  186. movq %rdi, 48(%rsp)
  187. movq %rsi, 56(%rsp)
  188. movups %xmm6, 64(%rsp)
  189. movups %xmm7, 80(%rsp)
  190. movups %xmm8, 96(%rsp)
  191. movups %xmm9, 112(%rsp)
  192. movups %xmm10, 128(%rsp)
  193. movups %xmm11, 144(%rsp)
  194. movups %xmm12, 160(%rsp)
  195. movups %xmm13, 176(%rsp)
  196. movups %xmm14, 192(%rsp)
  197. movups %xmm15, 208(%rsp)
  198. movq ARG1, old_bm
  199. movq ARG2, old_bn
  200. movq ARG3, old_bk
  201. movq OLD_A, ba
  202. movq OLD_B, bb
  203. movq OLD_C, C
  204. movq old_ldc, ldc
  205. #ifdef TRMMKERNEL
  206. movq old_offset, %r11
  207. #endif
  208. movaps %xmm3, %xmm0
  209. movsd OLD_ALPHA_I, %xmm1
  210. #else
  211. movq old_ldc, ldc
  212. #ifdef TRMMKERNEL
  213. movq old_offset, %r11;
  214. #endif
  215. #endif
  216. vzeroupper
  217. vmovlps %xmm0, MEMALPHA_R
  218. vmovlps %xmm1, MEMALPHA_I
  219. movq old_bm, bm
  220. movq old_bn, bn
  221. movq old_bk, bk
  222. salq $ZBASE_SHIFT, ldc
  223. #ifdef TRMMKERNEL
  224. movq %r11, OFFSET
  225. #ifndef LEFT
  226. negq %r11;
  227. #endif
  228. movq %r11, kk;
  229. #endif
  230. MOVQ bn,j;
  231. SARQ $2,j; # Rn = 4
  232. JLE .L0_loopE;
  233. ALIGN_5;
  234. .L0_bodyB:;
  235. #if defined(TRMMKERNEL) && defined(LEFT)
  236. MOVQ OFFSET, %rax;
  237. MOVQ %rax, kk;
  238. #endif
  239. MOVQ C,C0;
  240. LEAQ (C,ldc,2),C1;
  241. MOVQ bk, k;
  242. SALQ $6, k;
  243. LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2
  244. MOVQ ba,ptrba;
  245. MOVQ bm,i;
  246. SARQ $2,i; # Rm = 4
  247. JLE .L1_loopE;
  248. ALIGN_5;
  249. .L1_bodyB:;
  250. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  251. MOVQ bb,ptrbb;
  252. #else
  253. MOVQ bb, ptrbb;
  254. MOVQ kk, %rax;
  255. SALQ $ZBASE_SHIFT, %rax;
  256. LEAQ (ptrba, %rax, 4), ptrba;
  257. LEAQ (ptrbb, %rax, 4), ptrbb;
  258. #endif
  259. PREFETCH0 0*SIZE(prebb);
  260. PREFETCH0 8*SIZE(prebb);
  261. PREFETCH0 16*SIZE(prebb)
  262. ADDQ $24*SIZE, prebb;
  263. # Initial Results Register
  264. XOR_DY yvec15, yvec15, yvec15;
  265. XOR_DY yvec14, yvec14, yvec14;
  266. EDUP_DY 0*SIZE(ptrbb), yvec2; # Br1, Br1, Br2, Br2
  267. XOR_DY yvec13, yvec13, yvec13;
  268. XOR_DY yvec12, yvec12, yvec12;
  269. EDUP_DY 4*SIZE(ptrbb), yvec3; # Br3, Br3, Br4, Br4
  270. PREFETCH2 3*SIZE(C0);
  271. PREFETCH2 3*SIZE(C1);
  272. XOR_DY yvec11, yvec11, yvec11;
  273. XOR_DY yvec10, yvec10, yvec10;
  274. LD_DY 0*SIZE(ptrba), yvec0; # Ar1, Ai1, Ar2, Ai2
  275. PREFETCH2 7*SIZE(C0, ldc, 1);
  276. PREFETCH2 7*SIZE(C1, ldc, 1);
  277. XOR_DY yvec9, yvec9, yvec9;
  278. XOR_DY yvec8, yvec8, yvec8;
  279. #ifndef TRMMKERNEL
  280. MOVQ bk,k;
  281. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  282. MOVQ bk, %rax;
  283. SUBQ kk, %rax;
  284. MOVQ %rax, kkk;
  285. #else
  286. MOVQ kk, %rax;
  287. #ifdef LEFT
  288. ADDQ $4, %rax;
  289. #else
  290. ADDQ $4, %rax;
  291. #endif
  292. MOVQ %rax, kkk;
  293. #endif
  294. SARQ $2,k; # Unroll 4 times
  295. JLE .L2_loopE;
  296. ALIGN_5;
  297. .L2_bodyB:;
  298. #### Computing kernel ####
  299. #### Unroll time 1 ####
  300. LD_DY 4*SIZE(ptrba), yvec1;
  301. MUL_DY yvec0, yvec2, yvec6;
  302. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1
  303. MUL_DY yvec0, yvec3, yvec7;
  304. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  305. ADD1_DY yvec6, yvec15, yvec15;
  306. ADD1_DY yvec7, yvec11, yvec11;
  307. PREFETCH0 PRESIZE*SIZE(ptrba);
  308. MUL_DY yvec1, yvec2, yvec6;
  309. EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  310. MUL_DY yvec1, yvec3, yvec7;
  311. EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  312. ADD1_DY yvec6, yvec14, yvec14;
  313. ADD1_DY yvec7, yvec10, yvec10;
  314. MUL_DY yvec0, yvec4, yvec6;
  315. MUL_DY yvec0, yvec5, yvec7;
  316. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  317. ADD1_DY yvec6, yvec13, yvec13;
  318. ADD1_DY yvec7, yvec9, yvec9;
  319. MUL_DY yvec1, yvec4, yvec6;
  320. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  321. MUL_DY yvec1, yvec5, yvec7;
  322. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  323. ADD1_DY yvec6, yvec12, yvec12;
  324. ADD1_DY yvec7, yvec8, yvec8;
  325. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  326. MUL_DY yvec0, yvec2, yvec6;
  327. MUL_DY yvec0, yvec3, yvec7;
  328. ADD2_DY yvec6, yvec15, yvec15;
  329. ADD2_DY yvec7, yvec11, yvec11;
  330. MUL_DY yvec1, yvec2, yvec6;
  331. EDUP_DY 8*SIZE(ptrbb), yvec2;
  332. MUL_DY yvec1, yvec3, yvec7;
  333. EDUP_DY 12*SIZE(ptrbb), yvec3;
  334. ADD2_DY yvec6, yvec14, yvec14;
  335. ADD2_DY yvec7, yvec10, yvec10;
  336. MUL_DY yvec0, yvec4, yvec6;
  337. MUL_DY yvec0, yvec5, yvec7;
  338. LD_DY 8*SIZE(ptrba), yvec0;
  339. ADD2_DY yvec6, yvec13, yvec13;
  340. ADD2_DY yvec7, yvec9, yvec9;
  341. MUL_DY yvec1, yvec4, yvec6;
  342. MUL_DY yvec1, yvec5, yvec7;
  343. ADD2_DY yvec6, yvec12, yvec12;
  344. ADD2_DY yvec7, yvec8, yvec8;
  345. #### Unroll time 2 ####
  346. LD_DY 12*SIZE(ptrba), yvec1;
  347. MUL_DY yvec0, yvec2, yvec6;
  348. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  349. MUL_DY yvec0, yvec3, yvec7;
  350. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  351. ADD1_DY yvec6, yvec15, yvec15;
  352. ADD1_DY yvec7, yvec11, yvec11;
  353. PREFETCH0 (PRESIZE+8)*SIZE(ptrba);
  354. MUL_DY yvec1, yvec2, yvec6;
  355. EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  356. MUL_DY yvec1, yvec3, yvec7;
  357. EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  358. ADD1_DY yvec6, yvec14, yvec14;
  359. ADD1_DY yvec7, yvec10, yvec10;
  360. MUL_DY yvec0, yvec4, yvec6;
  361. MUL_DY yvec0, yvec5, yvec7;
  362. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  363. ADD1_DY yvec6, yvec13, yvec13;
  364. ADD1_DY yvec7, yvec9, yvec9;
  365. MUL_DY yvec1, yvec4, yvec6;
  366. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  367. MUL_DY yvec1, yvec5, yvec7;
  368. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  369. ADD1_DY yvec6, yvec12, yvec12;
  370. ADD1_DY yvec7, yvec8, yvec8;
  371. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  372. MUL_DY yvec0, yvec2, yvec6;
  373. MUL_DY yvec0, yvec3, yvec7;
  374. ADD2_DY yvec6, yvec15, yvec15;
  375. ADD2_DY yvec7, yvec11, yvec11;
  376. MUL_DY yvec1, yvec2, yvec6;
  377. EDUP_DY 16*SIZE(ptrbb), yvec2;
  378. MUL_DY yvec1, yvec3, yvec7;
  379. EDUP_DY 20*SIZE(ptrbb), yvec3;
  380. ADD2_DY yvec6, yvec14, yvec14;
  381. ADD2_DY yvec7, yvec10, yvec10;
  382. MUL_DY yvec0, yvec4, yvec6;
  383. MUL_DY yvec0, yvec5, yvec7;
  384. LD_DY 16*SIZE(ptrba), yvec0;
  385. ADD2_DY yvec6, yvec13, yvec13;
  386. ADD2_DY yvec7, yvec9, yvec9;
  387. MUL_DY yvec1, yvec4, yvec6;
  388. MUL_DY yvec1, yvec5, yvec7;
  389. ADD2_DY yvec6, yvec12, yvec12;
  390. ADD2_DY yvec7, yvec8, yvec8;
  391. #### Unroll time 3 ####
  392. LD_DY 20*SIZE(ptrba), yvec1;
  393. MUL_DY yvec0, yvec2, yvec6;
  394. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  395. MUL_DY yvec0, yvec3, yvec7;
  396. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  397. ADD1_DY yvec6, yvec15, yvec15;
  398. ADD1_DY yvec7, yvec11, yvec11;
  399. PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
  400. MUL_DY yvec1, yvec2, yvec6;
  401. EDUP_DY 17*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  402. MUL_DY yvec1, yvec3, yvec7;
  403. EDUP_DY 21*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  404. ADD1_DY yvec6, yvec14, yvec14;
  405. ADD1_DY yvec7, yvec10, yvec10;
  406. MUL_DY yvec0, yvec4, yvec6;
  407. MUL_DY yvec0, yvec5, yvec7;
  408. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  409. ADD1_DY yvec6, yvec13, yvec13;
  410. ADD1_DY yvec7, yvec9, yvec9;
  411. MUL_DY yvec1, yvec4, yvec6;
  412. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  413. MUL_DY yvec1, yvec5, yvec7;
  414. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  415. ADD1_DY yvec6, yvec12, yvec12;
  416. ADD1_DY yvec7, yvec8, yvec8;
  417. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  418. MUL_DY yvec0, yvec2, yvec6;
  419. MUL_DY yvec0, yvec3, yvec7;
  420. ADD2_DY yvec6, yvec15, yvec15;
  421. ADD2_DY yvec7, yvec11, yvec11;
  422. MUL_DY yvec1, yvec2, yvec6;
  423. EDUP_DY 24*SIZE(ptrbb), yvec2;
  424. MUL_DY yvec1, yvec3, yvec7;
  425. EDUP_DY 28*SIZE(ptrbb), yvec3;
  426. ADD2_DY yvec6, yvec14, yvec14;
  427. ADD2_DY yvec7, yvec10, yvec10;
  428. MUL_DY yvec0, yvec4, yvec6;
  429. MUL_DY yvec0, yvec5, yvec7;
  430. LD_DY 24*SIZE(ptrba), yvec0;
  431. ADD2_DY yvec6, yvec13, yvec13;
  432. ADD2_DY yvec7, yvec9, yvec9;
  433. MUL_DY yvec1, yvec4, yvec6;
  434. MUL_DY yvec1, yvec5, yvec7;
  435. ADD2_DY yvec6, yvec12, yvec12;
  436. ADD2_DY yvec7, yvec8, yvec8;
  437. #### Unroll time 4 ####
  438. LD_DY 28*SIZE(ptrba), yvec1;
  439. MUL_DY yvec0, yvec2, yvec6;
  440. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  441. MUL_DY yvec0, yvec3, yvec7;
  442. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  443. ADDQ $32*SIZE, ptrba;
  444. ADD1_DY yvec6, yvec15, yvec15;
  445. ADD1_DY yvec7, yvec11, yvec11;
  446. PREFETCH0 (PRESIZE+24)*SIZE(ptrba);
  447. MUL_DY yvec1, yvec2, yvec6;
  448. EDUP_DY 25*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  449. MUL_DY yvec1, yvec3, yvec7;
  450. EDUP_DY 29*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  451. ADD1_DY yvec6, yvec14, yvec14;
  452. ADD1_DY yvec7, yvec10, yvec10;
  453. MUL_DY yvec0, yvec4, yvec6;
  454. MUL_DY yvec0, yvec5, yvec7;
  455. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  456. ADDQ $32*SIZE, ptrbb;
  457. ADD1_DY yvec6, yvec13, yvec13;
  458. ADD1_DY yvec7, yvec9, yvec9;
  459. MUL_DY yvec1, yvec4, yvec6;
  460. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  461. MUL_DY yvec1, yvec5, yvec7;
  462. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  463. ADD1_DY yvec6, yvec12, yvec12;
  464. ADD1_DY yvec7, yvec8, yvec8;
  465. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  466. MUL_DY yvec0, yvec2, yvec6;
  467. MUL_DY yvec0, yvec3, yvec7;
  468. ADD2_DY yvec6, yvec15, yvec15;
  469. ADD2_DY yvec7, yvec11, yvec11;
  470. MUL_DY yvec1, yvec2, yvec6;
  471. EDUP_DY 0*SIZE(ptrbb), yvec2;
  472. MUL_DY yvec1, yvec3, yvec7;
  473. EDUP_DY 4*SIZE(ptrbb), yvec3;
  474. ADD2_DY yvec6, yvec14, yvec14;
  475. ADD2_DY yvec7, yvec10, yvec10;
  476. MUL_DY yvec0, yvec4, yvec6;
  477. MUL_DY yvec0, yvec5, yvec7;
  478. LD_DY 0*SIZE(ptrba), yvec0;
  479. ADD2_DY yvec6, yvec13, yvec13;
  480. ADD2_DY yvec7, yvec9, yvec9;
  481. MUL_DY yvec1, yvec4, yvec6;
  482. MUL_DY yvec1, yvec5, yvec7;
  483. ADD2_DY yvec6, yvec12, yvec12;
  484. ADD2_DY yvec7, yvec8, yvec8;
  485. DECQ k;
  486. JG .L2_bodyB;
  487. ALIGN_5
  488. .L2_loopE:;
  489. #ifndef TRMMKERNEL
  490. TEST $2, bk;
  491. #else
  492. TEST $2, kkk;
  493. #endif
  494. JLE .L3_loopE;
  495. ALIGN_5
  496. .L3_bodyB:
  497. #### Unroll time 1 ####
  498. LD_DY 4*SIZE(ptrba), yvec1;
  499. MUL_DY yvec0, yvec2, yvec6;
  500. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1
  501. MUL_DY yvec0, yvec3, yvec7;
  502. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  503. ADD1_DY yvec6, yvec15, yvec15;
  504. ADD1_DY yvec7, yvec11, yvec11;
  505. PREFETCH0 PRESIZE*SIZE(ptrba);
  506. MUL_DY yvec1, yvec2, yvec6;
  507. EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  508. MUL_DY yvec1, yvec3, yvec7;
  509. EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  510. ADD1_DY yvec6, yvec14, yvec14;
  511. ADD1_DY yvec7, yvec10, yvec10;
  512. MUL_DY yvec0, yvec4, yvec6;
  513. MUL_DY yvec0, yvec5, yvec7;
  514. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  515. ADD1_DY yvec6, yvec13, yvec13;
  516. ADD1_DY yvec7, yvec9, yvec9;
  517. MUL_DY yvec1, yvec4, yvec6;
  518. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  519. MUL_DY yvec1, yvec5, yvec7;
  520. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  521. ADD1_DY yvec6, yvec12, yvec12;
  522. ADD1_DY yvec7, yvec8, yvec8;
  523. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  524. MUL_DY yvec0, yvec2, yvec6;
  525. MUL_DY yvec0, yvec3, yvec7;
  526. ADD2_DY yvec6, yvec15, yvec15;
  527. ADD2_DY yvec7, yvec11, yvec11;
  528. MUL_DY yvec1, yvec2, yvec6;
  529. EDUP_DY 8*SIZE(ptrbb), yvec2;
  530. MUL_DY yvec1, yvec3, yvec7;
  531. EDUP_DY 12*SIZE(ptrbb), yvec3;
  532. ADD2_DY yvec6, yvec14, yvec14;
  533. ADD2_DY yvec7, yvec10, yvec10;
  534. MUL_DY yvec0, yvec4, yvec6;
  535. MUL_DY yvec0, yvec5, yvec7;
  536. LD_DY 8*SIZE(ptrba), yvec0;
  537. ADD2_DY yvec6, yvec13, yvec13;
  538. ADD2_DY yvec7, yvec9, yvec9;
  539. MUL_DY yvec1, yvec4, yvec6;
  540. MUL_DY yvec1, yvec5, yvec7;
  541. ADD2_DY yvec6, yvec12, yvec12;
  542. ADD2_DY yvec7, yvec8, yvec8;
  543. #### Unroll time 2 ####
  544. LD_DY 12*SIZE(ptrba), yvec1;
  545. MUL_DY yvec0, yvec2, yvec6;
  546. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  547. MUL_DY yvec0, yvec3, yvec7;
  548. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  549. ADDQ $16*SIZE, ptrba
  550. ADD1_DY yvec6, yvec15, yvec15;
  551. ADD1_DY yvec7, yvec11, yvec11;
  552. PREFETCH0 (PRESIZE+8)*SIZE(ptrba);
  553. MUL_DY yvec1, yvec2, yvec6;
  554. EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  555. MUL_DY yvec1, yvec3, yvec7;
  556. EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  557. ADD1_DY yvec6, yvec14, yvec14;
  558. ADD1_DY yvec7, yvec10, yvec10;
  559. MUL_DY yvec0, yvec4, yvec6;
  560. MUL_DY yvec0, yvec5, yvec7;
  561. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  562. ADDQ $16*SIZE, ptrbb
  563. ADD1_DY yvec6, yvec13, yvec13;
  564. ADD1_DY yvec7, yvec9, yvec9;
  565. MUL_DY yvec1, yvec4, yvec6;
  566. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  567. MUL_DY yvec1, yvec5, yvec7;
  568. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  569. ADD1_DY yvec6, yvec12, yvec12;
  570. ADD1_DY yvec7, yvec8, yvec8;
  571. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  572. MUL_DY yvec0, yvec2, yvec6;
  573. MUL_DY yvec0, yvec3, yvec7;
  574. ADD2_DY yvec6, yvec15, yvec15;
  575. ADD2_DY yvec7, yvec11, yvec11;
  576. MUL_DY yvec1, yvec2, yvec6;
  577. EDUP_DY 0*SIZE(ptrbb), yvec2;
  578. MUL_DY yvec1, yvec3, yvec7;
  579. EDUP_DY 4*SIZE(ptrbb), yvec3;
  580. ADD2_DY yvec6, yvec14, yvec14;
  581. ADD2_DY yvec7, yvec10, yvec10;
  582. MUL_DY yvec0, yvec4, yvec6;
  583. MUL_DY yvec0, yvec5, yvec7;
  584. LD_DY 0*SIZE(ptrba), yvec0;
  585. ADD2_DY yvec6, yvec13, yvec13;
  586. ADD2_DY yvec7, yvec9, yvec9;
  587. MUL_DY yvec1, yvec4, yvec6;
  588. MUL_DY yvec1, yvec5, yvec7;
  589. ADD2_DY yvec6, yvec12, yvec12;
  590. ADD2_DY yvec7, yvec8, yvec8;
  591. .L3_loopE:;
  592. #ifndef TRMMKERNEL
  593. TEST $1, bk;
  594. #else
  595. TEST $1, kkk;
  596. #endif
  597. JLE .L4_loopE;
  598. ALIGN_5
  599. .L4_loopB:;
  600. #### Unroll time 1 ####
  601. PREFETCH0 PRESIZE*SIZE(ptrba);
  602. LD_DY 4*SIZE(ptrba), yvec1;
  603. MUL_DY yvec0, yvec2, yvec6;
  604. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  605. MUL_DY yvec0, yvec3, yvec7;
  606. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
  607. ADDQ $8*SIZE, ptrba;
  608. ADD1_DY yvec6, yvec15, yvec15;
  609. ADD1_DY yvec7, yvec11, yvec11;
  610. MUL_DY yvec1, yvec2, yvec6;
  611. EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
  612. MUL_DY yvec1, yvec3, yvec7;
  613. EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
  614. ADD1_DY yvec6, yvec14, yvec14;
  615. ADD1_DY yvec7, yvec10, yvec10;
  616. MUL_DY yvec0, yvec4, yvec6;
  617. MUL_DY yvec0, yvec5, yvec7;
  618. VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
  619. ADDQ $8*SIZE, ptrbb;
  620. ADD1_DY yvec6, yvec13, yvec13;
  621. ADD1_DY yvec7, yvec9, yvec9;
  622. MUL_DY yvec1, yvec4, yvec6;
  623. SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
  624. MUL_DY yvec1, yvec5, yvec7;
  625. SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
  626. ADD1_DY yvec6, yvec12, yvec12;
  627. ADD1_DY yvec7, yvec8, yvec8;
  628. VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
  629. MUL_DY yvec0, yvec2, yvec6;
  630. MUL_DY yvec0, yvec3, yvec7;
  631. ADD2_DY yvec6, yvec15, yvec15;
  632. ADD2_DY yvec7, yvec11, yvec11;
  633. MUL_DY yvec1, yvec2, yvec6;
  634. MUL_DY yvec1, yvec3, yvec7;
  635. ADD2_DY yvec6, yvec14, yvec14;
  636. ADD2_DY yvec7, yvec10, yvec10;
  637. MUL_DY yvec0, yvec4, yvec6;
  638. MUL_DY yvec0, yvec5, yvec7;
  639. ADD2_DY yvec6, yvec13, yvec13;
  640. ADD2_DY yvec7, yvec9, yvec9;
  641. MUL_DY yvec1, yvec4, yvec6;
  642. MUL_DY yvec1, yvec5, yvec7;
  643. ADD2_DY yvec6, yvec12, yvec12;
  644. ADD2_DY yvec7, yvec8, yvec8;
  645. .L4_loopE:;
  646. #### Handle ####
  647. XOR_DY yvec7, yvec7, yvec7;
  648. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  649. ADDSUB_DY yvec15, yvec7, yvec15;
  650. ADDSUB_DY yvec14, yvec7, yvec14;
  651. ADDSUB_DY yvec13, yvec7, yvec13;
  652. ADDSUB_DY yvec12, yvec7, yvec12;
  653. ADDSUB_DY yvec11, yvec7, yvec11;
  654. ADDSUB_DY yvec10, yvec7, yvec10;
  655. ADDSUB_DY yvec9, yvec7, yvec9;
  656. ADDSUB_DY yvec8, yvec7, yvec8;
  657. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  658. SUB_DY yvec15, yvec7, yvec15;
  659. SUB_DY yvec14, yvec7, yvec14;
  660. SUB_DY yvec13, yvec7, yvec13;
  661. SUB_DY yvec12, yvec7, yvec12;
  662. SUB_DY yvec11, yvec7, yvec11;
  663. SUB_DY yvec10, yvec7, yvec10;
  664. SUB_DY yvec9, yvec7, yvec9;
  665. SUB_DY yvec8, yvec7, yvec8;
  666. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  667. VPERMILP_DY $0x05, yvec15, yvec15;
  668. VPERMILP_DY $0x05, yvec14, yvec14;
  669. VPERMILP_DY $0x05, yvec13, yvec13;
  670. VPERMILP_DY $0x05, yvec12, yvec12;
  671. VPERMILP_DY $0x05, yvec11, yvec11;
  672. VPERMILP_DY $0x05, yvec10, yvec10;
  673. VPERMILP_DY $0x05, yvec9, yvec9;
  674. VPERMILP_DY $0x05, yvec8, yvec8;
  675. ADDSUB_DY yvec15, yvec7, yvec15;
  676. ADDSUB_DY yvec14, yvec7, yvec14;
  677. ADDSUB_DY yvec13, yvec7, yvec13;
  678. ADDSUB_DY yvec12, yvec7, yvec12;
  679. ADDSUB_DY yvec11, yvec7, yvec11;
  680. ADDSUB_DY yvec10, yvec7, yvec10;
  681. ADDSUB_DY yvec9, yvec7, yvec9;
  682. ADDSUB_DY yvec8, yvec7, yvec8;
  683. VPERMILP_DY $0x05, yvec15, yvec15;
  684. VPERMILP_DY $0x05, yvec14, yvec14;
  685. VPERMILP_DY $0x05, yvec13, yvec13;
  686. VPERMILP_DY $0x05, yvec12, yvec12;
  687. VPERMILP_DY $0x05, yvec11, yvec11;
  688. VPERMILP_DY $0x05, yvec10, yvec10;
  689. VPERMILP_DY $0x05, yvec9, yvec9;
  690. VPERMILP_DY $0x05, yvec8, yvec8;
  691. #endif
  692. #### Load Alpha ####
  693. BROAD_DY MEMALPHA_R,yvec7;
  694. BROAD_DY MEMALPHA_I,yvec6;
  695. #### Multiply Alpha ####
  696. VPERMILP_DY $0x05, yvec15, yvec5;
  697. MUL_DY yvec7, yvec15, yvec15;
  698. MUL_DY yvec6, yvec5, yvec5;
  699. ADDSUB_DY yvec5, yvec15, yvec15;
  700. VPERMILP_DY $0x05, yvec14, yvec4;
  701. MUL_DY yvec7, yvec14, yvec14;
  702. MUL_DY yvec6, yvec4, yvec4;
  703. ADDSUB_DY yvec4, yvec14, yvec14;
  704. VPERMILP_DY $0x05, yvec13, yvec3;
  705. MUL_DY yvec7, yvec13, yvec13;
  706. MUL_DY yvec6, yvec3, yvec3;
  707. ADDSUB_DY yvec3, yvec13, yvec13;
  708. VPERMILP_DY $0x05,yvec12, yvec2;
  709. MUL_DY yvec7, yvec12, yvec12;
  710. MUL_DY yvec6, yvec2, yvec2;
  711. ADDSUB_DY yvec2, yvec12, yvec12;
  712. VPERMILP_DY $0x05, yvec11, yvec1;
  713. MUL_DY yvec7, yvec11, yvec11;
  714. MUL_DY yvec6, yvec1, yvec1;
  715. ADDSUB_DY yvec1, yvec11, yvec11;
  716. VPERMILP_DY $0x05,yvec10, yvec0;
  717. MUL_DY yvec7, yvec10, yvec10;
  718. MUL_DY yvec6, yvec0, yvec0;
  719. ADDSUB_DY yvec0, yvec10, yvec10;
  720. VPERMILP_DY $0x05, yvec9, yvec5;
  721. MUL_DY yvec7, yvec9, yvec9;
  722. MUL_DY yvec6, yvec5, yvec5;
  723. ADDSUB_DY yvec5, yvec9, yvec9;
  724. VPERMILP_DY $0x05, yvec8, yvec4;
  725. MUL_DY yvec7, yvec8, yvec8;
  726. MUL_DY yvec6, yvec4, yvec4;
  727. ADDSUB_DY yvec4, yvec8, yvec8;
  728. #### Testing Alignment ####
  729. MOVQ C0, %rax;
  730. OR ldc, %rax;
  731. TEST $15, %rax;
  732. JNE .L4_loopEx;
  733. ALIGN_5
  734. #### Store Back ####
  735. EXTRA_DY $1,yvec15,xvec7;
  736. EXTRA_DY $1,yvec14,xvec6;
  737. EXTRA_DY $1,yvec13,xvec5;
  738. EXTRA_DY $1,yvec12,xvec4;
  739. EXTRA_DY $1,yvec11,xvec3;
  740. EXTRA_DY $1,yvec10,xvec2;
  741. EXTRA_DY $1,yvec9,xvec1;
  742. EXTRA_DY $1,yvec8,xvec0;
  743. #ifndef TRMMKERNEL
  744. ADD_DY 0*SIZE(C0),xvec15, xvec15;
  745. ADD_DY 2*SIZE(C0,ldc,1), xvec7, xvec7;
  746. ADD_DY 4*SIZE(C0),xvec14, xvec14;
  747. ADD_DY 6*SIZE(C0,ldc,1),xvec6, xvec6;
  748. ADD_DY 0*SIZE(C0,ldc,1),xvec13, xvec13;
  749. ADD_DY 2*SIZE(C0),xvec5, xvec5;
  750. ADD_DY 4*SIZE(C0,ldc,1),xvec12, xvec12;
  751. ADD_DY 6*SIZE(C0),xvec4, xvec4;
  752. ADD_DY 0*SIZE(C1),xvec11, xvec11;
  753. ADD_DY 2*SIZE(C1,ldc,1),xvec3, xvec3;
  754. ADD_DY 4*SIZE(C1),xvec10, xvec10;
  755. ADD_DY 6*SIZE(C1,ldc,1),xvec2, xvec2;
  756. ADD_DY 0*SIZE(C1,ldc,1),xvec9, xvec9;
  757. ADD_DY 2*SIZE(C1),xvec1, xvec1;
  758. ADD_DY 4*SIZE(C1,ldc,1),xvec8, xvec8;
  759. ADD_DY 6*SIZE(C1),xvec0, xvec0;
  760. #endif
  761. ST_DY xvec15,0*SIZE(C0);
  762. ST_DY xvec7,2*SIZE(C0,ldc,1);
  763. ST_DY xvec14,4*SIZE(C0);
  764. ST_DY xvec6,6*SIZE(C0,ldc,1);
  765. ST_DY xvec13,0*SIZE(C0,ldc,1);
  766. ST_DY xvec5,2*SIZE(C0);
  767. ST_DY xvec12,4*SIZE(C0,ldc,1);
  768. ST_DY xvec4,6*SIZE(C0);
  769. ST_DY xvec11,0*SIZE(C1);
  770. ST_DY xvec3,2*SIZE(C1,ldc,1);
  771. ST_DY xvec10,4*SIZE(C1);
  772. ST_DY xvec2,6*SIZE(C1,ldc,1);
  773. ST_DY xvec9,0*SIZE(C1,ldc,1);
  774. ST_DY xvec1,2*SIZE(C1);
  775. ST_DY xvec8,4*SIZE(C1,ldc,1);
  776. ST_DY xvec0,6*SIZE(C1);
  777. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  778. MOVQ bk, %rax;
  779. SUBQ kkk, %rax;
  780. SALQ $ZBASE_SHIFT, %rax;
  781. LEAQ (ptrba, %rax, 4), ptrba;
  782. LEAQ (ptrbb, %rax, 4), ptrbb;
  783. #endif
  784. #if defined(TRMMKERNEL) && defined(LEFT)
  785. ADDQ $4, kk;
  786. #endif
  787. ADDQ $8*SIZE,C0;
  788. ADDQ $8*SIZE,C1;
  789. .L1_bodyE:;
  790. DECQ i;
  791. JG .L1_bodyB;
  792. JMP .L1_loopE;
  793. ALIGN_5
  794. .L4_loopEx:
  795. EXTRA_DY $1, yvec15, xvec7;
  796. EXTRA_DY $1, yvec14, xvec6;
  797. #ifndef TRMMKERNEL
  798. LDL_DY 0*SIZE(C0), xvec0, xvec0;
  799. LDH_DY 1*SIZE(C0), xvec0, xvec0;
  800. LDL_DY 2*SIZE(C0, ldc, 1), xvec1, xvec1;
  801. LDH_DY 3*SIZE(C0, ldc, 1), xvec1, xvec1;
  802. LDL_DY 4*SIZE(C0), xvec2, xvec2;
  803. LDH_DY 5*SIZE(C0), xvec2, xvec2;
  804. LDL_DY 6*SIZE(C0, ldc, 1), xvec3, xvec3;
  805. LDH_DY 7*SIZE(C0, ldc, 1), xvec3, xvec3;
  806. ADD_DY xvec0, xvec15, xvec15;
  807. ADD_DY xvec1, xvec7, xvec7;
  808. ADD_DY xvec2, xvec14, xvec14;
  809. ADD_DY xvec3, xvec6, xvec6;
  810. #endif
  811. STL_DY xvec15, 0*SIZE(C0);
  812. STH_DY xvec15, 1*SIZE(C0);
  813. STL_DY xvec7, 2*SIZE(C0, ldc, 1);
  814. STH_DY xvec7, 3*SIZE(C0, ldc, 1);
  815. STL_DY xvec14, 4*SIZE(C0);
  816. STH_DY xvec14, 5*SIZE(C0);
  817. STL_DY xvec6, 6*SIZE(C0, ldc, 1);
  818. STH_DY xvec6, 7*SIZE(C0, ldc, 1);
  819. EXTRA_DY $1, yvec13, xvec5;
  820. EXTRA_DY $1, yvec12, xvec4;
  821. #ifndef TRMMKERNEL
  822. LDL_DY 0*SIZE(C0, ldc, 1), xvec3, xvec3;
  823. LDH_DY 1*SIZE(C0, ldc, 1), xvec3, xvec3;
  824. LDL_DY 2*SIZE(C0), xvec2, xvec2;
  825. LDH_DY 3*SIZE(C0), xvec2, xvec2;
  826. LDL_DY 4*SIZE(C0, ldc, 1), xvec1, xvec1;
  827. LDH_DY 5*SIZE(C0, ldc, 1), xvec1, xvec1;
  828. LDL_DY 6*SIZE(C0), xvec0, xvec0;
  829. LDH_DY 7*SIZE(C0), xvec0, xvec0;
  830. ADD_DY xvec3, xvec13, xvec13;
  831. ADD_DY xvec2, xvec5, xvec5;
  832. ADD_DY xvec1, xvec12, xvec12;
  833. ADD_DY xvec0, xvec4, xvec4;
  834. #endif
  835. STL_DY xvec13, 0*SIZE(C0, ldc, 1);
  836. STH_DY xvec13, 1*SIZE(C0, ldc, 1);
  837. STL_DY xvec5, 2*SIZE(C0);
  838. STH_DY xvec5, 3*SIZE(C0);
  839. STL_DY xvec12, 4*SIZE(C0, ldc, 1);
  840. STH_DY xvec12, 5*SIZE(C0, ldc, 1);
  841. STL_DY xvec4, 6*SIZE(C0);
  842. STH_DY xvec4, 7*SIZE(C0);
  843. EXTRA_DY $1, yvec11, xvec3;
  844. EXTRA_DY $1, yvec10, xvec2;
  845. #ifndef TRMMKERNEL
  846. LDL_DY 0*SIZE(C1), xvec7, xvec7;
  847. LDH_DY 1*SIZE(C1), xvec7, xvec7;
  848. LDL_DY 2*SIZE(C1, ldc, 1), xvec6, xvec6;
  849. LDH_DY 3*SIZE(C1, ldc, 1), xvec6, xvec6;
  850. LDL_DY 4*SIZE(C1), xvec5, xvec5;
  851. LDH_DY 5*SIZE(C1), xvec5, xvec5;
  852. LDL_DY 6*SIZE(C1, ldc, 1), xvec4, xvec4;
  853. LDH_DY 7*SIZE(C1, ldc, 1), xvec4, xvec4;
  854. ADD_DY xvec7, xvec11, xvec11;
  855. ADD_DY xvec6, xvec3, xvec3;
  856. ADD_DY xvec5, xvec10, xvec10;
  857. ADD_DY xvec4, xvec2, xvec2;
  858. #endif
  859. STL_DY xvec11, 0*SIZE(C1);
  860. STH_DY xvec11, 1*SIZE(C1);
  861. STL_DY xvec3, 2*SIZE(C1, ldc, 1);
  862. STH_DY xvec3, 3*SIZE(C1, ldc, 1);
  863. STL_DY xvec10, 4*SIZE(C1);
  864. STH_DY xvec10, 5*SIZE(C1);
  865. STL_DY xvec2, 6*SIZE(C1, ldc, 1);
  866. STH_DY xvec2, 7*SIZE(C1, ldc, 1);
  867. EXTRA_DY $1, yvec9, xvec1;
  868. EXTRA_DY $1, yvec8, xvec0;
  869. #ifndef TRMMKERNEL
  870. LDL_DY 0*SIZE(C1, ldc, 1), xvec5, xvec5;
  871. LDH_DY 1*SIZE(C1, ldc, 1), xvec5, xvec5;
  872. LDL_DY 2*SIZE(C1), xvec4, xvec4;
  873. LDH_DY 3*SIZE(C1), xvec4, xvec4;
  874. LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
  875. LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3;
  876. LDL_DY 6*SIZE(C1), xvec2, xvec2;
  877. LDH_DY 7*SIZE(C1), xvec2, xvec2;
  878. ADD_DY xvec5, xvec9, xvec9;
  879. ADD_DY xvec4, xvec1, xvec1;
  880. ADD_DY xvec3, xvec8, xvec8;
  881. ADD_DY xvec2, xvec0, xvec0;
  882. #endif
  883. STL_DY xvec9, 0*SIZE(C1, ldc, 1);
  884. STH_DY xvec9, 1*SIZE(C1, ldc, 1);
  885. STL_DY xvec1, 2*SIZE(C1);
  886. STH_DY xvec1, 3*SIZE(C1);
  887. STL_DY xvec8, 4*SIZE(C1, ldc, 1);
  888. STH_DY xvec8, 5*SIZE(C1, ldc, 1);
  889. STL_DY xvec0, 6*SIZE(C1);
  890. STH_DY xvec0, 7*SIZE(C1);
  891. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  892. MOVQ bk, %rax;
  893. SUBQ kkk, %rax;
  894. SALQ $ZBASE_SHIFT, %rax;
  895. LEAQ (ptrba, %rax, 4), ptrba;
  896. LEAQ (ptrbb, %rax, 4), ptrbb;
  897. #endif
  898. #if defined(TRMMKERNEL) && defined(LEFT)
  899. ADDQ $4, kk;
  900. #endif
  901. ADDQ $8*SIZE, C0;
  902. ADDQ $8*SIZE, C1;
  903. DECQ i;
  904. JG .L1_bodyB;
  905. ALIGN_5;
  906. .L1_loopE:;
  907. TEST $2, bm;
  908. JLE .L5_loopE;
  909. ALIGN_5
  910. .L5_bodyB:
  911. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  912. MOVQ bb,ptrbb;
  913. #else
  914. MOVQ bb, ptrbb;
  915. MOVQ kk, %rax;
  916. SALQ $ZBASE_SHIFT, %rax;
  917. LEAQ (ptrba, %rax, 2), ptrba;
  918. LEAQ (ptrbb, %rax, 4), ptrbb;
  919. #endif
  920. XOR_DY yvec15, yvec15, yvec15;
  921. XOR_DY yvec14, yvec14, yvec14;
  922. XOR_DY yvec13, yvec13, yvec13;
  923. XOR_DY yvec12, yvec12, yvec12;
  924. #ifndef TRMMKERNEL
  925. MOVQ bk,k;
  926. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  927. MOVQ bk, %rax;
  928. SUBQ kk, %rax;
  929. MOVQ %rax, kkk;
  930. #else
  931. MOVQ kk, %rax;
  932. #ifdef LEFT
  933. ADDQ $2, %rax;
  934. #else
  935. ADDQ $4, %rax;
  936. #endif
  937. MOVQ %rax, kkk;
  938. #endif
  939. SARQ $2, k;
  940. JLE .L7_loopE;
  941. ALIGN_5
  942. .L7_bodyB:
  943. #### Compute kernel ####
  944. #### Unroll times 1 ####
  945. LD_DY 0*SIZE(ptrba), yvec0;
  946. EDUP_DY 0*SIZE(ptrbb), yvec2;
  947. EDUP_DY 4*SIZE(ptrbb), yvec3;
  948. MUL_DY yvec0, yvec2, yvec6;
  949. ADD1_DY yvec6, yvec15, yvec15;
  950. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  951. MUL_DY yvec0, yvec3, yvec7;
  952. ADD1_DY yvec7, yvec14, yvec14;
  953. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  954. MUL_DY yvec0, yvec4, yvec6;
  955. ADD1_DY yvec6, yvec13, yvec13;
  956. EDUP_DY 1*SIZE(ptrbb), yvec2;
  957. MUL_DY yvec0, yvec5, yvec7;
  958. ADD1_DY yvec7 ,yvec12, yvec12;
  959. EDUP_DY 5*SIZE(ptrbb), yvec3
  960. VPERMILP_DY $0x05, yvec0, yvec0;
  961. MUL_DY yvec0, yvec2, yvec6;
  962. ADD2_DY yvec6, yvec15, yvec15;
  963. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  964. MUL_DY yvec0, yvec3, yvec7;
  965. ADD2_DY yvec7, yvec14, yvec14;
  966. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  967. MUL_DY yvec0, yvec4, yvec6;
  968. ADD2_DY yvec6, yvec13, yvec13;
  969. MUL_DY yvec0, yvec5, yvec7;
  970. ADD2_DY yvec7, yvec12, yvec12;
  971. #### Unroll time 2 ####
  972. LD_DY 4*SIZE(ptrba), yvec0;
  973. EDUP_DY 8*SIZE(ptrbb), yvec2;
  974. EDUP_DY 12*SIZE(ptrbb), yvec3;
  975. MUL_DY yvec0, yvec2, yvec6;
  976. ADD1_DY yvec6, yvec15, yvec15;
  977. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  978. MUL_DY yvec0, yvec3, yvec7;
  979. ADD1_DY yvec7, yvec14, yvec14;
  980. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  981. MUL_DY yvec0, yvec4, yvec6;
  982. ADD1_DY yvec6, yvec13, yvec13;
  983. EDUP_DY 9*SIZE(ptrbb), yvec2;
  984. MUL_DY yvec0, yvec5, yvec7;
  985. ADD1_DY yvec7 ,yvec12, yvec12;
  986. EDUP_DY 13*SIZE(ptrbb), yvec3
  987. VPERMILP_DY $0x05, yvec0, yvec0;
  988. MUL_DY yvec0, yvec2, yvec6;
  989. ADD2_DY yvec6, yvec15, yvec15;
  990. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  991. MUL_DY yvec0, yvec3, yvec7;
  992. ADD2_DY yvec7, yvec14, yvec14;
  993. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  994. MUL_DY yvec0, yvec4, yvec6;
  995. ADD2_DY yvec6, yvec13, yvec13;
  996. MUL_DY yvec0, yvec5, yvec7;
  997. ADD2_DY yvec7, yvec12, yvec12;
  998. #### Unroll time 3 ####
  999. LD_DY 8*SIZE(ptrba), yvec0;
  1000. EDUP_DY 16*SIZE(ptrbb), yvec2;
  1001. EDUP_DY 20*SIZE(ptrbb), yvec3;
  1002. MUL_DY yvec0, yvec2, yvec6;
  1003. ADD1_DY yvec6, yvec15, yvec15;
  1004. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1005. MUL_DY yvec0, yvec3, yvec7;
  1006. ADD1_DY yvec7, yvec14, yvec14;
  1007. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1008. MUL_DY yvec0, yvec4, yvec6;
  1009. ADD1_DY yvec6, yvec13, yvec13;
  1010. EDUP_DY 17*SIZE(ptrbb), yvec2;
  1011. MUL_DY yvec0, yvec5, yvec7;
  1012. ADD1_DY yvec7 ,yvec12, yvec12;
  1013. EDUP_DY 21*SIZE(ptrbb), yvec3
  1014. VPERMILP_DY $0x05, yvec0, yvec0;
  1015. MUL_DY yvec0, yvec2, yvec6;
  1016. ADD2_DY yvec6, yvec15, yvec15;
  1017. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1018. MUL_DY yvec0, yvec3, yvec7;
  1019. ADD2_DY yvec7, yvec14, yvec14;
  1020. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1021. MUL_DY yvec0, yvec4, yvec6;
  1022. ADD2_DY yvec6, yvec13, yvec13;
  1023. MUL_DY yvec0, yvec5, yvec7;
  1024. ADD2_DY yvec7, yvec12, yvec12;
  1025. #### Unroll time 4 ####
  1026. LD_DY 12*SIZE(ptrba), yvec0;
  1027. EDUP_DY 24*SIZE(ptrbb), yvec2;
  1028. EDUP_DY 28*SIZE(ptrbb), yvec3;
  1029. MUL_DY yvec0, yvec2, yvec6;
  1030. ADD1_DY yvec6, yvec15, yvec15;
  1031. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1032. MUL_DY yvec0, yvec3, yvec7;
  1033. ADD1_DY yvec7, yvec14, yvec14;
  1034. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1035. MUL_DY yvec0, yvec4, yvec6;
  1036. ADD1_DY yvec6, yvec13, yvec13;
  1037. EDUP_DY 25*SIZE(ptrbb), yvec2;
  1038. MUL_DY yvec0, yvec5, yvec7;
  1039. ADD1_DY yvec7 ,yvec12, yvec12;
  1040. EDUP_DY 29*SIZE(ptrbb), yvec3
  1041. VPERMILP_DY $0x05, yvec0, yvec0;
  1042. MUL_DY yvec0, yvec2, yvec6;
  1043. ADD2_DY yvec6, yvec15, yvec15;
  1044. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1045. MUL_DY yvec0, yvec3, yvec7;
  1046. ADD2_DY yvec7, yvec14, yvec14;
  1047. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1048. MUL_DY yvec0, yvec4, yvec6;
  1049. ADD2_DY yvec6, yvec13, yvec13;
  1050. ADDQ $16*SIZE, ptrba;
  1051. MUL_DY yvec0, yvec5, yvec7;
  1052. ADD2_DY yvec7, yvec12, yvec12;
  1053. ADDQ $32*SIZE, ptrbb;
  1054. DECQ k;
  1055. JG .L7_bodyB;
  1056. ALIGN_5
  1057. .L7_loopE:
  1058. #ifndef TRMMKERNEL
  1059. TEST $2, bk;
  1060. #else
  1061. TEST $2, kkk;
  1062. #endif
  1063. JLE .L8_loopE;
  1064. ALIGN_5
  1065. .L8_bodyB:
  1066. #### Unroll times 1 ####
  1067. LD_DY 0*SIZE(ptrba), yvec0;
  1068. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1069. EDUP_DY 4*SIZE(ptrbb), yvec3;
  1070. MUL_DY yvec0, yvec2, yvec6;
  1071. ADD1_DY yvec6, yvec15, yvec15;
  1072. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1073. MUL_DY yvec0, yvec3, yvec7;
  1074. ADD1_DY yvec7, yvec14, yvec14;
  1075. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1076. MUL_DY yvec0, yvec4, yvec6;
  1077. ADD1_DY yvec6, yvec13, yvec13;
  1078. EDUP_DY 1*SIZE(ptrbb), yvec2;
  1079. MUL_DY yvec0, yvec5, yvec7;
  1080. ADD1_DY yvec7 ,yvec12, yvec12;
  1081. EDUP_DY 5*SIZE(ptrbb), yvec3
  1082. VPERMILP_DY $0x05, yvec0, yvec0;
  1083. MUL_DY yvec0, yvec2, yvec6;
  1084. ADD2_DY yvec6, yvec15, yvec15;
  1085. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1086. MUL_DY yvec0, yvec3, yvec7;
  1087. ADD2_DY yvec7, yvec14, yvec14;
  1088. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1089. MUL_DY yvec0, yvec4, yvec6;
  1090. ADD2_DY yvec6, yvec13, yvec13;
  1091. MUL_DY yvec0, yvec5, yvec7;
  1092. ADD2_DY yvec7, yvec12, yvec12;
  1093. #### Unroll time 2 ####
  1094. LD_DY 4*SIZE(ptrba), yvec0;
  1095. EDUP_DY 8*SIZE(ptrbb), yvec2;
  1096. EDUP_DY 12*SIZE(ptrbb), yvec3;
  1097. MUL_DY yvec0, yvec2, yvec6;
  1098. ADD1_DY yvec6, yvec15, yvec15;
  1099. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1100. MUL_DY yvec0, yvec3, yvec7;
  1101. ADD1_DY yvec7, yvec14, yvec14;
  1102. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1103. MUL_DY yvec0, yvec4, yvec6;
  1104. ADD1_DY yvec6, yvec13, yvec13;
  1105. EDUP_DY 9*SIZE(ptrbb), yvec2;
  1106. MUL_DY yvec0, yvec5, yvec7;
  1107. ADD1_DY yvec7 ,yvec12, yvec12;
  1108. EDUP_DY 13*SIZE(ptrbb), yvec3
  1109. VPERMILP_DY $0x05, yvec0, yvec0;
  1110. MUL_DY yvec0, yvec2, yvec6;
  1111. ADD2_DY yvec6, yvec15, yvec15;
  1112. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1113. MUL_DY yvec0, yvec3, yvec7;
  1114. ADD2_DY yvec7, yvec14, yvec14;
  1115. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1116. MUL_DY yvec0, yvec4, yvec6;
  1117. ADD2_DY yvec6, yvec13, yvec13;
  1118. ADDQ $8*SIZE, ptrba;
  1119. MUL_DY yvec0, yvec5, yvec7;
  1120. ADD2_DY yvec7, yvec12, yvec12;
  1121. ADDQ $16*SIZE, ptrbb;
  1122. .L8_loopE:
  1123. #ifndef TRMMKERNEL
  1124. TEST $1, bk;
  1125. #else
  1126. TEST $1, kkk;
  1127. #endif
  1128. JLE .L9_loopE;
  1129. ALIGN_5
  1130. .L9_bodyB:
  1131. #### Unroll times 1 ####
  1132. LD_DY 0*SIZE(ptrba), yvec0;
  1133. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1134. EDUP_DY 4*SIZE(ptrbb), yvec3;
  1135. MUL_DY yvec0, yvec2, yvec6;
  1136. ADD1_DY yvec6, yvec15, yvec15;
  1137. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1138. MUL_DY yvec0, yvec3, yvec7;
  1139. ADD1_DY yvec7, yvec14, yvec14;
  1140. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1141. MUL_DY yvec0, yvec4, yvec6;
  1142. ADD1_DY yvec6, yvec13, yvec13;
  1143. EDUP_DY 1*SIZE(ptrbb), yvec2;
  1144. MUL_DY yvec0, yvec5, yvec7;
  1145. ADD1_DY yvec7 ,yvec12, yvec12;
  1146. EDUP_DY 5*SIZE(ptrbb), yvec3
  1147. VPERMILP_DY $0x05, yvec0, yvec0;
  1148. MUL_DY yvec0, yvec2, yvec6;
  1149. ADD2_DY yvec6, yvec15, yvec15;
  1150. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1151. MUL_DY yvec0, yvec3, yvec7;
  1152. ADD2_DY yvec7, yvec14, yvec14;
  1153. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1154. MUL_DY yvec0, yvec4, yvec6;
  1155. ADD2_DY yvec6, yvec13, yvec13;
  1156. MUL_DY yvec0, yvec5, yvec7;
  1157. ADD2_DY yvec7, yvec12, yvec12;
  1158. ADDQ $4*SIZE, ptrba;
  1159. ADDQ $8*SIZE, ptrbb;
  1160. .L9_loopE:
  1161. #### Handle ####
  1162. XOR_DY yvec7, yvec7, yvec7;
  1163. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1164. ADDSUB_DY yvec15, yvec7, yvec15;
  1165. ADDSUB_DY yvec14, yvec7, yvec14;
  1166. ADDSUB_DY yvec13, yvec7, yvec13;
  1167. ADDSUB_DY yvec12, yvec7, yvec12;
  1168. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1169. SUB_DY yvec15, yvec7, yvec15;
  1170. SUB_DY yvec14, yvec7, yvec14;
  1171. SUB_DY yvec13, yvec7, yvec13;
  1172. SUB_DY yvec12, yvec7, yvec12;
  1173. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1174. VPERMILP_DY $0x05, yvec15, yvec15;
  1175. VPERMILP_DY $0x05, yvec14, yvec14;
  1176. VPERMILP_DY $0x05, yvec13, yvec13;
  1177. VPERMILP_DY $0x05, yvec12, yvec12;
  1178. ADDSUB_DY yvec15, yvec7, yvec15;
  1179. ADDSUB_DY yvec14, yvec7, yvec14;
  1180. ADDSUB_DY yvec13, yvec7, yvec13;
  1181. ADDSUB_DY yvec12, yvec7, yvec12;
  1182. VPERMILP_DY $0x05, yvec15, yvec15;
  1183. VPERMILP_DY $0x05, yvec14, yvec14;
  1184. VPERMILP_DY $0x05, yvec13, yvec13;
  1185. VPERMILP_DY $0x05, yvec12, yvec12;
  1186. #endif
  1187. #### Load Alpha ####
  1188. BROAD_DY MEMALPHA_R, yvec7;
  1189. BROAD_DY MEMALPHA_I, yvec6;
  1190. #### Multiply Alpha ####
  1191. VPERMILP_DY $0x05, yvec15, yvec5;
  1192. MUL_DY yvec7, yvec15, yvec15;
  1193. MUL_DY yvec6, yvec5, yvec5;
  1194. ADD2_DY yvec5, yvec15, yvec15;
  1195. VPERMILP_DY $0x05, yvec14, yvec4;
  1196. MUL_DY yvec7, yvec14, yvec14;
  1197. MUL_DY yvec6, yvec4, yvec4;
  1198. ADD2_DY yvec4, yvec14, yvec14;
  1199. VPERMILP_DY $0x05, yvec13, yvec3;
  1200. MUL_DY yvec7, yvec13, yvec13;
  1201. MUL_DY yvec6, yvec3, yvec3;
  1202. ADD2_DY yvec3, yvec13, yvec13;
  1203. VPERMILP_DY $0x05,yvec12, yvec2;
  1204. MUL_DY yvec7, yvec12, yvec12;
  1205. MUL_DY yvec6, yvec2, yvec2;
  1206. ADD2_DY yvec2, yvec12, yvec12;
  1207. #### Testing Alignment ####
  1208. MOVQ C0, %rax;
  1209. OR ldc, %rax;
  1210. TEST $15, %rax;
  1211. JNE .L9_loopEx;
  1212. ALIGN_5
  1213. #### Writing back ####
  1214. EXTRA_DY $1, yvec15, xvec7;
  1215. EXTRA_DY $1, yvec14, xvec6;
  1216. EXTRA_DY $1, yvec13, xvec5;
  1217. EXTRA_DY $1, yvec12, xvec4;
  1218. #ifndef TRMMKERNEL
  1219. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  1220. ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7;
  1221. ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13;
  1222. ADD_DX 2*SIZE(C0), xvec5, xvec5;
  1223. ADD_DX 0*SIZE(C1), xvec14, xvec14;
  1224. ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6;
  1225. ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12;
  1226. ADD_DX 2*SIZE(C1), xvec4, xvec4;
  1227. #endif
  1228. ST_DX xvec15, 0*SIZE(C0);
  1229. ST_DX xvec7, 2*SIZE(C0, ldc, 1);
  1230. ST_DX xvec13, 0*SIZE(C0, ldc, 1);
  1231. ST_DX xvec5, 2*SIZE(C0);
  1232. ST_DX xvec14, 0*SIZE(C1);
  1233. ST_DX xvec6, 2*SIZE(C1, ldc, 1);
  1234. ST_DX xvec12, 0*SIZE(C1, ldc, 1);
  1235. ST_DX xvec4, 2*SIZE(C1);
  1236. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1237. MOVQ bk, %rax;
  1238. SUBQ kkk, %rax;
  1239. SALQ $ZBASE_SHIFT, %rax;
  1240. LEAQ (ptrba, %rax, 2), ptrba;
  1241. LEAQ (ptrbb, %rax, 4), ptrbb;
  1242. #endif
  1243. #if defined(TRMMKERNEL) && defined(LEFT)
  1244. ADDQ $2, kk;
  1245. #endif
  1246. ADDQ $4*SIZE, C0;
  1247. ADDQ $4*SIZE, C1;
  1248. JMP .L5_loopE;
  1249. ALIGN_5
  1250. .L9_loopEx:
  1251. EXTRA_DY $1, yvec15, xvec7;
  1252. EXTRA_DY $1, yvec14, xvec6;
  1253. EXTRA_DY $1, yvec13, xvec5;
  1254. EXTRA_DY $1, yvec12, xvec4;
  1255. #ifndef TRMMKERNEL
  1256. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1257. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1258. LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1;
  1259. LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1;
  1260. LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2;
  1261. LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2;
  1262. LDL_DX 2*SIZE(C0), xvec3, xvec3;
  1263. LDH_DX 3*SIZE(C0), xvec3, xvec3;
  1264. ADD_DX xvec0, xvec15, xvec15;
  1265. ADD_DX xvec1, xvec7, xvec7;
  1266. ADD_DX xvec2, xvec13, xvec13;
  1267. ADD_DX xvec3, xvec5, xvec5;
  1268. #endif
  1269. STL_DX xvec15, 0*SIZE(C0);
  1270. STH_DX xvec15, 1*SIZE(C0);
  1271. STL_DX xvec7, 2*SIZE(C0, ldc, 1);
  1272. STH_DX xvec7, 3*SIZE(C0, ldc, 1);
  1273. STL_DX xvec13, 0*SIZE(C0, ldc, 1);
  1274. STH_DX xvec13, 1*SIZE(C0, ldc, 1);
  1275. STL_DX xvec5, 2*SIZE(C0);
  1276. STH_DX xvec5, 3*SIZE(C0);
  1277. #ifndef TRMMKERNEL
  1278. LDL_DX 0*SIZE(C1), xvec0, xvec0;
  1279. LDH_DX 1*SIZE(C1), xvec0, xvec0;
  1280. LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1;
  1281. LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1;
  1282. LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2;
  1283. LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2;
  1284. LDL_DX 2*SIZE(C1), xvec3, xvec3;
  1285. LDH_DX 3*SIZE(C1), xvec3, xvec3;
  1286. ADD_DX xvec0, xvec14, xvec14;
  1287. ADD_DX xvec1, xvec6, xvec6;
  1288. ADD_DX xvec2, xvec12, xvec12;
  1289. ADD_DX xvec3, xvec4, xvec4;
  1290. #endif
  1291. STL_DX xvec14, 0*SIZE(C1);
  1292. STH_DX xvec14, 1*SIZE(C1);
  1293. STL_DX xvec6, 2*SIZE(C1, ldc, 1);
  1294. STH_DX xvec6, 3*SIZE(C1, ldc, 1);
  1295. STL_DX xvec12, 0*SIZE(C1, ldc, 1);
  1296. STH_DX xvec12, 1*SIZE(C1, ldc, 1);
  1297. STL_DX xvec4, 2*SIZE(C1);
  1298. STH_DX xvec4, 3*SIZE(C1);
  1299. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1300. MOVQ bk, %rax;
  1301. SUBQ kkk, %rax;
  1302. SALQ $ZBASE_SHIFT, %rax;
  1303. LEAQ (ptrba, %rax, 2), ptrba;
  1304. LEAQ (ptrbb, %rax, 4), ptrbb;
  1305. #endif
  1306. #if defined(TRMMKERNEL) && defined(LEFT)
  1307. ADDQ $2, kk;
  1308. #endif
  1309. ADDQ $4*SIZE, C0;
  1310. ADDQ $4*SIZE, C1;
  1311. .L5_loopE:
  1312. TEST $1, bm;
  1313. JLE .L6_loopE;
  1314. ALIGN_5
  1315. .L6_bodyB:
  1316. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1317. MOVQ bb,ptrbb;
  1318. #else
  1319. MOVQ bb, ptrbb;
  1320. MOVQ kk, %rax;
  1321. SALQ $ZBASE_SHIFT, %rax;
  1322. ADDQ %rax, ptrba;
  1323. LEAQ (ptrbb, %rax, 4), ptrbb;
  1324. #endif
  1325. XOR_DY yvec15, yvec15, yvec15;
  1326. XOR_DY yvec14, yvec14, yvec14;
  1327. #ifndef TRMMKERNEL
  1328. MOVQ bk,k;
  1329. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1330. MOVQ bk, %rax;
  1331. SUBQ kk, %rax;
  1332. MOVQ %rax, kkk;
  1333. #else
  1334. MOVQ kk, %rax;
  1335. #ifdef LEFT
  1336. ADDQ $1, %rax;
  1337. #else
  1338. ADDQ $4, %rax;
  1339. #endif
  1340. MOVQ %rax, kkk;
  1341. #endif
  1342. SARQ $2, k;
  1343. JLE .L10_loopE;
  1344. ALIGN_5
  1345. .L10_bodyB:
  1346. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  1347. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1348. EDUP_DY 4*SIZE(ptrbb), yvec3;
  1349. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  1350. MUL_DY yvec1, yvec2, yvec6;
  1351. ADD1_DY yvec6, yvec15, yvec15;
  1352. MUL_DY yvec1, yvec3, yvec7;
  1353. ADD1_DY yvec7, yvec14, yvec14;
  1354. VPERMILP_DY $0x05, yvec1, yvec4;
  1355. EDUP_DY 1*SIZE(ptrbb), yvec2;
  1356. EDUP_DY 5*SIZE(ptrbb), yvec3;
  1357. MUL_DY yvec4, yvec2, yvec6;
  1358. ADD2_DY yvec6, yvec15, yvec15;
  1359. MUL_DY yvec4, yvec3, yvec7;
  1360. ADD2_DY yvec7, yvec14, yvec14;
  1361. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  1362. EDUP_DY 8*SIZE(ptrbb), yvec2;
  1363. EDUP_DY 12*SIZE(ptrbb), yvec3;
  1364. MUL_DY yvec1, yvec2, yvec6;
  1365. ADD1_DY yvec6, yvec15, yvec15;
  1366. MUL_DY yvec1, yvec3, yvec7;
  1367. ADD1_DY yvec7, yvec14, yvec14;
  1368. VPERMILP_DY $0x05, yvec1, yvec4;
  1369. EDUP_DY 9*SIZE(ptrbb), yvec2;
  1370. EDUP_DY 13*SIZE(ptrbb), yvec3;
  1371. MUL_DY yvec4, yvec2, yvec6;
  1372. ADD2_DY yvec6, yvec15, yvec15;
  1373. MUL_DY yvec4, yvec3, yvec7;
  1374. ADD2_DY yvec7, yvec14, yvec14;
  1375. LD_DY 4*SIZE(ptrba), yvec0;
  1376. EDUP_DY 16*SIZE(ptrbb), yvec2;
  1377. EDUP_DY 20*SIZE(ptrbb), yvec3;
  1378. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  1379. MUL_DY yvec1, yvec2, yvec6;
  1380. ADD1_DY yvec6, yvec15, yvec15;
  1381. MUL_DY yvec1, yvec3, yvec7;
  1382. ADD1_DY yvec7, yvec14, yvec14;
  1383. VPERMILP_DY $0x05, yvec1, yvec4;
  1384. EDUP_DY 17*SIZE(ptrbb), yvec2;
  1385. EDUP_DY 21*SIZE(ptrbb), yvec3;
  1386. MUL_DY yvec4, yvec2, yvec6;
  1387. ADD2_DY yvec6, yvec15, yvec15;
  1388. MUL_DY yvec4, yvec3, yvec7;
  1389. ADD2_DY yvec7, yvec14, yvec14;
  1390. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  1391. EDUP_DY 24*SIZE(ptrbb), yvec2;
  1392. EDUP_DY 28*SIZE(ptrbb), yvec3;
  1393. MUL_DY yvec1, yvec2, yvec6;
  1394. ADD1_DY yvec6, yvec15, yvec15;
  1395. MUL_DY yvec1, yvec3, yvec7;
  1396. ADD1_DY yvec7, yvec14, yvec14;
  1397. VPERMILP_DY $0x05, yvec1, yvec4;
  1398. EDUP_DY 25*SIZE(ptrbb), yvec2;
  1399. EDUP_DY 29*SIZE(ptrbb), yvec3;
  1400. MUL_DY yvec4, yvec2, yvec6;
  1401. ADD2_DY yvec6, yvec15, yvec15;
  1402. MUL_DY yvec4, yvec3, yvec7;
  1403. ADD2_DY yvec7, yvec14, yvec14
  1404. ADDQ $8*SIZE, ptrba;
  1405. ADDQ $32*SIZE, ptrbb;
  1406. DECQ k;
  1407. JG .L10_bodyB;
  1408. ALIGN_5
  1409. .L10_loopE:
  1410. #ifndef TRMMKERNEL
  1411. TEST $2, bk;
  1412. #else
  1413. TEST $2, kkk;
  1414. #endif
  1415. JLE .L11_loopE;
  1416. ALIGN_5
  1417. .L11_bodyB:
  1418. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  1419. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1420. EDUP_DY 4*SIZE(ptrbb), yvec3;
  1421. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  1422. MUL_DY yvec1, yvec2, yvec6;
  1423. ADD1_DY yvec6, yvec15, yvec15;
  1424. MUL_DY yvec1, yvec3, yvec7;
  1425. ADD1_DY yvec7, yvec14, yvec14;
  1426. VPERMILP_DY $0x05, yvec1, yvec4;
  1427. EDUP_DY 1*SIZE(ptrbb), yvec2;
  1428. EDUP_DY 5*SIZE(ptrbb), yvec3;
  1429. MUL_DY yvec4, yvec2, yvec6;
  1430. ADD2_DY yvec6, yvec15, yvec15;
  1431. MUL_DY yvec4, yvec3, yvec7;
  1432. ADD2_DY yvec7, yvec14, yvec14;
  1433. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  1434. EDUP_DY 8*SIZE(ptrbb), yvec2;
  1435. EDUP_DY 12*SIZE(ptrbb), yvec3;
  1436. MUL_DY yvec1, yvec2, yvec6;
  1437. ADD1_DY yvec6, yvec15, yvec15;
  1438. MUL_DY yvec1, yvec3, yvec7;
  1439. ADD1_DY yvec7, yvec14, yvec14;
  1440. VPERMILP_DY $0x05, yvec1, yvec4;
  1441. EDUP_DY 9*SIZE(ptrbb), yvec2;
  1442. EDUP_DY 13*SIZE(ptrbb), yvec3;
  1443. MUL_DY yvec4, yvec2, yvec6;
  1444. ADD2_DY yvec6, yvec15, yvec15;
  1445. MUL_DY yvec4, yvec3, yvec7;
  1446. ADD2_DY yvec7, yvec14, yvec14;
  1447. ADDQ $4*SIZE, ptrba;
  1448. ADDQ $16*SIZE, ptrbb;
  1449. .L11_loopE:
  1450. #ifndef TRMMKERNEL
  1451. TEST $1, bk;
  1452. #else
  1453. TEST $1, kkk;
  1454. #endif
  1455. JLE .L12_loopE;
  1456. ALIGN_5
  1457. .L12_bodyB:
  1458. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  1459. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1460. EDUP_DY 4*SIZE(ptrbb), yvec3;
  1461. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  1462. MUL_DY yvec1, yvec2, yvec6;
  1463. ADD1_DY yvec6, yvec15, yvec15;
  1464. MUL_DY yvec1, yvec3, yvec7;
  1465. ADD1_DY yvec7, yvec14, yvec14;
  1466. VPERMILP_DY $0x05, yvec1, yvec4;
  1467. EDUP_DY 1*SIZE(ptrbb), yvec2;
  1468. EDUP_DY 5*SIZE(ptrbb), yvec3;
  1469. MUL_DY yvec4, yvec2, yvec6;
  1470. ADD2_DY yvec6, yvec15, yvec15;
  1471. MUL_DY yvec4, yvec3, yvec7;
  1472. ADD2_DY yvec7, yvec14, yvec14;
  1473. ADDQ $2*SIZE, ptrba;
  1474. ADDQ $8*SIZE, ptrbb;
  1475. .L12_loopE:
  1476. #### Handle ####
  1477. XOR_DY yvec7, yvec7, yvec7;
  1478. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1479. ADDSUB_DY yvec15, yvec7, yvec15;
  1480. ADDSUB_DY yvec14, yvec7, yvec14;
  1481. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1482. SUB_DY yvec15, yvec7, yvec15;
  1483. SUB_DY yvec14, yvec7, yvec14;
  1484. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1485. VPERMILP_DY $0x05, yvec15, yvec15;
  1486. VPERMILP_DY $0x05, yvec14, yvec14;
  1487. ADDSUB_DY yvec15, yvec7, yvec15;
  1488. ADDSUB_DY yvec14, yvec7, yvec14;
  1489. VPERMILP_DY $0x05, yvec15, yvec15;
  1490. VPERMILP_DY $0x05, yvec14, yvec14;
  1491. #endif
  1492. #### Multiply Alpha ####
  1493. BROAD_DY MEMALPHA_R, yvec7;
  1494. BROAD_DY MEMALPHA_I, yvec6;
  1495. VPERMILP_DY $0x05, yvec15, yvec5;
  1496. MUL_DY yvec7, yvec15, yvec15;
  1497. MUL_DY yvec6, yvec5, yvec5;
  1498. ADD2_DY yvec5, yvec15, yvec15;
  1499. VPERMILP_DY $0x05, yvec14, yvec4;
  1500. MUL_DY yvec7, yvec14, yvec14;
  1501. MUL_DY yvec6, yvec4, yvec4;
  1502. ADD2_DY yvec4, yvec14, yvec14;
  1503. #### Writing Back ####
  1504. EXTRA_DY $1, yvec15, xvec7;
  1505. EXTRA_DY $1, yvec14, xvec6;
  1506. #ifndef TRMMKERNEL
  1507. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1508. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1509. LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1;
  1510. LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1;
  1511. LDL_DX 0*SIZE(C1), xvec2, xvec2;
  1512. LDH_DX 1*SIZE(C1), xvec2, xvec2;
  1513. LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3;
  1514. LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3;
  1515. ADD_DX xvec0, xvec15, xvec15;
  1516. ADD_DX xvec1, xvec7, xvec7;
  1517. ADD_DX xvec2, xvec14, xvec14;
  1518. ADD_DX xvec3, xvec6, xvec6;
  1519. #endif
  1520. STL_DX xvec15, 0*SIZE(C0);
  1521. STH_DX xvec15, 1*SIZE(C0);
  1522. STL_DX xvec7, 0*SIZE(C0, ldc, 1);
  1523. STH_DX xvec7, 1*SIZE(C0, ldc, 1);
  1524. STL_DX xvec14, 0*SIZE(C1);
  1525. STH_DX xvec14, 1*SIZE(C1);
  1526. STL_DX xvec6, 0*SIZE(C1, ldc, 1);
  1527. STH_DX xvec6, 1*SIZE(C1, ldc, 1);
  1528. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1529. MOVQ bk, %rax;
  1530. SUBQ kkk, %rax;
  1531. SALQ $ZBASE_SHIFT, %rax;
  1532. ADDQ %rax, ptrba;
  1533. LEAQ (ptrbb, %rax, 4), ptrbb;
  1534. #endif
  1535. #if defined(TRMMKERNEL) && defined(LEFT)
  1536. ADDQ $1, kk;
  1537. #endif
  1538. ADDQ $2*SIZE, C0;
  1539. ADDQ $2*SIZE, C1;
  1540. .L6_loopE:
  1541. #if defined(TRMMKERNEL) && !defined(LEFT)
  1542. ADDQ $4, kk;
  1543. #endif
  1544. MOVQ bk,k;
  1545. SALQ $6,k;
  1546. ADDQ k,bb;
  1547. LEAQ (C,ldc,4),C;
  1548. .L0_bodyE:;
  1549. DECQ j;
  1550. JG .L0_bodyB;
  1551. ALIGN_5;
  1552. .L0_loopE:;
  1553. TEST $2, bn;
  1554. JLE .L20_loopE;
  1555. ALIGN_5
  1556. .L20_bodyB:
  1557. #if defined(TRMMKERNEL) && defined(LEFT)
  1558. MOVQ OFFSET, %rax;
  1559. MOVQ %rax, kk;
  1560. #endif
  1561. MOVQ C, C0;
  1562. LEAQ (C, ldc, 1), C1;
  1563. MOVQ ba, ptrba;
  1564. MOVQ bm, i;
  1565. SARQ $2, i;
  1566. JLE .L21_loopE;
  1567. ALIGN_5
  1568. .L21_bodyB:
  1569. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1570. MOVQ bb,ptrbb;
  1571. #else
  1572. MOVQ bb, ptrbb;
  1573. MOVQ kk, %rax;
  1574. SALQ $ZBASE_SHIFT, %rax;
  1575. LEAQ (ptrba, %rax, 4), ptrba;
  1576. LEAQ (ptrbb, %rax, 2), ptrbb;
  1577. #endif
  1578. XOR_DY yvec15, yvec15, yvec15;
  1579. XOR_DY yvec14, yvec14, yvec14;
  1580. XOR_DY yvec13, yvec13, yvec13;
  1581. XOR_DY yvec12, yvec12, yvec12;
  1582. #ifndef TRMMKERNEL
  1583. MOVQ bk,k;
  1584. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1585. MOVQ bk, %rax;
  1586. SUBQ kk, %rax;
  1587. MOVQ %rax, kkk;
  1588. #else
  1589. MOVQ kk, %rax;
  1590. #ifdef LEFT
  1591. ADDQ $4, %rax;
  1592. #else
  1593. ADDQ $2, %rax;
  1594. #endif
  1595. MOVQ %rax, kkk;
  1596. #endif
  1597. SARQ $2, k;
  1598. JLE .L211_loopE;
  1599. ALIGN_5
  1600. .L211_bodyB:
  1601. #### Unroll time 1 ####
  1602. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1603. LD_DY 0*SIZE(ptrba), yvec0;
  1604. MUL_DY yvec0, yvec2, yvec6;
  1605. ADD1_DY yvec6, yvec15, yvec15;
  1606. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1607. LD_DY 4*SIZE(ptrba), yvec1;
  1608. MUL_DY yvec1, yvec2, yvec7;
  1609. ADD1_DY yvec7, yvec14, yvec14;
  1610. EDUP_DY 1*SIZE(ptrbb), yvec3;
  1611. MUL_DY yvec0, yvec4, yvec6;
  1612. ADD1_DY yvec6, yvec13, yvec13;
  1613. VPERMILP_DY $0x05, yvec0, yvec0;
  1614. MUL_DY yvec1, yvec4, yvec7;
  1615. ADD1_DY yvec7, yvec12, yvec12;
  1616. VPERMILP_DY $0x05, yvec1, yvec1;
  1617. MUL_DY yvec0, yvec3, yvec6;
  1618. ADD2_DY yvec6, yvec15, yvec15;
  1619. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1620. MUL_DY yvec1, yvec3, yvec7;
  1621. ADD2_DY yvec7, yvec14, yvec14;
  1622. MUL_DY yvec0, yvec5, yvec6;
  1623. ADD2_DY yvec6, yvec13, yvec13;
  1624. MUL_DY yvec1, yvec5, yvec7;
  1625. ADD2_DY yvec7, yvec12, yvec12;
  1626. #### Unroll time 2 ####
  1627. EDUP_DY 4*SIZE(ptrbb), yvec2;
  1628. LD_DY 8*SIZE(ptrba), yvec0;
  1629. MUL_DY yvec0, yvec2, yvec6;
  1630. ADD1_DY yvec6, yvec15, yvec15;
  1631. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1632. LD_DY 12*SIZE(ptrba), yvec1;
  1633. MUL_DY yvec1, yvec2, yvec7;
  1634. ADD1_DY yvec7, yvec14, yvec14;
  1635. EDUP_DY 5*SIZE(ptrbb), yvec3;
  1636. MUL_DY yvec0, yvec4, yvec6;
  1637. ADD1_DY yvec6, yvec13, yvec13;
  1638. VPERMILP_DY $0x05, yvec0, yvec0;
  1639. MUL_DY yvec1, yvec4, yvec7;
  1640. ADD1_DY yvec7, yvec12, yvec12;
  1641. VPERMILP_DY $0x05, yvec1, yvec1;
  1642. MUL_DY yvec0, yvec3, yvec6;
  1643. ADD2_DY yvec6, yvec15, yvec15;
  1644. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1645. MUL_DY yvec1, yvec3, yvec7;
  1646. ADD2_DY yvec7, yvec14, yvec14;
  1647. MUL_DY yvec0, yvec5, yvec6;
  1648. ADD2_DY yvec6, yvec13, yvec13;
  1649. MUL_DY yvec1, yvec5, yvec7;
  1650. ADD2_DY yvec7, yvec12, yvec12;
  1651. #### Unroll time 3 ####
  1652. EDUP_DY 8*SIZE(ptrbb), yvec2;
  1653. LD_DY 16*SIZE(ptrba), yvec0;
  1654. MUL_DY yvec0, yvec2, yvec6;
  1655. ADD1_DY yvec6, yvec15, yvec15;
  1656. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1657. LD_DY 20*SIZE(ptrba), yvec1;
  1658. MUL_DY yvec1, yvec2, yvec7;
  1659. ADD1_DY yvec7, yvec14, yvec14;
  1660. EDUP_DY 9*SIZE(ptrbb), yvec3;
  1661. MUL_DY yvec0, yvec4, yvec6;
  1662. ADD1_DY yvec6, yvec13, yvec13;
  1663. VPERMILP_DY $0x05, yvec0, yvec0;
  1664. MUL_DY yvec1, yvec4, yvec7;
  1665. ADD1_DY yvec7, yvec12, yvec12;
  1666. VPERMILP_DY $0x05, yvec1, yvec1;
  1667. MUL_DY yvec0, yvec3, yvec6;
  1668. ADD2_DY yvec6, yvec15, yvec15;
  1669. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1670. MUL_DY yvec1, yvec3, yvec7;
  1671. ADD2_DY yvec7, yvec14, yvec14;
  1672. MUL_DY yvec0, yvec5, yvec6;
  1673. ADD2_DY yvec6, yvec13, yvec13;
  1674. MUL_DY yvec1, yvec5, yvec7;
  1675. ADD2_DY yvec7, yvec12, yvec12;
  1676. #### Unroll time 4 ####
  1677. EDUP_DY 12*SIZE(ptrbb), yvec2;
  1678. LD_DY 24*SIZE(ptrba), yvec0;
  1679. MUL_DY yvec0, yvec2, yvec6;
  1680. ADD1_DY yvec6, yvec15, yvec15;
  1681. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1682. LD_DY 28*SIZE(ptrba), yvec1;
  1683. MUL_DY yvec1, yvec2, yvec7;
  1684. ADD1_DY yvec7, yvec14, yvec14;
  1685. EDUP_DY 13*SIZE(ptrbb), yvec3;
  1686. MUL_DY yvec0, yvec4, yvec6;
  1687. ADD1_DY yvec6, yvec13, yvec13;
  1688. VPERMILP_DY $0x05, yvec0, yvec0;
  1689. MUL_DY yvec1, yvec4, yvec7;
  1690. ADD1_DY yvec7, yvec12, yvec12;
  1691. VPERMILP_DY $0x05, yvec1, yvec1;
  1692. MUL_DY yvec0, yvec3, yvec6;
  1693. ADD2_DY yvec6, yvec15, yvec15;
  1694. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1695. MUL_DY yvec1, yvec3, yvec7;
  1696. ADD2_DY yvec7, yvec14, yvec14;
  1697. ADDQ $16*SIZE, ptrbb;
  1698. MUL_DY yvec0, yvec5, yvec6;
  1699. ADD2_DY yvec6, yvec13, yvec13;
  1700. MUL_DY yvec1, yvec5, yvec7;
  1701. ADD2_DY yvec7, yvec12, yvec12;
  1702. ADDQ $32*SIZE, ptrba;
  1703. DECQ k;
  1704. JG .L211_bodyB;
  1705. ALIGN_5
  1706. .L211_loopE:
  1707. #ifndef TRMMKERNEL
  1708. TEST $2, bk;
  1709. #else
  1710. TEST $2, kkk;
  1711. #endif
  1712. JLE .L212_loopE;
  1713. ALIGN_5
  1714. .L212_bodyB:
  1715. #### Unroll time 1 ####
  1716. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1717. LD_DY 0*SIZE(ptrba), yvec0;
  1718. MUL_DY yvec0, yvec2, yvec6;
  1719. ADD1_DY yvec6, yvec15, yvec15;
  1720. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1721. LD_DY 4*SIZE(ptrba), yvec1;
  1722. MUL_DY yvec1, yvec2, yvec7;
  1723. ADD1_DY yvec7, yvec14, yvec14;
  1724. EDUP_DY 1*SIZE(ptrbb), yvec3;
  1725. MUL_DY yvec0, yvec4, yvec6;
  1726. ADD1_DY yvec6, yvec13, yvec13;
  1727. VPERMILP_DY $0x05, yvec0, yvec0;
  1728. MUL_DY yvec1, yvec4, yvec7;
  1729. ADD1_DY yvec7, yvec12, yvec12;
  1730. VPERMILP_DY $0x05, yvec1, yvec1;
  1731. MUL_DY yvec0, yvec3, yvec6;
  1732. ADD2_DY yvec6, yvec15, yvec15;
  1733. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1734. MUL_DY yvec1, yvec3, yvec7;
  1735. ADD2_DY yvec7, yvec14, yvec14;
  1736. MUL_DY yvec0, yvec5, yvec6;
  1737. ADD2_DY yvec6, yvec13, yvec13;
  1738. MUL_DY yvec1, yvec5, yvec7;
  1739. ADD2_DY yvec7, yvec12, yvec12;
  1740. #### Unroll time 2 ####
  1741. EDUP_DY 4*SIZE(ptrbb), yvec2;
  1742. LD_DY 8*SIZE(ptrba), yvec0;
  1743. MUL_DY yvec0, yvec2, yvec6;
  1744. ADD1_DY yvec6, yvec15, yvec15;
  1745. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1746. LD_DY 12*SIZE(ptrba), yvec1;
  1747. MUL_DY yvec1, yvec2, yvec7;
  1748. ADD1_DY yvec7, yvec14, yvec14;
  1749. EDUP_DY 5*SIZE(ptrbb), yvec3;
  1750. MUL_DY yvec0, yvec4, yvec6;
  1751. ADD1_DY yvec6, yvec13, yvec13;
  1752. VPERMILP_DY $0x05, yvec0, yvec0;
  1753. MUL_DY yvec1, yvec4, yvec7;
  1754. ADD1_DY yvec7, yvec12, yvec12;
  1755. VPERMILP_DY $0x05, yvec1, yvec1;
  1756. MUL_DY yvec0, yvec3, yvec6;
  1757. ADD2_DY yvec6, yvec15, yvec15;
  1758. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1759. MUL_DY yvec1, yvec3, yvec7;
  1760. ADD2_DY yvec7, yvec14, yvec14;
  1761. MUL_DY yvec0, yvec5, yvec6;
  1762. ADD2_DY yvec6, yvec13, yvec13;
  1763. MUL_DY yvec1, yvec5, yvec7;
  1764. ADD2_DY yvec7, yvec12, yvec12;
  1765. ADDQ $8*SIZE, ptrbb;
  1766. ADDQ $16*SIZE, ptrba;
  1767. .L212_loopE:
  1768. #ifndef TRMMKERNEL
  1769. TEST $1, bk;
  1770. #else
  1771. TEST $1, kkk;
  1772. #endif
  1773. JLE .L213_loopE;
  1774. ALIGN_5
  1775. .L213_bodyB:
  1776. #### Unroll time 1 ####
  1777. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1778. LD_DY 0*SIZE(ptrba), yvec0;
  1779. MUL_DY yvec0, yvec2, yvec6;
  1780. ADD1_DY yvec6, yvec15, yvec15;
  1781. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1782. LD_DY 4*SIZE(ptrba), yvec1;
  1783. MUL_DY yvec1, yvec2, yvec7;
  1784. ADD1_DY yvec7, yvec14, yvec14;
  1785. EDUP_DY 1*SIZE(ptrbb), yvec3;
  1786. MUL_DY yvec0, yvec4, yvec6;
  1787. ADD1_DY yvec6, yvec13, yvec13;
  1788. VPERMILP_DY $0x05, yvec0, yvec0;
  1789. MUL_DY yvec1, yvec4, yvec7;
  1790. ADD1_DY yvec7, yvec12, yvec12;
  1791. VPERMILP_DY $0x05, yvec1, yvec1;
  1792. MUL_DY yvec0, yvec3, yvec6;
  1793. ADD2_DY yvec6, yvec15, yvec15;
  1794. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  1795. MUL_DY yvec1, yvec3, yvec7;
  1796. ADD2_DY yvec7, yvec14, yvec14;
  1797. MUL_DY yvec0, yvec5, yvec6;
  1798. ADD2_DY yvec6, yvec13, yvec13;
  1799. MUL_DY yvec1, yvec5, yvec7;
  1800. ADD2_DY yvec7, yvec12, yvec12;
  1801. ADDQ $4*SIZE, ptrbb;
  1802. ADDQ $8*SIZE, ptrba;
  1803. .L213_loopE:
  1804. #### Handle ####
  1805. XOR_DY yvec7, yvec7, yvec7;
  1806. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1807. ADDSUB_DY yvec15, yvec7, yvec15;
  1808. ADDSUB_DY yvec14, yvec7, yvec14;
  1809. ADDSUB_DY yvec13, yvec7, yvec13;
  1810. ADDSUB_DY yvec12, yvec7, yvec12;
  1811. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1812. SUB_DY yvec15, yvec7, yvec15;
  1813. SUB_DY yvec14, yvec7, yvec14;
  1814. SUB_DY yvec13, yvec7, yvec13;
  1815. SUB_DY yvec12, yvec7, yvec12;
  1816. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1817. VPERMILP_DY $0x05, yvec15, yvec15;
  1818. VPERMILP_DY $0x05, yvec14, yvec14;
  1819. VPERMILP_DY $0x05, yvec13, yvec13;
  1820. VPERMILP_DY $0x05, yvec12, yvec12;
  1821. ADDSUB_DY yvec15, yvec7, yvec15;
  1822. ADDSUB_DY yvec14, yvec7, yvec14;
  1823. ADDSUB_DY yvec13, yvec7, yvec13;
  1824. ADDSUB_DY yvec12, yvec7, yvec12;
  1825. VPERMILP_DY $0x05, yvec15, yvec15;
  1826. VPERMILP_DY $0x05, yvec14, yvec14;
  1827. VPERMILP_DY $0x05, yvec13, yvec13;
  1828. VPERMILP_DY $0x05, yvec12, yvec12;
  1829. #endif
  1830. #### Load Alpha ####
  1831. BROAD_DY MEMALPHA_R,yvec7;
  1832. BROAD_DY MEMALPHA_I,yvec6;
  1833. #### Multiply Alpha ####
  1834. VPERMILP_DY $0x05, yvec15, yvec5;
  1835. MUL_DY yvec7, yvec15, yvec15;
  1836. MUL_DY yvec6, yvec5, yvec5;
  1837. ADD2_DY yvec5, yvec15, yvec15;
  1838. VPERMILP_DY $0x05, yvec14, yvec4;
  1839. MUL_DY yvec7, yvec14, yvec14;
  1840. MUL_DY yvec6, yvec4, yvec4;
  1841. ADD2_DY yvec4, yvec14, yvec14;
  1842. VPERMILP_DY $0x05, yvec13, yvec3;
  1843. MUL_DY yvec7, yvec13, yvec13;
  1844. MUL_DY yvec6, yvec3, yvec3;
  1845. ADD2_DY yvec3, yvec13, yvec13;
  1846. VPERMILP_DY $0x05,yvec12, yvec2;
  1847. MUL_DY yvec7, yvec12, yvec12;
  1848. MUL_DY yvec6, yvec2, yvec2;
  1849. ADD2_DY yvec2, yvec12, yvec12;
  1850. EXTRA_DY $1, yvec15, xvec7;
  1851. EXTRA_DY $1, yvec14, xvec6;
  1852. EXTRA_DY $1, yvec13, xvec5;
  1853. EXTRA_DY $1, yvec12, xvec4;
  1854. #### Testing Alignment ####
  1855. MOVQ C0, %rax;
  1856. OR ldc, %rax;
  1857. TEST $15, %rax;
  1858. JNE .L213_loopEx;
  1859. ALIGN_5
  1860. #### Writing back ####
  1861. #ifndef TRMMKERNEL
  1862. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  1863. ADD_DX 2*SIZE(C1), xvec7, xvec7;
  1864. ADD_DX 4*SIZE(C0), xvec14, xvec14;
  1865. ADD_DX 6*SIZE(C1), xvec6, xvec6;
  1866. ADD_DX 0*SIZE(C1), xvec13, xvec13;
  1867. ADD_DX 2*SIZE(C0), xvec5, xvec5;
  1868. ADD_DX 4*SIZE(C1), xvec12, xvec12;
  1869. ADD_DX 6*SIZE(C0), xvec4, xvec4;
  1870. #endif
  1871. ST_DX xvec15,0*SIZE(C0);
  1872. ST_DX xvec7,2*SIZE(C1);
  1873. ST_DX xvec14,4*SIZE(C0);
  1874. ST_DX xvec6,6*SIZE(C1);
  1875. ST_DX xvec13,0*SIZE(C1);
  1876. ST_DX xvec5,2*SIZE(C0);
  1877. ST_DX xvec12,4*SIZE(C1);
  1878. ST_DX xvec4,6*SIZE(C0);
  1879. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1880. MOVQ bk, %rax;
  1881. SUBQ kkk, %rax;
  1882. SALQ $ZBASE_SHIFT, %rax;
  1883. LEAQ (ptrba, %rax, 4), ptrba;
  1884. LEAQ (ptrbb, %rax, 2), ptrbb;
  1885. #endif
  1886. #if defined(TRMMKERNEL) && defined(LEFT)
  1887. ADDQ $4, kk;
  1888. #endif
  1889. ADDQ $8*SIZE, C0;
  1890. ADDQ $8*SIZE, C1;
  1891. DECQ i;
  1892. JG .L21_bodyB;
  1893. JMP .L21_loopE;
  1894. ALIGN_5
  1895. .L213_loopEx:
  1896. #ifndef TRMMKERNEL
  1897. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1898. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1899. LDL_DX 2*SIZE(C1), xvec1, xvec1;
  1900. LDH_DX 3*SIZE(C1), xvec1, xvec1;
  1901. LDL_DX 4*SIZE(C0), xvec2, xvec2;
  1902. LDH_DX 5*SIZE(C0), xvec2, xvec2;
  1903. LDL_DX 6*SIZE(C1), xvec3, xvec3;
  1904. LDH_DX 7*SIZE(C1), xvec3, xvec3;
  1905. ADD_DX xvec0, xvec15, xvec15;
  1906. ADD_DX xvec1, xvec7, xvec7;
  1907. ADD_DX xvec2, xvec14, xvec14;
  1908. ADD_DX xvec3, xvec6, xvec6;
  1909. #endif
  1910. STL_DX xvec15, 0*SIZE(C0);
  1911. STH_DX xvec15, 1*SIZE(C0);
  1912. STL_DX xvec7, 2*SIZE(C1);
  1913. STH_DX xvec7, 3*SIZE(C1);
  1914. STL_DX xvec14, 4*SIZE(C0);
  1915. STH_DX xvec14, 5*SIZE(C0);
  1916. STL_DX xvec6, 6*SIZE(C1);
  1917. STH_DX xvec6, 7*SIZE(C1);
  1918. #ifndef TRMMKERNEL
  1919. LDL_DX 0*SIZE(C1), xvec3, xvec3;
  1920. LDH_DX 1*SIZE(C1), xvec3, xvec3;
  1921. LDL_DX 2*SIZE(C0), xvec2, xvec2;
  1922. LDH_DX 3*SIZE(C0), xvec2, xvec2;
  1923. LDL_DX 4*SIZE(C1), xvec1, xvec1;
  1924. LDH_DX 5*SIZE(C1), xvec1, xvec1;
  1925. LDL_DX 6*SIZE(C0), xvec0, xvec0;
  1926. LDH_DX 7*SIZE(C0), xvec0, xvec0;
  1927. ADD_DX xvec3, xvec13, xvec13;
  1928. ADD_DX xvec2, xvec5, xvec5;
  1929. ADD_DX xvec1, xvec12, xvec12;
  1930. ADD_DX xvec0, xvec4, xvec4;
  1931. #endif
  1932. STL_DX xvec13, 0*SIZE(C1);
  1933. STH_DX xvec13, 1*SIZE(C1);
  1934. STL_DX xvec5, 2*SIZE(C0);
  1935. STH_DX xvec5, 3*SIZE(C0);
  1936. STL_DX xvec12, 4*SIZE(C1);
  1937. STH_DX xvec12, 5*SIZE(C1);
  1938. STL_DX xvec4, 6*SIZE(C0);
  1939. STH_DX xvec4, 7*SIZE(C0);
  1940. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1941. MOVQ bk, %rax;
  1942. SUBQ kkk, %rax;
  1943. SALQ $ZBASE_SHIFT, %rax;
  1944. LEAQ (ptrba, %rax, 4), ptrba;
  1945. LEAQ (ptrbb, %rax, 2), ptrbb;
  1946. #endif
  1947. #if defined(TRMMKERNEL) && defined(LEFT)
  1948. ADDQ $4, kk;
  1949. #endif
  1950. ADDQ $8*SIZE, C0;
  1951. ADDQ $8*SIZE, C1;
  1952. DECQ i;
  1953. JG .L21_bodyB;
  1954. ALIGN_5
  1955. .L21_loopE:
  1956. TEST $2, bm;
  1957. JLE .L22_loopE;
  1958. ALIGN_5
  1959. .L22_bodyB:
  1960. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1961. MOVQ bb,ptrbb;
  1962. #else
  1963. MOVQ bb, ptrbb;
  1964. MOVQ kk, %rax;
  1965. SALQ $ZBASE_SHIFT, %rax;
  1966. LEAQ (ptrba, %rax, 2), ptrba;
  1967. LEAQ (ptrbb, %rax, 2), ptrbb;
  1968. #endif
  1969. XOR_DY yvec15, yvec15, yvec15;
  1970. XOR_DY yvec14, yvec14, yvec13;
  1971. #ifndef TRMMKERNEL
  1972. MOVQ bk,k;
  1973. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1974. MOVQ bk, %rax;
  1975. SUBQ kk, %rax;
  1976. MOVQ %rax, kkk;
  1977. #else
  1978. MOVQ kk, %rax;
  1979. #ifdef LEFT
  1980. ADDQ $2, %rax;
  1981. #else
  1982. ADDQ $2, %rax;
  1983. #endif
  1984. MOVQ %rax, kkk;
  1985. #endif
  1986. SARQ $2, k;
  1987. JLE .L221_loopE;
  1988. ALIGN_5
  1989. .L221_bodyB:
  1990. #### Unroll time 1 ####
  1991. EDUP_DY 0*SIZE(ptrbb), yvec2;
  1992. LD_DY 0*SIZE(ptrba), yvec0;
  1993. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  1994. MUL_DY yvec0, yvec2, yvec6;
  1995. ADD1_DY yvec6, yvec15, yvec15;
  1996. EDUP_DY 1*SIZE(ptrbb), yvec3;
  1997. MUL_DY yvec0, yvec4, yvec6;
  1998. ADD1_DY yvec6, yvec13, yvec13;
  1999. VPERMILP_DY $0x05, yvec0, yvec0;
  2000. MUL_DY yvec0, yvec3, yvec6;
  2001. ADD2_DY yvec6, yvec15, yvec15;
  2002. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2003. MUL_DY yvec0, yvec5, yvec6;
  2004. ADD2_DY yvec6, yvec13, yvec13;
  2005. #### Unroll time 2 ####
  2006. EDUP_DY 4*SIZE(ptrbb), yvec2;
  2007. LD_DY 4*SIZE(ptrba), yvec0;
  2008. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2009. MUL_DY yvec0, yvec2, yvec6;
  2010. ADD1_DY yvec6, yvec15, yvec15;
  2011. EDUP_DY 5*SIZE(ptrbb), yvec3;
  2012. MUL_DY yvec0, yvec4, yvec6;
  2013. ADD1_DY yvec6, yvec13, yvec13;
  2014. VPERMILP_DY $0x05, yvec0, yvec0;
  2015. MUL_DY yvec0, yvec3, yvec6;
  2016. ADD2_DY yvec6, yvec15, yvec15;
  2017. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2018. MUL_DY yvec0, yvec5, yvec6;
  2019. ADD2_DY yvec6, yvec13, yvec13;
  2020. #### Unroll time 3 ####
  2021. EDUP_DY 8*SIZE(ptrbb), yvec2;
  2022. LD_DY 8*SIZE(ptrba), yvec0;
  2023. MUL_DY yvec0, yvec2, yvec6;
  2024. ADD1_DY yvec6, yvec15, yvec15;
  2025. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2026. EDUP_DY 9*SIZE(ptrbb), yvec3;
  2027. MUL_DY yvec0, yvec4, yvec6;
  2028. ADD1_DY yvec6, yvec13, yvec13;
  2029. VPERMILP_DY $0x05, yvec0, yvec0;
  2030. MUL_DY yvec0, yvec3, yvec6;
  2031. ADD2_DY yvec6, yvec15, yvec15;
  2032. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2033. MUL_DY yvec0, yvec5, yvec6;
  2034. ADD2_DY yvec6, yvec13, yvec13;
  2035. #### Unroll time 4 ####
  2036. EDUP_DY 12*SIZE(ptrbb), yvec2;
  2037. LD_DY 12*SIZE(ptrba), yvec0;
  2038. MUL_DY yvec0, yvec2, yvec6;
  2039. ADD1_DY yvec6, yvec15, yvec15;
  2040. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2041. EDUP_DY 13*SIZE(ptrbb), yvec3;
  2042. MUL_DY yvec0, yvec4, yvec6;
  2043. ADD1_DY yvec6, yvec13, yvec13;
  2044. VPERMILP_DY $0x05, yvec0, yvec0;
  2045. MUL_DY yvec0, yvec3, yvec6;
  2046. ADD2_DY yvec6, yvec15, yvec15;
  2047. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2048. ADDQ $16*SIZE, ptrbb;
  2049. MUL_DY yvec0, yvec5, yvec6;
  2050. ADD2_DY yvec6, yvec13, yvec13;
  2051. ADDQ $16*SIZE, ptrba;
  2052. DECQ k;
  2053. JG .L221_bodyB;
  2054. ALIGN_5
  2055. .L221_loopE:
  2056. #ifndef TRMMKERNEL
  2057. TEST $2, bk;
  2058. #else
  2059. TEST $2, kkk;
  2060. #endif
  2061. JLE .L222_loopE;
  2062. ALIGN_5
  2063. .L222_bodyB:
  2064. #### Unroll time 1 ####
  2065. EDUP_DY 0*SIZE(ptrbb), yvec2;
  2066. LD_DY 0*SIZE(ptrba), yvec0;
  2067. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2068. MUL_DY yvec0, yvec2, yvec6;
  2069. ADD1_DY yvec6, yvec15, yvec15;
  2070. EDUP_DY 1*SIZE(ptrbb), yvec3;
  2071. MUL_DY yvec0, yvec4, yvec6;
  2072. ADD1_DY yvec6, yvec13, yvec13;
  2073. VPERMILP_DY $0x05, yvec0, yvec0;
  2074. MUL_DY yvec0, yvec3, yvec6;
  2075. ADD2_DY yvec6, yvec15, yvec15;
  2076. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2077. MUL_DY yvec0, yvec5, yvec6;
  2078. ADD2_DY yvec6, yvec13, yvec13;
  2079. #### Unroll time 2 ####
  2080. EDUP_DY 4*SIZE(ptrbb), yvec2;
  2081. LD_DY 4*SIZE(ptrba), yvec0;
  2082. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2083. MUL_DY yvec0, yvec2, yvec6;
  2084. ADD1_DY yvec6, yvec15, yvec15;
  2085. EDUP_DY 5*SIZE(ptrbb), yvec3;
  2086. MUL_DY yvec0, yvec4, yvec6;
  2087. ADD1_DY yvec6, yvec13, yvec13;
  2088. VPERMILP_DY $0x05, yvec0, yvec0;
  2089. MUL_DY yvec0, yvec3, yvec6;
  2090. ADD2_DY yvec6, yvec15, yvec15;
  2091. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2092. MUL_DY yvec0, yvec5, yvec6;
  2093. ADD2_DY yvec6, yvec13, yvec13;
  2094. ADDQ $8*SIZE, ptrba;
  2095. ADDQ $8*SIZE, ptrbb;
  2096. .L222_loopE:
  2097. #ifndef TRMMKERNEL
  2098. TEST $1, bk;
  2099. #else
  2100. TEST $1, kkk;
  2101. #endif
  2102. JLE .L223_loopE;
  2103. ALIGN_5
  2104. .L223_bodyB:
  2105. #### Unroll time 1 ####
  2106. EDUP_DY 0*SIZE(ptrbb), yvec2;
  2107. LD_DY 0*SIZE(ptrba), yvec0;
  2108. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  2109. MUL_DY yvec0, yvec2, yvec6;
  2110. ADD1_DY yvec6, yvec15, yvec15;
  2111. EDUP_DY 1*SIZE(ptrbb), yvec3;
  2112. MUL_DY yvec0, yvec4, yvec6;
  2113. ADD1_DY yvec6, yvec13, yvec13;
  2114. VPERMILP_DY $0x05, yvec0, yvec0;
  2115. MUL_DY yvec0, yvec3, yvec6;
  2116. ADD2_DY yvec6, yvec15, yvec15;
  2117. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  2118. MUL_DY yvec0, yvec5, yvec6;
  2119. ADD2_DY yvec6, yvec13, yvec13;
  2120. ADDQ $4*SIZE, ptrba;
  2121. ADDQ $4*SIZE, ptrbb;
  2122. .L223_loopE:
  2123. #### Handle ####
  2124. XOR_DY yvec7, yvec7, yvec7;
  2125. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2126. ADDSUB_DY yvec15, yvec7, yvec15;
  2127. ADDSUB_DY yvec13, yvec7, yvec13;
  2128. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2129. SUB_DY yvec15, yvec7, yvec15;
  2130. SUB_DY yvec13, yvec7, yvec13;
  2131. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2132. VPERMILP_DY $0x05, yvec15, yvec15;
  2133. VPERMILP_DY $0x05, yvec13, yvec13;
  2134. ADDSUB_DY yvec15, yvec7, yvec15;
  2135. ADDSUB_DY yvec13, yvec7, yvec13;
  2136. VPERMILP_DY $0x05, yvec15, yvec15;
  2137. VPERMILP_DY $0x05, yvec13, yvec13;
  2138. #endif
  2139. #### Load Alpha ####
  2140. BROAD_DY MEMALPHA_R,yvec7;
  2141. BROAD_DY MEMALPHA_I,yvec6;
  2142. #### Multiply Alpha ####
  2143. VPERMILP_DY $0x05, yvec15, yvec5;
  2144. MUL_DY yvec7, yvec15, yvec15;
  2145. MUL_DY yvec6, yvec5, yvec5;
  2146. ADD2_DY yvec5, yvec15, yvec15;
  2147. VPERMILP_DY $0x05, yvec13, yvec3;
  2148. MUL_DY yvec7, yvec13, yvec13;
  2149. MUL_DY yvec6, yvec3, yvec3;
  2150. ADD2_DY yvec3, yvec13, yvec13;
  2151. EXTRA_DY $1, yvec15, xvec7;
  2152. EXTRA_DY $1, yvec13, xvec5;
  2153. #### Write back ####
  2154. #ifndef TRMMKERNEL
  2155. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2156. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2157. LDL_DX 2*SIZE(C1), xvec1, xvec1;
  2158. LDH_DX 3*SIZE(C1), xvec1, xvec1;
  2159. LDL_DX 0*SIZE(C1), xvec2, xvec2;
  2160. LDH_DX 1*SIZE(C1), xvec2, xvec2;
  2161. LDL_DX 2*SIZE(C0), xvec3, xvec3;
  2162. LDH_DX 3*SIZE(C0), xvec3, xvec3;
  2163. ADD_DX xvec0, xvec15, xvec15;
  2164. ADD_DX xvec1, xvec7, xvec7;
  2165. ADD_DX xvec2, xvec13, xvec13;
  2166. ADD_DX xvec3, xvec5, xvec5;
  2167. #endif
  2168. STL_DX xvec15, 0*SIZE(C0);
  2169. STH_DX xvec15, 1*SIZE(C0);
  2170. STL_DX xvec7, 2*SIZE(C1);
  2171. STH_DX xvec7, 3*SIZE(C1);
  2172. STL_DX xvec13, 0*SIZE(C1);
  2173. STH_DX xvec13, 1*SIZE(C1);
  2174. STL_DX xvec5, 2*SIZE(C0);
  2175. STH_DX xvec5, 3*SIZE(C0);
  2176. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2177. MOVQ bk, %rax;
  2178. SUBQ kkk, %rax;
  2179. SALQ $ZBASE_SHIFT, %rax;
  2180. LEAQ (ptrba, %rax, 2), ptrba;
  2181. LEAQ (ptrbb, %rax, 2), ptrbb;
  2182. #endif
  2183. #if defined(TRMMKERNEL) && defined(LEFT)
  2184. ADDQ $2, kk;
  2185. #endif
  2186. ADDQ $4*SIZE, C0;
  2187. ADDQ $4*SIZE, C1;
  2188. .L22_loopE:
  2189. TEST $1, bm;
  2190. JLE .L23_loopE;
  2191. ALIGN_5
  2192. .L23_bodyB:
  2193. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2194. MOVQ bb,ptrbb;
  2195. #else
  2196. MOVQ bb, ptrbb;
  2197. MOVQ kk, %rax;
  2198. SALQ $ZBASE_SHIFT, %rax;
  2199. ADDQ %rax, ptrba;
  2200. LEAQ (ptrbb, %rax, 2), ptrbb;
  2201. #endif
  2202. XOR_DY yvec15, yvec15, yvec15;
  2203. #ifndef TRMMKERNEL
  2204. MOVQ bk,k;
  2205. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2206. MOVQ bk, %rax;
  2207. SUBQ kk, %rax;
  2208. MOVQ %rax, kkk;
  2209. #else
  2210. MOVQ kk, %rax;
  2211. #ifdef LEFT
  2212. ADDQ $1, %rax;
  2213. #else
  2214. ADDQ $2, %rax;
  2215. #endif
  2216. MOVQ %rax, kkk;
  2217. #endif
  2218. SARQ $2, k;
  2219. JLE .L231_loopE;
  2220. ALIGN_5
  2221. .L231_bodyB:
  2222. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  2223. EDUP_DY 0*SIZE(ptrbb), yvec2;
  2224. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  2225. MUL_DY yvec1, yvec2, yvec6;
  2226. ADD1_DY yvec6, yvec15, yvec15;
  2227. VPERMILP_DY $0x05, yvec1, yvec4;
  2228. EDUP_DY 1*SIZE(ptrbb), yvec2;
  2229. MUL_DY yvec4, yvec2, yvec6;
  2230. ADD2_DY yvec6, yvec15, yvec15;
  2231. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  2232. EDUP_DY 4*SIZE(ptrbb), yvec2;
  2233. MUL_DY yvec1, yvec2, yvec6;
  2234. ADD1_DY yvec6, yvec15, yvec15;
  2235. VPERMILP_DY $0x05, yvec1, yvec4;
  2236. EDUP_DY 5*SIZE(ptrbb), yvec2;
  2237. MUL_DY yvec4, yvec2, yvec6;
  2238. ADD2_DY yvec6, yvec15, yvec15;
  2239. LD_DY 4*SIZE(ptrba), yvec0;
  2240. EDUP_DY 8*SIZE(ptrbb), yvec2;
  2241. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  2242. MUL_DY yvec1, yvec2, yvec6;
  2243. ADD1_DY yvec6, yvec15, yvec15;
  2244. VPERMILP_DY $0x05, yvec1, yvec4;
  2245. EDUP_DY 9*SIZE(ptrbb), yvec2;
  2246. MUL_DY yvec4, yvec2, yvec6;
  2247. ADD2_DY yvec6, yvec15, yvec15;
  2248. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  2249. EDUP_DY 12*SIZE(ptrbb), yvec2;
  2250. MUL_DY yvec1, yvec2, yvec6;
  2251. ADD1_DY yvec6, yvec15, yvec15;
  2252. VPERMILP_DY $0x05, yvec1, yvec4;
  2253. EDUP_DY 13*SIZE(ptrbb), yvec2;
  2254. MUL_DY yvec4, yvec2, yvec6;
  2255. ADD2_DY yvec6, yvec15, yvec15;
  2256. ADDQ $8*SIZE, ptrba;
  2257. ADDQ $16*SIZE, ptrbb;
  2258. DECQ k;
  2259. JG .L231_bodyB;
  2260. ALIGN_5
  2261. .L231_loopE:
  2262. #ifndef TRMMKERNEL
  2263. TEST $2, bk;
  2264. #else
  2265. TEST $2, kkk;
  2266. #endif
  2267. JLE .L232_loopE;
  2268. ALIGN_5
  2269. .L232_bodyB:
  2270. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  2271. EDUP_DY 0*SIZE(ptrbb), yvec2;
  2272. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  2273. MUL_DY yvec1, yvec2, yvec6;
  2274. ADD1_DY yvec6, yvec15, yvec15;
  2275. VPERMILP_DY $0x05, yvec1, yvec4;
  2276. EDUP_DY 1*SIZE(ptrbb), yvec2;
  2277. MUL_DY yvec4, yvec2, yvec6;
  2278. ADD2_DY yvec6, yvec15, yvec15;
  2279. SHUF_DY $0x31, yvec0, yvec0, yvec1;
  2280. EDUP_DY 4*SIZE(ptrbb), yvec2;
  2281. MUL_DY yvec1, yvec2, yvec6;
  2282. ADD1_DY yvec6, yvec15, yvec15;
  2283. VPERMILP_DY $0x05, yvec1, yvec4;
  2284. EDUP_DY 5*SIZE(ptrbb), yvec2;
  2285. MUL_DY yvec4, yvec2, yvec6;
  2286. ADD2_DY yvec6, yvec15, yvec15;
  2287. ADDQ $4*SIZE, ptrba;
  2288. ADDQ $8*SIZE, ptrbb;
  2289. .L232_loopE:
  2290. #ifndef TRMMKERNEL
  2291. TEST $1, bk;
  2292. #else
  2293. TEST $1, kkk;
  2294. #endif
  2295. JLE .L233_loopE;
  2296. ALIGN_5
  2297. .L233_bodyB:
  2298. LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
  2299. EDUP_DY 0*SIZE(ptrbb), yvec2;
  2300. SHUF_DY $0x20, yvec0, yvec0, yvec1;
  2301. MUL_DY yvec1, yvec2, yvec6;
  2302. ADD1_DY yvec6, yvec15, yvec15;
  2303. VPERMILP_DY $0x05, yvec1, yvec4;
  2304. EDUP_DY 1*SIZE(ptrbb), yvec2;
  2305. MUL_DY yvec4, yvec2, yvec6;
  2306. ADD2_DY yvec6, yvec15, yvec15;
  2307. ADDQ $2*SIZE, ptrba;
  2308. ADDQ $4*SIZE, ptrbb;
  2309. .L233_loopE:
  2310. #### Handle ####
  2311. XOR_DY yvec7, yvec7, yvec7;
  2312. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2313. ADDSUB_DY yvec15, yvec7, yvec15;
  2314. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2315. SUB_DY yvec15, yvec7, yvec15;
  2316. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2317. VPERMILP_DY $0x05, yvec15, yvec15;
  2318. ADDSUB_DY yvec15, yvec7, yvec15;
  2319. VPERMILP_DY $0x05, yvec15, yvec15;
  2320. #endif
  2321. #### Multiply Alpha ####
  2322. BROAD_DY MEMALPHA_R, yvec7;
  2323. BROAD_DY MEMALPHA_I, yvec6;
  2324. #### Writing Back ####
  2325. VPERMILP_DY $0x05, yvec15, yvec5;
  2326. MUL_DY yvec7, yvec15, yvec15;
  2327. MUL_DY yvec6, yvec5, yvec5;
  2328. ADD2_DY yvec5, yvec15, yvec15;
  2329. EXTRA_DY $1, yvec15, xvec7;
  2330. #### Writing Back ####
  2331. #ifndef TRMMKERNEL
  2332. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2333. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2334. LDL_DX 0*SIZE(C1), xvec1, xvec1;
  2335. LDH_DX 1*SIZE(C1), xvec1, xvec1;
  2336. ADD_DX xvec0, xvec15, xvec15;
  2337. ADD_DX xvec1, xvec7, xvec7;
  2338. #endif
  2339. STL_DX xvec15, 0*SIZE(C0);
  2340. STH_DX xvec15, 1*SIZE(C0);
  2341. STL_DX xvec7, 0*SIZE(C1);
  2342. STH_DX xvec7, 1*SIZE(C1);
  2343. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2344. MOVQ bk, %rax;
  2345. SUBQ kkk, %rax;
  2346. SALQ $ZBASE_SHIFT, %rax;
  2347. ADDQ %rax, ptrba;
  2348. LEAQ (ptrbb, %rax, 2), ptrbb;
  2349. #endif
  2350. #if defined(TRMMKERNEL) && defined(LEFT)
  2351. ADDQ $1, kk;
  2352. #endif
  2353. ADDQ $2*SIZE, C0;
  2354. ADDQ $2*SIZE, C0;
  2355. .L23_loopE:
  2356. #if defined(TRMMKERNEL) && !defined(LEFT)
  2357. ADDQ $2, kk;
  2358. #endif
  2359. MOVQ bk, k;
  2360. SALQ $5, k;
  2361. ADDQ k, bb;
  2362. LEAQ (C, ldc, 2), C;
  2363. .L20_loopE:
  2364. TEST $1, bn;
  2365. JLE .L30_loopE;
  2366. ALIGN_5
  2367. .L30_bodyB:
  2368. #if defined(TRMMKERNEL) && defined(LEFT)
  2369. MOVQ OFFSET, %rax;
  2370. MOVQ %rax, kk;
  2371. #endif
  2372. MOVQ ba, ptrba;
  2373. MOVQ C, C0;
  2374. MOVQ bm, i;
  2375. SARQ $2, i;
  2376. JLE .L31_loopE;
  2377. ALIGN_5
  2378. .L31_bodyB:
  2379. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2380. MOVQ bb,ptrbb;
  2381. #else
  2382. MOVQ bb, ptrbb;
  2383. MOVQ kk, %rax;
  2384. SALQ $ZBASE_SHIFT, %rax;
  2385. LEAQ (ptrba, %rax, 4), ptrba;
  2386. ADDQ %rax, ptrbb;
  2387. #endif
  2388. XOR_DY yvec15, yvec15, yvec15;
  2389. XOR_DY yvec14, yvec14, yvec14;
  2390. #ifndef TRMMKERNEL
  2391. MOVQ bk,k;
  2392. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2393. MOVQ bk, %rax;
  2394. SUBQ kk, %rax;
  2395. MOVQ %rax, kkk;
  2396. #else
  2397. MOVQ kk, %rax;
  2398. #ifdef LEFT
  2399. ADDQ $4, %rax;
  2400. #else
  2401. ADDQ $1, %rax;
  2402. #endif
  2403. MOVQ %rax, kkk;
  2404. #endif
  2405. SARQ $2, k;
  2406. JLE .L311_loopE;
  2407. ALIGN_5
  2408. .L311_bodyB:
  2409. LD_DY 0*SIZE(ptrba), yvec0;
  2410. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2411. MUL_DY yvec0, yvec2, yvec6;
  2412. ADD1_DY yvec6, yvec15, yvec15;
  2413. LD_DY 4*SIZE(ptrba), yvec1;
  2414. MUL_DY yvec1, yvec2, yvec7;
  2415. ADD1_DY yvec7, yvec14, yvec14;
  2416. VPERMILP_DY $0x05, yvec0, yvec4;
  2417. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2418. MUL_DY yvec4, yvec3, yvec6;
  2419. ADD2_DY yvec6, yvec15, yvec15;
  2420. VPERMILP_DY $0x05, yvec1, yvec5;
  2421. MUL_DY yvec5, yvec3, yvec7;
  2422. ADD2_DY yvec7, yvec14, yvec14;
  2423. LD_DY 8*SIZE(ptrba), yvec0;
  2424. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2425. MUL_DY yvec0, yvec2, yvec6;
  2426. ADD1_DY yvec6, yvec15, yvec15;
  2427. LD_DY 12*SIZE(ptrba), yvec1;
  2428. MUL_DY yvec1, yvec2, yvec7;
  2429. ADD1_DY yvec7, yvec14, yvec14;
  2430. VPERMILP_DY $0x05, yvec0, yvec4;
  2431. BROAD_DY 3*SIZE(ptrbb), yvec3;
  2432. MUL_DY yvec4, yvec3, yvec6;
  2433. ADD2_DY yvec6, yvec15, yvec15;
  2434. VPERMILP_DY $0x05, yvec1, yvec5;
  2435. MUL_DY yvec5, yvec3, yvec7;
  2436. ADD2_DY yvec7, yvec14, yvec14;
  2437. LD_DY 16*SIZE(ptrba), yvec0;
  2438. BROAD_DY 4*SIZE(ptrbb), yvec2;
  2439. MUL_DY yvec0, yvec2, yvec6;
  2440. ADD1_DY yvec6, yvec15, yvec15;
  2441. LD_DY 20*SIZE(ptrba), yvec1;
  2442. MUL_DY yvec1, yvec2, yvec7;
  2443. ADD1_DY yvec7, yvec14, yvec14;
  2444. VPERMILP_DY $0x05, yvec0, yvec4;
  2445. BROAD_DY 5*SIZE(ptrbb), yvec3;
  2446. MUL_DY yvec4, yvec3, yvec6;
  2447. ADD2_DY yvec6, yvec15, yvec15;
  2448. VPERMILP_DY $0x05, yvec1, yvec5;
  2449. MUL_DY yvec5, yvec3, yvec7;
  2450. ADD2_DY yvec7, yvec14, yvec14;
  2451. LD_DY 24*SIZE(ptrba), yvec0;
  2452. BROAD_DY 6*SIZE(ptrbb), yvec2;
  2453. MUL_DY yvec0, yvec2, yvec6;
  2454. ADD1_DY yvec6, yvec15, yvec15;
  2455. LD_DY 28*SIZE(ptrba), yvec1;
  2456. MUL_DY yvec1, yvec2, yvec7;
  2457. ADD1_DY yvec7, yvec14, yvec14;
  2458. VPERMILP_DY $0x05, yvec0, yvec4;
  2459. BROAD_DY 7*SIZE(ptrbb), yvec3;
  2460. MUL_DY yvec4, yvec3, yvec6;
  2461. ADD2_DY yvec6, yvec15, yvec15;
  2462. VPERMILP_DY $0x05, yvec1, yvec5;
  2463. MUL_DY yvec5, yvec3, yvec7;
  2464. ADD2_DY yvec7, yvec14, yvec14;
  2465. ADDQ $32*SIZE, ptrba;
  2466. ADDQ $8*SIZE, ptrbb;
  2467. DECQ k;
  2468. JG .L311_bodyB;
  2469. ALIGN_5
  2470. .L311_loopE:
  2471. #ifndef TRMMKERNEL
  2472. TEST $2, bk;
  2473. #else
  2474. TEST $2, kkk;
  2475. #endif
  2476. JLE .L312_loopE;
  2477. ALIGN_5
  2478. .L312_bodyB:
  2479. LD_DY 0*SIZE(ptrba), yvec0;
  2480. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2481. MUL_DY yvec0, yvec2, yvec6;
  2482. ADD1_DY yvec6, yvec15, yvec15;
  2483. LD_DY 4*SIZE(ptrba), yvec1;
  2484. MUL_DY yvec1, yvec2, yvec7;
  2485. ADD1_DY yvec7, yvec14, yvec14;
  2486. VPERMILP_DY $0x05, yvec0, yvec4;
  2487. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2488. MUL_DY yvec4, yvec3, yvec6;
  2489. ADD2_DY yvec6, yvec15, yvec15;
  2490. VPERMILP_DY $0x05, yvec1, yvec5;
  2491. MUL_DY yvec5, yvec3, yvec7;
  2492. ADD2_DY yvec7, yvec14, yvec14;
  2493. LD_DY 8*SIZE(ptrba), yvec0;
  2494. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2495. MUL_DY yvec0, yvec2, yvec6;
  2496. ADD1_DY yvec6, yvec15, yvec15;
  2497. LD_DY 12*SIZE(ptrba), yvec1;
  2498. MUL_DY yvec1, yvec2, yvec7;
  2499. ADD1_DY yvec7, yvec14, yvec14;
  2500. VPERMILP_DY $0x05, yvec0, yvec4;
  2501. BROAD_DY 3*SIZE(ptrbb), yvec3;
  2502. MUL_DY yvec4, yvec3, yvec6;
  2503. ADD2_DY yvec6, yvec15, yvec15;
  2504. VPERMILP_DY $0x05, yvec1, yvec5;
  2505. MUL_DY yvec5, yvec3, yvec7;
  2506. ADD2_DY yvec7, yvec14, yvec14;
  2507. ADDQ $16*SIZE, ptrba;
  2508. ADDQ $4*SIZE, ptrbb;
  2509. .L312_loopE:
  2510. #ifndef TRMMKERNEL
  2511. TEST $1, bk;
  2512. #else
  2513. TEST $1, kkk;
  2514. #endif
  2515. JLE .L313_loopE;
  2516. ALIGN_5
  2517. .L313_bodyB:
  2518. LD_DY 0*SIZE(ptrba), yvec0;
  2519. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2520. MUL_DY yvec0, yvec2, yvec6;
  2521. ADD1_DY yvec6, yvec15, yvec15;
  2522. LD_DY 4*SIZE(ptrba), yvec1;
  2523. MUL_DY yvec1, yvec2, yvec7;
  2524. ADD1_DY yvec7, yvec14, yvec14;
  2525. VPERMILP_DY $0x05, yvec0, yvec4;
  2526. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2527. MUL_DY yvec4, yvec3, yvec6;
  2528. ADD2_DY yvec6, yvec15, yvec15;
  2529. VPERMILP_DY $0x05, yvec1, yvec5;
  2530. MUL_DY yvec5, yvec3, yvec7;
  2531. ADD2_DY yvec7, yvec14, yvec14;
  2532. ADDQ $8*SIZE, ptrba;
  2533. ADDQ $2*SIZE, ptrbb;
  2534. .L313_loopE:
  2535. #### Handle ####
  2536. XOR_DY yvec7, yvec7, yvec7;
  2537. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2538. ADDSUB_DY yvec15, yvec7, yvec15;
  2539. ADDSUB_DY yvec14, yvec7, yvec14;
  2540. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2541. SUB_DY yvec15, yvec7, yvec15;
  2542. SUB_DY yvec14, yvec7, yvec14;
  2543. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2544. VPERMILP_DY $0x05, yvec15, yvec15;
  2545. VPERMILP_DY $0x05, yvec14, yvec14;
  2546. ADDSUB_DY yvec15, yvec7, yvec15;
  2547. ADDSUB_DY yvec14, yvec7, yvec14;
  2548. VPERMILP_DY $0x05, yvec15, yvec15;
  2549. VPERMILP_DY $0x05, yvec14, yvec14;
  2550. #endif
  2551. #### Load Alpha ####
  2552. BROAD_DY MEMALPHA_R,yvec7;
  2553. BROAD_DY MEMALPHA_I,yvec6;
  2554. #### Multiply Alpha ####
  2555. VPERMILP_DY $0x05, yvec15, yvec5;
  2556. MUL_DY yvec7, yvec15, yvec15;
  2557. MUL_DY yvec6, yvec5, yvec5;
  2558. ADD2_DY yvec5, yvec15, yvec15;
  2559. VPERMILP_DY $0x05, yvec14, yvec4;
  2560. MUL_DY yvec7, yvec14, yvec14;
  2561. MUL_DY yvec6, yvec4, yvec4;
  2562. ADD2_DY yvec4, yvec14, yvec14;
  2563. EXTRA_DY $1, yvec15, xvec7;
  2564. EXTRA_DY $1, yvec14, xvec6;
  2565. #### Writing Back ####
  2566. #ifndef TRMMKERNEL
  2567. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2568. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2569. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  2570. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  2571. LDL_DX 4*SIZE(C0), xvec2, xvec2;
  2572. LDH_DX 5*SIZE(C0), xvec2, xvec2;
  2573. LDL_DX 6*SIZE(C0), xvec3, xvec3;
  2574. LDH_DX 7*SIZE(C0), xvec3, xvec3;
  2575. ADD_DX xvec0, xvec15, xvec15;
  2576. ADD_DX xvec1, xvec7, xvec7;
  2577. ADD_DX xvec2, xvec14, xvec14;
  2578. ADD_DX xvec3, xvec6, xvec6;
  2579. #endif
  2580. STL_DX xvec15, 0*SIZE(C0);
  2581. STH_DX xvec15, 1*SIZE(C0);
  2582. STL_DX xvec7, 2*SIZE(C0);
  2583. STH_DX xvec7, 3*SIZE(C0);
  2584. STL_DX xvec14, 4*SIZE(C0);
  2585. STH_DX xvec14, 5*SIZE(C0);
  2586. STL_DX xvec6, 6*SIZE(C0);
  2587. STH_DX xvec6, 7*SIZE(C0);
  2588. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2589. MOVQ bk, %rax;
  2590. SUBQ kkk, %rax;
  2591. SALQ $ZBASE_SHIFT, %rax;
  2592. LEAQ (ptrba, %rax, 4), ptrba;
  2593. ADDQ %rax, ptrbb;
  2594. #endif
  2595. #if defined(TRMMKERNEL) && defined(LEFT)
  2596. ADDQ $4, kk;
  2597. #endif
  2598. ADDQ $8*SIZE, C0;
  2599. DECQ i;
  2600. JG .L31_bodyB;
  2601. ALIGN_5
  2602. .L31_loopE:
  2603. TEST $2, bm;
  2604. JLE .L32_loopE;
  2605. ALIGN_5
  2606. .L32_bodyB:
  2607. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2608. MOVQ bb,ptrbb;
  2609. #else
  2610. MOVQ bb, ptrbb;
  2611. MOVQ kk, %rax;
  2612. SALQ $ZBASE_SHIFT, %rax;
  2613. LEAQ (ptrba, %rax, 2), ptrba;
  2614. ADDQ %rax, ptrbb;
  2615. #endif
  2616. XOR_DY yvec15, yvec15, yvec15;
  2617. #ifndef TRMMKERNEL
  2618. MOVQ bk,k;
  2619. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2620. MOVQ bk, %rax;
  2621. SUBQ kk, %rax;
  2622. MOVQ %rax, kkk;
  2623. #else
  2624. MOVQ kk, %rax;
  2625. #ifdef LEFT
  2626. ADDQ $2, %rax;
  2627. #else
  2628. ADDQ $1, %rax;
  2629. #endif
  2630. MOVQ %rax, kkk;
  2631. #endif
  2632. SARQ $2, k;
  2633. JLE .L321_loopE;
  2634. ALIGN_5
  2635. .L321_bodyB:
  2636. LD_DY 0*SIZE(ptrba), yvec0;
  2637. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2638. MUL_DY yvec0, yvec2, yvec6;
  2639. ADD1_DY yvec6, yvec15, yvec15;
  2640. VPERMILP_DY $0x05, yvec0, yvec1;
  2641. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2642. MUL_DY yvec1, yvec3, yvec7;
  2643. ADD2_DY yvec7, yvec15, yvec15;
  2644. LD_DY 4*SIZE(ptrba), yvec0;
  2645. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2646. MUL_DY yvec0, yvec2, yvec6;
  2647. ADD1_DY yvec6, yvec15, yvec15;
  2648. VPERMILP_DY $0x05, yvec0, yvec1;
  2649. BROAD_DY 3*SIZE(ptrbb), yvec3;
  2650. MUL_DY yvec1, yvec3, yvec7;
  2651. ADD2_DY yvec7, yvec15, yvec15;
  2652. LD_DY 8*SIZE(ptrba), yvec0;
  2653. BROAD_DY 4*SIZE(ptrbb), yvec2;
  2654. MUL_DY yvec0, yvec2, yvec6;
  2655. ADD1_DY yvec6, yvec15, yvec15;
  2656. VPERMILP_DY $0x05, yvec0, yvec1;
  2657. BROAD_DY 5*SIZE(ptrbb), yvec3;
  2658. MUL_DY yvec1, yvec3, yvec7;
  2659. ADD2_DY yvec7, yvec15, yvec15;
  2660. LD_DY 12*SIZE(ptrba), yvec0;
  2661. BROAD_DY 6*SIZE(ptrbb), yvec2;
  2662. MUL_DY yvec0, yvec2, yvec6;
  2663. ADD1_DY yvec6, yvec15, yvec15;
  2664. VPERMILP_DY $0x05, yvec0, yvec1;
  2665. BROAD_DY 7*SIZE(ptrbb), yvec3;
  2666. MUL_DY yvec1, yvec3, yvec7;
  2667. ADD2_DY yvec7, yvec15, yvec15;
  2668. ADDQ $16*SIZE, ptrba;
  2669. ADDQ $8*SIZE, ptrbb;
  2670. DECQ k;
  2671. JG .L321_bodyB;
  2672. ALIGN_5
  2673. .L321_loopE:
  2674. #ifndef TRMMKERNEL
  2675. TEST $2, bk;
  2676. #else
  2677. TEST $2, kkk;
  2678. #endif
  2679. JLE .L322_loopE;
  2680. ALIGN_5
  2681. .L322_bodyB:
  2682. LD_DY 0*SIZE(ptrba), yvec0;
  2683. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2684. MUL_DY yvec0, yvec2, yvec6;
  2685. ADD1_DY yvec6, yvec15, yvec15;
  2686. VPERMILP_DY $0x05, yvec0, yvec1;
  2687. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2688. MUL_DY yvec1, yvec3, yvec7;
  2689. ADD2_DY yvec7, yvec15, yvec15;
  2690. LD_DY 4*SIZE(ptrba), yvec0;
  2691. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2692. MUL_DY yvec0, yvec2, yvec6;
  2693. ADD1_DY yvec6, yvec15, yvec15;
  2694. VPERMILP_DY $0x05, yvec0, yvec1;
  2695. BROAD_DY 3*SIZE(ptrbb), yvec3;
  2696. MUL_DY yvec1, yvec3, yvec7;
  2697. ADD2_DY yvec7, yvec15, yvec15;
  2698. ADDQ $8*SIZE, ptrba;
  2699. ADDQ $4*SIZE, ptrbb;
  2700. .L322_loopE:
  2701. #ifndef TRMMKERNEL
  2702. TEST $1, bk;
  2703. #else
  2704. TEST $1, kkk;
  2705. #endif
  2706. JLE .L323_loopE;
  2707. ALIGN_5
  2708. .L323_bodyB:
  2709. LD_DY 0*SIZE(ptrba), yvec0;
  2710. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2711. MUL_DY yvec0, yvec2, yvec6;
  2712. ADD1_DY yvec6, yvec15, yvec15;
  2713. VPERMILP_DY $0x05, yvec0, yvec1;
  2714. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2715. MUL_DY yvec1, yvec3, yvec7;
  2716. ADD2_DY yvec7, yvec15, yvec15;
  2717. ADDQ $4*SIZE, ptrba;
  2718. ADDQ $2*SIZE, ptrbb;
  2719. .L323_loopE:
  2720. #### Handle ####
  2721. XOR_DY yvec7, yvec7, yvec7;
  2722. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2723. ADDSUB_DY yvec15, yvec7, yvec15;
  2724. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2725. SUB_DY yvec15, yvec7, yvec15;
  2726. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2727. VPERMILP_DY $0x05, yvec15, yvec15;
  2728. ADDSUB_DY yvec15, yvec7, yvec15;
  2729. VPERMILP_DY $0x05, yvec15, yvec15;
  2730. #endif
  2731. #### Load Alpha ####
  2732. BROAD_DY MEMALPHA_R,yvec7;
  2733. BROAD_DY MEMALPHA_I,yvec6;
  2734. #### Multiply Alpha ####
  2735. VPERMILP_DY $0x05, yvec15, yvec5;
  2736. MUL_DY yvec7, yvec15, yvec15;
  2737. MUL_DY yvec6, yvec5, yvec5;
  2738. ADD2_DY yvec5, yvec15, yvec15;
  2739. EXTRA_DY $1, yvec15, xvec7;
  2740. #### Writing Back ####
  2741. #ifndef TRMMKERNEL
  2742. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2743. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2744. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  2745. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  2746. ADD_DX xvec0, xvec15, xvec15;
  2747. ADD_DX xvec1, xvec7, xvec7;
  2748. #endif
  2749. STL_DX xvec15, 0*SIZE(C0);
  2750. STH_DX xvec15, 1*SIZE(C0);
  2751. STL_DX xvec7, 2*SIZE(C0);
  2752. STH_DX xvec7, 3*SIZE(C0);
  2753. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2754. MOVQ bk, %rax;
  2755. SUBQ kkk, %rax;
  2756. SALQ $ZBASE_SHIFT, %rax;
  2757. LEAQ (ptrba, %rax, 2), ptrba;
  2758. ADDQ %rax, ptrbb;
  2759. #endif
  2760. #if defined(TRMMKERNEL) && defined(LEFT)
  2761. ADDQ $2, kk;
  2762. #endif
  2763. ADDQ $4*SIZE, C0;
  2764. .L32_loopE:
  2765. TEST $1, bm;
  2766. JLE .L33_loopE;
  2767. ALIGN_5
  2768. .L33_bodyB:
  2769. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2770. MOVQ bb,ptrbb;
  2771. #else
  2772. MOVQ bb, ptrbb;
  2773. MOVQ kk, %rax;
  2774. SALQ $ZBASE_SHIFT, %rax;
  2775. ADDQ %rax, ptrba;
  2776. ADDQ %rax, ptrbb;
  2777. #endif
  2778. XOR_DY yvec15, yvec15, yvec15;
  2779. #ifndef TRMMKERNEL
  2780. MOVQ bk,k;
  2781. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2782. MOVQ bk, %rax;
  2783. SUBQ kk, %rax;
  2784. MOVQ %rax, kkk;
  2785. #else
  2786. MOVQ kk, %rax;
  2787. #ifdef LEFT
  2788. ADDQ $1, %rax;
  2789. #else
  2790. ADDQ $1, %rax;
  2791. #endif
  2792. MOVQ %rax, kkk;
  2793. #endif
  2794. SARQ $2, k;
  2795. JLE .L331_loopE;
  2796. ALIGN_5
  2797. .L331_bodyB:
  2798. LD_DX 0*SIZE(ptrba), xvec0;
  2799. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2800. MUL_DX xvec0, xvec2, xvec2;
  2801. ADD1_DX xvec2, xvec15, xvec15;
  2802. SHUF_DX $0x4e, xvec0, xvec1;
  2803. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2804. MUL_DX xvec1, xvec3, xvec3;
  2805. ADDSUB_DX xvec3, xvec15, xvec15;
  2806. LD_DX 2*SIZE(ptrba), xvec0;
  2807. BROAD_DX 2*SIZE(ptrbb), xvec2;
  2808. MUL_DX xvec0, xvec2, xvec2;
  2809. ADD1_DX xvec2, xvec15, xvec15;
  2810. SHUF_DX $0x4e, xvec0, xvec1;
  2811. BROAD_DX 3*SIZE(ptrbb), xvec3;
  2812. MUL_DX xvec1, xvec3, xvec3;
  2813. ADDSUB_DX xvec3, xvec15, xvec15;
  2814. LD_DX 4*SIZE(ptrba), xvec0;
  2815. BROAD_DX 4*SIZE(ptrbb), xvec2;
  2816. MUL_DX xvec0, xvec2, xvec2;
  2817. ADD1_DX xvec2, xvec15, xvec15;
  2818. SHUF_DX $0x4e, xvec0, xvec1;
  2819. BROAD_DX 5*SIZE(ptrbb), xvec3;
  2820. MUL_DX xvec1, xvec3, xvec3;
  2821. ADDSUB_DX xvec3, xvec15, xvec15;
  2822. LD_DX 6*SIZE(ptrba), xvec0;
  2823. BROAD_DX 6*SIZE(ptrbb), xvec2;
  2824. MUL_DX xvec0, xvec2, xvec2;
  2825. ADD1_DX xvec2, xvec15, xvec15;
  2826. SHUF_DX $0x4e, xvec0, xvec1;
  2827. BROAD_DX 7*SIZE(ptrbb), xvec3;
  2828. MUL_DX xvec1, xvec3, xvec3;
  2829. ADDSUB_DX xvec3, xvec15, xvec15;
  2830. ADDQ $8*SIZE, ptrba;
  2831. ADDQ $8*SIZE, ptrbb;
  2832. DECQ k;
  2833. JG .L331_bodyB;
  2834. ALIGN_5
  2835. .L331_loopE:
  2836. #ifndef TRMMKERNEL
  2837. TEST $2, bk;
  2838. #else
  2839. TEST $2, kkk;
  2840. #endif
  2841. JLE .L332_loopE;
  2842. ALIGN_5
  2843. .L332_bodyB:
  2844. LD_DX 0*SIZE(ptrba), xvec0;
  2845. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2846. MUL_DX xvec0, xvec2, xvec2;
  2847. ADD1_DX xvec2, xvec15, xvec15;
  2848. SHUF_DX $0x4e, xvec0, xvec1;
  2849. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2850. MUL_DX xvec1, xvec3, xvec3;
  2851. ADDSUB_DX xvec3, xvec15, xvec15;
  2852. LD_DX 2*SIZE(ptrba), xvec0;
  2853. BROAD_DX 2*SIZE(ptrbb), xvec2;
  2854. MUL_DX xvec0, xvec2, xvec2;
  2855. ADD1_DX xvec2, xvec15, xvec15;
  2856. SHUF_DX $0x4e, xvec0, xvec1;
  2857. BROAD_DX 3*SIZE(ptrbb), xvec3;
  2858. MUL_DX xvec1, xvec3, xvec3;
  2859. ADDSUB_DX xvec3, xvec15, xvec15;
  2860. ADDQ $4*SIZE, ptrba;
  2861. ADDQ $4*SIZE, ptrbb;
  2862. .L332_loopE:
  2863. #ifndef TRMMKERNEL
  2864. TEST $1, bk;
  2865. #else
  2866. TEST $1, kkk;
  2867. #endif
  2868. JLE .L333_loopE;
  2869. ALIGN_5
  2870. .L333_bodyB:
  2871. LD_DX 0*SIZE(ptrba), xvec0;
  2872. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2873. MUL_DX xvec0, xvec2, xvec2;
  2874. ADD1_DX xvec2, xvec15, xvec15;
  2875. SHUF_DX $0x4e, xvec0, xvec1;
  2876. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2877. MUL_DX xvec1, xvec3, xvec3;
  2878. ADDSUB_DX xvec3, xvec15, xvec15;
  2879. ADDQ $2*SIZE, ptrba;
  2880. ADDQ $2*SIZE, ptrbb;
  2881. .L333_loopE:
  2882. #### Handle ####
  2883. XOR_DY yvec7, yvec7, yvec7;
  2884. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2885. ADDSUB_DX xvec15, xvec7, xvec7;
  2886. MOV_DX xvec7, xvec15;
  2887. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2888. SUB_DX xvec15, xvec7, xvec7;
  2889. MOV_DX xvec7, xvec15;
  2890. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2891. SHUF_DX $0x4e, xvec15, xvec15;
  2892. ADDSUB_DX xvec15, xvec7, xvec7;
  2893. MOV_DX xvec7, xvec15;
  2894. SHUF_DX $0x4e, xvec15, xvec15;
  2895. #endif
  2896. #### Load Alpha ####
  2897. BROAD_DX MEMALPHA_R,xvec7;
  2898. BROAD_DX MEMALPHA_I,xvec6;
  2899. #### Multiply Alpha ####
  2900. SHUF_DX $0x4e, xvec15, xvec5;
  2901. MUL_DX xvec7, xvec15, xvec15;
  2902. MUL_DX xvec6, xvec5, xvec5;
  2903. ADDSUB_DX xvec5, xvec15, xvec15;
  2904. #### Writing back ####
  2905. #ifndef TRMMKERNEL
  2906. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2907. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2908. ADD_DX xvec0, xvec15, xvec15;
  2909. #endif
  2910. STL_DX xvec15, 0*SIZE(C0);
  2911. STH_DX xvec15, 1*SIZE(C0);
  2912. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2913. MOVQ bk, %rax;
  2914. SUBQ kkk, %rax;
  2915. SALQ $ZBASE_SHIFT, %rax;
  2916. ADDQ %rax, ptrba;
  2917. ADDQ %rax, ptrbb;
  2918. #endif
  2919. #if defined(TRMMKERNEL) && defined(LEFT)
  2920. ADDQ $1, kk;
  2921. #endif
  2922. ADDQ $2*SIZE, C0;
  2923. .L33_loopE:
  2924. #if defined(TRMMKERNEL) && !defined(LEFT)
  2925. ADDQ $1, kk;
  2926. #endif
  2927. MOVQ bk, k;
  2928. SALQ $4*SIZE, k;
  2929. ADDQ k, bb;
  2930. LEAQ (C, ldc, 1), C;
  2931. .L30_loopE:
  2932. movq 0(%rsp), %rbx;
  2933. movq 8(%rsp), %rbp;
  2934. movq 16(%rsp), %r12;
  2935. movq 24(%rsp), %r13;
  2936. movq 32(%rsp), %r14;
  2937. movq 40(%rsp), %r15;
  2938. vzeroupper
  2939. #ifdef WINDOWS_ABI
  2940. movq 48(%rsp), %rdi
  2941. movq 56(%rsp), %rsi
  2942. movups 64(%rsp), %xmm6
  2943. movups 80(%rsp), %xmm7
  2944. movups 96(%rsp), %xmm8
  2945. movups 112(%rsp), %xmm9
  2946. movups 128(%rsp), %xmm10
  2947. movups 144(%rsp), %xmm11
  2948. movups 160(%rsp), %xmm12
  2949. movups 176(%rsp), %xmm13
  2950. movups 192(%rsp), %xmm14
  2951. movups 208(%rsp), %xmm15
  2952. #endif
  2953. addq $STACKSIZE, %rsp;
  2954. ret
  2955. EPILOGUE