You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8_sandy.S 72 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. #define ASSEMBLER
  29. #include "common.h"
  30. #define old_bm %rdi
  31. #define old_bn %rsi
  32. #define old_bk %rdx
  33. #define bm %r13
  34. #define bn %r14
  35. #define bk %r15
  36. #define ALPHA %xmm0
  37. #define ba %rcx
  38. #define bb %r8
  39. #define C %r9
  40. #define ldc %r10
  41. #define i %r11
  42. #define k %rax
  43. #define ptrba %rdi
  44. #define ptrbb %rsi
  45. #define C0 %rbx
  46. #define C1 %rbp
  47. #define prebb %r12
  48. #ifndef WINDOWS_ABI
  49. #define STACKSIZE 128
  50. #define old_ldc 8+STACKSIZE(%rsp)
  51. #define old_offset 16+STACKSIZE(%rsp)
  52. #define MEMALPHA 48(%rsp)
  53. #define j 56(%rsp)
  54. #define OFFSET 64(%rsp)
  55. #define kk 72(%rsp)
  56. #define kkk 80(%rsp)
  57. #else
  58. #define STACKSIZE 512
  59. #define OLD_A 40 + STACKSIZE(%rsp)
  60. #define OLD_B 48 + STACKSIZE(%rsp)
  61. #define OLD_C 56 + STACKSIZE(%rsp)
  62. #define old_ldc 64 + STACKSIZE(%rsp)
  63. #define old_offset 72 + STACKSIZE(%rsp)
  64. #define MEMALPHA 224(%rsp)
  65. #define j 232(%rsp)
  66. #define OFFSET 240(%rsp)
  67. #define kk 248(%rsp)
  68. #define kkk 256(%rsp)
  69. #endif
  70. #define PREFETCH0 prefetcht0
  71. #define PREFETCH1 prefetcht0
  72. #define PREFETCH2 prefetcht2
  73. #define xvec0 %xmm0
  74. #define xvec1 %xmm1
  75. #define xvec2 %xmm2
  76. #define xvec3 %xmm3
  77. #define xvec4 %xmm4
  78. #define xvec5 %xmm5
  79. #define xvec6 %xmm6
  80. #define xvec7 %xmm7
  81. #define xvec8 %xmm8
  82. #define xvec9 %xmm9
  83. #define xvec10 %xmm10
  84. #define xvec11 %xmm11
  85. #define xvec12 %xmm12
  86. #define xvec13 %xmm13
  87. #define xvec14 %xmm14
  88. #define xvec15 %xmm15
  89. #define yvec0 %ymm0
  90. #define yvec1 %ymm1
  91. #define yvec2 %ymm2
  92. #define yvec3 %ymm3
  93. #define yvec4 %ymm4
  94. #define yvec5 %ymm5
  95. #define yvec6 %ymm6
  96. #define yvec7 %ymm7
  97. #define yvec8 %ymm8
  98. #define yvec9 %ymm9
  99. #define yvec10 %ymm10
  100. #define yvec11 %ymm11
  101. #define yvec12 %ymm12
  102. #define yvec13 %ymm13
  103. #define yvec14 %ymm14
  104. #define yvec15 %ymm15
  105. #define LEAQ leaq
  106. #define ADDQ addq
  107. #define MULQ imulq
  108. #define SARQ sarq
  109. #define SALQ salq
  110. #define ANDQ andq
  111. #define SUBQ subq
  112. #define DECQ decq
  113. #define JG jg
  114. #define JLE jle
  115. #define TEST testq
  116. #define OR orq
  117. #define JNE jne
  118. #define NOP
  119. #define XOR xorpd
  120. #undef MOVQ
  121. #define MOVQ movq
  122. #define XOR_DY vxorpd
  123. #define XOR_DX vxorpd
  124. #define LD_DY vmovapd
  125. #define LD_DX vmovapd
  126. #define LDL_DX vmovlpd
  127. #define LDL_DY vmovlpd
  128. #define LDH_DX vmovhpd
  129. #define LDH_DY vmovhpd
  130. #define ST_DY vmovapd
  131. #define ST_DX vmovapd
  132. #define STL_DX vmovlpd
  133. #define STL_DY vmovlpd
  134. #define STH_DX vmovhpd
  135. #define STH_DY vmovhpd
  136. #define EDUP_DY vmovddup
  137. #define ADD_DY vaddpd
  138. #define ADD_DX vaddpd
  139. #define ADD1_DY vaddpd
  140. #define ADD2_DY vaddpd
  141. #define ADDSUB_DY vaddsubpd
  142. #define MUL_DY vmulpd
  143. #define MUL_DX vmulpd
  144. #define SHUF_DY vperm2f128
  145. #define SHUF_DX vpshufd
  146. #define VPERMILP_DY vpermilpd
  147. #define BROAD_DY vbroadcastsd
  148. #define BROAD_DX vmovddup
  149. #define MOV_DY vmovapd
  150. #define MOV_DX vmovapd
  151. #define REVS_DY vshufpd
  152. #define REVS_DX vmovsd
  153. #define EXTRA_DY vextractf128
  154. PROLOGUE
  155. subq $STACKSIZE, %rsp;
  156. movq %rbx, 0(%rsp);
  157. movq %rbp, 8(%rsp);
  158. movq %r12, 16(%rsp);
  159. movq %r13, 24(%rsp);
  160. movq %r14, 32(%rsp);
  161. movq %r15, 40(%rsp);
  162. #ifdef WINDOWS_ABI
  163. movq %rdi, 48(%rsp)
  164. movq %rsi, 56(%rsp)
  165. movups %xmm6, 64(%rsp)
  166. movups %xmm7, 80(%rsp)
  167. movups %xmm8, 96(%rsp)
  168. movups %xmm9, 112(%rsp)
  169. movups %xmm10, 128(%rsp)
  170. movups %xmm11, 144(%rsp)
  171. movups %xmm12, 160(%rsp)
  172. movups %xmm13, 176(%rsp)
  173. movups %xmm14, 192(%rsp)
  174. movups %xmm15, 208(%rsp)
  175. movq ARG1, old_bm
  176. movq ARG2, old_bn
  177. movq ARG3, old_bk
  178. movq OLD_A, ba
  179. movq OLD_B, bb
  180. movq OLD_C, C
  181. movq old_ldc, ldc
  182. #ifdef TRMMKERNEL
  183. movq old_offset, %r11
  184. #endif
  185. movaps %xmm3, %xmm0
  186. #else
  187. movq old_ldc, ldc
  188. #ifdef TRMMKERNEL
  189. movq old_offset, %r11
  190. #endif
  191. #endif
  192. vzeroupper
  193. vmovlps ALPHA, MEMALPHA
  194. movq old_bm, bm
  195. movq old_bn, bn
  196. movq old_bk, bk
  197. leaq (, ldc, SIZE), ldc
  198. #ifdef TRMMKERNEL
  199. movq %r11, OFFSET
  200. #ifndef LEFT
  201. negq %r11;
  202. #endif
  203. movq %r11, kk
  204. #endif
  205. MOVQ bn,j;
  206. SARQ $2,j; # Rn = 4
  207. JLE .L0_loopE;
  208. ALIGN_5;
  209. .L0_bodyB:;
  210. #if defined(TRMMKERNEL) && defined(LEFT)
  211. MOVQ OFFSET, %rax;
  212. MOVQ %rax, kk;
  213. #endif
  214. MOVQ C,C0;
  215. LEAQ (C,ldc,2),C1;
  216. MOVQ bk, k;
  217. SALQ $5, k;
  218. LEAQ (bb, k, 1), prebb;
  219. MOVQ ba,ptrba;
  220. MOVQ bm,i;
  221. SARQ $3,i; # Rm = 8
  222. JLE .L1_loopE;
  223. ALIGN_5;
  224. .L1_bodyB:;
  225. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  226. MOVQ bb, ptrbb;
  227. #else
  228. MOVQ bb, ptrbb;
  229. MOVQ kk, %rax;
  230. LEAQ (, %rax, SIZE), %rax;
  231. LEAQ (ptrba, %rax, 8), ptrba;
  232. LEAQ (ptrbb, %rax, 4), ptrbb;
  233. #endif
  234. //#### Initial Results Register ####
  235. PREFETCH2 0*SIZE(prebb);
  236. XOR_DY yvec15, yvec15, yvec15;
  237. PREFETCH2 8*SIZE(prebb);
  238. XOR_DY yvec14, yvec14, yvec14;
  239. XOR_DY yvec13, yvec13, yvec13;
  240. ADDQ $16*SIZE, prebb
  241. XOR_DY yvec12, yvec12, yvec12;
  242. PREFETCH0 3*SIZE(C0)
  243. LD_DY 0*SIZE(ptrbb), yvec2;
  244. PREFETCH0 3*SIZE(C0, ldc, 1)
  245. XOR_DY yvec11, yvec11, yvec11;
  246. PREFETCH0 3*SIZE(C1)
  247. XOR_DY yvec10, yvec10, yvec10;
  248. PREFETCH0 3*SIZE(C1, ldc, 1)
  249. LD_DY 0*SIZE(ptrba), yvec0;
  250. XOR_DY yvec9, yvec9, yvec9;
  251. XOR_DY yvec8, yvec8, yvec8;
  252. VPERMILP_DY $0x05, yvec2, yvec3;
  253. #ifndef TRMMKERNEL
  254. MOVQ bk,k;
  255. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  256. MOVQ bk, %rax;
  257. SUBQ kk, %rax;
  258. MOVQ %rax, kkk;
  259. #else
  260. MOVQ kk, %rax;
  261. #ifdef LEFT
  262. ADDQ $8, %rax;
  263. #else
  264. ADDQ $4, %rax;
  265. #endif
  266. MOVQ %rax, kkk;
  267. #endif
  268. SARQ $2,k;
  269. JLE .L2_loopE;
  270. ALIGN_5;
  271. .L2_bodyB:;
  272. # Computing kernel
  273. //#### Unroll times 1 ####
  274. LD_DY 4*SIZE(ptrba), yvec1;
  275. MUL_DY yvec0, yvec2, yvec6;
  276. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  277. MUL_DY yvec0, yvec3, yvec7;
  278. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  279. ADD_DY yvec15, yvec6, yvec15;
  280. ADD_DY yvec13, yvec7, yvec13;
  281. PREFETCH0 64*SIZE(ptrba)
  282. MUL_DY yvec1, yvec2, yvec6;
  283. LD_DY 4*SIZE(ptrbb), yvec2;
  284. MUL_DY yvec1, yvec3, yvec7;
  285. VPERMILP_DY $0x05, yvec2, yvec3;
  286. ADD_DY yvec14, yvec6, yvec14;
  287. ADD_DY yvec12, yvec7, yvec12;
  288. MUL_DY yvec0, yvec4, yvec6;
  289. MUL_DY yvec0, yvec5, yvec7;
  290. LD_DY 8*SIZE(ptrba), yvec0;
  291. ADD_DY yvec11, yvec6, yvec11;
  292. ADD_DY yvec9, yvec7, yvec9;
  293. MUL_DY yvec1, yvec4, yvec6;
  294. MUL_DY yvec1, yvec5, yvec7;
  295. ADD_DY yvec10, yvec6, yvec10;
  296. ADD_DY yvec8, yvec7, yvec8;
  297. //#### Unroll times 2 ####
  298. LD_DY 12*SIZE(ptrba), yvec1;
  299. MUL_DY yvec0, yvec2, yvec6;
  300. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  301. MUL_DY yvec0, yvec3, yvec7;
  302. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  303. ADD_DY yvec15, yvec6, yvec15;
  304. ADD_DY yvec13, yvec7, yvec13;
  305. PREFETCH0 72*SIZE(ptrba)
  306. MUL_DY yvec1, yvec2, yvec6;
  307. LD_DY 8*SIZE(ptrbb), yvec2;
  308. MUL_DY yvec1, yvec3, yvec7;
  309. VPERMILP_DY $0x05, yvec2, yvec3;
  310. ADD_DY yvec14, yvec6, yvec14;
  311. ADD_DY yvec12, yvec7, yvec12;
  312. MUL_DY yvec0, yvec4, yvec6;
  313. MUL_DY yvec0, yvec5, yvec7;
  314. LD_DY 16*SIZE(ptrba), yvec0;
  315. ADD_DY yvec11, yvec6, yvec11;
  316. ADD_DY yvec9, yvec7, yvec9;
  317. MUL_DY yvec1, yvec4, yvec6;
  318. MUL_DY yvec1, yvec5, yvec7;
  319. ADD_DY yvec10, yvec6, yvec10;
  320. ADD_DY yvec8, yvec7, yvec8;
  321. //#### Unroll times 3 ####
  322. LD_DY 20*SIZE(ptrba), yvec1;
  323. MUL_DY yvec0, yvec2, yvec6;
  324. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  325. MUL_DY yvec0, yvec3, yvec7;
  326. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  327. ADD_DY yvec15, yvec6, yvec15;
  328. ADD_DY yvec13, yvec7, yvec13;
  329. PREFETCH0 80*SIZE(ptrba)
  330. MUL_DY yvec1, yvec2, yvec6;
  331. LD_DY 12*SIZE(ptrbb), yvec2;
  332. ADDQ $16*SIZE, ptrbb;
  333. MUL_DY yvec1, yvec3, yvec7;
  334. VPERMILP_DY $0x05, yvec2, yvec3;
  335. ADD_DY yvec14, yvec6, yvec14;
  336. ADD_DY yvec12, yvec7, yvec12;
  337. MUL_DY yvec0, yvec4, yvec6;
  338. MUL_DY yvec0, yvec5, yvec7;
  339. LD_DY 24*SIZE(ptrba), yvec0;
  340. ADD_DY yvec11, yvec6, yvec11;
  341. ADD_DY yvec9, yvec7, yvec9;
  342. MUL_DY yvec1, yvec4, yvec6;
  343. MUL_DY yvec1, yvec5, yvec7;
  344. ADD_DY yvec10, yvec6, yvec10;
  345. ADD_DY yvec8, yvec7, yvec8;
  346. //#### Unroll times 4 ####
  347. LD_DY 28*SIZE(ptrba), yvec1;
  348. MUL_DY yvec0, yvec2, yvec6;
  349. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  350. MUL_DY yvec0, yvec3, yvec7;
  351. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  352. ADDQ $32*SIZE, ptrba;
  353. ADD_DY yvec15, yvec6, yvec15;
  354. ADD_DY yvec13, yvec7, yvec13;
  355. PREFETCH0 88*SIZE(ptrba)
  356. MUL_DY yvec1, yvec2, yvec6;
  357. LD_DY 0*SIZE(ptrbb), yvec2;
  358. MUL_DY yvec1, yvec3, yvec7;
  359. VPERMILP_DY $0x05, yvec2, yvec3;
  360. ADD_DY yvec14, yvec6, yvec14;
  361. ADD_DY yvec12, yvec7, yvec12;
  362. MUL_DY yvec0, yvec4, yvec6;
  363. MUL_DY yvec0, yvec5, yvec7;
  364. LD_DY 0*SIZE(ptrba), yvec0;
  365. ADD_DY yvec11, yvec6, yvec11;
  366. ADD_DY yvec9, yvec7, yvec9;
  367. MUL_DY yvec1, yvec4, yvec6;
  368. MUL_DY yvec1, yvec5, yvec7;
  369. ADD_DY yvec10, yvec6, yvec10;
  370. ADD_DY yvec8, yvec7, yvec8;
  371. .L2_bodyE:;
  372. DECQ k;
  373. JG .L2_bodyB;
  374. ALIGN_5
  375. .L2_loopE:;
  376. PREFETCH2 0*SIZE(prebb);
  377. ADDQ $8*SIZE, prebb;
  378. #ifndef TRMMKERNEL
  379. TEST $2, bk;
  380. #else
  381. MOVQ kkk, %rax;
  382. TEST $2, %rax;
  383. #endif
  384. JLE .L3_loopE;
  385. ALIGN_5
  386. .L3_bodyB:
  387. //#### Unroll times 1 ####
  388. PREFETCH0 64*SIZE(ptrba)
  389. LD_DY 4*SIZE(ptrba), yvec1;
  390. MUL_DY yvec0, yvec2, yvec6;
  391. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  392. MUL_DY yvec0, yvec3, yvec7;
  393. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  394. ADD_DY yvec15, yvec6, yvec15;
  395. ADD_DY yvec13, yvec7, yvec13;
  396. MUL_DY yvec1, yvec2, yvec6;
  397. LD_DY 4*SIZE(ptrbb), yvec2;
  398. ADDQ $8*SIZE, ptrbb;
  399. MUL_DY yvec1, yvec3, yvec7;
  400. VPERMILP_DY $0x05, yvec2, yvec3;
  401. ADD_DY yvec14, yvec6, yvec14;
  402. ADD_DY yvec12, yvec7, yvec12;
  403. MUL_DY yvec0, yvec4, yvec6;
  404. MUL_DY yvec0, yvec5, yvec7;
  405. LD_DY 8*SIZE(ptrba), yvec0;
  406. ADD_DY yvec11, yvec6, yvec11;
  407. ADD_DY yvec9, yvec7, yvec9;
  408. MUL_DY yvec1, yvec4, yvec6;
  409. MUL_DY yvec1, yvec5, yvec7;
  410. ADD_DY yvec10, yvec6, yvec10;
  411. ADD_DY yvec8, yvec7, yvec8;
  412. //#### Unroll times 2 ####
  413. PREFETCH0 72*SIZE(ptrba)
  414. LD_DY 12*SIZE(ptrba), yvec1;
  415. MUL_DY yvec0, yvec2, yvec6;
  416. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  417. MUL_DY yvec0, yvec3, yvec7;
  418. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  419. ADDQ $16*SIZE, ptrba;
  420. ADD_DY yvec15, yvec6, yvec15;
  421. ADD_DY yvec13, yvec7, yvec13;
  422. MUL_DY yvec1, yvec2, yvec6;
  423. LD_DY 0*SIZE(ptrbb), yvec2;
  424. MUL_DY yvec1, yvec3, yvec7;
  425. VPERMILP_DY $0x05, yvec2, yvec3;
  426. ADD_DY yvec14, yvec6, yvec14;
  427. ADD_DY yvec12, yvec7, yvec12;
  428. MUL_DY yvec0, yvec4, yvec6;
  429. MUL_DY yvec0, yvec5, yvec7;
  430. LD_DY 0*SIZE(ptrba), yvec0;
  431. ADD_DY yvec11, yvec6, yvec11;
  432. ADD_DY yvec9, yvec7, yvec9;
  433. MUL_DY yvec1, yvec4, yvec6;
  434. MUL_DY yvec1, yvec5, yvec7;
  435. ADD_DY yvec10, yvec6, yvec10;
  436. ADD_DY yvec8, yvec7, yvec8;
  437. .L3_loopE:
  438. PREFETCH2 0*SIZE(prebb);
  439. ADDQ $8*SIZE, prebb
  440. #ifndef TRMMKERNEL
  441. TEST $1, bk;
  442. #else
  443. MOVQ kkk, %rax;
  444. TEST $1, %rax;
  445. #endif
  446. JLE .L4_loopE;
  447. ALIGN_5
  448. .L4_bodyB:;
  449. //#### Unroll times 1 ####
  450. PREFETCH0 64*SIZE(ptrba)
  451. LD_DY 4*SIZE(ptrba), yvec1;
  452. MUL_DY yvec0, yvec2, yvec6;
  453. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  454. MUL_DY yvec0, yvec3, yvec7;
  455. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  456. ADDQ $8*SIZE, ptrba;
  457. ADD_DY yvec15, yvec6, yvec15;
  458. ADD_DY yvec13, yvec7, yvec13;
  459. MUL_DY yvec1, yvec2, yvec6;
  460. MUL_DY yvec1, yvec3, yvec7;
  461. ADDQ $4*SIZE, ptrbb;
  462. ADD_DY yvec14, yvec6, yvec14;
  463. ADD_DY yvec12, yvec7, yvec12;
  464. MUL_DY yvec0, yvec4, yvec6;
  465. MUL_DY yvec0, yvec5, yvec7;
  466. ADD_DY yvec11, yvec6, yvec11;
  467. ADD_DY yvec9, yvec7, yvec9;
  468. MUL_DY yvec1, yvec4, yvec6;
  469. MUL_DY yvec1, yvec5, yvec7;
  470. ADD_DY yvec10, yvec6, yvec10;
  471. ADD_DY yvec8, yvec7, yvec8;
  472. .L4_loopE:;
  473. //#### Load Alpha ####
  474. BROAD_DY MEMALPHA,yvec7;
  475. //#### Multiply Alpha ####
  476. MUL_DY yvec7,yvec15,yvec15;
  477. MUL_DY yvec7,yvec14,yvec14;
  478. MUL_DY yvec7,yvec13,yvec13;
  479. MUL_DY yvec7,yvec12,yvec12;
  480. MUL_DY yvec7,yvec11,yvec11;
  481. MUL_DY yvec7,yvec10,yvec10;
  482. MUL_DY yvec7,yvec9,yvec9;
  483. MUL_DY yvec7,yvec8,yvec8;
  484. //#### Reverse the Results ####
  485. MOV_DY yvec15,yvec7;
  486. REVS_DY $0x0a,yvec13,yvec15,yvec15;
  487. REVS_DY $0x0a,yvec7,yvec13,yvec13;
  488. MOV_DY yvec14,yvec7;
  489. REVS_DY $0x0a,yvec12,yvec14,yvec14;
  490. REVS_DY $0x0a,yvec7,yvec12,yvec12;
  491. MOV_DY yvec11,yvec7;
  492. REVS_DY $0x0a,yvec9,yvec11,yvec11;
  493. REVS_DY $0x0a,yvec7,yvec9,yvec9;
  494. MOV_DY yvec10,yvec7;
  495. REVS_DY $0x0a,yvec8,yvec10,yvec10;
  496. REVS_DY $0x0a,yvec7,yvec8,yvec8;
  497. //#### Testing alignment ####
  498. MOVQ C0, %rax;
  499. OR ldc, %rax;
  500. TEST $15, %rax;
  501. JNE .L4_loopEx; # Unalign part write back
  502. ALIGN_5
  503. //#### Writing Back ####
  504. EXTRA_DY $1,yvec15,xvec7;
  505. EXTRA_DY $1,yvec14,xvec6;
  506. EXTRA_DY $1,yvec13,xvec5;
  507. EXTRA_DY $1,yvec12,xvec4;
  508. EXTRA_DY $1,yvec11,xvec3;
  509. EXTRA_DY $1,yvec10,xvec2;
  510. EXTRA_DY $1,yvec9,xvec1;
  511. EXTRA_DY $1,yvec8,xvec0;
  512. #ifndef TRMMKERNEL
  513. ADD_DY 0*SIZE(C0),xvec15,xvec15;
  514. ADD_DY 2*SIZE(C1),xvec7,xvec7;
  515. ADD_DY 4*SIZE(C0),xvec14,xvec14;
  516. ADD_DY 6*SIZE(C1),xvec6,xvec6;
  517. ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13;
  518. ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5;
  519. ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12;
  520. ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4;
  521. ADD_DY 0*SIZE(C1),xvec11,xvec11;
  522. ADD_DY 2*SIZE(C0),xvec3,xvec3;
  523. ADD_DY 4*SIZE(C1),xvec10,xvec10;
  524. ADD_DY 6*SIZE(C0),xvec2,xvec2;
  525. ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9;
  526. ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1;
  527. ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8;
  528. ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0;
  529. #endif
  530. ST_DY xvec15, 0*SIZE(C0);
  531. ST_DY xvec7, 2*SIZE(C1);
  532. ST_DY xvec14, 4*SIZE(C0);
  533. ST_DY xvec6, 6*SIZE(C1);
  534. ST_DY xvec13, 0*SIZE(C0,ldc,1);
  535. ST_DY xvec5, 2*SIZE(C1,ldc,1);
  536. ST_DY xvec12, 4*SIZE(C0,ldc,1);
  537. ST_DY xvec4, 6*SIZE(C1,ldc,1);
  538. ST_DY xvec11, 0*SIZE(C1);
  539. ST_DY xvec3, 2*SIZE(C0);
  540. ST_DY xvec10, 4*SIZE(C1);
  541. ST_DY xvec2, 6*SIZE(C0);
  542. ST_DY xvec9, 0*SIZE(C1,ldc,1);
  543. ST_DY xvec1, 2*SIZE(C0,ldc,1);
  544. ST_DY xvec8, 4*SIZE(C1,ldc,1);
  545. ST_DY xvec0, 6*SIZE(C0,ldc,1);
  546. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  547. MOVQ bk, %rax;
  548. SUBQ kkk, %rax;
  549. LEAQ (, %rax, SIZE), %rax;
  550. LEAQ (ptrba, %rax, 8), ptrba;
  551. LEAQ (ptrbb, %rax, 4), ptrbb;
  552. #endif
  553. #if defined(TRMMKERNEL) && defined(LEFT)
  554. ADDQ $8, kk
  555. #endif
  556. ADDQ $8*SIZE,C0;
  557. ADDQ $8*SIZE,C1;
  558. .L1_bodyE:;
  559. DECQ i;
  560. JG .L1_bodyB;
  561. JMP .L1_loopE;
  562. ALIGN_5;
  563. .L4_loopEx:;
  564. EXTRA_DY $1, yvec15, xvec7;
  565. #ifndef TRMMKERNEL
  566. LDL_DY 0*SIZE(C0), xvec6, xvec6;
  567. LDH_DY 1*SIZE(C0), xvec6, xvec6;
  568. ADD_DY xvec6, xvec15, xvec15;
  569. LDL_DY 2*SIZE(C1), xvec5, xvec5;
  570. LDH_DY 3*SIZE(C1), xvec5, xvec5;
  571. ADD_DY xvec5, xvec7, xvec7;
  572. #endif
  573. STL_DY xvec15, 0*SIZE(C0);
  574. STH_DY xvec15, 1*SIZE(C0);
  575. STL_DY xvec7, 2*SIZE(C1);
  576. STH_DY xvec7, 3*SIZE(C1);
  577. EXTRA_DY $1, yvec14, xvec4;
  578. #ifndef TRMMKERNEL
  579. LDL_DY 4*SIZE(C0), xvec3, xvec3;
  580. LDH_DY 5*SIZE(C0), xvec3, xvec3;
  581. ADD_DY xvec3, xvec14, xvec14;
  582. LDL_DY 6*SIZE(C1), xvec2, xvec2;
  583. LDH_DY 7*SIZE(C1), xvec2, xvec2;
  584. ADD_DY xvec2, xvec4, xvec4;
  585. #endif
  586. STL_DY xvec14, 4*SIZE(C0);
  587. STH_DY xvec14, 5*SIZE(C0);
  588. STL_DY xvec4, 6*SIZE(C1);
  589. STH_DY xvec4, 7*SIZE(C1);
  590. EXTRA_DY $1, yvec13, xvec7;
  591. #ifndef TRMMKERNEL
  592. LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6;
  593. LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6;
  594. ADD_DY xvec6, xvec13, xvec13;
  595. LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5;
  596. LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5;
  597. ADD_DY xvec5, xvec7, xvec7;
  598. #endif
  599. STL_DY xvec13, 0*SIZE(C0, ldc, 1);
  600. STH_DY xvec13, 1*SIZE(C0, ldc, 1);
  601. STL_DY xvec7, 2*SIZE(C1, ldc, 1);
  602. STH_DY xvec7, 3*SIZE(C1, ldc, 1);
  603. EXTRA_DY $1, yvec12, xvec4;
  604. #ifndef TRMMKERNEL
  605. LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3;
  606. LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3;
  607. ADD_DY xvec3, xvec12, xvec12;
  608. LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2;
  609. LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2;
  610. ADD_DY xvec2, xvec4, xvec4;
  611. #endif
  612. STL_DY xvec12, 4*SIZE(C0, ldc, 1);
  613. STH_DY xvec12, 5*SIZE(C0, ldc ,1);
  614. STL_DY xvec4, 6*SIZE(C1, ldc, 1);
  615. STH_DY xvec4, 7*SIZE(C1, ldc, 1);
  616. EXTRA_DY $1, yvec11, xvec7;
  617. #ifndef TRMMKERNEL
  618. LDL_DY 0*SIZE(C1), xvec6, xvec6;
  619. LDH_DY 1*SIZE(C1), xvec6, xvec6;
  620. ADD_DY xvec6, xvec11, xvec11;
  621. LDL_DY 2*SIZE(C0), xvec5, xvec5;
  622. LDH_DY 3*SIZE(C0), xvec5, xvec5;
  623. ADD_DY xvec5, xvec7, xvec7;
  624. #endif
  625. STL_DY xvec11, 0*SIZE(C1);
  626. STH_DY xvec11, 1*SIZE(C1);
  627. STL_DY xvec7, 2*SIZE(C0);
  628. STH_DY xvec7, 3*SIZE(C0);
  629. EXTRA_DY $1, yvec10, xvec4;
  630. #ifndef TRMMKERNEL
  631. LDL_DY 4*SIZE(C1), xvec3, xvec3;
  632. LDH_DY 5*SIZE(C1), xvec3, xvec3;
  633. ADD_DY xvec3, xvec10, xvec10;
  634. LDL_DY 6*SIZE(C0), xvec2, xvec2;
  635. LDH_DY 7*SIZE(C0), xvec2, xvec2;
  636. ADD_DY xvec2, xvec4, xvec4;
  637. #endif
  638. STL_DY xvec10, 4*SIZE(C1);
  639. STH_DY xvec10, 5*SIZE(C1);
  640. STL_DY xvec4, 6*SIZE(C0);
  641. STH_DY xvec4, 7*SIZE(C0);
  642. EXTRA_DY $1, yvec9, xvec7;
  643. #ifndef TRMMKERNEL
  644. LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6;
  645. LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6;
  646. ADD_DY xvec6, xvec9, xvec9;
  647. LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5;
  648. LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5;
  649. ADD_DY xvec5, xvec7, xvec7;
  650. #endif
  651. STL_DY xvec9, 0*SIZE(C1, ldc, 1);
  652. STH_DY xvec9, 1*SIZE(C1, ldc, 1);
  653. STL_DY xvec7, 2*SIZE(C0, ldc, 1);
  654. STH_DY xvec7, 3*SIZE(C0, ldc, 1);
  655. EXTRA_DY $1, yvec8, xvec4;
  656. #ifndef TRMMKERNEL
  657. LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
  658. LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3;
  659. ADD_DY xvec3, xvec8, xvec8;
  660. LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2;
  661. LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2;
  662. ADD_DY xvec2, xvec4, xvec4;
  663. #endif
  664. STL_DY xvec8, 4*SIZE(C1, ldc, 1);
  665. STH_DY xvec8, 5*SIZE(C1, ldc, 1);
  666. STL_DY xvec4, 6*SIZE(C0, ldc, 1);
  667. STH_DY xvec4, 7*SIZE(C0, ldc, 1);
  668. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  669. MOVQ bk, %rax;
  670. SUBQ kkk, %rax;
  671. LEAQ (, %rax, SIZE), %rax;
  672. LEAQ (ptrba, %rax, 8), ptrba;
  673. LEAQ (ptrbb, %rax, 4), ptrbb;
  674. #endif
  675. #if defined(TRMMKERNEL) && defined(LEFT)
  676. ADDQ $8, kk
  677. #endif
  678. ADDQ $8*SIZE, C0;
  679. ADDQ $8*SIZE, C1;
  680. DECQ i;
  681. JG .L1_bodyB;
  682. ALIGN_5
  683. .L1_loopE:;
  684. TEST $4, bm; # Rm = 4
  685. JLE .L5_loopE;
  686. ALIGN_5
  687. .L5_bodyB:;
  688. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  689. MOVQ bb, ptrbb;
  690. #else
  691. MOVQ bb, ptrbb;
  692. MOVQ kk, %rax;
  693. LEAQ (, %rax, SIZE), %rax;
  694. LEAQ (ptrba, %rax, 4), ptrba;
  695. LEAQ (ptrbb, %rax, 4), ptrbb;
  696. #endif
  697. //#### Initial Results Register ####
  698. XOR_DY yvec15, yvec15, yvec15;
  699. XOR_DY yvec13, yvec13, yvec13;
  700. LD_DY 0*SIZE(ptrbb), yvec2;
  701. XOR_DY yvec11, yvec11, yvec11;
  702. XOR_DY yvec9, yvec9, yvec9;
  703. LD_DY 0*SIZE(ptrba), yvec0;
  704. VPERMILP_DY $0x05, yvec2, yvec3;
  705. #ifndef TRMMKERNEL
  706. MOVQ bk, k;
  707. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  708. MOVQ bk, %rax;
  709. SUBQ kk, %rax;
  710. MOVQ %rax, kkk;
  711. #else
  712. MOVQ kk, %rax;
  713. #ifdef LEFT
  714. ADDQ $4, %rax;
  715. #else
  716. ADDQ $4, %rax;
  717. #endif
  718. MOVQ %rax, kkk;
  719. #endif
  720. SARQ $2, k;
  721. JLE .L6_loopE;
  722. ALIGN_5;
  723. .L6_bodyB:;
  724. # Computing kernel
  725. //#### Untoll time 1 ####
  726. LD_DY 4*SIZE(ptrba), yvec1;
  727. MUL_DY yvec0, yvec2, yvec6;
  728. ADD_DY yvec15, yvec6, yvec15;
  729. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  730. MUL_DY yvec0, yvec3, yvec7;
  731. ADD_DY yvec13, yvec7, yvec13;
  732. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  733. LD_DY 4*SIZE(ptrbb), yvec2;
  734. MUL_DY yvec0, yvec4, yvec6;
  735. ADD_DY yvec11, yvec6, yvec11;
  736. VPERMILP_DY $0x05, yvec2, yvec3;
  737. MUL_DY yvec0, yvec5, yvec7;
  738. ADD_DY yvec9, yvec7, yvec9;
  739. //#### Untoll time 2 ####
  740. LD_DY 8*SIZE(ptrba), yvec0;
  741. MUL_DY yvec1, yvec2, yvec6;
  742. ADD_DY yvec15, yvec6, yvec15;
  743. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  744. MUL_DY yvec1, yvec3, yvec7;
  745. ADD_DY yvec13, yvec7, yvec13;
  746. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  747. LD_DY 8*SIZE(ptrbb), yvec2;
  748. MUL_DY yvec1, yvec4, yvec6;
  749. ADD_DY yvec11, yvec6, yvec11;
  750. VPERMILP_DY $0x05, yvec2, yvec3;
  751. MUL_DY yvec1, yvec5, yvec7;
  752. ADD_DY yvec9, yvec7, yvec9;
  753. //#### Untoll time 3 ####
  754. LD_DY 12*SIZE(ptrba), yvec1;
  755. MUL_DY yvec0, yvec2, yvec6;
  756. ADD_DY yvec15, yvec6, yvec15;
  757. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  758. ADDQ $16*SIZE, ptrba;
  759. MUL_DY yvec0, yvec3, yvec7;
  760. ADD_DY yvec13, yvec7, yvec13;
  761. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  762. LD_DY 12*SIZE(ptrbb), yvec2;
  763. MUL_DY yvec0, yvec4, yvec6;
  764. ADD_DY yvec11, yvec6, yvec11;
  765. VPERMILP_DY $0x05, yvec2, yvec3;
  766. ADDQ $16*SIZE, ptrbb;
  767. MUL_DY yvec0, yvec5, yvec7;
  768. ADD_DY yvec9, yvec7, yvec9;
  769. //#### Untoll time 4 ####
  770. LD_DY 0*SIZE(ptrba), yvec0;
  771. MUL_DY yvec1, yvec2, yvec6;
  772. ADD_DY yvec15, yvec6, yvec15;
  773. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  774. MUL_DY yvec1, yvec3, yvec7;
  775. ADD_DY yvec13, yvec7, yvec13;
  776. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  777. LD_DY 0*SIZE(ptrbb), yvec2;
  778. MUL_DY yvec1, yvec4, yvec6;
  779. ADD_DY yvec11, yvec6, yvec11;
  780. VPERMILP_DY $0x05, yvec2, yvec3;
  781. MUL_DY yvec1, yvec5, yvec7;
  782. ADD_DY yvec9, yvec7, yvec9;
  783. DECQ k;
  784. JG .L6_bodyB;
  785. ALIGN_5
  786. .L6_loopE:;
  787. #ifndef TRMMKERNEL
  788. TEST $2, bk;
  789. #else
  790. MOVQ kkk, %rax;
  791. TEST $2, %rax;
  792. #endif
  793. JLE .L7_loopE;
  794. ALIGN_5
  795. .L7_bodyB:;
  796. //#### Untoll time 1 ####
  797. LD_DY 4*SIZE(ptrba), yvec1;
  798. MUL_DY yvec0, yvec2, yvec6;
  799. ADD_DY yvec15, yvec6, yvec15;
  800. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  801. ADDQ $8*SIZE, ptrba;
  802. MUL_DY yvec0, yvec3, yvec7;
  803. ADD_DY yvec13, yvec7, yvec13;
  804. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  805. LD_DY 4*SIZE(ptrbb), yvec2;
  806. MUL_DY yvec0, yvec4, yvec6;
  807. ADD_DY yvec11, yvec6, yvec11;
  808. VPERMILP_DY $0x05, yvec2, yvec3;
  809. ADDQ $8*SIZE, ptrbb;
  810. MUL_DY yvec0, yvec5, yvec7;
  811. ADD_DY yvec9, yvec7, yvec9;
  812. //#### Untoll time 2 ####
  813. LD_DY 0*SIZE(ptrba), yvec0;
  814. MUL_DY yvec1, yvec2, yvec6;
  815. ADD_DY yvec15, yvec6, yvec15;
  816. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  817. MUL_DY yvec1, yvec3, yvec7;
  818. ADD_DY yvec13, yvec7, yvec13;
  819. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  820. LD_DY 0*SIZE(ptrbb), yvec2;
  821. MUL_DY yvec1, yvec4, yvec6;
  822. ADD_DY yvec11, yvec6, yvec11;
  823. VPERMILP_DY $0x05, yvec2, yvec3;
  824. MUL_DY yvec1, yvec5, yvec7;
  825. ADD_DY yvec9, yvec7, yvec9;
  826. .L7_loopE:;
  827. #ifndef TRMMKERNEL
  828. TEST $1, bk
  829. #else
  830. MOVQ kkk, %rax;
  831. TEST $1, %rax;
  832. #endif
  833. JLE .L8_loopE;
  834. ALIGN_5
  835. .L8_bodyB:;
  836. //#### Untoll time 1 ####
  837. MUL_DY yvec0, yvec2, yvec6;
  838. ADD_DY yvec15, yvec6, yvec15;
  839. SHUF_DY $0x03, yvec2, yvec2, yvec4;
  840. ADDQ $4*SIZE, ptrba;
  841. MUL_DY yvec0, yvec3, yvec7;
  842. ADD_DY yvec13, yvec7, yvec13;
  843. SHUF_DY $0x03, yvec3, yvec3, yvec5;
  844. MUL_DY yvec0, yvec4, yvec6;
  845. ADD_DY yvec11, yvec6, yvec11;
  846. ADDQ $4*SIZE, ptrbb;
  847. MUL_DY yvec0, yvec5, yvec7;
  848. ADD_DY yvec9, yvec7, yvec9;
  849. .L8_loopE:;
  850. //#### Load Alpha ####
  851. BROAD_DY MEMALPHA, yvec7;
  852. //#### Multiply Alpha ####
  853. MUL_DY yvec7,yvec15,yvec15;
  854. MUL_DY yvec7,yvec13,yvec13;
  855. MUL_DY yvec7,yvec11,yvec11;
  856. MUL_DY yvec7,yvec9,yvec9;
  857. //#### Reverse the Results ####
  858. MOV_DY yvec15, yvec7;
  859. REVS_DY $0x0a,yvec13,yvec15,yvec15;
  860. REVS_DY $0x0a,yvec7,yvec13,yvec13;
  861. MOV_DY yvec11,yvec7;
  862. REVS_DY $0x0a,yvec9,yvec11,yvec11;
  863. REVS_DY $0x0a,yvec7,yvec9,yvec9;
  864. //#### Testing alignment ####
  865. MOVQ C0, %rax;
  866. OR ldc, %rax;
  867. TEST $15, %rax;
  868. JNE .L8_loopEx; # Unalign part write back
  869. ALIGN_5
  870. //#### Writing Back ####
  871. EXTRA_DY $1,yvec15,xvec7;
  872. EXTRA_DY $1,yvec13,xvec5;
  873. EXTRA_DY $1,yvec11,xvec3;
  874. EXTRA_DY $1,yvec9,xvec1;
  875. #ifndef TRMMKERNEL
  876. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  877. ADD_DX 2*SIZE(C1), xvec7, xvec7;
  878. ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13;
  879. ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5;
  880. ADD_DX 0*SIZE(C1), xvec11, xvec11;
  881. ADD_DX 2*SIZE(C0), xvec3, xvec3;
  882. ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9;
  883. ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1;
  884. #endif
  885. ST_DX xvec15, 0*SIZE(C0);
  886. ST_DX xvec7, 2*SIZE(C1);
  887. ST_DX xvec13, 0*SIZE(C0,ldc,1);
  888. ST_DX xvec5, 2*SIZE(C1,ldc,1);
  889. ST_DX xvec11, 0*SIZE(C1);
  890. ST_DX xvec3, 2*SIZE(C0);
  891. ST_DX xvec9, 0*SIZE(C1,ldc,1);
  892. ST_DX xvec1, 2*SIZE(C0,ldc,1);
  893. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  894. MOVQ bk, %rax;
  895. SUBQ kkk, %rax;
  896. LEAQ (, %rax, SIZE), %rax;
  897. LEAQ (ptrba, %rax, 4), ptrba;
  898. LEAQ (ptrbb, %rax, 4), ptrbb;
  899. #endif
  900. #if defined(TRMMKERNEL)&&defined(LEFT)
  901. ADDQ $4, kk
  902. #endif
  903. ADDQ $4*SIZE, C0;
  904. ADDQ $4*SIZE, C1;
  905. JMP .L5_loopE;
  906. ALIGN_5
  907. .L8_loopEx:;
  908. EXTRA_DY $1,yvec15,xvec7;
  909. EXTRA_DY $1,yvec13,xvec5;
  910. EXTRA_DY $1,yvec11,xvec3;
  911. EXTRA_DY $1,yvec9,xvec1;
  912. #ifndef TRMMKERNEL
  913. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  914. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  915. LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
  916. LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
  917. LDL_DX 0*SIZE(C1), xvec10, xvec10;
  918. LDH_DX 1*SIZE(C1), xvec10, xvec10;
  919. LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
  920. LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
  921. ADD_DX xvec14, xvec15, xvec15;
  922. ADD_DX xvec12, xvec13, xvec13;
  923. ADD_DX xvec10, xvec11, xvec11;
  924. ADD_DX xvec8, xvec9, xvec9;
  925. #endif
  926. STL_DX xvec15, 0*SIZE(C0);
  927. STH_DX xvec15, 1*SIZE(C0);
  928. STL_DX xvec13, 0*SIZE(C0, ldc, 1);
  929. STH_DX xvec13, 1*SIZE(C0, ldc, 1);
  930. STL_DX xvec11, 0*SIZE(C1);
  931. STH_DX xvec11, 1*SIZE(C1);
  932. STL_DX xvec9, 0*SIZE(C1, ldc, 1);
  933. STH_DX xvec9, 1*SIZE(C1, ldc, 1);
  934. #ifndef TRMMKERNEL
  935. LDL_DX 2*SIZE(C0), xvec0, xvec0;
  936. LDH_DX 3*SIZE(C0), xvec0, xvec0;
  937. LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2;
  938. LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2;
  939. LDL_DX 2*SIZE(C1), xvec4, xvec4;
  940. LDH_DX 3*SIZE(C1), xvec4, xvec4;
  941. LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6;
  942. LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6;
  943. ADD_DX xvec0, xvec3, xvec3;
  944. ADD_DX xvec2, xvec1, xvec1;
  945. ADD_DX xvec4, xvec7, xvec7;
  946. ADD_DX xvec6, xvec5, xvec5;
  947. #endif
  948. STL_DX xvec3, 2*SIZE(C0);
  949. STH_DX xvec3, 3*SIZE(C0);
  950. STL_DX xvec1, 2*SIZE(C0, ldc, 1);
  951. STH_DX xvec1, 3*SIZE(C0, ldc, 1);
  952. STL_DX xvec7, 2*SIZE(C1);
  953. STH_DX xvec7, 3*SIZE(C1);
  954. STL_DX xvec5, 2*SIZE(C1, ldc, 1);
  955. STH_DX xvec5, 3*SIZE(C1, ldc, 1);
  956. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  957. MOVQ bk, %rax;
  958. SUBQ kkk, %rax;
  959. LEAQ (, %rax, SIZE), %rax;
  960. LEAQ (ptrba, %rax, 4), ptrba;
  961. LEAQ (ptrbb, %rax, 4), ptrbb;
  962. #endif
  963. #if defined(TRMMKERNEL)&&defined(LEFT)
  964. ADDQ $4, kk
  965. #endif
  966. ADDQ $4*SIZE, C0;
  967. ADDQ $4*SIZE, C1;
  968. .L5_loopE:;
  969. TEST $2, bm;
  970. JLE .L9_loopE;
  971. ALIGN_5
  972. .L9_bodyB:;
  973. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  974. MOVQ bb, ptrbb;
  975. #else
  976. MOVQ bb, ptrbb;
  977. MOVQ kk, %rax;
  978. LEAQ (, %rax, SIZE), %rax;
  979. LEAQ (ptrba, %rax, 2), ptrba;
  980. LEAQ (ptrbb, %rax, 4), ptrbb
  981. #endif
  982. //#### Initial Results Register ####
  983. LD_DX 0*SIZE(ptrbb), xvec2;
  984. XOR_DY yvec15, yvec15, yvec15;
  985. LD_DX 2*SIZE(ptrbb), xvec3;
  986. XOR_DY yvec13, yvec13, yvec13;
  987. LD_DX 0*SIZE(ptrba), xvec0;
  988. XOR_DY yvec11, yvec11, yvec11;
  989. SHUF_DX $0x4e, xvec2, xvec4;
  990. XOR_DY yvec9, yvec9, yvec9;
  991. #ifndef TRMMKERNEL
  992. MOVQ bk, k;
  993. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  994. MOVQ bk, %rax;
  995. SUBQ kk, %rax;
  996. MOVQ %rax, kkk;
  997. #else
  998. MOVQ kk, %rax;
  999. #ifdef LEFT
  1000. ADDQ $2, %rax;
  1001. #else
  1002. ADDQ $4, %rax;
  1003. #endif
  1004. MOVQ %rax, kkk;
  1005. #endif
  1006. SARQ $2, k;
  1007. JLE .L10_loopE;
  1008. ALIGN_5;
  1009. .L10_bodyB:;
  1010. # Computing kernel
  1011. //#### Unroll time 1 ####
  1012. LD_DX 4*SIZE(ptrbb), xvec6;
  1013. SHUF_DX $0x4e, xvec3, xvec5;
  1014. MUL_DX xvec0, xvec2, xvec2;
  1015. ADD_DX xvec2, xvec15, xvec15;
  1016. LD_DX 6*SIZE(ptrbb), xvec7;
  1017. MUL_DX xvec0, xvec3, xvec3;
  1018. ADD_DX xvec3, xvec11, xvec11;
  1019. LD_DX 2*SIZE(ptrba), xvec1;
  1020. MUL_DX xvec0, xvec4, xvec4;
  1021. ADD_DX xvec4, xvec13, xvec13;
  1022. SHUF_DX $0x4e, xvec6, xvec4;
  1023. MUL_DX xvec0, xvec5, xvec5;
  1024. ADD_DX xvec5, xvec9, xvec9;
  1025. //#### Unroll time 2 ####
  1026. LD_DX 8*SIZE(ptrbb), xvec2;
  1027. SHUF_DX $0x4e, xvec7, xvec5;
  1028. MUL_DX xvec1, xvec6, xvec6;
  1029. ADD_DX xvec6, xvec15, xvec15;
  1030. LD_DX 10*SIZE(ptrbb), xvec3;
  1031. MUL_DX xvec1, xvec7, xvec7;
  1032. ADD_DX xvec7, xvec11, xvec11;
  1033. LD_DX 4*SIZE(ptrba), xvec0;
  1034. MUL_DX xvec1, xvec4, xvec4;
  1035. ADD_DX xvec4, xvec13, xvec13;
  1036. SHUF_DX $0x4e, xvec2, xvec4;
  1037. MUL_DX xvec1, xvec5, xvec5;
  1038. ADD_DX xvec5, xvec9, xvec9;
  1039. //#### Unroll time 3 ####
  1040. LD_DX 12*SIZE(ptrbb), xvec6;
  1041. SHUF_DX $0x4e, xvec3, xvec5;
  1042. MUL_DX xvec0, xvec2, xvec2;
  1043. ADD_DX xvec2, xvec15, xvec15;
  1044. LD_DX 14*SIZE(ptrbb), xvec7;
  1045. MUL_DX xvec0, xvec3, xvec3;
  1046. ADD_DX xvec3, xvec11, xvec11;
  1047. ADDQ $16*SIZE, ptrbb;
  1048. LD_DX 6*SIZE(ptrba), xvec1;
  1049. MUL_DX xvec0, xvec4, xvec4;
  1050. ADD_DX xvec4, xvec13, xvec13;
  1051. SHUF_DX $0x4e, xvec6, xvec4;
  1052. ADDQ $8*SIZE, ptrba;
  1053. MUL_DX xvec0, xvec5, xvec5;
  1054. ADD_DX xvec5, xvec9, xvec9;
  1055. //#### Unroll time 4 ####
  1056. LD_DX 0*SIZE(ptrbb), xvec2;
  1057. SHUF_DX $0x4e, xvec7, xvec5;
  1058. MUL_DX xvec1, xvec6, xvec6;
  1059. ADD_DX xvec6, xvec15, xvec15;
  1060. LD_DX 2*SIZE(ptrbb), xvec3;
  1061. MUL_DX xvec1, xvec7, xvec7;
  1062. ADD_DX xvec7, xvec11, xvec11;
  1063. LD_DX 0*SIZE(ptrba), xvec0;
  1064. MUL_DX xvec1, xvec4, xvec4;
  1065. ADD_DX xvec4, xvec13, xvec13;
  1066. SHUF_DX $0x4e, xvec2, xvec4;
  1067. MUL_DX xvec1, xvec5, xvec5;
  1068. ADD_DX xvec5, xvec9, xvec9;
  1069. DECQ k;
  1070. JG .L10_bodyB;
  1071. ALIGN_5
  1072. .L10_loopE:;
  1073. #ifndef TRMMKERNEL
  1074. TEST $2, bk
  1075. #else
  1076. MOVQ kkk, %rax;
  1077. TEST $2, %rax;
  1078. #endif
  1079. JLE .L11_loopE;
  1080. ALIGN_5
  1081. .L11_bodyB:;
  1082. //#### Unroll time 1 ####
  1083. LD_DX 4*SIZE(ptrbb), xvec6;
  1084. SHUF_DX $0x4e, xvec3, xvec5;
  1085. MUL_DX xvec0, xvec2, xvec2;
  1086. ADD_DX xvec2, xvec15, xvec15;
  1087. LD_DX 6*SIZE(ptrbb), xvec7;
  1088. MUL_DX xvec0, xvec3, xvec3;
  1089. ADD_DX xvec3, xvec11, xvec11;
  1090. ADDQ $8*SIZE, ptrbb;
  1091. LD_DX 2*SIZE(ptrba), xvec1;
  1092. MUL_DX xvec0, xvec4, xvec4;
  1093. ADD_DX xvec4, xvec13, xvec13;
  1094. SHUF_DX $0x4e, xvec6, xvec4;
  1095. ADDQ $4*SIZE, ptrba;
  1096. MUL_DX xvec0, xvec5, xvec5;
  1097. ADD_DX xvec5, xvec9, xvec9;
  1098. //#### Unroll time 2 ####
  1099. LD_DX 0*SIZE(ptrbb), xvec2;
  1100. SHUF_DX $0x4e, xvec7, xvec5;
  1101. MUL_DX xvec1, xvec6, xvec6;
  1102. ADD_DX xvec6, xvec15, xvec15;
  1103. LD_DX 2*SIZE(ptrbb), xvec3;
  1104. MUL_DX xvec1, xvec7, xvec7;
  1105. ADD_DX xvec7, xvec11, xvec11;
  1106. LD_DX 0*SIZE(ptrba), xvec0;
  1107. MUL_DX xvec1, xvec4, xvec4;
  1108. ADD_DX xvec4, xvec13, xvec13;
  1109. SHUF_DX $0x4e, xvec2, xvec4;
  1110. MUL_DX xvec1, xvec5, xvec5;
  1111. ADD_DX xvec5, xvec9, xvec9;
  1112. .L11_loopE:;
  1113. #ifndef TRMMKERNEL
  1114. TEST $1, bk
  1115. #else
  1116. MOVQ kkk, %rax;
  1117. TEST $1, %rax;
  1118. #endif
  1119. JLE .L12_loopE;
  1120. ALIGN_5
  1121. .L12_bodyB:;
  1122. SHUF_DX $0x4e, xvec3, xvec5;
  1123. MUL_DX xvec0, xvec2, xvec2;
  1124. ADD_DX xvec2, xvec15, xvec15;
  1125. ADDQ $4*SIZE, ptrbb;
  1126. MUL_DX xvec0, xvec3, xvec3;
  1127. ADD_DX xvec3, xvec11, xvec11;
  1128. ADDQ $2*SIZE, ptrba;
  1129. MUL_DX xvec0, xvec4, xvec4;
  1130. ADD_DX xvec4, xvec13, xvec13;
  1131. MUL_DX xvec0, xvec5, xvec5;
  1132. ADD_DX xvec5, xvec9, xvec9;
  1133. .L12_loopE:;
  1134. //#### Load Alpha ####
  1135. BROAD_DX MEMALPHA, xvec7;
  1136. //#### Multiply Alpha ####
  1137. MUL_DX xvec7, xvec15, xvec15;
  1138. MUL_DX xvec7, xvec13, xvec13;
  1139. MUL_DX xvec7, xvec11, xvec11;
  1140. MUL_DX xvec7, xvec9, xvec9;
  1141. //#### Reverse the Results ####
  1142. MOV_DX xvec15, xvec6;
  1143. REVS_DX xvec13, xvec15, xvec15;
  1144. REVS_DX xvec6, xvec13, xvec13;
  1145. MOV_DX xvec11, xvec6;
  1146. REVS_DX xvec9, xvec11, xvec11;
  1147. REVS_DX xvec6, xvec9, xvec9;
  1148. //#### Testing Alignment ####
  1149. MOVQ C0, %rax;
  1150. OR ldc, %rax;
  1151. TEST $15, %rax;
  1152. JNE .L12_loopEx;
  1153. ALIGN_5
  1154. //#### Writing Back ####
  1155. #ifndef TRMMKERNEL
  1156. ADD_DX 0*SIZE(C0), xvec13, xvec13;
  1157. ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15;
  1158. ADD_DX 0*SIZE(C1), xvec9, xvec9;
  1159. ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11;
  1160. #endif
  1161. ST_DX xvec13, 0*SIZE(C0);
  1162. ST_DX xvec15, 0*SIZE(C0, ldc, 1);
  1163. ST_DX xvec9, 0*SIZE(C1);
  1164. ST_DX xvec11, 0*SIZE(C1, ldc, 1);
  1165. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1166. MOVQ bk, %rax;
  1167. SUBQ kkk, %rax;
  1168. LEAQ (,%rax, SIZE), %rax;
  1169. LEAQ (ptrba, %rax, 2), ptrba;
  1170. LEAQ (ptrbb, %rax, 4), ptrbb;
  1171. #endif
  1172. #if defined(TRMMKERNEL) && defined(LEFT)
  1173. ADDQ $2, kk
  1174. #endif
  1175. ADDQ $2*SIZE, C0
  1176. ADDQ $2*SIZE, C1
  1177. JMP .L9_loopE;
  1178. ALIGN_5
  1179. .L12_loopEx:
  1180. #ifndef TRMMKERNEL
  1181. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  1182. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  1183. LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
  1184. LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
  1185. LDL_DX 0*SIZE(C1), xvec10, xvec10;
  1186. LDH_DX 1*SIZE(C1), xvec10, xvec10;
  1187. LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
  1188. LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
  1189. ADD_DX xvec14, xvec13, xvec13;
  1190. ADD_DX xvec12, xvec15, xvec15;
  1191. ADD_DX xvec10, xvec9, xvec9;
  1192. ADD_DX xvec8, xvec11, xvec11;
  1193. #endif
  1194. STL_DX xvec13, 0*SIZE(C0);
  1195. STH_DX xvec13, 1*SIZE(C0);
  1196. STL_DX xvec15, 0*SIZE(C0, ldc, 1);
  1197. STH_DX xvec15, 1*SIZE(C0, ldc, 1);
  1198. STL_DX xvec9, 0*SIZE(C1);
  1199. STH_DX xvec9, 1*SIZE(C1);
  1200. STL_DX xvec11, 0*SIZE(C1, ldc, 1);
  1201. STH_DX xvec11, 1*SIZE(C1, ldc, 1);
  1202. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1203. MOVQ bk, %rax;
  1204. SUBQ kkk, %rax;
  1205. LEAQ (,%rax, SIZE), %rax;
  1206. LEAQ (ptrba, %rax, 2), ptrba;
  1207. LEAQ (ptrbb, %rax, 4), ptrbb;
  1208. #endif
  1209. #if defined(TRMMKERNEL) && defined(LEFT)
  1210. ADDQ $2, kk
  1211. #endif
  1212. ADDQ $2*SIZE, C0;
  1213. ADDQ $2*SIZE, C1;
  1214. .L9_loopE:;
  1215. TEST $1, bm
  1216. JLE .L13_loopE;
  1217. ALIGN_5
  1218. .L13_bodyB:;
  1219. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1220. MOVQ bb, ptrbb;
  1221. #else
  1222. MOVQ bb, ptrbb;
  1223. MOVQ kk, %rax;
  1224. LEAQ (,%rax, SIZE), %rax;
  1225. ADDQ %rax, ptrba;
  1226. LEAQ (ptrbb, %rax, 4), ptrbb;
  1227. #endif
  1228. //#### Initial Results Register ####
  1229. XOR_DY yvec15, yvec15, yvec15;
  1230. #ifndef TRMMKERNEL
  1231. MOVQ bk, k;
  1232. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1233. MOVQ bk, %rax;
  1234. SUBQ kk, %rax;
  1235. MOVQ %rax, kkk;
  1236. #else
  1237. MOVQ kk, %rax;
  1238. #ifdef LEFT
  1239. ADDQ $1, %rax;
  1240. #else
  1241. ADDQ $4, %rax;
  1242. #endif
  1243. MOVQ %rax, kkk;
  1244. #endif
  1245. SARQ $2, k;
  1246. JLE .L14_loopE;
  1247. ALIGN_5
  1248. .L14_bodyB:;
  1249. BROAD_DY 0*SIZE(ptrba), yvec0;
  1250. LD_DY 0*SIZE(ptrbb), yvec2;
  1251. MUL_DY yvec0, yvec2, yvec6;
  1252. ADD_DY yvec15, yvec6, yvec15;
  1253. BROAD_DY 1*SIZE(ptrba), yvec1;
  1254. LD_DY 4*SIZE(ptrbb), yvec3;
  1255. MUL_DY yvec1, yvec3, yvec7;
  1256. ADD_DY yvec15, yvec7, yvec15;
  1257. BROAD_DY 2*SIZE(ptrba), yvec0;
  1258. LD_DY 8*SIZE(ptrbb), yvec2;
  1259. MUL_DY yvec0, yvec2, yvec6;
  1260. ADD_DY yvec15, yvec6, yvec15;
  1261. BROAD_DY 3*SIZE(ptrba), yvec1;
  1262. LD_DY 12*SIZE(ptrbb), yvec3;
  1263. MUL_DY yvec1, yvec3, yvec7;
  1264. ADD_DY yvec15, yvec7, yvec15;
  1265. ADDQ $4*SIZE, ptrba;
  1266. ADDQ $16*SIZE, ptrbb;
  1267. DECQ k;
  1268. JG .L14_bodyB;
  1269. ALIGN_5
  1270. .L14_loopE:
  1271. #ifndef TRMMKERNEL
  1272. TEST $2, bk;
  1273. #else
  1274. MOVQ kkk, %rax;
  1275. TEST $2, %rax;
  1276. #endif
  1277. JLE .L15_loopE;
  1278. ALIGN_5
  1279. .L15_bodyB:
  1280. BROAD_DY 0*SIZE(ptrba), yvec0;
  1281. LD_DY 0*SIZE(ptrbb), yvec2;
  1282. MUL_DY yvec0, yvec2, yvec6;
  1283. ADD_DY yvec15, yvec6, yvec15;
  1284. BROAD_DY 1*SIZE(ptrba), yvec1;
  1285. LD_DY 4*SIZE(ptrbb), yvec3;
  1286. MUL_DY yvec1, yvec3, yvec7;
  1287. ADD_DY yvec15, yvec7, yvec15;
  1288. ADDQ $2*SIZE, ptrba;
  1289. ADDQ $8*SIZE, ptrbb;
  1290. .L15_loopE:;
  1291. #ifndef TRMMKERNEL
  1292. TEST $1, bk;
  1293. #else
  1294. MOVQ kkk, %rax;
  1295. TEST $1, %rax;
  1296. #endif
  1297. JLE .L16_loopE;
  1298. ALIGN_5
  1299. .L16_bodyB:;
  1300. BROAD_DY 0*SIZE(ptrba), yvec0;
  1301. LD_DY 0*SIZE(ptrbb), yvec2;
  1302. MUL_DY yvec0, yvec2, yvec6;
  1303. ADD_DY yvec15, yvec6, yvec15;
  1304. ADDQ $1*SIZE, ptrba;
  1305. ADDQ $4*SIZE, ptrbb;
  1306. .L16_loopE:
  1307. //#### Load Alpha ####
  1308. BROAD_DY MEMALPHA, yvec7;
  1309. //#### Multiply Alpha ####
  1310. MUL_DY yvec15, yvec7, yvec15;
  1311. //#### Writing Back ####
  1312. EXTRA_DY $1, yvec15, xvec7;
  1313. #ifndef TRMMKERNEL
  1314. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1315. LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0;
  1316. LDL_DX 0*SIZE(C1), xvec1, xvec1;
  1317. LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1;
  1318. ADD_DX xvec0, xvec15, xvec15;
  1319. ADD_DX xvec1, xvec7, xvec7;
  1320. #endif
  1321. STL_DX xvec15, 0*SIZE(C0);
  1322. STH_DX xvec15, 0*SIZE(C0, ldc, 1);
  1323. STL_DX xvec7, 0*SIZE(C1);
  1324. STH_DX xvec7, 0*SIZE(C1, ldc, 1);
  1325. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1326. MOVQ bk, %rax;
  1327. SUBQ kkk, %rax;
  1328. LEAQ (,%rax, SIZE), %rax;
  1329. ADDQ %rax, ptrba;
  1330. LEAQ (ptrbb, %rax, 4), ptrbb;
  1331. #endif
  1332. #if defined(TRMMKERNEL)&&defined(LEFT)
  1333. ADDQ $1, kk
  1334. #endif
  1335. ADDQ $1*SIZE, C0
  1336. ADDQ $1*SIZE, C1
  1337. .L13_loopE:;
  1338. #if defined(TRMMKERNEL)&&!defined(LEFT)
  1339. ADDQ $4, kk
  1340. #endif
  1341. MOVQ bk,k;
  1342. SALQ $5,k;
  1343. ADDQ k,bb;
  1344. LEAQ (C,ldc,4),C;
  1345. .L0_bodyE:;
  1346. DECQ j;
  1347. JG .L0_bodyB;
  1348. ALIGN_5;
  1349. .L0_loopE:;
  1350. TEST $2, bn;
  1351. JLE .L20_loopE;
  1352. ALIGN_5;
  1353. .L20_loopB:;
  1354. #if defined(TRMMKERNEL) && defined(LEFT)
  1355. MOVQ OFFSET, %rax;
  1356. MOVQ %rax, kk
  1357. #endif
  1358. MOVQ C, C0;
  1359. LEAQ (C, ldc, 1), C1;
  1360. MOVQ ba, ptrba;
  1361. MOVQ bm, i;
  1362. SARQ $3, i; # Rm = 8
  1363. JLE .L21_loopE;
  1364. ALIGN_5;
  1365. .L21_bodyB:;
  1366. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1367. MOVQ bb, ptrbb;
  1368. #else
  1369. MOVQ bb, ptrbb;
  1370. MOVQ kk, %rax;
  1371. LEAQ (, %rax, SIZE), %rax;
  1372. LEAQ (ptrba, %rax, 8), ptrba;
  1373. LEAQ (ptrbb, %rax, 2), ptrbb;
  1374. #endif
  1375. //#### Initial Results Register ####
  1376. XOR_DY yvec15, yvec15, yvec15;
  1377. XOR_DY yvec14, yvec14, yvec14;
  1378. XOR_DY yvec13, yvec13, yvec13;
  1379. XOR_DY yvec12, yvec12, yvec12;
  1380. XOR_DY yvec11, yvec11, yvec11;
  1381. XOR_DY yvec10, yvec10, yvec10;
  1382. XOR_DY yvec9, yvec9, yvec9;
  1383. XOR_DY yvec8, yvec8, yvec8;
  1384. #ifndef TRMMKERNEL
  1385. MOVQ bk, k;
  1386. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1387. MOVQ bk, %rax;
  1388. SUBQ kk, %rax;
  1389. MOVQ %rax, kkk;
  1390. #else
  1391. MOVQ kk, %rax;
  1392. #ifdef LEFT
  1393. ADDQ $8, %rax;
  1394. #else
  1395. ADDQ $2, %rax;
  1396. #endif
  1397. MOVQ %rax, kkk;
  1398. #endif
  1399. SARQ $2, k;
  1400. JLE .L211_loopE;
  1401. ALIGN_5;
  1402. .L211_bodyB:
  1403. # Computing kernel
  1404. //#### Unroll time 1 ####
  1405. LD_DX 0*SIZE(ptrba), xvec0;
  1406. LD_DX 0*SIZE(ptrbb), xvec4;
  1407. MOV_DX xvec4, xvec5;
  1408. MUL_DX xvec0, xvec4, xvec4;
  1409. ADD_DX xvec4, xvec15, xvec15;
  1410. LD_DX 2*SIZE(ptrba), xvec1;
  1411. MOV_DX xvec5, xvec6;
  1412. MUL_DX xvec1, xvec5, xvec5;
  1413. ADD_DX xvec5, xvec14, xvec14;
  1414. LD_DX 4*SIZE(ptrba), xvec2;
  1415. MOV_DX xvec6, xvec7;
  1416. MUL_DX xvec2, xvec6, xvec6;
  1417. ADD_DX xvec6, xvec13, xvec13;
  1418. LD_DX 6*SIZE(ptrba), xvec3;
  1419. SHUF_DX $0x4e, xvec7, xvec4;
  1420. MUL_DX xvec3, xvec7, xvec7;
  1421. ADD_DX xvec7, xvec12, xvec12;
  1422. MOV_DX xvec4, xvec5;
  1423. MUL_DX xvec0, xvec4, xvec4;
  1424. ADD_DX xvec4, xvec11, xvec11;
  1425. MOV_DX xvec5, xvec6;
  1426. MUL_DX xvec1, xvec5, xvec5;
  1427. ADD_DX xvec5, xvec10, xvec10;
  1428. MOV_DX xvec6, xvec7;
  1429. MUL_DX xvec2, xvec6, xvec6;
  1430. ADD_DX xvec6, xvec9, xvec9;
  1431. MUL_DX xvec3, xvec7, xvec7;
  1432. ADD_DX xvec7, xvec8, xvec8;
  1433. //#### Unroll time 2 ####
  1434. LD_DX 8*SIZE(ptrba), xvec0;
  1435. LD_DX 2*SIZE(ptrbb), xvec4;
  1436. MOV_DX xvec4, xvec5;
  1437. MUL_DX xvec0, xvec4, xvec4;
  1438. ADD_DX xvec4, xvec15, xvec15;
  1439. LD_DX 10*SIZE(ptrba), xvec1;
  1440. MOV_DX xvec5, xvec6;
  1441. MUL_DX xvec1, xvec5, xvec5;
  1442. ADD_DX xvec5, xvec14, xvec14;
  1443. LD_DX 12*SIZE(ptrba), xvec2;
  1444. MOV_DX xvec6, xvec7;
  1445. MUL_DX xvec2, xvec6, xvec6;
  1446. ADD_DX xvec6, xvec13, xvec13;
  1447. LD_DX 14*SIZE(ptrba), xvec3;
  1448. SHUF_DX $0x4e, xvec7, xvec4;
  1449. MUL_DX xvec3, xvec7, xvec7;
  1450. ADD_DX xvec7, xvec12, xvec12;
  1451. MOV_DX xvec4, xvec5;
  1452. MUL_DX xvec0, xvec4, xvec4;
  1453. ADD_DX xvec4, xvec11, xvec11;
  1454. MOV_DX xvec5, xvec6;
  1455. MUL_DX xvec1, xvec5, xvec5;
  1456. ADD_DX xvec5, xvec10, xvec10;
  1457. MOV_DX xvec6, xvec7;
  1458. MUL_DX xvec2, xvec6, xvec6;
  1459. ADD_DX xvec6, xvec9, xvec9;
  1460. MUL_DX xvec3, xvec7, xvec7;
  1461. ADD_DX xvec7, xvec8, xvec8;
  1462. //#### Unroll time 3 ####
  1463. LD_DX 16*SIZE(ptrba), xvec0;
  1464. LD_DX 4*SIZE(ptrbb), xvec4;
  1465. MOV_DX xvec4, xvec5;
  1466. MUL_DX xvec0, xvec4, xvec4;
  1467. ADD_DX xvec4, xvec15, xvec15;
  1468. LD_DX 18*SIZE(ptrba), xvec1;
  1469. MOV_DX xvec5, xvec6;
  1470. MUL_DX xvec1, xvec5, xvec5;
  1471. ADD_DX xvec5, xvec14, xvec14;
  1472. LD_DX 20*SIZE(ptrba), xvec2;
  1473. MOV_DX xvec6, xvec7;
  1474. MUL_DX xvec2, xvec6, xvec6;
  1475. ADD_DX xvec6, xvec13, xvec13;
  1476. LD_DX 22*SIZE(ptrba), xvec3;
  1477. SHUF_DX $0x4e, xvec7, xvec4;
  1478. MUL_DX xvec3, xvec7, xvec7;
  1479. ADD_DX xvec7, xvec12, xvec12;
  1480. MOV_DX xvec4, xvec5;
  1481. MUL_DX xvec0, xvec4, xvec4;
  1482. ADD_DX xvec4, xvec11, xvec11;
  1483. MOV_DX xvec5, xvec6;
  1484. MUL_DX xvec1, xvec5, xvec5;
  1485. ADD_DX xvec5, xvec10, xvec10;
  1486. MOV_DX xvec6, xvec7;
  1487. MUL_DX xvec2, xvec6, xvec6;
  1488. ADD_DX xvec6, xvec9, xvec9;
  1489. MUL_DX xvec3, xvec7, xvec7;
  1490. ADD_DX xvec7, xvec8, xvec8;
  1491. //#### Unroll time 4 ####
  1492. LD_DX 24*SIZE(ptrba), xvec0;
  1493. LD_DX 6*SIZE(ptrbb), xvec4;
  1494. MOV_DX xvec4, xvec5;
  1495. MUL_DX xvec0, xvec4, xvec4;
  1496. ADD_DX xvec4, xvec15, xvec15;
  1497. ADDQ $8*SIZE, ptrbb;
  1498. LD_DX 26*SIZE(ptrba), xvec1;
  1499. MOV_DX xvec5, xvec6;
  1500. MUL_DX xvec1, xvec5, xvec5;
  1501. ADD_DX xvec5, xvec14, xvec14;
  1502. LD_DX 28*SIZE(ptrba), xvec2;
  1503. MOV_DX xvec6, xvec7;
  1504. MUL_DX xvec2, xvec6, xvec6;
  1505. ADD_DX xvec6, xvec13, xvec13;
  1506. LD_DX 30*SIZE(ptrba), xvec3;
  1507. SHUF_DX $0x4e, xvec7, xvec4;
  1508. MUL_DX xvec3, xvec7, xvec7;
  1509. ADD_DX xvec7, xvec12, xvec12;
  1510. ADDQ $32*SIZE, ptrba;
  1511. MOV_DX xvec4, xvec5;
  1512. MUL_DX xvec0, xvec4, xvec4;
  1513. ADD_DX xvec4, xvec11, xvec11;
  1514. MOV_DX xvec5, xvec6;
  1515. MUL_DX xvec1, xvec5, xvec5;
  1516. ADD_DX xvec5, xvec10, xvec10;
  1517. MOV_DX xvec6, xvec7;
  1518. MUL_DX xvec2, xvec6, xvec6;
  1519. ADD_DX xvec6, xvec9, xvec9;
  1520. MUL_DX xvec3, xvec7, xvec7;
  1521. ADD_DX xvec7, xvec8, xvec8;
  1522. DECQ k;
  1523. JG .L211_bodyB;
  1524. ALIGN_5
  1525. .L211_loopE:
  1526. #ifndef TRMMKERNEL
  1527. TEST $2, bk;
  1528. #else
  1529. MOVQ kkk, %rax;
  1530. TEST $2, %rax;
  1531. #endif
  1532. JLE .L212_loopE;
  1533. ALIGN_5;
  1534. .L212_bodyB:
  1535. # Computing kernel
  1536. //#### Unroll time 1 ####
  1537. LD_DX 0*SIZE(ptrba), xvec0;
  1538. LD_DX 0*SIZE(ptrbb), xvec4;
  1539. MOV_DX xvec4, xvec5;
  1540. MUL_DX xvec0, xvec4, xvec4;
  1541. ADD_DX xvec4, xvec15, xvec15;
  1542. LD_DX 2*SIZE(ptrba), xvec1;
  1543. MOV_DX xvec5, xvec6;
  1544. MUL_DX xvec1, xvec5, xvec5;
  1545. ADD_DX xvec5, xvec14, xvec14;
  1546. LD_DX 4*SIZE(ptrba), xvec2;
  1547. MOV_DX xvec6, xvec7;
  1548. MUL_DX xvec2, xvec6, xvec6;
  1549. ADD_DX xvec6, xvec13, xvec13;
  1550. LD_DX 6*SIZE(ptrba), xvec3;
  1551. SHUF_DX $0x4e, xvec7, xvec4;
  1552. MUL_DX xvec3, xvec7, xvec7;
  1553. ADD_DX xvec7, xvec12, xvec12;
  1554. MOV_DX xvec4, xvec5;
  1555. MUL_DX xvec0, xvec4, xvec4;
  1556. ADD_DX xvec4, xvec11, xvec11;
  1557. MOV_DX xvec5, xvec6;
  1558. MUL_DX xvec1, xvec5, xvec5;
  1559. ADD_DX xvec5, xvec10, xvec10;
  1560. MOV_DX xvec6, xvec7;
  1561. MUL_DX xvec2, xvec6, xvec6;
  1562. ADD_DX xvec6, xvec9, xvec9;
  1563. MUL_DX xvec3, xvec7, xvec7;
  1564. ADD_DX xvec7, xvec8, xvec8;
  1565. //#### Unroll time 2 ####
  1566. LD_DX 8*SIZE(ptrba), xvec0;
  1567. LD_DX 2*SIZE(ptrbb), xvec4;
  1568. MOV_DX xvec4, xvec5;
  1569. MUL_DX xvec0, xvec4, xvec4;
  1570. ADD_DX xvec4, xvec15, xvec15;
  1571. ADDQ $4*SIZE, ptrbb;
  1572. LD_DX 10*SIZE(ptrba), xvec1;
  1573. MOV_DX xvec5, xvec6;
  1574. MUL_DX xvec1, xvec5, xvec5;
  1575. ADD_DX xvec5, xvec14, xvec14;
  1576. LD_DX 12*SIZE(ptrba), xvec2;
  1577. MOV_DX xvec6, xvec7;
  1578. MUL_DX xvec2, xvec6, xvec6;
  1579. ADD_DX xvec6, xvec13, xvec13;
  1580. LD_DX 14*SIZE(ptrba), xvec3;
  1581. SHUF_DX $0x4e, xvec7, xvec4;
  1582. MUL_DX xvec3, xvec7, xvec7;
  1583. ADD_DX xvec7, xvec12, xvec12;
  1584. ADDQ $16*SIZE, ptrba;
  1585. MOV_DX xvec4, xvec5;
  1586. MUL_DX xvec0, xvec4, xvec4;
  1587. ADD_DX xvec4, xvec11, xvec11;
  1588. MOV_DX xvec5, xvec6;
  1589. MUL_DX xvec1, xvec5, xvec5;
  1590. ADD_DX xvec5, xvec10, xvec10;
  1591. MOV_DX xvec6, xvec7;
  1592. MUL_DX xvec2, xvec6, xvec6;
  1593. ADD_DX xvec6, xvec9, xvec9;
  1594. MUL_DX xvec3, xvec7, xvec7;
  1595. ADD_DX xvec7, xvec8, xvec8;
  1596. .L212_loopE:
  1597. #ifndef TRMMKERNEL
  1598. TEST $1, bk;
  1599. #else
  1600. MOVQ kkk, %rax;
  1601. TEST $1, %rax;
  1602. #endif
  1603. JLE .L213_loopE;
  1604. ALIGN_5
  1605. .L213_bodyB:
  1606. //#### Unroll time 1 ####
  1607. LD_DX 0*SIZE(ptrba), xvec0;
  1608. LD_DX 0*SIZE(ptrbb), xvec4;
  1609. MOV_DX xvec4, xvec5;
  1610. MUL_DX xvec0, xvec4, xvec4;
  1611. ADD_DX xvec4, xvec15, xvec15;
  1612. ADDQ $2*SIZE, ptrbb;
  1613. LD_DX 2*SIZE(ptrba), xvec1;
  1614. MOV_DX xvec5, xvec6;
  1615. MUL_DX xvec1, xvec5, xvec5;
  1616. ADD_DX xvec5, xvec14, xvec14;
  1617. LD_DX 4*SIZE(ptrba), xvec2;
  1618. MOV_DX xvec6, xvec7;
  1619. MUL_DX xvec2, xvec6, xvec6;
  1620. ADD_DX xvec6, xvec13, xvec13;
  1621. LD_DX 6*SIZE(ptrba), xvec3;
  1622. SHUF_DX $0x4e, xvec7, xvec4;
  1623. MUL_DX xvec3, xvec7, xvec7;
  1624. ADD_DX xvec7, xvec12, xvec12;
  1625. ADDQ $8*SIZE, ptrba;
  1626. MOV_DX xvec4, xvec5;
  1627. MUL_DX xvec0, xvec4, xvec4;
  1628. ADD_DX xvec4, xvec11, xvec11;
  1629. MOV_DX xvec5, xvec6;
  1630. MUL_DX xvec1, xvec5, xvec5;
  1631. ADD_DX xvec5, xvec10, xvec10;
  1632. MOV_DX xvec6, xvec7;
  1633. MUL_DX xvec2, xvec6, xvec6;
  1634. ADD_DX xvec6, xvec9, xvec9;
  1635. MUL_DX xvec3, xvec7, xvec7;
  1636. ADD_DX xvec7, xvec8, xvec8;
  1637. .L213_loopE:
  1638. //#### Multiply Alpha ####
  1639. BROAD_DX MEMALPHA, xvec7;
  1640. MUL_DX xvec7, xvec15, xvec15;
  1641. MUL_DX xvec7, xvec14, xvec14;
  1642. MUL_DX xvec7, xvec13, xvec13;
  1643. MUL_DX xvec7, xvec12, xvec12;
  1644. MUL_DX xvec7, xvec11, xvec11;
  1645. MUL_DX xvec7, xvec10, xvec10;
  1646. MUL_DX xvec7, xvec9, xvec9;
  1647. MUL_DX xvec7, xvec8, xvec8;
  1648. //#### Reverse ####
  1649. MOV_DX xvec15, xvec6;
  1650. REVS_DX xvec11, xvec15, xvec15;
  1651. REVS_DX xvec6, xvec11, xvec11;
  1652. MOV_DX xvec14, xvec6;
  1653. REVS_DX xvec10, xvec14, xvec14;
  1654. REVS_DX xvec6, xvec10, xvec10;
  1655. MOV_DX xvec13, xvec6;
  1656. REVS_DX xvec9, xvec13, xvec13;
  1657. REVS_DX xvec6, xvec9, xvec9;
  1658. MOV_DX xvec12, xvec6;
  1659. REVS_DX xvec8, xvec12, xvec12;
  1660. REVS_DX xvec6, xvec8, xvec8;
  1661. //#### Testing Alignment ####
  1662. MOVQ C0, %rax;
  1663. OR ldc, %rax;
  1664. TEST $15, %rax;
  1665. JNE .L213_loopEx;
  1666. ALIGN_5
  1667. //#### Writing Back ####
  1668. #ifndef TRMMKERNEL
  1669. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  1670. ADD_DX 2*SIZE(C0), xvec10, xvec10;
  1671. ADD_DX 4*SIZE(C0), xvec9, xvec9;
  1672. ADD_DX 6*SIZE(C0), xvec8, xvec8;
  1673. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  1674. ADD_DX 2*SIZE(C1), xvec14, xvec14;
  1675. ADD_DX 4*SIZE(C1), xvec13, xvec13;
  1676. ADD_DX 6*SIZE(C1), xvec12, xvec12;
  1677. #endif
  1678. ST_DX xvec11, 0*SIZE(C0);
  1679. ST_DX xvec10, 2*SIZE(C0);
  1680. ST_DX xvec9, 4*SIZE(C0);
  1681. ST_DX xvec8, 6*SIZE(C0);
  1682. ST_DX xvec15, 0*SIZE(C1);
  1683. ST_DX xvec14, 2*SIZE(C1);
  1684. ST_DX xvec13, 4*SIZE(C1);
  1685. ST_DX xvec12, 6*SIZE(C1);
  1686. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1687. MOVQ bk, %rax;
  1688. SUBQ kkk, %rax;
  1689. LEAQ (,%rax, SIZE), %rax;
  1690. LEAQ (ptrba, %rax, 8), ptrba;
  1691. LEAQ (ptrbb, %rax, 2), ptrbb;
  1692. #endif
  1693. #if defined(TRMMKERNEL) && defined(LEFT)
  1694. ADDQ $8, kk
  1695. #endif
  1696. ADDQ $8*SIZE, C0;
  1697. ADDQ $8*SIZE, C1;
  1698. DECQ i;
  1699. JG .L21_bodyB;
  1700. JMP .L21_loopE;
  1701. ALIGN_5
  1702. .L213_loopEx:;
  1703. #ifndef TRMMKERNEL
  1704. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1705. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1706. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  1707. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  1708. LDL_DX 4*SIZE(C0), xvec2, xvec2;
  1709. LDH_DX 5*SIZE(C0), xvec2, xvec2;
  1710. LDL_DX 6*SIZE(C0), xvec3, xvec3;
  1711. LDH_DX 7*SIZE(C0), xvec3, xvec3;
  1712. ADD_DX xvec0, xvec11, xvec11;
  1713. ADD_DX xvec1, xvec10, xvec10;
  1714. ADD_DX xvec2, xvec9, xvec9;
  1715. ADD_DX xvec3, xvec8, xvec8;
  1716. #endif
  1717. STL_DX xvec11, 0*SIZE(C0);
  1718. STH_DX xvec11, 1*SIZE(C0);
  1719. STL_DX xvec10, 2*SIZE(C0);
  1720. STH_DX xvec10, 3*SIZE(C0);
  1721. STL_DX xvec9, 4*SIZE(C0);
  1722. STH_DX xvec9, 5*SIZE(C0);
  1723. STL_DX xvec8, 6*SIZE(C0);
  1724. STH_DX xvec8, 7*SIZE(C0);
  1725. #ifndef TRMMKERNEL
  1726. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  1727. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  1728. LDL_DX 2*SIZE(C1), xvec5, xvec5;
  1729. LDH_DX 3*SIZE(C1), xvec5, xvec5;
  1730. LDL_DX 4*SIZE(C1), xvec6, xvec6;
  1731. LDH_DX 5*SIZE(C1), xvec6, xvec6;
  1732. LDL_DX 6*SIZE(C1), xvec7, xvec7;
  1733. LDH_DX 7*SIZE(C1), xvec7, xvec7;
  1734. ADD_DX xvec4, xvec15, xvec15;
  1735. ADD_DX xvec5, xvec14, xvec14;
  1736. ADD_DX xvec6, xvec13, xvec13;
  1737. ADD_DX xvec7, xvec12, xvec12;
  1738. #endif
  1739. STL_DX xvec15, 0*SIZE(C1);
  1740. STH_DX xvec15, 1*SIZE(C1);
  1741. STL_DX xvec14, 2*SIZE(C1);
  1742. STH_DX xvec14, 3*SIZE(C1);
  1743. STL_DX xvec13, 4*SIZE(C1);
  1744. STH_DX xvec13, 5*SIZE(C1);
  1745. STL_DX xvec12, 6*SIZE(C1);
  1746. STH_DX xvec12, 7*SIZE(C1);
  1747. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1748. MOVQ bk, %rax;
  1749. SUBQ kkk, %rax;
  1750. LEAQ (,%rax, SIZE), %rax;
  1751. LEAQ (ptrba, %rax, 8), ptrba;
  1752. LEAQ (ptrbb, %rax, 2), ptrbb;
  1753. #endif
  1754. #if defined(TRMMKERNEL) && defined(LEFT)
  1755. ADDQ $8, kk
  1756. #endif
  1757. ADDQ $8*SIZE, C0;
  1758. ADDQ $8*SIZE, C1;
  1759. DECQ i;
  1760. JG .L21_bodyB;
  1761. .L21_loopE:;
  1762. TEST $4, bm; # Rm = 4
  1763. JLE .L22_loopE;
  1764. ALIGN_5;
  1765. .L22_bodyB:;
  1766. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  1767. MOVQ bb, ptrbb;
  1768. #else
  1769. MOVQ bb, ptrbb;
  1770. MOVQ kk, %rax;
  1771. LEAQ (,%rax, SIZE), %rax;
  1772. LEAQ (ptrba, %rax, 4), ptrba;
  1773. LEAQ (ptrbb, %rax, 2), ptrbb;
  1774. #endif
  1775. //#### Initial Results Register ####
  1776. XOR_DY yvec15, yvec15, yvec15;
  1777. XOR_DY yvec14, yvec14, yvec14;
  1778. XOR_DY yvec11, yvec11, yvec11;
  1779. XOR_DY yvec10, yvec10, yvec10;
  1780. #ifndef TRMMKERNEL
  1781. MOVQ bk, k;
  1782. #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  1783. MOVQ bk, %rax;
  1784. SUBQ kk, %rax;
  1785. MOVQ %rax, kkk;
  1786. #else
  1787. MOVQ kk, %rax;
  1788. #ifdef LEFT
  1789. ADDQ $4, %rax;
  1790. #else
  1791. ADDQ $2, %rax;
  1792. #endif
  1793. MOVQ %rax, kkk;
  1794. #endif
  1795. SARQ $2, k;
  1796. JLE .L221_loopE;
  1797. ALIGN_5
  1798. .L221_bodyB:;
  1799. # Computing kernel
  1800. //#### Unroll time 1 ####
  1801. LD_DX 0*SIZE(ptrba), xvec0;
  1802. LD_DX 0*SIZE(ptrbb), xvec4;
  1803. MOV_DX xvec4, xvec5;
  1804. MUL_DX xvec0, xvec4, xvec4;
  1805. ADD_DX xvec4, xvec15, xvec15;
  1806. LD_DX 2*SIZE(ptrba), xvec1;
  1807. SHUF_DX $0x4e, xvec5, xvec4;
  1808. MUL_DX xvec1, xvec5, xvec5;
  1809. ADD_DX xvec5, xvec14, xvec14;
  1810. MOV_DX xvec4, xvec5;
  1811. MUL_DX xvec0, xvec4, xvec4;
  1812. ADD_DX xvec4, xvec11, xvec11;
  1813. MUL_DX xvec1, xvec5, xvec5;
  1814. ADD_DX xvec5, xvec10, xvec10;
  1815. //#### Unroll time 2 ####
  1816. LD_DX 4*SIZE(ptrba), xvec0;
  1817. LD_DX 2*SIZE(ptrbb), xvec4;
  1818. MOV_DX xvec4, xvec5;
  1819. MUL_DX xvec0, xvec4, xvec4;
  1820. ADD_DX xvec4, xvec15, xvec15;
  1821. LD_DX 6*SIZE(ptrba), xvec1;
  1822. SHUF_DX $0x4e, xvec5, xvec4;
  1823. MUL_DX xvec1, xvec5, xvec5;
  1824. ADD_DX xvec5, xvec14, xvec14;
  1825. MOV_DX xvec4, xvec5;
  1826. MUL_DX xvec0, xvec4, xvec4;
  1827. ADD_DX xvec4, xvec11, xvec11;
  1828. MUL_DX xvec1, xvec5, xvec5;
  1829. ADD_DX xvec5, xvec10, xvec10;
  1830. //#### Unroll time 3 ####
  1831. LD_DX 8*SIZE(ptrba), xvec0;
  1832. LD_DX 4*SIZE(ptrbb), xvec4;
  1833. MOV_DX xvec4, xvec5;
  1834. MUL_DX xvec0, xvec4, xvec4;
  1835. ADD_DX xvec4, xvec15, xvec15;
  1836. LD_DX 10*SIZE(ptrba), xvec1;
  1837. SHUF_DX $0x4e, xvec5, xvec4;
  1838. MUL_DX xvec1, xvec5, xvec5;
  1839. ADD_DX xvec5, xvec14, xvec14;
  1840. MOV_DX xvec4, xvec5;
  1841. MUL_DX xvec0, xvec4, xvec4;
  1842. ADD_DX xvec4, xvec11, xvec11;
  1843. MUL_DX xvec1, xvec5, xvec5;
  1844. ADD_DX xvec5, xvec10, xvec10;
  1845. //#### Unroll time 4 ####
  1846. LD_DX 12*SIZE(ptrba), xvec0;
  1847. LD_DX 6*SIZE(ptrbb), xvec4;
  1848. MOV_DX xvec4, xvec5;
  1849. MUL_DX xvec0, xvec4, xvec4;
  1850. ADD_DX xvec4, xvec15, xvec15;
  1851. ADDQ $8*SIZE, ptrbb;
  1852. LD_DX 14*SIZE(ptrba), xvec1;
  1853. SHUF_DX $0x4e, xvec5, xvec4;
  1854. MUL_DX xvec1, xvec5, xvec5;
  1855. ADD_DX xvec5, xvec14, xvec14;
  1856. ADDQ $16*SIZE, ptrba;
  1857. MOV_DX xvec4, xvec5;
  1858. MUL_DX xvec0, xvec4, xvec4;
  1859. ADD_DX xvec4, xvec11, xvec11;
  1860. MUL_DX xvec1, xvec5, xvec5;
  1861. ADD_DX xvec5, xvec10, xvec10;
  1862. DECQ k;
  1863. JG .L221_bodyB;
  1864. ALIGN_5
  1865. .L221_loopE:;
  1866. #ifndef TRMMKERNEL
  1867. TEST $2, bk;
  1868. #else
  1869. MOVQ kkk, %rax;
  1870. TEST $2, %rax;
  1871. #endif
  1872. JLE .L222_loopE;
  1873. ALIGN_5
  1874. .L222_bodyB:
  1875. //#### Unroll time 1 ####
  1876. LD_DX 0*SIZE(ptrba), xvec0;
  1877. LD_DX 0*SIZE(ptrbb), xvec4;
  1878. MOV_DX xvec4, xvec5;
  1879. MUL_DX xvec0, xvec4, xvec4;
  1880. ADD_DX xvec4, xvec15, xvec15;
  1881. LD_DX 2*SIZE(ptrba), xvec1;
  1882. SHUF_DX $0x4e, xvec5, xvec4;
  1883. MUL_DX xvec1, xvec5, xvec5;
  1884. ADD_DX xvec5, xvec14, xvec14;
  1885. MOV_DX xvec4, xvec5;
  1886. MUL_DX xvec0, xvec4, xvec4;
  1887. ADD_DX xvec4, xvec11, xvec11;
  1888. MUL_DX xvec1, xvec5, xvec5;
  1889. ADD_DX xvec5, xvec10, xvec10;
  1890. //#### Unroll time 2 ####
  1891. LD_DX 4*SIZE(ptrba), xvec0;
  1892. LD_DX 2*SIZE(ptrbb), xvec4;
  1893. MOV_DX xvec4, xvec5;
  1894. MUL_DX xvec0, xvec4, xvec4;
  1895. ADD_DX xvec4, xvec15, xvec15;
  1896. ADDQ $4*SIZE, ptrbb;
  1897. LD_DX 6*SIZE(ptrba), xvec1;
  1898. SHUF_DX $0x4e, xvec5, xvec4;
  1899. MUL_DX xvec1, xvec5, xvec5;
  1900. ADD_DX xvec5, xvec14, xvec14;
  1901. ADDQ $8*SIZE, ptrba;
  1902. MOV_DX xvec4, xvec5;
  1903. MUL_DX xvec0, xvec4, xvec4;
  1904. ADD_DX xvec4, xvec11, xvec11;
  1905. MUL_DX xvec1, xvec5, xvec5;
  1906. ADD_DX xvec5, xvec10, xvec10;
  1907. .L222_loopE:
  1908. #ifndef TRMMKERNEL
  1909. TEST $1, bk
  1910. #else
  1911. MOVQ kkk, %rax;
  1912. TEST $1, %rax;
  1913. #endif
  1914. JLE .L223_loopE;
  1915. ALIGN_5
  1916. .L223_bodyB:
  1917. //#### Unroll time 1 ####
  1918. LD_DX 0*SIZE(ptrba), xvec0;
  1919. LD_DX 0*SIZE(ptrbb), xvec4;
  1920. MOV_DX xvec4, xvec5;
  1921. MUL_DX xvec0, xvec4, xvec4;
  1922. ADD_DX xvec4, xvec15, xvec15;
  1923. ADDQ $2*SIZE, ptrbb;
  1924. LD_DX 2*SIZE(ptrba), xvec1;
  1925. SHUF_DX $0x4e, xvec5, xvec4;
  1926. MUL_DX xvec1, xvec5, xvec5;
  1927. ADD_DX xvec5, xvec14, xvec14;
  1928. ADDQ $4*SIZE, ptrba;
  1929. MOV_DX xvec4, xvec5;
  1930. MUL_DX xvec0, xvec4, xvec4;
  1931. ADD_DX xvec4, xvec11, xvec11;
  1932. MUL_DX xvec1, xvec5, xvec5;
  1933. ADD_DX xvec5, xvec10, xvec10;
  1934. .L223_loopE:
  1935. //#### Multiply Alpha ####
  1936. BROAD_DX MEMALPHA, xvec7;
  1937. MUL_DX xvec7, xvec15, xvec15;
  1938. MUL_DX xvec7, xvec14, xvec14;
  1939. MUL_DX xvec7, xvec11, xvec11;
  1940. MUL_DX xvec7, xvec10, xvec10;
  1941. //#### Reverse ####
  1942. MOV_DX xvec15, xvec6;
  1943. REVS_DX xvec11, xvec15, xvec15;
  1944. REVS_DX xvec6, xvec11, xvec11;
  1945. MOV_DX xvec14, xvec6;
  1946. REVS_DX xvec10, xvec14, xvec14;
  1947. REVS_DX xvec6, xvec10, xvec10;
  1948. //#### Testing Alignment ####
  1949. MOVQ C0, %rax;
  1950. OR ldc, %rax;
  1951. TEST $15, %rax;
  1952. JNE .L223_loopEx;
  1953. ALIGN_5
  1954. //#### Writing Back ####
  1955. #ifndef TRMMKERNEL
  1956. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  1957. ADD_DX 2*SIZE(C0), xvec10, xvec10;
  1958. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  1959. ADD_DX 2*SIZE(C1), xvec14, xvec14;
  1960. #endif
  1961. ST_DX xvec11, 0*SIZE(C0);
  1962. ST_DX xvec10, 2*SIZE(C0);
  1963. ST_DX xvec15, 0*SIZE(C1);
  1964. ST_DX xvec14, 2*SIZE(C1);
  1965. #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
  1966. MOVQ bk, %rax;
  1967. SUBQ kkk, %rax;
  1968. LEAQ (,%rax, SIZE), %rax;
  1969. LEAQ (ptrba, %rax, 4), ptrba;
  1970. LEAQ (ptrbb, %rax, 2), ptrbb;
  1971. #endif
  1972. #if defined(TRMMKERNEL) && defined(LEFT)
  1973. ADDQ $4, kk
  1974. #endif
  1975. ADDQ $4*SIZE, C0;
  1976. ADDQ $4*SIZE, C1;
  1977. JMP .L22_loopE;
  1978. ALIGN_5
  1979. .L223_loopEx:;
  1980. #ifndef TRMMKERNEL
  1981. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  1982. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  1983. LDL_DX 2*SIZE(C0), xvec1, xvec1;
  1984. LDH_DX 3*SIZE(C0), xvec1, xvec1;
  1985. ADD_DX xvec0, xvec11, xvec11;
  1986. ADD_DX xvec1, xvec10, xvec10;
  1987. #endif
  1988. STL_DX xvec11, 0*SIZE(C0);
  1989. STH_DX xvec11, 1*SIZE(C0);
  1990. STL_DX xvec10, 2*SIZE(C0);
  1991. STH_DX xvec10, 3*SIZE(C0);
  1992. #ifndef TRMMKERNEL
  1993. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  1994. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  1995. LDL_DX 2*SIZE(C1), xvec5, xvec5;
  1996. LDH_DX 3*SIZE(C1), xvec5, xvec5;
  1997. ADD_DX xvec4, xvec15, xvec15;
  1998. ADD_DX xvec5, xvec14, xvec14;
  1999. #endif
  2000. STL_DX xvec15, 0*SIZE(C1);
  2001. STH_DX xvec15, 1*SIZE(C1);
  2002. STL_DX xvec14, 2*SIZE(C1);
  2003. STH_DX xvec14, 3*SIZE(C1);
  2004. #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
  2005. MOVQ bk, %rax;
  2006. SUBQ kkk, %rax;
  2007. LEAQ (,%rax, SIZE), %rax;
  2008. LEAQ (ptrba, %rax, 4), ptrba;
  2009. LEAQ (ptrbb, %rax, 2), ptrbb;
  2010. #endif
  2011. #if defined(TRMMKERNEL) && defined(LEFT)
  2012. ADDQ $4, kk
  2013. #endif
  2014. ADDQ $4*SIZE, C0;
  2015. ADDQ $4*SIZE, C1;
  2016. .L22_loopE:;
  2017. TEST $2, bm; // Rm = 2
  2018. JLE .L23_loopE;
  2019. ALIGN_5;
  2020. .L23_bodyB:
  2021. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2022. MOVQ bb, ptrbb;
  2023. #else
  2024. MOVQ bb, ptrbb;
  2025. MOVQ kk, %rax;
  2026. LEAQ (,%rax, SIZE), %rax;
  2027. LEAQ (ptrba, %rax, 2), ptrba;
  2028. LEAQ (ptrbb, %rax, 2), ptrbb;
  2029. #endif
  2030. XOR_DY yvec15, yvec15, yvec15;
  2031. XOR_DY yvec11, yvec11, yvec11;
  2032. #ifndef TRMMKERNEL
  2033. MOVQ bk, k;
  2034. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2035. MOVQ bk, %rax;
  2036. SUBQ kk, %rax;
  2037. MOVQ %rax, kkk;
  2038. #else
  2039. MOVQ kk, %rax;
  2040. #ifdef LEFT
  2041. ADDQ $2, %rax;
  2042. #else
  2043. ADDQ $2, %rax;
  2044. #endif
  2045. MOVQ %rax, kkk;
  2046. #endif
  2047. SARQ $2, k;
  2048. JLE .L231_loopE;
  2049. ALIGN_5
  2050. .L231_bodyB:
  2051. # Computing kernel
  2052. //#### Unroll time 1 ####
  2053. LD_DX 0*SIZE(ptrba), xvec0;
  2054. LD_DX 0*SIZE(ptrbb), xvec4;
  2055. SHUF_DX $0x4e, xvec4, xvec5;
  2056. MUL_DX xvec0, xvec4, xvec4;
  2057. ADD_DX xvec4, xvec15, xvec15;
  2058. MUL_DX xvec0, xvec5, xvec5;
  2059. ADD_DX xvec5, xvec11, xvec11;
  2060. //#### Unroll time 2 ####
  2061. LD_DX 2*SIZE(ptrba), xvec0;
  2062. LD_DX 2*SIZE(ptrbb), xvec4;
  2063. SHUF_DX $0x4e, xvec4, xvec5;
  2064. MUL_DX xvec0, xvec4, xvec4;
  2065. ADD_DX xvec4, xvec15, xvec15;
  2066. MUL_DX xvec0, xvec5, xvec5;
  2067. ADD_DX xvec5, xvec11, xvec11;
  2068. //#### Unroll time 3 ####
  2069. LD_DX 4*SIZE(ptrba), xvec0;
  2070. LD_DX 4*SIZE(ptrbb), xvec4;
  2071. SHUF_DX $0x4e, xvec4, xvec5;
  2072. MUL_DX xvec0, xvec4, xvec4;
  2073. ADD_DX xvec4, xvec15, xvec15;
  2074. MUL_DX xvec0, xvec5, xvec5;
  2075. ADD_DX xvec5, xvec11, xvec11;
  2076. //#### Unroll time 4 ####
  2077. LD_DX 6*SIZE(ptrba), xvec0;
  2078. LD_DX 6*SIZE(ptrbb), xvec4;
  2079. SHUF_DX $0x4e, xvec4, xvec5;
  2080. MUL_DX xvec0, xvec4, xvec4;
  2081. ADD_DX xvec4, xvec15, xvec15;
  2082. ADDQ $8*SIZE, ptrba;
  2083. MUL_DX xvec0, xvec5, xvec5;
  2084. ADD_DX xvec5, xvec11, xvec11;
  2085. ADDQ $8*SIZE, ptrbb;
  2086. DECQ k;
  2087. JG .L231_bodyB;
  2088. ALIGN_5
  2089. .L231_loopE:
  2090. #ifndef TRMMKERNEL
  2091. TEST $2, bk;
  2092. #else
  2093. MOVQ kkk, %rax;
  2094. TEST $2, %rax;
  2095. #endif
  2096. JLE .L232_loopE;
  2097. ALIGN_5
  2098. .L232_bodyB:
  2099. //#### Unroll time 1 ####
  2100. LD_DX 0*SIZE(ptrba), xvec0;
  2101. LD_DX 0*SIZE(ptrbb), xvec4;
  2102. SHUF_DX $0x4e, xvec4, xvec5;
  2103. MUL_DX xvec0, xvec4, xvec4;
  2104. ADD_DX xvec4, xvec15, xvec15;
  2105. MUL_DX xvec0, xvec5, xvec5;
  2106. ADD_DX xvec5, xvec11, xvec11;
  2107. //#### Unroll time 2 ####
  2108. LD_DX 2*SIZE(ptrba), xvec0;
  2109. LD_DX 2*SIZE(ptrbb), xvec4;
  2110. SHUF_DX $0x4e, xvec4, xvec5;
  2111. MUL_DX xvec0, xvec4, xvec4;
  2112. ADD_DX xvec4, xvec15, xvec15;
  2113. ADDQ $4*SIZE, ptrba;
  2114. MUL_DX xvec0, xvec5, xvec5;
  2115. ADD_DX xvec5, xvec11, xvec11;
  2116. ADDQ $4*SIZE, ptrbb;
  2117. .L232_loopE:
  2118. #ifndef TRMMKERNEL
  2119. TEST $1, bk;
  2120. #else
  2121. MOVQ kkk, %rax;
  2122. TEST $1, %rax;
  2123. #endif
  2124. JLE .L233_loopE;
  2125. ALIGN_5
  2126. .L233_bodyB:
  2127. //#### Unroll time 1 ####
  2128. LD_DX 0*SIZE(ptrba), xvec0;
  2129. LD_DX 0*SIZE(ptrbb), xvec4;
  2130. SHUF_DX $0x4e, xvec4, xvec5;
  2131. MUL_DX xvec0, xvec4, xvec4;
  2132. ADD_DX xvec4, xvec15, xvec15;
  2133. ADDQ $2*SIZE, ptrba;
  2134. MUL_DX xvec0, xvec5, xvec5;
  2135. ADD_DX xvec5, xvec11, xvec11;
  2136. ADDQ $2*SIZE, ptrbb;
  2137. .L233_loopE:
  2138. //#### Multiply Alpha ####
  2139. BROAD_DX MEMALPHA, xvec7;
  2140. MUL_DX xvec7, xvec15, xvec15;
  2141. MUL_DX xvec7, xvec11, xvec11;
  2142. //#### Reverse ####
  2143. MOV_DX xvec15, xvec6;
  2144. REVS_DX xvec11, xvec15, xvec15;
  2145. REVS_DX xvec6, xvec11, xvec11;
  2146. //#### Testing Alignment ####
  2147. MOVQ C0, %rax;
  2148. OR ldc, %rax;
  2149. TEST $15, %rax;
  2150. JNE .L233_loopEx;
  2151. ALIGN_5
  2152. //#### Writing Back ####
  2153. #ifndef TRMMKERNEL
  2154. ADD_DX 0*SIZE(C0), xvec11, xvec11;
  2155. ADD_DX 0*SIZE(C1), xvec15, xvec15;
  2156. #endif
  2157. ST_DX xvec11, 0*SIZE(C0);
  2158. ST_DX xvec15, 0*SIZE(C1);
  2159. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2160. MOVQ bk, %rax;
  2161. SUBQ kkk, %rax;
  2162. LEAQ (,%rax, SIZE), %rax;
  2163. LEAQ (ptrba, %rax, 2), ptrba;
  2164. LEAQ (ptrbb, %rax, 2), ptrbb;
  2165. #endif
  2166. #if defined(TRMMKERNEL) && defined(LEFT)
  2167. ADDQ $2, kk;
  2168. #endif
  2169. ADDQ $2*SIZE, C0;
  2170. ADDQ $2*SIZE, C1;
  2171. JMP .L23_loopE;
  2172. ALIGN_5
  2173. .L233_loopEx:;
  2174. #ifndef TRMMKERNEL
  2175. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2176. LDH_DX 1*SIZE(C0), xvec0, xvec0;
  2177. ADD_DX xvec0, xvec11, xvec11;
  2178. #endif
  2179. STL_DX xvec11, 0*SIZE(C0);
  2180. STH_DX xvec11, 1*SIZE(C0);
  2181. #ifndef TRMMKERNEL
  2182. LDL_DX 0*SIZE(C1), xvec4, xvec4;
  2183. LDH_DX 1*SIZE(C1), xvec4, xvec4;
  2184. ADD_DX xvec4, xvec15, xvec15;
  2185. #endif
  2186. STL_DX xvec15, 0*SIZE(C1);
  2187. STH_DX xvec15, 1*SIZE(C1);
  2188. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2189. MOVQ bk, %rax;
  2190. SUBQ kkk, %rax;
  2191. LEAQ (,%rax, SIZE), %rax;
  2192. LEAQ (ptrba, %rax, 2), ptrba;
  2193. LEAQ (ptrbb, %rax, 2), ptrbb;
  2194. #endif
  2195. #if defined(TRMMKERNEL) && defined(LEFT)
  2196. ADDQ $2, kk;
  2197. #endif
  2198. ADDQ $2*SIZE, C0;
  2199. ADDQ $2*SIZE, C1;
  2200. .L23_loopE:
  2201. TEST $1, bm; // Rm = 1
  2202. JLE .L24_loopE;
  2203. ALIGN_5;
  2204. .L24_bodyB:
  2205. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2206. MOVQ bb, ptrbb;
  2207. #else
  2208. MOVQ bb, ptrbb;
  2209. MOVQ kk, %rax;
  2210. LEAQ (, %rax, SIZE), %rax;
  2211. ADDQ %rax, ptrba;
  2212. LEAQ (ptrbb, %rax, 2), ptrbb;
  2213. #endif
  2214. XOR_DY yvec15, yvec15, yvec15;
  2215. #ifndef TRMMKERNEL
  2216. MOVQ bk, k;
  2217. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2218. MOVQ bk, %rax;
  2219. SUBQ kk, %rax;
  2220. MOVQ %rax, kkk;
  2221. #else
  2222. MOVQ kk, %rax;
  2223. #ifdef LEFT
  2224. ADDQ $1, %rax;
  2225. #else
  2226. ADDQ $2, %rax;
  2227. #endif
  2228. MOVQ %rax, kkk;
  2229. #endif
  2230. SARQ $2, k;
  2231. JLE .L241_loopE;
  2232. ALIGN_5
  2233. .L241_bodyB:
  2234. BROAD_DX 0*SIZE(ptrba), xvec0;
  2235. LD_DX 0*SIZE(ptrbb), xvec2;
  2236. MUL_DX xvec0, xvec2, xvec2;
  2237. ADD_DX xvec2, xvec15, xvec15;
  2238. BROAD_DX 1*SIZE(ptrba), xvec1;
  2239. LD_DX 2*SIZE(ptrbb), xvec3;
  2240. MUL_DX xvec1, xvec3, xvec3;
  2241. ADD_DX xvec3, xvec15, xvec15;
  2242. BROAD_DX 2*SIZE(ptrba), xvec0;
  2243. LD_DX 4*SIZE(ptrbb), xvec2;
  2244. MUL_DX xvec0, xvec2, xvec2;
  2245. ADD_DX xvec2, xvec15, xvec15;
  2246. BROAD_DX 3*SIZE(ptrba), xvec1;
  2247. LD_DX 6*SIZE(ptrbb), xvec3;
  2248. MUL_DX xvec1, xvec3, xvec3;
  2249. ADD_DX xvec3, xvec15, xvec15;
  2250. ADDQ $4*SIZE, ptrba;
  2251. ADDQ $8*SIZE, ptrbb;
  2252. DECQ k;
  2253. JG .L241_bodyB;
  2254. ALIGN_5
  2255. .L241_loopE:
  2256. #ifndef TRMMKERNEL
  2257. TEST $2, bk;
  2258. #else
  2259. MOVQ kkk, %rax;
  2260. TEST $2, %rax;
  2261. #endif
  2262. JLE .L242_loopE;
  2263. ALIGN_5
  2264. .L242_bodyB:
  2265. BROAD_DX 0*SIZE(ptrba), xvec0;
  2266. LD_DX 0*SIZE(ptrbb), xvec2;
  2267. MUL_DX xvec0, xvec2, xvec2;
  2268. ADD_DX xvec2, xvec15, xvec15;
  2269. BROAD_DX 1*SIZE(ptrba), xvec1;
  2270. LD_DX 2*SIZE(ptrbb), xvec3;
  2271. MUL_DX xvec1, xvec3, xvec3;
  2272. ADD_DX xvec3, xvec15, xvec15;
  2273. ADDQ $2*SIZE, ptrba;
  2274. ADDQ $4*SIZE, ptrbb;
  2275. .L242_loopE:
  2276. #ifndef TRMMKERNEL
  2277. TEST $1, bk;
  2278. #else
  2279. MOVQ kkk, %rax;
  2280. TEST $1, %rax;
  2281. #endif
  2282. JLE .L243_loopE;
  2283. ALIGN_5
  2284. .L243_bodyB:
  2285. BROAD_DX 0*SIZE(ptrba), xvec0;
  2286. LD_DX 0*SIZE(ptrbb), xvec2;
  2287. MUL_DX xvec0, xvec2, xvec2;
  2288. ADD_DX xvec2, xvec15, xvec15;
  2289. ADDQ $1*SIZE, ptrba;
  2290. ADDQ $2*SIZE, ptrbb;
  2291. .L243_loopE:
  2292. BROAD_DX MEMALPHA, xvec7;
  2293. MUL_DX xvec7, xvec15, xvec15;
  2294. #ifndef TRMMKERNEL
  2295. LDL_DX 0*SIZE(C0), xvec0, xvec0;
  2296. LDH_DX 0*SIZE(C1), xvec0, xvec0;
  2297. ADD_DX xvec0, xvec15, xvec15;
  2298. #endif
  2299. STL_DX xvec15, 0*SIZE(C0);
  2300. STH_DX xvec15, 0*SIZE(C1);
  2301. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2302. MOVQ bk, %rax;
  2303. SUBQ kkk, %rax;
  2304. LEAQ (,%rax, SIZE), %rax;
  2305. ADDQ %rax, ptrba;
  2306. LEAQ (ptrbb, %rax, 2), ptrbb;
  2307. #endif
  2308. #if defined(TRMMKERNEL) && defined(LEFT)
  2309. ADDQ $1, kk;
  2310. #endif
  2311. ADDQ $1*SIZE, C0;
  2312. ADDQ $1*SIZE, C1;
  2313. .L24_loopE:
  2314. #if defined(TRMMKERNEL) && !defined(LEFT)
  2315. ADDQ $2, kk;
  2316. #endif
  2317. MOVQ bk, k;
  2318. SALQ $4, k;
  2319. ADDQ k, bb;
  2320. LEAQ (C, ldc, 2), C;
  2321. .L20_loopE:;
  2322. TEST $1, bn; // Rn = 1
  2323. JLE .L30_loopE;
  2324. ALIGN_5
  2325. .L30_bodyB:
  2326. #if defined(TRMMKERNEL)&&defined(LEFT)
  2327. MOVQ OFFSET, %rax;
  2328. MOVQ %rax, kk;
  2329. #endif
  2330. MOVQ C, C0;
  2331. MOVQ ba, ptrba;
  2332. MOVQ bm, i;
  2333. SARQ $3, i;
  2334. JLE .L31_loopE;
  2335. ALIGN_5
  2336. .L31_bodyB:
  2337. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2338. MOVQ bb, ptrbb;
  2339. #else
  2340. MOVQ bb, ptrbb;
  2341. MOVQ kk, %rax
  2342. LEAQ (, %rax, SIZE), %rax;
  2343. LEAQ (ptrba, %rax, 8), ptrba;
  2344. ADDQ %rax, ptrbb;
  2345. #endif
  2346. //#### Initial Results Register ####
  2347. XOR_DY yvec15, yvec15, yvec15;
  2348. XOR_DY yvec14, yvec14, yvec14;
  2349. #ifndef TRMMKERNEL
  2350. MOVQ bk, k;
  2351. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2352. MOVQ bk, %rax;
  2353. SUBQ kk, %rax;
  2354. MOVQ %rax, kkk;
  2355. #else
  2356. MOVQ kk, %rax;
  2357. #ifdef LEFT
  2358. ADDQ $8, %rax;
  2359. #else
  2360. ADDQ $1, %rax;
  2361. #endif
  2362. MOVQ %rax, kkk;
  2363. #endif
  2364. SARQ $2, k;
  2365. JLE .L311_loopE;
  2366. ALIGN_5
  2367. .L311_bodyB:
  2368. //#### Unroll time 1 ####
  2369. LD_DY 0*SIZE(ptrba), yvec0;
  2370. LD_DY 4*SIZE(ptrba), yvec1;
  2371. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2372. MUL_DY yvec2, yvec0, yvec0;
  2373. ADD_DY yvec0, yvec15, yvec15;
  2374. MUL_DY yvec2, yvec1, yvec1;
  2375. ADD_DY yvec1, yvec14, yvec14;
  2376. //#### Unroll time 2 ####
  2377. LD_DY 8*SIZE(ptrba), yvec3;
  2378. LD_DY 12*SIZE(ptrba), yvec4;
  2379. BROAD_DY 1*SIZE(ptrbb), yvec5;
  2380. MUL_DY yvec5, yvec3, yvec3;
  2381. ADD_DY yvec3, yvec15, yvec15;
  2382. MUL_DY yvec5, yvec4, yvec4
  2383. ADD_DY yvec4, yvec14, yvec14;
  2384. //#### Unroll time 3 ####
  2385. LD_DY 16*SIZE(ptrba), yvec0;
  2386. LD_DY 20*SIZE(ptrba), yvec1;
  2387. BROAD_DY 2*SIZE(ptrbb), yvec2;
  2388. MUL_DY yvec2, yvec0, yvec0;
  2389. ADD_DY yvec0, yvec15, yvec15;
  2390. MUL_DY yvec2, yvec1, yvec1;
  2391. ADD_DY yvec1, yvec14, yvec14;
  2392. //#### Unroll time 2 ####
  2393. LD_DY 24*SIZE(ptrba), yvec3;
  2394. LD_DY 28*SIZE(ptrba), yvec4;
  2395. BROAD_DY 3*SIZE(ptrbb), yvec5;
  2396. MUL_DY yvec5, yvec3, yvec3;
  2397. ADD_DY yvec3, yvec15, yvec15;
  2398. ADDQ $32*SIZE, ptrba;
  2399. MUL_DY yvec5, yvec4, yvec4;
  2400. ADD_DY yvec4, yvec14, yvec14;
  2401. ADDQ $4*SIZE, ptrbb;
  2402. DECQ k;
  2403. JG .L311_bodyB;
  2404. ALIGN_5
  2405. .L311_loopE:
  2406. #ifndef TRMMKERNEL
  2407. TEST $2, bk;
  2408. #else
  2409. MOVQ kkk, %rax;
  2410. TEST $2, %rax;
  2411. #endif
  2412. JLE .L312_loopE;
  2413. ALIGN_5
  2414. .L312_bodyB:
  2415. //#### Unroll time 1 ####
  2416. LD_DY 0*SIZE(ptrba), yvec0;
  2417. LD_DY 4*SIZE(ptrba), yvec1;
  2418. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2419. MUL_DY yvec2, yvec0, yvec0;
  2420. ADD_DY yvec0, yvec15, yvec15;
  2421. MUL_DY yvec2, yvec1, yvec1;
  2422. ADD_DY yvec1, yvec14, yvec14;
  2423. //#### Unroll time 2 ####
  2424. LD_DY 8*SIZE(ptrba), yvec3;
  2425. LD_DY 12*SIZE(ptrba), yvec4;
  2426. BROAD_DY 1*SIZE(ptrbb), yvec5;
  2427. MUL_DY yvec5, yvec3, yvec3;
  2428. ADD_DY yvec3, yvec15, yvec15;
  2429. ADDQ $16*SIZE, ptrba;
  2430. MUL_DY yvec5, yvec4, yvec4
  2431. ADD_DY yvec4, yvec14, yvec14;
  2432. ADDQ $2*SIZE, ptrbb;
  2433. .L312_loopE:
  2434. #ifndef TRMMKERNEL
  2435. TEST $1, bk;
  2436. #else
  2437. MOVQ kkk, %rax;
  2438. TEST $1, %rax;
  2439. #endif
  2440. JLE .L313_loopE;
  2441. ALIGN_5
  2442. .L313_bodyB:
  2443. //#### Unroll time 1 ####
  2444. LD_DY 0*SIZE(ptrba), yvec0;
  2445. LD_DY 4*SIZE(ptrba), yvec1;
  2446. BROAD_DY 0*SIZE(ptrbb), yvec2;
  2447. MUL_DY yvec2, yvec0, yvec0;
  2448. ADD_DY yvec0, yvec15, yvec15;
  2449. ADDQ $8*SIZE, ptrba;
  2450. MUL_DY yvec2, yvec1, yvec1;
  2451. ADD_DY yvec1, yvec14, yvec14;
  2452. ADDQ $1*SIZE, ptrbb;
  2453. .L313_loopE:
  2454. //#### Multiply Alpha ####
  2455. BROAD_DY MEMALPHA, yvec7;
  2456. MUL_DY yvec7, yvec15, yvec15;
  2457. MUL_DY yvec7, yvec14, yvec14;
  2458. //#### Testing Alignment ####
  2459. MOVQ C0, %rax;
  2460. OR ldc, %rax;
  2461. TEST $15, %rax;
  2462. JNE .L313_loopEx;
  2463. ALIGN_5
  2464. //#### Writing Back ####
  2465. EXTRA_DY $1, yvec15, xvec13;
  2466. EXTRA_DY $1, yvec14, xvec12;
  2467. #ifndef TRMMKERNEL
  2468. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  2469. ADD_DX 2*SIZE(C0), xvec13, xvec13;
  2470. ADD_DX 4*SIZE(C0), xvec14, xvec14;
  2471. ADD_DX 6*SIZE(C0), xvec12, xvec12;
  2472. #endif
  2473. ST_DX xvec15, 0*SIZE(C0);
  2474. ST_DX xvec13, 2*SIZE(C0);
  2475. ST_DX xvec14, 4*SIZE(C0);
  2476. ST_DX xvec12, 6*SIZE(C0);
  2477. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2478. MOVQ bk, %rax;
  2479. SUBQ kkk, %rax;
  2480. LEAQ (,%rax, SIZE), %rax;
  2481. LEAQ (ptrba, %rax, 8), ptrba;
  2482. ADDQ %rax, ptrbb;
  2483. #endif
  2484. #if defined(TRMMKERNEL)&&defined(LEFT)
  2485. ADDQ $8, kk;
  2486. #endif
  2487. ADDQ $8*SIZE, C0;
  2488. DECQ i;
  2489. JG .L31_bodyB;
  2490. JMP .L31_loopE;
  2491. ALIGN_5
  2492. .L313_loopEx:
  2493. EXTRA_DY $1, yvec15, xvec13;
  2494. EXTRA_DY $1, yvec14, xvec12;
  2495. #ifndef TRMMKERNEL
  2496. LDL_DX 0*SIZE(C0), xvec11, xvec11;
  2497. LDH_DX 1*SIZE(C0), xvec11, xvec11;
  2498. LDL_DX 2*SIZE(C0), xvec10, xvec10;
  2499. LDH_DX 3*SIZE(C0), xvec10, xvec10;
  2500. LDL_DX 4*SIZE(C0), xvec9, xvec9;
  2501. LDH_DX 5*SIZE(C0), xvec9, xvec9;
  2502. LDL_DX 6*SIZE(C0), xvec8, xvec8;
  2503. LDH_DX 7*SIZE(C0), xvec8, xvec8;
  2504. ADD_DX xvec11, xvec15, xvec15;
  2505. ADD_DX xvec10, xvec13, xvec13;
  2506. ADD_DX xvec9, xvec14, xvec14;
  2507. ADD_DX xvec8, xvec12, xvec12;
  2508. #endif
  2509. STL_DX xvec15, 0*SIZE(C0);
  2510. STH_DX xvec15, 1*SIZE(C0);
  2511. STL_DX xvec13, 2*SIZE(C0);
  2512. STH_DX xvec13, 3*SIZE(C0);
  2513. STL_DX xvec14, 4*SIZE(C0);
  2514. STH_DX xvec14, 5*SIZE(C0);
  2515. STL_DX xvec12, 6*SIZE(C0);
  2516. STH_DX xvec12, 7*SIZE(C0);
  2517. #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2518. MOVQ bk, %rax;
  2519. SUBQ kkk, %rax;
  2520. LEAQ (,%rax, SIZE), %rax;
  2521. LEAQ (ptrba, %rax, 8), ptrba;
  2522. ADDQ %rax, ptrbb;
  2523. #endif
  2524. #if defined(TRMMKERNEL)&&defined(LEFT)
  2525. ADDQ $8, kk;
  2526. #endif
  2527. ADDQ $8*SIZE, C0;
  2528. DECQ i;
  2529. JG .L31_bodyB;
  2530. .L31_loopE:
  2531. TEST $4, bm
  2532. JLE .L32_loopE;
  2533. ALIGN_5
  2534. .L32_bodyB:
  2535. #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2536. MOVQ bb, ptrbb;
  2537. #else
  2538. MOVQ bb, ptrbb;
  2539. MOVQ kk, %rax;
  2540. LEAQ (,%rax, SIZE), %rax;
  2541. LEAQ (ptrba, %rax, 4), ptrba;
  2542. ADDQ %rax, ptrbb;
  2543. #endif
  2544. //#### Initial Results Register ####
  2545. XOR_DY yvec15, yvec15, yvec15;
  2546. #ifndef TRMMKERNEL
  2547. MOVQ bk, k;
  2548. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2549. MOVQ bk, %rax;
  2550. SUBQ kk, %rax;
  2551. MOVQ %rax, kkk;
  2552. #else
  2553. MOVQ kk, %rax;
  2554. #ifdef LEFT
  2555. ADDQ $4, %rax;
  2556. #else
  2557. ADDQ $1, %rax;
  2558. #endif
  2559. MOVQ %rax, kkk
  2560. #endif
  2561. SARQ $2, k;
  2562. JLE .L321_loopE;
  2563. ALIGN_5
  2564. .L321_bodyB:
  2565. LD_DY 0*SIZE(ptrba), yvec0;
  2566. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2567. MUL_DY yvec0, yvec1, yvec1;
  2568. ADD_DY yvec1, yvec15, yvec15;
  2569. LD_DY 4*SIZE(ptrba), yvec2;
  2570. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2571. MUL_DY yvec2, yvec3, yvec3;
  2572. ADD_DY yvec3, yvec15, yvec15;
  2573. LD_DY 8*SIZE(ptrba), yvec4;
  2574. BROAD_DY 2*SIZE(ptrbb), yvec5;
  2575. MUL_DY yvec4, yvec5, yvec5;
  2576. ADD_DY yvec5, yvec15, yvec15;
  2577. LD_DY 12*SIZE(ptrba), yvec6;
  2578. BROAD_DY 3*SIZE(ptrbb), yvec7;
  2579. MUL_DY yvec6, yvec7, yvec7;
  2580. ADD_DY yvec7, yvec15, yvec15;
  2581. ADDQ $16*SIZE, ptrba;
  2582. ADDQ $4*SIZE, ptrbb;
  2583. DECQ k;
  2584. JG .L321_bodyB;
  2585. ALIGN_5
  2586. .L321_loopE:
  2587. #ifndef TRMMKERNEL
  2588. TEST $2, bk;
  2589. #else
  2590. MOVQ kkk, %rax;
  2591. TEST $2, %rax;
  2592. #endif
  2593. JLE .L322_loopE;
  2594. ALIGN_5
  2595. .L322_bodyB:
  2596. LD_DY 0*SIZE(ptrba), yvec0;
  2597. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2598. MUL_DY yvec0, yvec1, yvec1;
  2599. ADD_DY yvec1, yvec15, yvec15;
  2600. LD_DY 4*SIZE(ptrba), yvec2;
  2601. BROAD_DY 1*SIZE(ptrbb), yvec3;
  2602. MUL_DY yvec2, yvec3, yvec3;
  2603. ADD_DY yvec3, yvec15, yvec15;
  2604. ADDQ $8*SIZE, ptrba;
  2605. ADDQ $2*SIZE, ptrbb;
  2606. .L322_loopE:
  2607. #ifndef TRMMKERNEL
  2608. TEST $1, bk;
  2609. #else
  2610. MOVQ kkk, %rax;
  2611. TEST $1, %rax;
  2612. #endif
  2613. JLE .L323_loopE;
  2614. ALIGN_5
  2615. .L323_bodyB:
  2616. LD_DY 0*SIZE(ptrba), yvec0;
  2617. BROAD_DY 0*SIZE(ptrbb), yvec1;
  2618. MUL_DY yvec0, yvec1, yvec1;
  2619. ADD_DY yvec1, yvec15, yvec15;
  2620. ADDQ $4*SIZE, ptrba;
  2621. ADDQ $1*SIZE, ptrbb;
  2622. .L323_loopE:
  2623. //#### Multiply Alpha ####
  2624. BROAD_DY MEMALPHA, yvec7;
  2625. MUL_DY yvec7, yvec15, yvec15;
  2626. //#### Testing Alignment ####
  2627. MOVQ C0, %rax;
  2628. OR ldc, %rax;
  2629. TEST $15, %rax;
  2630. JNE .L323_loopEx;
  2631. ALIGN_5
  2632. //#### Writing Back ####
  2633. EXTRA_DY $1, yvec15, xvec14;
  2634. #ifndef TRMMKERNEL
  2635. ADD_DX 0*SIZE(C0), xvec15, xvec15;
  2636. ADD_DX 2*SIZE(C0), xvec14, xvec14;
  2637. #endif
  2638. ST_DX xvec15, 0*SIZE(C0);
  2639. ST_DX xvec14, 2*SIZE(C0);
  2640. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2641. MOVQ bk, %rax;
  2642. SUBQ kkk, %rax;
  2643. LEAQ (, %rax, SIZE), %rax;
  2644. LEAQ (ptrba, %rax, 4), ptrba;
  2645. ADDQ %rax, ptrbb;
  2646. #endif
  2647. #if defined(TRMMKERNEL) && defined(LEFT)
  2648. ADDQ $4, kk
  2649. #endif
  2650. ADDQ $4*SIZE, C0;
  2651. JMP .L32_loopE;
  2652. ALIGN_5
  2653. .L323_loopEx:
  2654. //#### Writing Back ####
  2655. EXTRA_DY $1, yvec15, xvec14;
  2656. #ifndef TRMMKERNEL
  2657. LDL_DX 0*SIZE(C0), xvec13, xvec13;
  2658. LDH_DX 1*SIZE(C0), xvec13, xvec13;
  2659. LDL_DX 2*SIZE(C0), xvec12, xvec12;
  2660. LDH_DX 3*SIZE(C0), xvec12, xvec12;
  2661. ADD_DX xvec13, xvec15, xvec15;
  2662. ADD_DX xvec12, xvec14, xvec14;
  2663. #endif
  2664. STL_DX xvec15, 0*SIZE(C0);
  2665. STH_DX xvec15, 1*SIZE(C0);
  2666. STL_DX xvec14, 2*SIZE(C0);
  2667. STH_DX xvec14, 3*SIZE(C0);
  2668. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
  2669. MOVQ bk, %rax;
  2670. SUBQ kkk, %rax;
  2671. LEAQ (, %rax, SIZE), %rax;
  2672. LEAQ (ptrba, %rax, 4), ptrba;
  2673. ADDQ %rax, ptrbb;
  2674. #endif
  2675. #if defined(TRMMKERNEL) && defined(LEFT)
  2676. ADDQ $4, kk
  2677. #endif
  2678. ADDQ $4*SIZE, C0;
  2679. .L32_loopE:
  2680. TEST $2, bm
  2681. JLE .L33_loopE;
  2682. ALIGN_5
  2683. .L33_bodyB:
  2684. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2685. MOVQ bb, ptrbb;
  2686. #else
  2687. MOVQ bb, ptrbb;
  2688. MOVQ kk, %rax
  2689. LEAQ (, %rax, SIZE), %rax
  2690. LEAQ (ptrba, %rax, 2), ptrba
  2691. ADDQ %rax, ptrbb;
  2692. #endif
  2693. //#### Initial Result ####
  2694. XOR_DY yvec15, yvec15, yvec15;
  2695. #ifndef TRMMKERNEL
  2696. MOVQ bk, k;
  2697. #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2698. MOVQ bk, %rax;
  2699. SUBQ kk, %rax;
  2700. MOVQ %rax, kkk;
  2701. #else
  2702. MOVQ kk, %rax;
  2703. #ifdef LEFT
  2704. ADDQ $2, %rax;
  2705. #else
  2706. ADDQ $1, %rax;
  2707. #endif
  2708. MOVQ %rax, kkk;
  2709. #endif
  2710. SARQ $2, k;
  2711. JLE .L331_loopE;
  2712. ALIGN_5
  2713. .L331_bodyB:
  2714. LD_DX 0*SIZE(ptrba), xvec0;
  2715. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2716. MUL_DX xvec0, xvec2, xvec2;
  2717. ADD_DX xvec2, xvec15, xvec15;
  2718. LD_DX 2*SIZE(ptrba), xvec1;
  2719. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2720. MUL_DX xvec1, xvec3, xvec3;
  2721. ADD_DX xvec3, xvec15, xvec15;
  2722. LD_DX 4*SIZE(ptrba), xvec4;
  2723. BROAD_DX 2*SIZE(ptrbb), xvec5;
  2724. MUL_DX xvec4, xvec5, xvec5;
  2725. ADD_DX xvec5, xvec15, xvec15;
  2726. LD_DX 6*SIZE(ptrba), xvec6;
  2727. BROAD_DX 3*SIZE(ptrbb), xvec7;
  2728. MUL_DX xvec6, xvec7, xvec7;
  2729. ADD_DX xvec7, xvec15, xvec15;
  2730. ADDQ $8*SIZE, ptrba;
  2731. ADDQ $4*SIZE, ptrbb;
  2732. DECQ k;
  2733. JG .L331_bodyB;
  2734. ALIGN_5
  2735. .L331_loopE:
  2736. #ifndef TRMMKERNEL
  2737. TEST $2,bk;
  2738. #else
  2739. MOVQ kkk, %rax;
  2740. TEST $2, %rax
  2741. #endif
  2742. JLE .L332_loopE;
  2743. ALIGN_5
  2744. .L332_bodyB:
  2745. LD_DX 0*SIZE(ptrba), xvec0;
  2746. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2747. MUL_DX xvec0, xvec2, xvec2;
  2748. ADD_DX xvec2, xvec15, xvec15;
  2749. LD_DX 2*SIZE(ptrba), xvec1;
  2750. BROAD_DX 1*SIZE(ptrbb), xvec3;
  2751. MUL_DX xvec1, xvec3, xvec3;
  2752. ADD_DX xvec3, xvec15, xvec15;
  2753. ADDQ $4*SIZE, ptrba;
  2754. ADDQ $2*SIZE, ptrbb;
  2755. .L332_loopE:
  2756. #ifndef TRMMKERNEL
  2757. TEST $1, bk;
  2758. #else
  2759. MOVQ kkk, %rax;
  2760. TEST $1, %rax;
  2761. #endif
  2762. JLE .L333_loopE;
  2763. ALIGN_5
  2764. .L333_bodyB:
  2765. LD_DX 0*SIZE(ptrba), xvec0;
  2766. BROAD_DX 0*SIZE(ptrbb), xvec2;
  2767. MUL_DX xvec0, xvec2, xvec2;
  2768. ADD_DX xvec2, xvec15, xvec15;
  2769. ADDQ $2*SIZE, ptrba;
  2770. ADDQ $1*SIZE, ptrbb;
  2771. .L333_loopE:
  2772. //#### Multiply Alpha ####
  2773. BROAD_DX MEMALPHA, xvec7;
  2774. MUL_DX xvec7, xvec15, xvec15;
  2775. #ifndef TRMMKERNEL
  2776. LDL_DX 0*SIZE(C0), xvec14, xvec14;
  2777. LDH_DX 1*SIZE(C0), xvec14, xvec14;
  2778. ADD_DX xvec14, xvec15, xvec15;
  2779. #endif
  2780. STL_DX xvec15, 0*SIZE(C0);
  2781. STH_DX xvec15, 1*SIZE(C0);
  2782. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2783. MOVQ bk, %rax;
  2784. SUBQ kkk, %rax;
  2785. LEAQ (,%rax, SIZE), %rax;
  2786. LEAQ (ptrba, %rax, 2), ptrba;
  2787. ADDQ %rax, ptrbb;
  2788. #endif
  2789. #if defined(TRMMKERNEL) && defined(LEFT)
  2790. addq $2, kk
  2791. #endif
  2792. ADDQ $2*SIZE, C0;
  2793. .L33_loopE:
  2794. TEST $1, bm
  2795. JLE .L34_loopE;
  2796. ALIGN_5
  2797. .L34_bodyB:
  2798. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2799. MOVQ bb, ptrbb;
  2800. #else
  2801. MOVQ bb, ptrbb;
  2802. MOVQ kk, %rax;
  2803. LEAQ (, %rax, SIZE), %rax;
  2804. ADDQ %rax, ptrba;
  2805. ADDQ %rax, ptrbb;
  2806. #endif
  2807. XOR_DY yvec15, yvec15, yvec15;
  2808. #ifndef TRMMKERNEL
  2809. MOVQ bk, k;
  2810. #elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
  2811. MOVQ bk, %rax;
  2812. SUBQ kk, %rax;
  2813. MOVQ %rax, kkk;
  2814. #else
  2815. MOVQ kk, %rax;
  2816. #ifdef LEFT
  2817. ADDQ $1, %rax;
  2818. #else
  2819. ADDQ $1, %rax;
  2820. #endif
  2821. MOVQ %rax, kkk;
  2822. #endif
  2823. SARQ $2, k;
  2824. JLE .L341_loopE;
  2825. ALIGN_5
  2826. .L341_bodyB:
  2827. vmovsd 0*SIZE(ptrba), xvec0;
  2828. vmovsd 0*SIZE(ptrbb), xvec1;
  2829. vmulsd xvec0, xvec1, xvec1;
  2830. vaddsd xvec1, xvec15, xvec15;
  2831. vmovsd 1*SIZE(ptrba), xvec0;
  2832. vmovsd 1*SIZE(ptrbb), xvec1;
  2833. vmulsd xvec0, xvec1, xvec1;
  2834. vaddsd xvec1, xvec15, xvec15;
  2835. vmovsd 2*SIZE(ptrba), xvec0;
  2836. vmovsd 2*SIZE(ptrbb), xvec1;
  2837. vmulsd xvec0, xvec1, xvec1;
  2838. vaddsd xvec1, xvec15, xvec15;
  2839. vmovsd 3*SIZE(ptrba), xvec0;
  2840. vmovsd 3*SIZE(ptrbb), xvec1;
  2841. vmulsd xvec0, xvec1, xvec1;
  2842. vaddsd xvec1, xvec15, xvec15;
  2843. addq $4*SIZE, ptrba;
  2844. addq $4*SIZE, ptrbb;
  2845. decq k;
  2846. JG .L341_bodyB;
  2847. ALIGN_5
  2848. .L341_loopE:
  2849. #ifndef TRMMKERNEL
  2850. TEST $2, bk;
  2851. #else
  2852. MOVQ kkk, %rax;
  2853. TEST $2, %rax;
  2854. #endif
  2855. JLE .L342_loopE;
  2856. ALIGN_5
  2857. .L342_bodyB:
  2858. vmovsd 0*SIZE(ptrba), xvec0;
  2859. vmovsd 0*SIZE(ptrbb), xvec1;
  2860. vmulsd xvec0, xvec1, xvec1;
  2861. vaddsd xvec1, xvec15, xvec15;
  2862. vmovsd 1*SIZE(ptrba), xvec0;
  2863. vmovsd 1*SIZE(ptrbb), xvec1;
  2864. vmulsd xvec0, xvec1, xvec1;
  2865. vaddsd xvec1, xvec15, xvec15;
  2866. addq $2*SIZE, ptrba;
  2867. addq $2*SIZE, ptrbb;
  2868. .L342_loopE:
  2869. #ifndef TRMMKERNEL
  2870. TEST $1, bk
  2871. #else
  2872. MOVQ kkk, %rax;
  2873. TEST $1, %rax;
  2874. #endif
  2875. JLE .L343_loopE;
  2876. ALIGN_5
  2877. .L343_bodyB:
  2878. vmovsd 0*SIZE(ptrba), xvec0;
  2879. vmovsd 0*SIZE(ptrbb), xvec1;
  2880. vmulsd xvec0, xvec1, xvec1;
  2881. vaddsd xvec1, xvec15, xvec15;
  2882. addq $1*SIZE, ptrba;
  2883. addq $1*SIZE, ptrbb;
  2884. .L343_loopE:
  2885. //#### Writing Back ####
  2886. vmovsd MEMALPHA, xvec7;
  2887. vmulsd xvec7, xvec15, xvec15;
  2888. #ifndef TRMMKERNEL
  2889. vmovsd 0*SIZE(C0), xvec0;
  2890. vaddsd xvec0, xvec15, xvec15;
  2891. #endif
  2892. movsd xvec15, 0*SIZE(C0);
  2893. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2894. MOVQ bk, %rax;
  2895. SUBQ kkk, %rax;
  2896. LEAQ (,%rax, SIZE), %rax;
  2897. ADDQ %rax, ptrba;
  2898. ADDQ %rax, ptrbb;
  2899. #endif
  2900. #if defined(TRMMKERNEL) && defined(LEFT)
  2901. addq $1, kk
  2902. #endif
  2903. addq $1*SIZE, C0;
  2904. .L34_loopE:
  2905. MOVQ bk, k
  2906. SALQ $3, k;
  2907. ADDQ k, bb;
  2908. LEAQ (C, ldc, 1), C;
  2909. .L30_loopE:
  2910. movq 0(%rsp), %rbx;
  2911. movq 8(%rsp), %rbp;
  2912. movq 16(%rsp), %r12;
  2913. movq 24(%rsp), %r13;
  2914. movq 32(%rsp), %r14;
  2915. movq 40(%rsp), %r15;
  2916. vzeroupper
  2917. #ifdef WINDOWS_ABI
  2918. movq 48(%rsp), %rdi
  2919. movq 56(%rsp), %rsi
  2920. movups 64(%rsp), %xmm6
  2921. movups 80(%rsp), %xmm7
  2922. movups 96(%rsp), %xmm8
  2923. movups 112(%rsp), %xmm9
  2924. movups 128(%rsp), %xmm10
  2925. movups 144(%rsp), %xmm11
  2926. movups 160(%rsp), %xmm12
  2927. movups 176(%rsp), %xmm13
  2928. movups 192(%rsp), %xmm14
  2929. movups 208(%rsp), %xmm15
  2930. #endif
  2931. addq $STACKSIZE, %rsp;
  2932. ret
  2933. EPILOGUE