You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4_sandy.S 70 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define OLD_M %rdi
  30. #define OLD_N %rsi
  31. #define M %r13
  32. #define J %r14
  33. #define OLD_K %rdx
  34. #define A %rcx
  35. #define B %r8
  36. #define C %r9
  37. #define LDC %r10
  38. #define I %r11
  39. #define AO %rdi
  40. #define BO %rsi
  41. #define CO1 %r15
  42. #define K %r12
  43. #define BI %rbp
  44. #define SP %rbx
  45. #define BO1 %rdi
  46. #define CO2 %rdx
  47. #ifndef WINDOWS_ABI
  48. #define STACKSIZE 96
  49. #else
  50. #define STACKSIZE 256
  51. #define OLD_A 40 + STACKSIZE(%rsp)
  52. #define OLD_B 48 + STACKSIZE(%rsp)
  53. #define OLD_C 56 + STACKSIZE(%rsp)
  54. #define OLD_LDC 64 + STACKSIZE(%rsp)
  55. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  56. #endif
  57. #define L_BUFFER_SIZE 8192
  58. #define Ndiv6 24(%rsp)
  59. #define Nmod6 32(%rsp)
  60. #define N 40(%rsp)
  61. #define ALPHA 48(%rsp)
  62. #define OFFSET 56(%rsp)
  63. #define KK 64(%rsp)
  64. #define KKK 72(%rsp)
  65. #define BUFFER1 128(%rsp)
  66. #if defined(OS_WINDOWS)
  67. #if L_BUFFER_SIZE > 16384
  68. #define STACK_TOUCH \
  69. movl $0, 4096 * 4(%rsp);\
  70. movl $0, 4096 * 3(%rsp);\
  71. movl $0, 4096 * 2(%rsp);\
  72. movl $0, 4096 * 1(%rsp);
  73. #elif L_BUFFER_SIZE > 12288
  74. #define STACK_TOUCH \
  75. movl $0, 4096 * 3(%rsp);\
  76. movl $0, 4096 * 2(%rsp);\
  77. movl $0, 4096 * 1(%rsp);
  78. #elif L_BUFFER_SIZE > 8192
  79. #define STACK_TOUCH \
  80. movl $0, 4096 * 2(%rsp);\
  81. movl $0, 4096 * 1(%rsp);
  82. #elif L_BUFFER_SIZE > 4096
  83. #define STACK_TOUCH \
  84. movl $0, 4096 * 1(%rsp);
  85. #else
  86. #define STACK_TOUCH
  87. #endif
  88. #else
  89. #define STACK_TOUCH
  90. #endif
  91. #define A_PR1 512
  92. #define B_PR1 512
  93. /*******************************************************************************************
  94. * 4 lines of N
  95. *******************************************************************************************/
  96. .macro KERNEL16x4_SUB
  97. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  98. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  99. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  100. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  101. vmulps %ymm2 , %ymm0 , %ymm12
  102. vmulps %ymm2 , %ymm1 , %ymm13
  103. vmulps %ymm3 , %ymm0 , %ymm14
  104. vmulps %ymm3 , %ymm1 , %ymm15
  105. vaddps %ymm12, %ymm4 , %ymm4
  106. vaddps %ymm13, %ymm5 , %ymm5
  107. vaddps %ymm14, %ymm6 , %ymm6
  108. vaddps %ymm15, %ymm7 , %ymm7
  109. vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
  110. vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
  111. vmulps %ymm2 , %ymm0 , %ymm12
  112. vmulps %ymm2 , %ymm1 , %ymm13
  113. vmulps %ymm3 , %ymm0 , %ymm14
  114. vmulps %ymm3 , %ymm1 , %ymm15
  115. vaddps %ymm12, %ymm8 , %ymm8
  116. vaddps %ymm13, %ymm9 , %ymm9
  117. vaddps %ymm14, %ymm10, %ymm10
  118. vaddps %ymm15, %ymm11, %ymm11
  119. addq $ 4 , BI
  120. addq $ 16, %rax
  121. .endm
  122. .macro SAVE16x4
  123. vbroadcastss ALPHA, %ymm0
  124. vmulps %ymm0 , %ymm4 , %ymm4
  125. vmulps %ymm0 , %ymm5 , %ymm5
  126. vmulps %ymm0 , %ymm6 , %ymm6
  127. vmulps %ymm0 , %ymm7 , %ymm7
  128. vmulps %ymm0 , %ymm8 , %ymm8
  129. vmulps %ymm0 , %ymm9 , %ymm9
  130. vmulps %ymm0 , %ymm10, %ymm10
  131. vmulps %ymm0 , %ymm11, %ymm11
  132. #if !defined(TRMMKERNEL)
  133. vaddps (CO1), %ymm4,%ymm4
  134. vaddps 8 * SIZE(CO1), %ymm5,%ymm5
  135. vaddps (CO1, LDC), %ymm6,%ymm6
  136. vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7
  137. vaddps (CO2), %ymm8,%ymm8
  138. vaddps 8 * SIZE(CO2), %ymm9,%ymm9
  139. vaddps (CO2, LDC), %ymm10,%ymm10
  140. vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11
  141. #endif
  142. vmovups %ymm4 , (CO1)
  143. vmovups %ymm5 , 8 * SIZE(CO1)
  144. vmovups %ymm6 , (CO1, LDC)
  145. vmovups %ymm7 , 8 * SIZE(CO1, LDC)
  146. vmovups %ymm8 , (CO2)
  147. vmovups %ymm9 , 8 * SIZE(CO2)
  148. vmovups %ymm10, (CO2, LDC)
  149. vmovups %ymm11, 8 * SIZE(CO2, LDC)
  150. prefetcht0 64(CO1)
  151. prefetcht0 64(CO1, LDC)
  152. prefetcht0 64(CO2)
  153. prefetcht0 64(CO2, LDC)
  154. .endm
  155. /*******************************************************************************************/
  156. .macro KERNEL8x4_SUB
  157. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  158. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  159. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  160. vmulps %ymm2 , %ymm0 , %ymm12
  161. vmulps %ymm3 , %ymm0 , %ymm14
  162. vaddps %ymm12, %ymm4 , %ymm4
  163. vaddps %ymm14, %ymm6 , %ymm6
  164. vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
  165. vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
  166. vmulps %ymm2 , %ymm0 , %ymm12
  167. vmulps %ymm3 , %ymm0 , %ymm14
  168. vaddps %ymm12, %ymm8 , %ymm8
  169. vaddps %ymm14, %ymm10, %ymm10
  170. addq $ 4 , BI
  171. addq $ 8 , %rax
  172. .endm
  173. .macro SAVE8x4
  174. vbroadcastss ALPHA, %ymm0
  175. vmulps %ymm0 , %ymm4 , %ymm4
  176. vmulps %ymm0 , %ymm6 , %ymm6
  177. vmulps %ymm0 , %ymm8 , %ymm8
  178. vmulps %ymm0 , %ymm10, %ymm10
  179. #if !defined(TRMMKERNEL)
  180. vaddps (CO1), %ymm4,%ymm4
  181. vaddps (CO1, LDC), %ymm6,%ymm6
  182. vaddps (CO2), %ymm8,%ymm8
  183. vaddps (CO2, LDC), %ymm10,%ymm10
  184. #endif
  185. vmovups %ymm4 , (CO1)
  186. vmovups %ymm6 , (CO1, LDC)
  187. vmovups %ymm8 , (CO2)
  188. vmovups %ymm10, (CO2, LDC)
  189. .endm
  190. /*******************************************************************************************/
  191. .macro KERNEL4x4_SUB
  192. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  193. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  194. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
  195. vmulps %xmm2 , %xmm0 , %xmm12
  196. vmulps %xmm3 , %xmm0 , %xmm14
  197. vaddps %xmm12, %xmm4 , %xmm4
  198. vaddps %xmm14, %xmm6 , %xmm6
  199. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2
  200. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3
  201. vmulps %xmm2 , %xmm0 , %xmm12
  202. vmulps %xmm3 , %xmm0 , %xmm14
  203. vaddps %xmm12, %xmm8 , %xmm8
  204. vaddps %xmm14, %xmm10, %xmm10
  205. addq $ 4 , BI
  206. addq $ 4 , %rax
  207. .endm
  208. .macro SAVE4x4
  209. vbroadcastss ALPHA, %xmm0
  210. vmulps %xmm0 , %xmm4 , %xmm4
  211. vmulps %xmm0 , %xmm6 , %xmm6
  212. vmulps %xmm0 , %xmm8 , %xmm8
  213. vmulps %xmm0 , %xmm10, %xmm10
  214. #if !defined(TRMMKERNEL)
  215. vaddps (CO1), %xmm4,%xmm4
  216. vaddps (CO1, LDC), %xmm6,%xmm6
  217. vaddps (CO2), %xmm8,%xmm8
  218. vaddps (CO2, LDC), %xmm10,%xmm10
  219. #endif
  220. vmovups %xmm4 , (CO1)
  221. vmovups %xmm6 , (CO1, LDC)
  222. vmovups %xmm8 , (CO2)
  223. vmovups %xmm10, (CO2, LDC)
  224. .endm
  225. /*******************************************************************************************/
  226. .macro KERNEL2x4_SUB
  227. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  228. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  229. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  230. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  231. vmulss %xmm2 , %xmm0 , %xmm12
  232. vmulss %xmm2 , %xmm1 , %xmm13
  233. vmulss %xmm3 , %xmm0 , %xmm14
  234. vmulss %xmm3 , %xmm1 , %xmm15
  235. vaddss %xmm12, %xmm4 , %xmm4
  236. vaddss %xmm13, %xmm5 , %xmm5
  237. vaddss %xmm14, %xmm6 , %xmm6
  238. vaddss %xmm15, %xmm7 , %xmm7
  239. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
  240. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
  241. vmulss %xmm2 , %xmm0 , %xmm12
  242. vmulss %xmm2 , %xmm1 , %xmm13
  243. vmulss %xmm3 , %xmm0 , %xmm14
  244. vmulss %xmm3 , %xmm1 , %xmm15
  245. vaddss %xmm12, %xmm8 , %xmm8
  246. vaddss %xmm13, %xmm9 , %xmm9
  247. vaddss %xmm14, %xmm10, %xmm10
  248. vaddss %xmm15, %xmm11, %xmm11
  249. addq $ 4 , BI
  250. addq $ 2, %rax
  251. .endm
  252. .macro SAVE2x4
  253. vmovss ALPHA, %xmm0
  254. vmulss %xmm0 , %xmm4 , %xmm4
  255. vmulss %xmm0 , %xmm5 , %xmm5
  256. vmulss %xmm0 , %xmm6 , %xmm6
  257. vmulss %xmm0 , %xmm7 , %xmm7
  258. vmulss %xmm0 , %xmm8 , %xmm8
  259. vmulss %xmm0 , %xmm9 , %xmm9
  260. vmulss %xmm0 , %xmm10, %xmm10
  261. vmulss %xmm0 , %xmm11, %xmm11
  262. #if !defined(TRMMKERNEL)
  263. vaddss (CO1), %xmm4,%xmm4
  264. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  265. vaddss (CO1, LDC), %xmm6,%xmm6
  266. vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
  267. vaddss (CO2), %xmm8,%xmm8
  268. vaddss 1 * SIZE(CO2), %xmm9,%xmm9
  269. vaddss (CO2, LDC), %xmm10,%xmm10
  270. vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11
  271. #endif
  272. vmovss %xmm4 , (CO1)
  273. vmovss %xmm5 , 1 * SIZE(CO1)
  274. vmovss %xmm6 , (CO1, LDC)
  275. vmovss %xmm7 , 1 * SIZE(CO1, LDC)
  276. vmovss %xmm8 , (CO2)
  277. vmovss %xmm9 , 1 * SIZE(CO2)
  278. vmovss %xmm10, (CO2, LDC)
  279. vmovss %xmm11, 1 * SIZE(CO2, LDC)
  280. .endm
  281. /*******************************************************************************************/
  282. .macro KERNEL1x4_SUB
  283. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  284. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  285. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  286. vmulss %xmm2 , %xmm0 , %xmm12
  287. vmulss %xmm3 , %xmm0 , %xmm14
  288. vaddss %xmm12, %xmm4 , %xmm4
  289. vaddss %xmm14, %xmm6 , %xmm6
  290. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
  291. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
  292. vmulss %xmm2 , %xmm0 , %xmm12
  293. vmulss %xmm3 , %xmm0 , %xmm14
  294. vaddss %xmm12, %xmm8 , %xmm8
  295. vaddss %xmm14, %xmm10, %xmm10
  296. addq $ 4 , BI
  297. addq $ 1, %rax
  298. .endm
  299. .macro SAVE1x4
  300. vmovss ALPHA, %xmm0
  301. vmulss %xmm0 , %xmm4 , %xmm4
  302. vmulss %xmm0 , %xmm6 , %xmm6
  303. vmulss %xmm0 , %xmm8 , %xmm8
  304. vmulss %xmm0 , %xmm10, %xmm10
  305. #if !defined(TRMMKERNEL)
  306. vaddss (CO1), %xmm4,%xmm4
  307. vaddss (CO1, LDC), %xmm6,%xmm6
  308. vaddss (CO2), %xmm8,%xmm8
  309. vaddss (CO2, LDC), %xmm10,%xmm10
  310. #endif
  311. vmovss %xmm4 , (CO1)
  312. vmovss %xmm6 , (CO1, LDC)
  313. vmovss %xmm8 , (CO2)
  314. vmovss %xmm10, (CO2, LDC)
  315. .endm
  316. /*******************************************************************************************/
  317. /*******************************************************************************************
  318. * 2 lines of N
  319. *******************************************************************************************/
  320. .macro KERNEL16x2_SUB
  321. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  322. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  323. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  324. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  325. vmulps %ymm2 , %ymm0 , %ymm12
  326. vmulps %ymm2 , %ymm1 , %ymm13
  327. vmulps %ymm3 , %ymm0 , %ymm14
  328. vmulps %ymm3 , %ymm1 , %ymm15
  329. vaddps %ymm12, %ymm4 , %ymm4
  330. vaddps %ymm13, %ymm5 , %ymm5
  331. vaddps %ymm14, %ymm6 , %ymm6
  332. vaddps %ymm15, %ymm7 , %ymm7
  333. addq $ 2 , BI
  334. addq $ 16, %rax
  335. .endm
  336. .macro SAVE16x2
  337. vbroadcastss ALPHA, %ymm0
  338. vmulps %ymm0 , %ymm4 , %ymm4
  339. vmulps %ymm0 , %ymm5 , %ymm5
  340. vmulps %ymm0 , %ymm6 , %ymm6
  341. vmulps %ymm0 , %ymm7 , %ymm7
  342. #if !defined(TRMMKERNEL)
  343. vaddps (CO1), %ymm4,%ymm4
  344. vaddps 8 * SIZE(CO1), %ymm5,%ymm5
  345. vaddps (CO1, LDC), %ymm6,%ymm6
  346. vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7
  347. #endif
  348. vmovups %ymm4 , (CO1)
  349. vmovups %ymm5 , 8 * SIZE(CO1)
  350. vmovups %ymm6 , (CO1, LDC)
  351. vmovups %ymm7 , 8 * SIZE(CO1, LDC)
  352. .endm
  353. /*******************************************************************************************/
  354. .macro KERNEL8x2_SUB
  355. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  356. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  357. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  358. vmulps %ymm2 , %ymm0 , %ymm12
  359. vmulps %ymm3 , %ymm0 , %ymm14
  360. vaddps %ymm12, %ymm4 , %ymm4
  361. vaddps %ymm14, %ymm6 , %ymm6
  362. addq $ 2 , BI
  363. addq $ 8 , %rax
  364. .endm
  365. .macro SAVE8x2
  366. vbroadcastss ALPHA, %ymm0
  367. vmulps %ymm0 , %ymm4 , %ymm4
  368. vmulps %ymm0 , %ymm6 , %ymm6
  369. #if !defined(TRMMKERNEL)
  370. vaddps (CO1), %ymm4,%ymm4
  371. vaddps (CO1, LDC), %ymm6,%ymm6
  372. #endif
  373. vmovups %ymm4 , (CO1)
  374. vmovups %ymm6 , (CO1, LDC)
  375. .endm
  376. /*******************************************************************************************/
  377. .macro KERNEL4x2_SUB
  378. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  379. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  380. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
  381. vmulps %xmm2 , %xmm0 , %xmm12
  382. vmulps %xmm3 , %xmm0 , %xmm14
  383. vaddps %xmm12, %xmm4 , %xmm4
  384. vaddps %xmm14, %xmm6 , %xmm6
  385. addq $ 2 , BI
  386. addq $ 4 , %rax
  387. .endm
  388. .macro SAVE4x2
  389. vbroadcastss ALPHA, %xmm0
  390. vmulps %xmm0 , %xmm4 , %xmm4
  391. vmulps %xmm0 , %xmm6 , %xmm6
  392. #if !defined(TRMMKERNEL)
  393. vaddps (CO1), %xmm4,%xmm4
  394. vaddps (CO1, LDC), %xmm6,%xmm6
  395. #endif
  396. vmovups %xmm4 , (CO1)
  397. vmovups %xmm6 , (CO1, LDC)
  398. .endm
  399. /*******************************************************************************************/
  400. .macro KERNEL2x2_SUB
  401. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  402. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  403. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  404. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  405. vmulss %xmm2 , %xmm0 , %xmm12
  406. vmulss %xmm2 , %xmm1 , %xmm13
  407. vmulss %xmm3 , %xmm0 , %xmm14
  408. vmulss %xmm3 , %xmm1 , %xmm15
  409. vaddss %xmm12, %xmm4 , %xmm4
  410. vaddss %xmm13, %xmm5 , %xmm5
  411. vaddss %xmm14, %xmm6 , %xmm6
  412. vaddss %xmm15, %xmm7 , %xmm7
  413. addq $ 2 , BI
  414. addq $ 2, %rax
  415. .endm
  416. .macro SAVE2x2
  417. vmovss ALPHA, %xmm0
  418. vmulss %xmm0 , %xmm4 , %xmm4
  419. vmulss %xmm0 , %xmm5 , %xmm5
  420. vmulss %xmm0 , %xmm6 , %xmm6
  421. vmulss %xmm0 , %xmm7 , %xmm7
  422. #if !defined(TRMMKERNEL)
  423. vaddss (CO1), %xmm4,%xmm4
  424. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  425. vaddss (CO1, LDC), %xmm6,%xmm6
  426. vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
  427. #endif
  428. vmovss %xmm4 , (CO1)
  429. vmovss %xmm5 , 1 * SIZE(CO1)
  430. vmovss %xmm6 , (CO1, LDC)
  431. vmovss %xmm7 , 1 * SIZE(CO1, LDC)
  432. .endm
  433. /*******************************************************************************************/
  434. .macro KERNEL1x2_SUB
  435. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  436. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  437. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  438. vmulss %xmm2 , %xmm0 , %xmm12
  439. vmulss %xmm3 , %xmm0 , %xmm14
  440. vaddss %xmm12, %xmm4 , %xmm4
  441. vaddss %xmm14, %xmm6 , %xmm6
  442. addq $ 2 , BI
  443. addq $ 1, %rax
  444. .endm
  445. .macro SAVE1x2
  446. vmovss ALPHA, %xmm0
  447. vmulss %xmm0 , %xmm4 , %xmm4
  448. vmulss %xmm0 , %xmm6 , %xmm6
  449. #if !defined(TRMMKERNEL)
  450. vaddss (CO1), %xmm4,%xmm4
  451. vaddss (CO1, LDC), %xmm6,%xmm6
  452. #endif
  453. vmovss %xmm4 , (CO1)
  454. vmovss %xmm6 , (CO1, LDC)
  455. .endm
  456. /*******************************************************************************************/
  457. /*******************************************************************************************
  458. * 1 line of N
  459. *******************************************************************************************/
  460. .macro KERNEL16x1_SUB
  461. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  462. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  463. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  464. vmulps %ymm2 , %ymm0 , %ymm12
  465. vmulps %ymm2 , %ymm1 , %ymm13
  466. vaddps %ymm12, %ymm4 , %ymm4
  467. vaddps %ymm13, %ymm5 , %ymm5
  468. addq $ 1 , BI
  469. addq $ 16, %rax
  470. .endm
  471. .macro SAVE16x1
  472. vbroadcastss ALPHA, %ymm0
  473. vmulps %ymm0 , %ymm4 , %ymm4
  474. vmulps %ymm0 , %ymm5 , %ymm5
  475. #if !defined(TRMMKERNEL)
  476. vaddps (CO1), %ymm4,%ymm4
  477. vaddps 8 * SIZE(CO1), %ymm5,%ymm5
  478. #endif
  479. vmovups %ymm4 , (CO1)
  480. vmovups %ymm5 , 8 * SIZE(CO1)
  481. .endm
  482. /*******************************************************************************************/
  483. .macro KERNEL8x1_SUB
  484. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  485. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  486. vmulps %ymm2 , %ymm0 , %ymm12
  487. vaddps %ymm12, %ymm4 , %ymm4
  488. addq $ 1 , BI
  489. addq $ 8 , %rax
  490. .endm
  491. .macro SAVE8x1
  492. vbroadcastss ALPHA, %ymm0
  493. vmulps %ymm0 , %ymm4 , %ymm4
  494. #if !defined(TRMMKERNEL)
  495. vaddps (CO1), %ymm4,%ymm4
  496. #endif
  497. vmovups %ymm4 , (CO1)
  498. .endm
  499. /*******************************************************************************************/
  500. .macro KERNEL4x1_SUB
  501. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  502. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  503. vmulps %xmm2 , %xmm0 , %xmm12
  504. vaddps %xmm12, %xmm4 , %xmm4
  505. addq $ 1 , BI
  506. addq $ 4 , %rax
  507. .endm
  508. .macro SAVE4x1
  509. vbroadcastss ALPHA, %xmm0
  510. vmulps %xmm0 , %xmm4 , %xmm4
  511. #if !defined(TRMMKERNEL)
  512. vaddps (CO1), %xmm4,%xmm4
  513. #endif
  514. vmovups %xmm4 , (CO1)
  515. .endm
  516. /*******************************************************************************************/
  517. .macro KERNEL2x1_SUB
  518. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  519. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  520. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  521. vmulss %xmm2 , %xmm0 , %xmm12
  522. vmulss %xmm2 , %xmm1 , %xmm13
  523. vaddss %xmm12, %xmm4 , %xmm4
  524. vaddss %xmm13, %xmm5 , %xmm5
  525. addq $ 1 , BI
  526. addq $ 2 , %rax
  527. .endm
  528. .macro SAVE2x1
  529. vmovss ALPHA, %xmm0
  530. vmulss %xmm0 , %xmm4 , %xmm4
  531. vmulss %xmm0 , %xmm5 , %xmm5
  532. #if !defined(TRMMKERNEL)
  533. vaddss (CO1), %xmm4,%xmm4
  534. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  535. #endif
  536. vmovss %xmm4 , (CO1)
  537. vmovss %xmm5 , 1 * SIZE(CO1)
  538. .endm
  539. /*******************************************************************************************/
  540. .macro KERNEL1x1_SUB
  541. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  542. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  543. vmulss %xmm2 , %xmm0 , %xmm12
  544. vaddss %xmm12, %xmm4 , %xmm4
  545. addq $ 1 , BI
  546. addq $ 1 , %rax
  547. .endm
  548. .macro SAVE1x1
  549. vmovss ALPHA, %xmm0
  550. vmulss %xmm0 , %xmm4 , %xmm4
  551. #if !defined(TRMMKERNEL)
  552. vaddss (CO1), %xmm4,%xmm4
  553. #endif
  554. vmovss %xmm4 , (CO1)
  555. .endm
  556. /*******************************************************************************************/
  557. /*************************************************************************************
  558. * TRMM Kernel
  559. *************************************************************************************/
  560. PROLOGUE
  561. PROFCODE
  562. subq $STACKSIZE, %rsp
  563. movq %rbx, (%rsp)
  564. movq %rbp, 8(%rsp)
  565. movq %r12, 16(%rsp)
  566. movq %r13, 24(%rsp)
  567. movq %r14, 32(%rsp)
  568. movq %r15, 40(%rsp)
  569. vzeroupper
  570. #ifdef WINDOWS_ABI
  571. movq %rdi, 48(%rsp)
  572. movq %rsi, 56(%rsp)
  573. movups %xmm6, 64(%rsp)
  574. movups %xmm7, 80(%rsp)
  575. movups %xmm8, 96(%rsp)
  576. movups %xmm9, 112(%rsp)
  577. movups %xmm10, 128(%rsp)
  578. movups %xmm11, 144(%rsp)
  579. movups %xmm12, 160(%rsp)
  580. movups %xmm13, 176(%rsp)
  581. movups %xmm14, 192(%rsp)
  582. movups %xmm15, 208(%rsp)
  583. movq ARG1, OLD_M
  584. movq ARG2, OLD_N
  585. movq ARG3, OLD_K
  586. movq OLD_A, A
  587. movq OLD_B, B
  588. movq OLD_C, C
  589. movq OLD_LDC, LDC
  590. #ifdef TRMMKERNEL
  591. vmovsd OLD_OFFSET, %xmm12
  592. #endif
  593. vmovaps %xmm3, %xmm0
  594. #else
  595. movq STACKSIZE + 8(%rsp), LDC
  596. #ifdef TRMMKERNEL
  597. movsd STACKSIZE + 16(%rsp), %xmm12
  598. #endif
  599. #endif
  600. movq %rsp, SP # save old stack
  601. subq $128 + L_BUFFER_SIZE, %rsp
  602. andq $-4096, %rsp # align stack
  603. STACK_TOUCH
  604. cmpq $0, OLD_M
  605. je .L999
  606. cmpq $0, OLD_N
  607. je .L999
  608. cmpq $0, OLD_K
  609. je .L999
  610. movq OLD_M, M
  611. movq OLD_N, N
  612. movq OLD_K, K
  613. vmovss %xmm0, ALPHA
  614. salq $BASE_SHIFT, LDC
  615. movq N, %rax
  616. xorq %rdx, %rdx
  617. movq $4, %rdi
  618. divq %rdi // N / 4
  619. movq %rax, Ndiv6 // N / 4
  620. movq %rdx, Nmod6 // N % 4
  621. #ifdef TRMMKERNEL
  622. vmovsd %xmm12, OFFSET
  623. vmovsd %xmm12, KK
  624. #ifndef LEFT
  625. negq KK
  626. #endif
  627. #endif
  628. movq Ndiv6, J
  629. cmpq $0, J
  630. je .L2_0
  631. ALIGN_4
  632. /*******************************************************************************************/
  633. .L4_01:
  634. // copy to sub buffer
  635. movq B, BO1
  636. leaq BUFFER1, BO // first buffer to BO
  637. movq K, %rax
  638. sarq $2, %rax // K / 4
  639. jz .L4_01b
  640. ALIGN_4
  641. .L4_01a:
  642. prefetcht0 512(BO1)
  643. prefetchw 512(BO)
  644. vmovups (BO1), %xmm0
  645. vmovups 4*SIZE(BO1), %xmm1
  646. vmovups 8*SIZE(BO1), %xmm2
  647. vmovups 12*SIZE(BO1), %xmm3
  648. vmovups %xmm0, (BO)
  649. vmovups %xmm1, 4*SIZE(BO)
  650. vmovups %xmm2, 8*SIZE(BO)
  651. vmovups %xmm3,12*SIZE(BO)
  652. addq $ 16*SIZE,BO1
  653. addq $ 16*SIZE,BO
  654. decq %rax
  655. jnz .L4_01a
  656. .L4_01b:
  657. movq K, %rax
  658. andq $3, %rax // K % 4
  659. jz .L4_02d
  660. ALIGN_4
  661. .L4_02c:
  662. vmovups (BO1), %xmm0
  663. vmovups %xmm0, (BO)
  664. addq $ 4*SIZE,BO1
  665. addq $ 4*SIZE,BO
  666. decq %rax
  667. jnz .L4_02c
  668. .L4_02d:
  669. movq BO1, B // next offset of B
  670. .L4_10:
  671. movq C, CO1
  672. leaq (C, LDC, 2), CO2
  673. leaq (C, LDC, 4), C // c += 4 * ldc
  674. #if defined(TRMMKERNEL) && defined(LEFT)
  675. movq OFFSET, %rax
  676. movq %rax, KK
  677. #endif
  678. movq A, AO // aoffset = a
  679. addq $ 16 * SIZE, AO
  680. movq M, I
  681. sarq $4, I // i = (m >> 4)
  682. je .L4_20
  683. ALIGN_4
  684. .L4_11:
  685. #if !defined(TRMMKERNEL) || \
  686. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  687. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  688. leaq BUFFER1, BO // first buffer to BO
  689. addq $4 * SIZE, BO
  690. #else
  691. movq KK, %rax
  692. leaq BUFFER1, BO // first buffer to BO
  693. addq $4 * SIZE, BO
  694. movq %rax, BI // Index for BO
  695. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  696. leaq (BO, BI, SIZE), BO
  697. salq $4, %rax // rax = rax * 16 ; number of values
  698. leaq (AO, %rax, SIZE), AO
  699. #endif
  700. vzeroall
  701. #ifndef TRMMKERNEL
  702. movq K, %rax
  703. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  704. movq K, %rax
  705. subq KK, %rax
  706. movq %rax, KKK
  707. #else
  708. movq KK, %rax
  709. #ifdef LEFT
  710. addq $16, %rax // number of values in AO
  711. #else
  712. addq $4, %rax // number of values in BO
  713. #endif
  714. movq %rax, KKK
  715. #endif
  716. andq $-8, %rax // K = K - ( K % 8 )
  717. je .L4_16
  718. movq %rax, BI // Index for BO
  719. leaq (,BI,4) , BI // BI = BI * 4 ; number of values
  720. salq $4, %rax // rax = rax * 16 ; number of values
  721. leaq (AO, %rax, SIZE), AO
  722. leaq (BO, BI, SIZE), BO
  723. negq BI
  724. negq %rax
  725. ALIGN_4
  726. .L4_12:
  727. prefetcht0 A_PR1(AO, %rax, SIZE)
  728. prefetcht0 B_PR1(BO, BI , SIZE)
  729. KERNEL16x4_SUB
  730. prefetcht0 A_PR1(AO, %rax, SIZE)
  731. KERNEL16x4_SUB
  732. prefetcht0 A_PR1(AO, %rax, SIZE)
  733. KERNEL16x4_SUB
  734. prefetcht0 A_PR1(AO, %rax, SIZE)
  735. KERNEL16x4_SUB
  736. prefetcht0 A_PR1(AO, %rax, SIZE)
  737. prefetcht0 B_PR1(BO, BI , SIZE)
  738. KERNEL16x4_SUB
  739. prefetcht0 A_PR1(AO, %rax, SIZE)
  740. KERNEL16x4_SUB
  741. prefetcht0 A_PR1(AO, %rax, SIZE)
  742. KERNEL16x4_SUB
  743. prefetcht0 A_PR1(AO, %rax, SIZE)
  744. KERNEL16x4_SUB
  745. je .L4_16
  746. prefetcht0 A_PR1(AO, %rax, SIZE)
  747. prefetcht0 B_PR1(BO, BI , SIZE)
  748. KERNEL16x4_SUB
  749. prefetcht0 A_PR1(AO, %rax, SIZE)
  750. KERNEL16x4_SUB
  751. prefetcht0 A_PR1(AO, %rax, SIZE)
  752. KERNEL16x4_SUB
  753. prefetcht0 A_PR1(AO, %rax, SIZE)
  754. KERNEL16x4_SUB
  755. prefetcht0 A_PR1(AO, %rax, SIZE)
  756. prefetcht0 B_PR1(BO, BI , SIZE)
  757. KERNEL16x4_SUB
  758. prefetcht0 A_PR1(AO, %rax, SIZE)
  759. KERNEL16x4_SUB
  760. prefetcht0 A_PR1(AO, %rax, SIZE)
  761. KERNEL16x4_SUB
  762. prefetcht0 A_PR1(AO, %rax, SIZE)
  763. KERNEL16x4_SUB
  764. je .L4_16
  765. jmp .L4_12
  766. ALIGN_4
  767. .L4_16:
  768. #ifndef TRMMKERNEL
  769. movq K, %rax
  770. #else
  771. movq KKK, %rax
  772. #endif
  773. andq $7, %rax # if (k & 1)
  774. je .L4_19
  775. movq %rax, BI // Index for BO
  776. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  777. salq $4, %rax // rax = rax * 16 ; number of values
  778. leaq (AO, %rax, SIZE), AO
  779. leaq (BO, BI, SIZE), BO
  780. negq BI
  781. negq %rax
  782. ALIGN_4
  783. .L4_17:
  784. KERNEL16x4_SUB
  785. jl .L4_17
  786. ALIGN_4
  787. .L4_19:
  788. SAVE16x4
  789. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  790. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  791. movq K, %rax
  792. subq KKK, %rax
  793. movq %rax, BI // Index for BO
  794. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  795. leaq (BO, BI, SIZE), BO
  796. salq $4, %rax // rax = rax * 16 ; number of values
  797. leaq (AO, %rax, SIZE), AO
  798. #endif
  799. #if defined(TRMMKERNEL) && defined(LEFT)
  800. addq $16, KK
  801. #endif
  802. addq $16 * SIZE, CO1 # coffset += 16
  803. addq $16 * SIZE, CO2 # coffset += 16
  804. decq I # i --
  805. jg .L4_11
  806. ALIGN_4
  807. /**************************************************************************
  808. * Rest of M
  809. ***************************************************************************/
  810. .L4_20:
  811. // Test rest of M
  812. testq $15, M
  813. jz .L4_60 // to next 3 lines of N
  814. testq $8, M
  815. jz .L4_21pre
  816. ALIGN_4
  817. /**************************************************************************/
  818. .L4_20_1:
  819. #if !defined(TRMMKERNEL) || \
  820. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  821. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  822. leaq BUFFER1, BO // first buffer to BO
  823. addq $4 * SIZE, BO
  824. #else
  825. movq KK, %rax
  826. leaq BUFFER1, BO // first buffer to BO
  827. addq $4 * SIZE, BO
  828. movq %rax, BI // Index for BO
  829. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  830. leaq (BO, BI, SIZE), BO
  831. salq $3, %rax // rax = rax * 8 ; number of values
  832. leaq (AO, %rax, SIZE), AO
  833. #endif
  834. vzeroall
  835. #ifndef TRMMKERNEL
  836. movq K, %rax
  837. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  838. movq K, %rax
  839. subq KK, %rax
  840. movq %rax, KKK
  841. #else
  842. movq KK, %rax
  843. #ifdef LEFT
  844. addq $8, %rax // number of values in A
  845. #else
  846. addq $4, %rax // number of values in BO
  847. #endif
  848. movq %rax, KKK
  849. #endif
  850. andq $-8, %rax
  851. je .L4_20_6
  852. movq %rax, BI // Index for BO
  853. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  854. salq $3, %rax // rax = rax * 8 ; number of values
  855. leaq (AO, %rax, SIZE), AO
  856. leaq (BO, BI, SIZE), BO
  857. negq BI
  858. negq %rax
  859. ALIGN_4
  860. .L4_20_2:
  861. KERNEL8x4_SUB
  862. KERNEL8x4_SUB
  863. KERNEL8x4_SUB
  864. KERNEL8x4_SUB
  865. KERNEL8x4_SUB
  866. KERNEL8x4_SUB
  867. KERNEL8x4_SUB
  868. KERNEL8x4_SUB
  869. je .L4_20_6
  870. KERNEL8x4_SUB
  871. KERNEL8x4_SUB
  872. KERNEL8x4_SUB
  873. KERNEL8x4_SUB
  874. KERNEL8x4_SUB
  875. KERNEL8x4_SUB
  876. KERNEL8x4_SUB
  877. KERNEL8x4_SUB
  878. je .L4_20_6
  879. jmp .L4_20_2
  880. ALIGN_4
  881. .L4_20_6:
  882. #ifndef TRMMKERNEL
  883. movq K, %rax
  884. #else
  885. movq KKK, %rax
  886. #endif
  887. andq $7, %rax # if (k & 1)
  888. je .L4_20_9
  889. movq %rax, BI // Index for BO
  890. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  891. salq $3, %rax // rax = rax * 8 ; number of values
  892. leaq (AO, %rax, SIZE), AO
  893. leaq (BO, BI, SIZE), BO
  894. negq BI
  895. negq %rax
  896. ALIGN_4
  897. .L4_20_7:
  898. KERNEL8x4_SUB
  899. jl .L4_20_7
  900. ALIGN_4
  901. .L4_20_9:
  902. SAVE8x4
  903. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  904. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  905. movq K, %rax
  906. subq KKK, %rax
  907. movq %rax, BI // Index for BO
  908. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  909. leaq (BO, BI, SIZE), BO
  910. salq $3, %rax // rax = rax * 8 ; number of values
  911. leaq (AO, %rax, SIZE), AO
  912. #endif
  913. #if defined(TRMMKERNEL) && defined(LEFT)
  914. addq $8, KK
  915. #endif
  916. addq $8 * SIZE, CO1 # coffset += 8
  917. addq $8 * SIZE, CO2 # coffset += 8
  918. ALIGN_4
  919. /**************************************************************************/
  920. .L4_21pre:
  921. testq $4, M
  922. jz .L4_30
  923. ALIGN_4
  924. .L4_21:
  925. #if !defined(TRMMKERNEL) || \
  926. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  927. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  928. leaq BUFFER1, BO // first buffer to BO
  929. addq $4 * SIZE, BO
  930. #else
  931. movq KK, %rax
  932. leaq BUFFER1, BO // first buffer to BO
  933. addq $4 * SIZE, BO
  934. movq %rax, BI // Index for BO
  935. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  936. leaq (BO, BI, SIZE), BO
  937. salq $2, %rax // rax = rax * 4 ; number of values
  938. leaq (AO, %rax, SIZE), AO
  939. #endif
  940. vzeroall
  941. #ifndef TRMMKERNEL
  942. movq K, %rax
  943. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  944. movq K, %rax
  945. subq KK, %rax
  946. movq %rax, KKK
  947. #else
  948. movq KK, %rax
  949. #ifdef LEFT
  950. addq $4, %rax // number of values in A
  951. #else
  952. addq $4, %rax // number of values in BO
  953. #endif
  954. movq %rax, KKK
  955. #endif
  956. andq $-8, %rax
  957. je .L4_26
  958. movq %rax, BI // Index for BO
  959. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  960. salq $2, %rax // rax = rax * 4 ; number of values
  961. leaq (AO, %rax, SIZE), AO
  962. leaq (BO, BI, SIZE), BO
  963. negq BI
  964. negq %rax
  965. ALIGN_4
  966. .L4_22:
  967. KERNEL4x4_SUB
  968. KERNEL4x4_SUB
  969. KERNEL4x4_SUB
  970. KERNEL4x4_SUB
  971. KERNEL4x4_SUB
  972. KERNEL4x4_SUB
  973. KERNEL4x4_SUB
  974. KERNEL4x4_SUB
  975. je .L4_26
  976. KERNEL4x4_SUB
  977. KERNEL4x4_SUB
  978. KERNEL4x4_SUB
  979. KERNEL4x4_SUB
  980. KERNEL4x4_SUB
  981. KERNEL4x4_SUB
  982. KERNEL4x4_SUB
  983. KERNEL4x4_SUB
  984. je .L4_26
  985. jmp .L4_22
  986. ALIGN_4
  987. .L4_26:
  988. #ifndef TRMMKERNEL
  989. movq K, %rax
  990. #else
  991. movq KKK, %rax
  992. #endif
  993. andq $7, %rax # if (k & 1)
  994. je .L4_29
  995. movq %rax, BI // Index for BO
  996. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  997. salq $2, %rax // rax = rax * 4 ; number of values
  998. leaq (AO, %rax, SIZE), AO
  999. leaq (BO, BI, SIZE), BO
  1000. negq BI
  1001. negq %rax
  1002. ALIGN_4
  1003. .L4_27:
  1004. KERNEL4x4_SUB
  1005. jl .L4_27
  1006. ALIGN_4
  1007. .L4_29:
  1008. SAVE4x4
  1009. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1010. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1011. movq K, %rax
  1012. subq KKK, %rax
  1013. movq %rax, BI // Index for BO
  1014. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1015. leaq (BO, BI, SIZE), BO
  1016. salq $2, %rax // rax = rax * 4 ; number of values
  1017. leaq (AO, %rax, SIZE), AO
  1018. #endif
  1019. #if defined(TRMMKERNEL) && defined(LEFT)
  1020. addq $4, KK
  1021. #endif
  1022. addq $4 * SIZE, CO1 # coffset += 4
  1023. addq $4 * SIZE, CO2 # coffset += 4
  1024. ALIGN_4
  1025. .L4_30:
  1026. testq $2, M
  1027. jz .L4_40
  1028. ALIGN_4
  1029. .L4_31:
  1030. #if !defined(TRMMKERNEL) || \
  1031. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1032. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1033. leaq BUFFER1, BO // first buffer to BO
  1034. addq $4 * SIZE, BO
  1035. #else
  1036. movq KK, %rax
  1037. leaq BUFFER1, BO // first buffer to BO
  1038. addq $4 * SIZE, BO
  1039. movq %rax, BI // Index for BO
  1040. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1041. leaq (BO, BI, SIZE), BO
  1042. salq $1, %rax // rax = rax * 2 ; number of values
  1043. leaq (AO, %rax, SIZE), AO
  1044. #endif
  1045. vzeroall
  1046. #ifndef TRMMKERNEL
  1047. movq K, %rax
  1048. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1049. movq K, %rax
  1050. subq KK, %rax
  1051. movq %rax, KKK
  1052. #else
  1053. movq KK, %rax
  1054. #ifdef LEFT
  1055. addq $2, %rax // number of values in AO
  1056. #else
  1057. addq $4, %rax // number of values in BO
  1058. #endif
  1059. movq %rax, KKK
  1060. #endif
  1061. andq $-8, %rax
  1062. je .L4_36
  1063. movq %rax, BI // Index for BO
  1064. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1065. salq $1, %rax // rax = rax *2 ; number of values
  1066. leaq (AO, %rax, SIZE), AO
  1067. leaq (BO, BI, SIZE), BO
  1068. negq BI
  1069. negq %rax
  1070. ALIGN_4
  1071. .L4_32:
  1072. KERNEL2x4_SUB
  1073. KERNEL2x4_SUB
  1074. KERNEL2x4_SUB
  1075. KERNEL2x4_SUB
  1076. KERNEL2x4_SUB
  1077. KERNEL2x4_SUB
  1078. KERNEL2x4_SUB
  1079. KERNEL2x4_SUB
  1080. je .L4_36
  1081. KERNEL2x4_SUB
  1082. KERNEL2x4_SUB
  1083. KERNEL2x4_SUB
  1084. KERNEL2x4_SUB
  1085. KERNEL2x4_SUB
  1086. KERNEL2x4_SUB
  1087. KERNEL2x4_SUB
  1088. KERNEL2x4_SUB
  1089. je .L4_36
  1090. jmp .L4_32
  1091. ALIGN_4
  1092. .L4_36:
  1093. #ifndef TRMMKERNEL
  1094. movq K, %rax
  1095. #else
  1096. movq KKK, %rax
  1097. #endif
  1098. andq $7, %rax # if (k & 1)
  1099. je .L4_39
  1100. movq %rax, BI // Index for BO
  1101. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1102. salq $1, %rax // rax = rax *2 ; number of values
  1103. leaq (AO, %rax, SIZE), AO
  1104. leaq (BO, BI, SIZE), BO
  1105. negq BI
  1106. negq %rax
  1107. ALIGN_4
  1108. .L4_37:
  1109. KERNEL2x4_SUB
  1110. jl .L4_37
  1111. ALIGN_4
  1112. .L4_39:
  1113. SAVE2x4
  1114. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1115. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1116. movq K, %rax
  1117. subq KKK, %rax
  1118. movq %rax, BI // Index for BO
  1119. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1120. leaq (BO, BI, SIZE), BO
  1121. salq $1, %rax // rax = rax * 2 ; number of values
  1122. leaq (AO, %rax, SIZE), AO
  1123. #endif
  1124. #if defined(TRMMKERNEL) && defined(LEFT)
  1125. addq $2, KK
  1126. #endif
  1127. addq $2 * SIZE, CO1 # coffset += 2
  1128. addq $2 * SIZE, CO2 # coffset += 2
  1129. ALIGN_4
  1130. .L4_40:
  1131. testq $1, M
  1132. jz .L4_60 // to next 4 lines of N
  1133. ALIGN_4
  1134. .L4_41:
  1135. #if !defined(TRMMKERNEL) || \
  1136. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1137. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1138. leaq BUFFER1, BO // first buffer to BO
  1139. addq $4 * SIZE, BO
  1140. #else
  1141. movq KK, %rax
  1142. leaq BUFFER1, BO // first buffer to BO
  1143. addq $4 * SIZE, BO
  1144. movq %rax, BI // Index for BO
  1145. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1146. leaq (BO, BI, SIZE), BO
  1147. leaq (AO, %rax, SIZE), AO
  1148. #endif
  1149. vzeroall
  1150. #ifndef TRMMKERNEL
  1151. movq K, %rax
  1152. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1153. movq K, %rax
  1154. subq KK, %rax
  1155. movq %rax, KKK
  1156. #else
  1157. movq KK, %rax
  1158. #ifdef LEFT
  1159. addq $1, %rax // number of values in AO
  1160. #else
  1161. addq $4, %rax // number of values in BO
  1162. #endif
  1163. movq %rax, KKK
  1164. #endif
  1165. andq $-8, %rax
  1166. je .L4_46
  1167. movq %rax, BI // Index for BO
  1168. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1169. leaq (AO, %rax, SIZE), AO
  1170. leaq (BO, BI, SIZE), BO
  1171. negq BI
  1172. negq %rax
  1173. ALIGN_4
  1174. .L4_42:
  1175. KERNEL1x4_SUB
  1176. KERNEL1x4_SUB
  1177. KERNEL1x4_SUB
  1178. KERNEL1x4_SUB
  1179. KERNEL1x4_SUB
  1180. KERNEL1x4_SUB
  1181. KERNEL1x4_SUB
  1182. KERNEL1x4_SUB
  1183. je .L4_46
  1184. KERNEL1x4_SUB
  1185. KERNEL1x4_SUB
  1186. KERNEL1x4_SUB
  1187. KERNEL1x4_SUB
  1188. KERNEL1x4_SUB
  1189. KERNEL1x4_SUB
  1190. KERNEL1x4_SUB
  1191. KERNEL1x4_SUB
  1192. je .L4_46
  1193. jmp .L4_42
  1194. ALIGN_4
  1195. .L4_46:
  1196. #ifndef TRMMKERNEL
  1197. movq K, %rax
  1198. #else
  1199. movq KKK, %rax
  1200. #endif
  1201. andq $7, %rax # if (k & 1)
  1202. je .L4_49
  1203. movq %rax, BI // Index for BO
  1204. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1205. leaq (AO, %rax, SIZE), AO
  1206. leaq (BO, BI, SIZE), BO
  1207. negq BI
  1208. negq %rax
  1209. ALIGN_4
  1210. .L4_47:
  1211. KERNEL1x4_SUB
  1212. jl .L4_47
  1213. ALIGN_4
  1214. .L4_49:
  1215. SAVE1x4
  1216. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1217. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1218. movq K, %rax
  1219. subq KKK, %rax
  1220. movq %rax, BI // Index for BO
  1221. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1222. leaq (BO, BI, SIZE), BO
  1223. leaq (AO, %rax, SIZE), AO
  1224. #endif
  1225. #if defined(TRMMKERNEL) && defined(LEFT)
  1226. addq $1, KK
  1227. #endif
  1228. addq $1 * SIZE, CO1 # coffset += 1
  1229. addq $1 * SIZE, CO2 # coffset += 1
  1230. ALIGN_4
  1231. .L4_60:
  1232. #if defined(TRMMKERNEL) && !defined(LEFT)
  1233. addq $4, KK
  1234. #endif
  1235. decq J // j --
  1236. jg .L4_01 // next 4 lines of N
  1237. /*******************************************************************************************/
  1238. .L2_0:
  1239. movq Nmod6, J
  1240. andq $3, J // j % 4
  1241. je .L999
  1242. movq Nmod6, J
  1243. andq $2, J // j % 4
  1244. je .L1_0
  1245. .L2_01:
  1246. // copy to sub buffer
  1247. movq B, BO1
  1248. leaq BUFFER1, BO // first buffer to BO
  1249. movq K, %rax
  1250. sarq $2, %rax // K / 4
  1251. jz .L2_01b
  1252. ALIGN_4
  1253. .L2_01a:
  1254. vmovsd (BO1), %xmm0
  1255. vmovsd 2*SIZE(BO1), %xmm1
  1256. vmovsd 4*SIZE(BO1), %xmm2
  1257. vmovsd 6*SIZE(BO1), %xmm3
  1258. vmovsd %xmm0, (BO)
  1259. vmovsd %xmm1, 2*SIZE(BO)
  1260. vmovsd %xmm2, 4*SIZE(BO)
  1261. vmovsd %xmm3, 6*SIZE(BO)
  1262. addq $8*SIZE,BO1
  1263. addq $8*SIZE,BO
  1264. decq %rax
  1265. jnz .L2_01a
  1266. .L2_01b:
  1267. movq K, %rax
  1268. andq $3, %rax // K % 4
  1269. jz .L2_02d
  1270. ALIGN_4
  1271. .L2_02c:
  1272. vmovsd (BO1), %xmm0
  1273. vmovsd %xmm0, (BO)
  1274. addq $2*SIZE,BO1
  1275. addq $2*SIZE,BO
  1276. decq %rax
  1277. jnz .L2_02c
  1278. .L2_02d:
  1279. movq BO1, B // next offset of B
  1280. .L2_10:
  1281. movq C, CO1
  1282. leaq (C, LDC, 2), C // c += 2 * ldc
  1283. #if defined(TRMMKERNEL) && defined(LEFT)
  1284. movq OFFSET, %rax
  1285. movq %rax, KK
  1286. #endif
  1287. movq A, AO // aoffset = a
  1288. addq $16 * SIZE, AO
  1289. movq M, I
  1290. sarq $4, I // i = (m >> 4)
  1291. je .L2_20
  1292. ALIGN_4
  1293. .L2_11:
  1294. #if !defined(TRMMKERNEL) || \
  1295. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1296. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1297. leaq BUFFER1, BO // first buffer to BO
  1298. addq $4 * SIZE, BO
  1299. #else
  1300. movq KK, %rax
  1301. leaq BUFFER1, BO // first buffer to BO
  1302. addq $4 * SIZE, BO
  1303. movq %rax, BI // Index for BO
  1304. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1305. leaq (BO, BI, SIZE), BO
  1306. salq $4, %rax // rax = rax * 16 ; number of values
  1307. leaq (AO, %rax, SIZE), AO
  1308. #endif
  1309. vzeroall
  1310. #ifndef TRMMKERNEL
  1311. movq K, %rax
  1312. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1313. movq K, %rax
  1314. subq KK, %rax
  1315. movq %rax, KKK
  1316. #else
  1317. movq KK, %rax
  1318. #ifdef LEFT
  1319. addq $16, %rax // number of values in AO
  1320. #else
  1321. addq $2, %rax // number of values in BO
  1322. #endif
  1323. movq %rax, KKK
  1324. #endif
  1325. andq $-8, %rax // K = K - ( K % 8 )
  1326. je .L2_16
  1327. movq %rax, BI // Index for BO
  1328. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1329. salq $4, %rax // rax = rax * 16 ; number of values
  1330. leaq (AO, %rax, SIZE), AO
  1331. leaq (BO, BI, SIZE), BO
  1332. negq BI
  1333. negq %rax
  1334. ALIGN_4
  1335. .L2_12:
  1336. KERNEL16x2_SUB
  1337. KERNEL16x2_SUB
  1338. KERNEL16x2_SUB
  1339. KERNEL16x2_SUB
  1340. KERNEL16x2_SUB
  1341. KERNEL16x2_SUB
  1342. KERNEL16x2_SUB
  1343. KERNEL16x2_SUB
  1344. je .L2_16
  1345. KERNEL16x2_SUB
  1346. KERNEL16x2_SUB
  1347. KERNEL16x2_SUB
  1348. KERNEL16x2_SUB
  1349. KERNEL16x2_SUB
  1350. KERNEL16x2_SUB
  1351. KERNEL16x2_SUB
  1352. KERNEL16x2_SUB
  1353. je .L2_16
  1354. jmp .L2_12
  1355. ALIGN_4
  1356. .L2_16:
  1357. #ifndef TRMMKERNEL
  1358. movq K, %rax
  1359. #else
  1360. movq KKK, %rax
  1361. #endif
  1362. andq $7, %rax # if (k & 1)
  1363. je .L2_19
  1364. movq %rax, BI // Index for BO
  1365. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1366. salq $4, %rax // rax = rax * 16 ; number of values
  1367. leaq (AO, %rax, SIZE), AO
  1368. leaq (BO, BI, SIZE), BO
  1369. negq BI
  1370. negq %rax
  1371. ALIGN_4
  1372. .L2_17:
  1373. KERNEL16x2_SUB
  1374. jl .L2_17
  1375. ALIGN_4
  1376. .L2_19:
  1377. SAVE16x2
  1378. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1379. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1380. movq K, %rax
  1381. subq KKK, %rax
  1382. movq %rax, BI // Index for BO
  1383. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1384. leaq (BO, BI, SIZE), BO
  1385. salq $4, %rax // rax = rax * 16 ; number of values
  1386. leaq (AO, %rax, SIZE), AO
  1387. #endif
  1388. #if defined(TRMMKERNEL) && defined(LEFT)
  1389. addq $16, KK
  1390. #endif
  1391. addq $16 * SIZE, CO1 # coffset += 16
  1392. decq I # i --
  1393. jg .L2_11
  1394. ALIGN_4
  1395. /**************************************************************************
  1396. * Rest of M
  1397. ***************************************************************************/
  1398. .L2_20:
  1399. // Test rest of M
  1400. testq $15, M
  1401. jz .L2_60 // to next 2 lines of N
  1402. testq $8, M
  1403. jz .L2_21pre
  1404. ALIGN_4
  1405. /**************************************************************************/
  1406. .L2_20_1:
  1407. #if !defined(TRMMKERNEL) || \
  1408. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1409. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1410. leaq BUFFER1, BO // first buffer to BO
  1411. addq $4 * SIZE, BO
  1412. #else
  1413. movq KK, %rax
  1414. leaq BUFFER1, BO // first buffer to BO
  1415. addq $4 * SIZE, BO
  1416. movq %rax, BI // Index for BO
  1417. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1418. leaq (BO, BI, SIZE), BO
  1419. salq $3, %rax // rax = rax * 8 ; number of values
  1420. leaq (AO, %rax, SIZE), AO
  1421. #endif
  1422. vzeroall
  1423. #ifndef TRMMKERNEL
  1424. movq K, %rax
  1425. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1426. movq K, %rax
  1427. subq KK, %rax
  1428. movq %rax, KKK
  1429. #else
  1430. movq KK, %rax
  1431. #ifdef LEFT
  1432. addq $8, %rax // number of values in A
  1433. #else
  1434. addq $2, %rax // number of values in BO
  1435. #endif
  1436. movq %rax, KKK
  1437. #endif
  1438. andq $-8, %rax
  1439. je .L2_20_6
  1440. movq %rax, BI // Index for BO
  1441. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1442. salq $3, %rax // rax = rax * 8 ; number of values
  1443. leaq (AO, %rax, SIZE), AO
  1444. leaq (BO, BI, SIZE), BO
  1445. negq BI
  1446. negq %rax
  1447. ALIGN_4
  1448. .L2_20_2:
  1449. KERNEL8x2_SUB
  1450. KERNEL8x2_SUB
  1451. KERNEL8x2_SUB
  1452. KERNEL8x2_SUB
  1453. KERNEL8x2_SUB
  1454. KERNEL8x2_SUB
  1455. KERNEL8x2_SUB
  1456. KERNEL8x2_SUB
  1457. je .L2_20_6
  1458. KERNEL8x2_SUB
  1459. KERNEL8x2_SUB
  1460. KERNEL8x2_SUB
  1461. KERNEL8x2_SUB
  1462. KERNEL8x2_SUB
  1463. KERNEL8x2_SUB
  1464. KERNEL8x2_SUB
  1465. KERNEL8x2_SUB
  1466. je .L2_20_6
  1467. jmp .L2_20_2
  1468. ALIGN_4
  1469. .L2_20_6:
  1470. #ifndef TRMMKERNEL
  1471. movq K, %rax
  1472. #else
  1473. movq KKK, %rax
  1474. #endif
  1475. andq $7, %rax # if (k & 1)
  1476. je .L2_20_9
  1477. movq %rax, BI // Index for BO
  1478. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1479. salq $3, %rax // rax = rax * 8 ; number of values
  1480. leaq (AO, %rax, SIZE), AO
  1481. leaq (BO, BI, SIZE), BO
  1482. negq BI
  1483. negq %rax
  1484. ALIGN_4
  1485. .L2_20_7:
  1486. KERNEL8x2_SUB
  1487. jl .L2_20_7
  1488. ALIGN_4
  1489. .L2_20_9:
  1490. SAVE8x2
  1491. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1492. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1493. movq K, %rax
  1494. subq KKK, %rax
  1495. movq %rax, BI // Index for BO
  1496. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1497. leaq (BO, BI, SIZE), BO
  1498. salq $3, %rax // rax = rax * 8 ; number of values
  1499. leaq (AO, %rax, SIZE), AO
  1500. #endif
  1501. #if defined(TRMMKERNEL) && defined(LEFT)
  1502. addq $8, KK
  1503. #endif
  1504. addq $8 * SIZE, CO1 # coffset += 8
  1505. ALIGN_4
  1506. /**************************************************************************/
  1507. .L2_21pre:
  1508. testq $4, M
  1509. jz .L2_30
  1510. ALIGN_4
  1511. .L2_21:
  1512. #if !defined(TRMMKERNEL) || \
  1513. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1514. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1515. leaq BUFFER1, BO // first buffer to BO
  1516. addq $4 * SIZE, BO
  1517. #else
  1518. movq KK, %rax
  1519. leaq BUFFER1, BO // first buffer to BO
  1520. addq $4 * SIZE, BO
  1521. movq %rax, BI // Index for BO
  1522. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1523. leaq (BO, BI, SIZE), BO
  1524. salq $2, %rax // rax = rax * 4 ; number of values
  1525. leaq (AO, %rax, SIZE), AO
  1526. #endif
  1527. vzeroall
  1528. #ifndef TRMMKERNEL
  1529. movq K, %rax
  1530. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1531. movq K, %rax
  1532. subq KK, %rax
  1533. movq %rax, KKK
  1534. #else
  1535. movq KK, %rax
  1536. #ifdef LEFT
  1537. addq $4, %rax // number of values in A
  1538. #else
  1539. addq $2, %rax // number of values in BO
  1540. #endif
  1541. movq %rax, KKK
  1542. #endif
  1543. andq $-8, %rax
  1544. je .L2_26
  1545. movq %rax, BI // Index for BO
  1546. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  1547. salq $2, %rax // rax = rax * 4 ; number of values
  1548. leaq (AO, %rax, SIZE), AO
  1549. leaq (BO, BI, SIZE), BO
  1550. negq BI
  1551. negq %rax
  1552. ALIGN_4
  1553. .L2_22:
  1554. KERNEL4x2_SUB
  1555. KERNEL4x2_SUB
  1556. KERNEL4x2_SUB
  1557. KERNEL4x2_SUB
  1558. KERNEL4x2_SUB
  1559. KERNEL4x2_SUB
  1560. KERNEL4x2_SUB
  1561. KERNEL4x2_SUB
  1562. je .L2_26
  1563. KERNEL4x2_SUB
  1564. KERNEL4x2_SUB
  1565. KERNEL4x2_SUB
  1566. KERNEL4x2_SUB
  1567. KERNEL4x2_SUB
  1568. KERNEL4x2_SUB
  1569. KERNEL4x2_SUB
  1570. KERNEL4x2_SUB
  1571. je .L2_26
  1572. jmp .L2_22
  1573. ALIGN_4
  1574. .L2_26:
  1575. #ifndef TRMMKERNEL
  1576. movq K, %rax
  1577. #else
  1578. movq KKK, %rax
  1579. #endif
  1580. andq $7, %rax # if (k & 1)
  1581. je .L2_29
  1582. movq %rax, BI // Index for BO
  1583. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1584. salq $2, %rax // rax = rax * 4 ; number of values
  1585. leaq (AO, %rax, SIZE), AO
  1586. leaq (BO, BI, SIZE), BO
  1587. negq BI
  1588. negq %rax
  1589. ALIGN_4
  1590. .L2_27:
  1591. KERNEL4x2_SUB
  1592. jl .L2_27
  1593. ALIGN_4
  1594. .L2_29:
  1595. SAVE4x2
  1596. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1597. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1598. movq K, %rax
  1599. subq KKK, %rax
  1600. movq %rax, BI // Index for BO
  1601. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1602. leaq (BO, BI, SIZE), BO
  1603. salq $2, %rax // rax = rax * 4 ; number of values
  1604. leaq (AO, %rax, SIZE), AO
  1605. #endif
  1606. #if defined(TRMMKERNEL) && defined(LEFT)
  1607. addq $4, KK
  1608. #endif
  1609. addq $4 * SIZE, CO1 # coffset += 4
  1610. ALIGN_4
  1611. .L2_30:
  1612. testq $2, M
  1613. jz .L2_40
  1614. ALIGN_4
  1615. .L2_31:
  1616. #if !defined(TRMMKERNEL) || \
  1617. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1618. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1619. leaq BUFFER1, BO // first buffer to BO
  1620. addq $4 * SIZE, BO
  1621. #else
  1622. movq KK, %rax
  1623. leaq BUFFER1, BO // first buffer to BO
  1624. addq $4 * SIZE, BO
  1625. movq %rax, BI // Index for BO
  1626. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1627. leaq (BO, BI, SIZE), BO
  1628. salq $1, %rax // rax = rax * 2 ; number of values
  1629. leaq (AO, %rax, SIZE), AO
  1630. #endif
  1631. vzeroall
  1632. #ifndef TRMMKERNEL
  1633. movq K, %rax
  1634. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1635. movq K, %rax
  1636. subq KK, %rax
  1637. movq %rax, KKK
  1638. #else
  1639. movq KK, %rax
  1640. #ifdef LEFT
  1641. addq $2, %rax // number of values in AO
  1642. #else
  1643. addq $2, %rax // number of values in BO
  1644. #endif
  1645. movq %rax, KKK
  1646. #endif
  1647. andq $-8, %rax
  1648. je .L2_36
  1649. movq %rax, BI // Index for BO
  1650. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1651. salq $1, %rax // rax = rax *2 ; number of values
  1652. leaq (AO, %rax, SIZE), AO
  1653. leaq (BO, BI, SIZE), BO
  1654. negq BI
  1655. negq %rax
  1656. ALIGN_4
  1657. .L2_32:
  1658. KERNEL2x2_SUB
  1659. KERNEL2x2_SUB
  1660. KERNEL2x2_SUB
  1661. KERNEL2x2_SUB
  1662. KERNEL2x2_SUB
  1663. KERNEL2x2_SUB
  1664. KERNEL2x2_SUB
  1665. KERNEL2x2_SUB
  1666. je .L2_36
  1667. KERNEL2x2_SUB
  1668. KERNEL2x2_SUB
  1669. KERNEL2x2_SUB
  1670. KERNEL2x2_SUB
  1671. KERNEL2x2_SUB
  1672. KERNEL2x2_SUB
  1673. KERNEL2x2_SUB
  1674. KERNEL2x2_SUB
  1675. je .L2_36
  1676. jmp .L2_32
  1677. ALIGN_4
  1678. .L2_36:
  1679. #ifndef TRMMKERNEL
  1680. movq K, %rax
  1681. #else
  1682. movq KKK, %rax
  1683. #endif
  1684. andq $7, %rax # if (k & 1)
  1685. je .L2_39
  1686. movq %rax, BI // Index for BO
  1687. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1688. salq $1, %rax // rax = rax *2 ; number of values
  1689. leaq (AO, %rax, SIZE), AO
  1690. leaq (BO, BI, SIZE), BO
  1691. negq BI
  1692. negq %rax
  1693. ALIGN_4
  1694. .L2_37:
  1695. KERNEL2x2_SUB
  1696. jl .L2_37
  1697. ALIGN_4
  1698. .L2_39:
  1699. SAVE2x2
  1700. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1701. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1702. movq K, %rax
  1703. subq KKK, %rax
  1704. movq %rax, BI // Index for BO
  1705. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1706. leaq (BO, BI, SIZE), BO
  1707. salq $1, %rax // rax = rax * 2 ; number of values
  1708. leaq (AO, %rax, SIZE), AO
  1709. #endif
  1710. #if defined(TRMMKERNEL) && defined(LEFT)
  1711. addq $2, KK
  1712. #endif
  1713. addq $2 * SIZE, CO1 # coffset += 2
  1714. ALIGN_4
  1715. .L2_40:
  1716. testq $1, M
  1717. jz .L2_60 // to next 2 lines of N
  1718. ALIGN_4
  1719. .L2_41:
  1720. #if !defined(TRMMKERNEL) || \
  1721. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1722. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1723. leaq BUFFER1, BO // first buffer to BO
  1724. addq $4 * SIZE, BO
  1725. #else
  1726. movq KK, %rax
  1727. leaq BUFFER1, BO // first buffer to BO
  1728. addq $4 * SIZE, BO
  1729. movq %rax, BI // Index for BO
  1730. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1731. leaq (BO, BI, SIZE), BO
  1732. leaq (AO, %rax, SIZE), AO
  1733. #endif
  1734. vzeroall
  1735. #ifndef TRMMKERNEL
  1736. movq K, %rax
  1737. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1738. movq K, %rax
  1739. subq KK, %rax
  1740. movq %rax, KKK
  1741. #else
  1742. movq KK, %rax
  1743. #ifdef LEFT
  1744. addq $1, %rax // number of values in AO
  1745. #else
  1746. addq $2, %rax // number of values in BO
  1747. #endif
  1748. movq %rax, KKK
  1749. #endif
  1750. andq $-8, %rax
  1751. je .L2_46
  1752. movq %rax, BI // Index for BO
  1753. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1754. leaq (AO, %rax, SIZE), AO
  1755. leaq (BO, BI, SIZE), BO
  1756. negq BI
  1757. negq %rax
  1758. ALIGN_4
  1759. .L2_42:
  1760. KERNEL1x2_SUB
  1761. KERNEL1x2_SUB
  1762. KERNEL1x2_SUB
  1763. KERNEL1x2_SUB
  1764. KERNEL1x2_SUB
  1765. KERNEL1x2_SUB
  1766. KERNEL1x2_SUB
  1767. KERNEL1x2_SUB
  1768. je .L2_46
  1769. KERNEL1x2_SUB
  1770. KERNEL1x2_SUB
  1771. KERNEL1x2_SUB
  1772. KERNEL1x2_SUB
  1773. KERNEL1x2_SUB
  1774. KERNEL1x2_SUB
  1775. KERNEL1x2_SUB
  1776. KERNEL1x2_SUB
  1777. je .L2_46
  1778. jmp .L2_42
  1779. ALIGN_4
  1780. .L2_46:
  1781. #ifndef TRMMKERNEL
  1782. movq K, %rax
  1783. #else
  1784. movq KKK, %rax
  1785. #endif
  1786. andq $7, %rax # if (k & 1)
  1787. je .L2_49
  1788. movq %rax, BI // Index for BO
  1789. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1790. leaq (AO, %rax, SIZE), AO
  1791. leaq (BO, BI, SIZE), BO
  1792. negq BI
  1793. negq %rax
  1794. ALIGN_4
  1795. .L2_47:
  1796. KERNEL1x2_SUB
  1797. jl .L2_47
  1798. ALIGN_4
  1799. .L2_49:
  1800. SAVE1x2
  1801. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1802. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1803. movq K, %rax
  1804. subq KKK, %rax
  1805. movq %rax, BI // Index for BO
  1806. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1807. leaq (BO, BI, SIZE), BO
  1808. leaq (AO, %rax, SIZE), AO
  1809. #endif
  1810. #if defined(TRMMKERNEL) && defined(LEFT)
  1811. addq $1, KK
  1812. #endif
  1813. addq $1 * SIZE, CO1 # coffset += 1
  1814. ALIGN_4
  1815. .L2_60:
  1816. #if defined(TRMMKERNEL) && !defined(LEFT)
  1817. addq $2, KK
  1818. #endif
  1819. .L1_0:
  1820. /************************************************************************************************
  1821. * Loop for Nmod6 % 2 > 0
  1822. *************************************************************************************************/
  1823. movq Nmod6, J
  1824. andq $1, J // j % 2
  1825. je .L999
  1826. ALIGN_4
  1827. .L1_01:
  1828. // copy to sub buffer
  1829. movq B, BO1
  1830. leaq BUFFER1, BO // first buffer to BO
  1831. movq K, %rax
  1832. ALIGN_4
  1833. .L1_02b:
  1834. vmovss (BO1), %xmm0
  1835. vmovss %xmm0, (BO)
  1836. addq $1*SIZE,BO1
  1837. addq $1*SIZE,BO
  1838. decq %rax
  1839. jnz .L1_02b
  1840. .L1_02c:
  1841. movq BO1, B // next offset of B
  1842. .L1_10:
  1843. movq C, CO1
  1844. leaq (C, LDC, 1), C // c += 1 * ldc
  1845. #if defined(TRMMKERNEL) && defined(LEFT)
  1846. movq OFFSET, %rax
  1847. movq %rax, KK
  1848. #endif
  1849. movq A, AO // aoffset = a
  1850. addq $16 * SIZE, AO
  1851. movq M, I
  1852. sarq $4, I // i = (m >> 4)
  1853. je .L1_20
  1854. ALIGN_4
  1855. .L1_11:
  1856. #if !defined(TRMMKERNEL) || \
  1857. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1858. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1859. leaq BUFFER1, BO // first buffer to BO
  1860. addq $4 * SIZE, BO
  1861. #else
  1862. movq KK, %rax
  1863. leaq BUFFER1, BO // first buffer to BO
  1864. addq $4 * SIZE, BO
  1865. movq %rax, BI // Index for BO
  1866. leaq (BO, BI, SIZE), BO
  1867. salq $4, %rax // rax = rax * 16 ; number of values
  1868. leaq (AO, %rax, SIZE), AO
  1869. #endif
  1870. vzeroall
  1871. #ifndef TRMMKERNEL
  1872. movq K, %rax
  1873. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1874. movq K, %rax
  1875. subq KK, %rax
  1876. movq %rax, KKK
  1877. #else
  1878. movq KK, %rax
  1879. #ifdef LEFT
  1880. addq $16, %rax // number of values in AO
  1881. #else
  1882. addq $1, %rax // number of values in BO
  1883. #endif
  1884. movq %rax, KKK
  1885. #endif
  1886. andq $-8, %rax // K = K - ( K % 8 )
  1887. je .L1_16
  1888. movq %rax, BI // Index for BO
  1889. salq $4, %rax // rax = rax * 16 ; number of values
  1890. leaq (AO, %rax, SIZE), AO
  1891. leaq (BO, BI, SIZE), BO
  1892. negq BI
  1893. negq %rax
  1894. ALIGN_4
  1895. .L1_12:
  1896. KERNEL16x1_SUB
  1897. KERNEL16x1_SUB
  1898. KERNEL16x1_SUB
  1899. KERNEL16x1_SUB
  1900. KERNEL16x1_SUB
  1901. KERNEL16x1_SUB
  1902. KERNEL16x1_SUB
  1903. KERNEL16x1_SUB
  1904. je .L1_16
  1905. KERNEL16x1_SUB
  1906. KERNEL16x1_SUB
  1907. KERNEL16x1_SUB
  1908. KERNEL16x1_SUB
  1909. KERNEL16x1_SUB
  1910. KERNEL16x1_SUB
  1911. KERNEL16x1_SUB
  1912. KERNEL16x1_SUB
  1913. je .L1_16
  1914. jmp .L1_12
  1915. ALIGN_4
  1916. .L1_16:
  1917. #ifndef TRMMKERNEL
  1918. movq K, %rax
  1919. #else
  1920. movq KKK, %rax
  1921. #endif
  1922. andq $7, %rax # if (k & 1)
  1923. je .L1_19
  1924. movq %rax, BI // Index for BO
  1925. salq $4, %rax // rax = rax * 16 ; number of values
  1926. leaq (AO, %rax, SIZE), AO
  1927. leaq (BO, BI, SIZE), BO
  1928. negq BI
  1929. negq %rax
  1930. ALIGN_4
  1931. .L1_17:
  1932. KERNEL16x1_SUB
  1933. jl .L1_17
  1934. ALIGN_4
  1935. .L1_19:
  1936. SAVE16x1
  1937. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1938. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1939. movq K, %rax
  1940. subq KKK, %rax
  1941. movq %rax, BI // Index for BO
  1942. leaq (BO, BI, SIZE), BO
  1943. salq $4, %rax // rax = rax * 16 ; number of values
  1944. leaq (AO, %rax, SIZE), AO
  1945. #endif
  1946. #if defined(TRMMKERNEL) && defined(LEFT)
  1947. addq $16, KK
  1948. #endif
  1949. addq $16 * SIZE, CO1 # coffset += 16
  1950. decq I # i --
  1951. jg .L1_11
  1952. ALIGN_4
  1953. /**************************************************************************
  1954. * Rest of M
  1955. ***************************************************************************/
  1956. .L1_20:
  1957. // Test rest of M
  1958. testq $15, M
  1959. jz .L999
  1960. testq $8, M
  1961. jz .L1_21pre
  1962. ALIGN_4
  1963. /**************************************************************************/
  1964. .L1_20_1:
  1965. #if !defined(TRMMKERNEL) || \
  1966. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1967. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1968. leaq BUFFER1, BO // first buffer to BO
  1969. addq $4 * SIZE, BO
  1970. #else
  1971. movq KK, %rax
  1972. leaq BUFFER1, BO // first buffer to BO
  1973. addq $4 * SIZE, BO
  1974. movq %rax, BI // Index for BO
  1975. leaq (BO, BI, SIZE), BO
  1976. salq $3, %rax // rax = rax * 8 ; number of values
  1977. leaq (AO, %rax, SIZE), AO
  1978. #endif
  1979. vzeroall
  1980. #ifndef TRMMKERNEL
  1981. movq K, %rax
  1982. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1983. movq K, %rax
  1984. subq KK, %rax
  1985. movq %rax, KKK
  1986. #else
  1987. movq KK, %rax
  1988. #ifdef LEFT
  1989. addq $8, %rax // number of values in A
  1990. #else
  1991. addq $1, %rax // number of values in BO
  1992. #endif
  1993. movq %rax, KKK
  1994. #endif
  1995. andq $-8, %rax
  1996. je .L1_20_6
  1997. movq %rax, BI // Index for BO
  1998. salq $3, %rax // rax = rax * 8 ; number of values
  1999. leaq (AO, %rax, SIZE), AO
  2000. leaq (BO, BI, SIZE), BO
  2001. negq BI
  2002. negq %rax
  2003. ALIGN_4
  2004. .L1_20_2:
  2005. KERNEL8x1_SUB
  2006. KERNEL8x1_SUB
  2007. KERNEL8x1_SUB
  2008. KERNEL8x1_SUB
  2009. KERNEL8x1_SUB
  2010. KERNEL8x1_SUB
  2011. KERNEL8x1_SUB
  2012. KERNEL8x1_SUB
  2013. je .L1_20_6
  2014. KERNEL8x1_SUB
  2015. KERNEL8x1_SUB
  2016. KERNEL8x1_SUB
  2017. KERNEL8x1_SUB
  2018. KERNEL8x1_SUB
  2019. KERNEL8x1_SUB
  2020. KERNEL8x1_SUB
  2021. KERNEL8x1_SUB
  2022. je .L1_20_6
  2023. jmp .L1_20_2
  2024. ALIGN_4
  2025. .L1_20_6:
  2026. #ifndef TRMMKERNEL
  2027. movq K, %rax
  2028. #else
  2029. movq KKK, %rax
  2030. #endif
  2031. andq $7, %rax # if (k & 1)
  2032. je .L1_20_9
  2033. movq %rax, BI // Index for BO
  2034. salq $3, %rax // rax = rax * 8 ; number of values
  2035. leaq (AO, %rax, SIZE), AO
  2036. leaq (BO, BI, SIZE), BO
  2037. negq BI
  2038. negq %rax
  2039. ALIGN_4
  2040. .L1_20_7:
  2041. KERNEL8x1_SUB
  2042. jl .L1_20_7
  2043. ALIGN_4
  2044. .L1_20_9:
  2045. SAVE8x1
  2046. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2047. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2048. movq K, %rax
  2049. subq KKK, %rax
  2050. movq %rax, BI // Index for BO
  2051. leaq (BO, BI, SIZE), BO
  2052. salq $3, %rax // rax = rax * 8 ; number of values
  2053. leaq (AO, %rax, SIZE), AO
  2054. #endif
  2055. #if defined(TRMMKERNEL) && defined(LEFT)
  2056. addq $8, KK
  2057. #endif
  2058. addq $8 * SIZE, CO1 # coffset += 8
  2059. ALIGN_4
  2060. /**************************************************************************/
  2061. .L1_21pre:
  2062. testq $4, M
  2063. jz .L1_30
  2064. ALIGN_4
  2065. .L1_21:
  2066. #if !defined(TRMMKERNEL) || \
  2067. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2068. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2069. leaq BUFFER1, BO // first buffer to BO
  2070. addq $4 * SIZE, BO
  2071. #else
  2072. movq KK, %rax
  2073. leaq BUFFER1, BO // first buffer to BO
  2074. addq $4 * SIZE, BO
  2075. movq %rax, BI // Index for BO
  2076. leaq (BO, BI, SIZE), BO
  2077. salq $2, %rax // rax = rax * 4 ; number of values
  2078. leaq (AO, %rax, SIZE), AO
  2079. #endif
  2080. vzeroall
  2081. #ifndef TRMMKERNEL
  2082. movq K, %rax
  2083. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2084. movq K, %rax
  2085. subq KK, %rax
  2086. movq %rax, KKK
  2087. #else
  2088. movq KK, %rax
  2089. #ifdef LEFT
  2090. addq $4, %rax // number of values in A
  2091. #else
  2092. addq $1, %rax // number of values in BO
  2093. #endif
  2094. movq %rax, KKK
  2095. #endif
  2096. andq $-8, %rax
  2097. je .L1_26
  2098. movq %rax, BI // Index for BO
  2099. salq $2, %rax // rax = rax * 4 ; number of values
  2100. leaq (AO, %rax, SIZE), AO
  2101. leaq (BO, BI, SIZE), BO
  2102. negq BI
  2103. negq %rax
  2104. ALIGN_4
  2105. .L1_22:
  2106. KERNEL4x1_SUB
  2107. KERNEL4x1_SUB
  2108. KERNEL4x1_SUB
  2109. KERNEL4x1_SUB
  2110. KERNEL4x1_SUB
  2111. KERNEL4x1_SUB
  2112. KERNEL4x1_SUB
  2113. KERNEL4x1_SUB
  2114. je .L1_26
  2115. KERNEL4x1_SUB
  2116. KERNEL4x1_SUB
  2117. KERNEL4x1_SUB
  2118. KERNEL4x1_SUB
  2119. KERNEL4x1_SUB
  2120. KERNEL4x1_SUB
  2121. KERNEL4x1_SUB
  2122. KERNEL4x1_SUB
  2123. je .L1_26
  2124. jmp .L1_22
  2125. ALIGN_4
  2126. .L1_26:
  2127. #ifndef TRMMKERNEL
  2128. movq K, %rax
  2129. #else
  2130. movq KKK, %rax
  2131. #endif
  2132. andq $7, %rax # if (k & 1)
  2133. je .L1_29
  2134. movq %rax, BI // Index for BO
  2135. salq $2, %rax // rax = rax * 4 ; number of values
  2136. leaq (AO, %rax, SIZE), AO
  2137. leaq (BO, BI, SIZE), BO
  2138. negq BI
  2139. negq %rax
  2140. ALIGN_4
  2141. .L1_27:
  2142. KERNEL4x1_SUB
  2143. jl .L1_27
  2144. ALIGN_4
  2145. .L1_29:
  2146. SAVE4x1
  2147. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2148. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2149. movq K, %rax
  2150. subq KKK, %rax
  2151. movq %rax, BI // Index for BO
  2152. leaq (BO, BI, SIZE), BO
  2153. salq $2, %rax // rax = rax * 4 ; number of values
  2154. leaq (AO, %rax, SIZE), AO
  2155. #endif
  2156. #if defined(TRMMKERNEL) && defined(LEFT)
  2157. addq $4, KK
  2158. #endif
  2159. addq $4 * SIZE, CO1 # coffset += 4
  2160. ALIGN_4
  2161. .L1_30:
  2162. testq $2, M
  2163. jz .L1_40
  2164. ALIGN_4
  2165. .L1_31:
  2166. #if !defined(TRMMKERNEL) || \
  2167. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2168. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2169. leaq BUFFER1, BO // first buffer to BO
  2170. addq $4 * SIZE, BO
  2171. #else
  2172. movq KK, %rax
  2173. leaq BUFFER1, BO // first buffer to BO
  2174. addq $4 * SIZE, BO
  2175. movq %rax, BI // Index for BO
  2176. leaq (BO, BI, SIZE), BO
  2177. salq $1, %rax // rax = rax * 2 ; number of values
  2178. leaq (AO, %rax, SIZE), AO
  2179. #endif
  2180. vzeroall
  2181. #ifndef TRMMKERNEL
  2182. movq K, %rax
  2183. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2184. movq K, %rax
  2185. subq KK, %rax
  2186. movq %rax, KKK
  2187. #else
  2188. movq KK, %rax
  2189. #ifdef LEFT
  2190. addq $2, %rax // number of values in AO
  2191. #else
  2192. addq $1, %rax // number of values in BO
  2193. #endif
  2194. movq %rax, KKK
  2195. #endif
  2196. andq $-8, %rax
  2197. je .L1_36
  2198. movq %rax, BI // Index for BO
  2199. salq $1, %rax // rax = rax *2 ; number of values
  2200. leaq (AO, %rax, SIZE), AO
  2201. leaq (BO, BI, SIZE), BO
  2202. negq BI
  2203. negq %rax
  2204. ALIGN_4
  2205. .L1_32:
  2206. KERNEL2x1_SUB
  2207. KERNEL2x1_SUB
  2208. KERNEL2x1_SUB
  2209. KERNEL2x1_SUB
  2210. KERNEL2x1_SUB
  2211. KERNEL2x1_SUB
  2212. KERNEL2x1_SUB
  2213. KERNEL2x1_SUB
  2214. je .L1_36
  2215. KERNEL2x1_SUB
  2216. KERNEL2x1_SUB
  2217. KERNEL2x1_SUB
  2218. KERNEL2x1_SUB
  2219. KERNEL2x1_SUB
  2220. KERNEL2x1_SUB
  2221. KERNEL2x1_SUB
  2222. KERNEL2x1_SUB
  2223. je .L1_36
  2224. jmp .L1_32
  2225. ALIGN_4
  2226. .L1_36:
  2227. #ifndef TRMMKERNEL
  2228. movq K, %rax
  2229. #else
  2230. movq KKK, %rax
  2231. #endif
  2232. andq $7, %rax # if (k & 1)
  2233. je .L1_39
  2234. movq %rax, BI // Index for BO
  2235. salq $1, %rax // rax = rax *2 ; number of values
  2236. leaq (AO, %rax, SIZE), AO
  2237. leaq (BO, BI, SIZE), BO
  2238. negq BI
  2239. negq %rax
  2240. ALIGN_4
  2241. .L1_37:
  2242. KERNEL2x1_SUB
  2243. jl .L1_37
  2244. ALIGN_4
  2245. .L1_39:
  2246. SAVE2x1
  2247. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2248. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2249. movq K, %rax
  2250. subq KKK, %rax
  2251. movq %rax, BI // Index for BO
  2252. leaq (BO, BI, SIZE), BO
  2253. salq $1, %rax // rax = rax * 2 ; number of values
  2254. leaq (AO, %rax, SIZE), AO
  2255. #endif
  2256. #if defined(TRMMKERNEL) && defined(LEFT)
  2257. addq $2, KK
  2258. #endif
  2259. addq $2 * SIZE, CO1 # coffset += 2
  2260. ALIGN_4
  2261. .L1_40:
  2262. testq $1, M
  2263. jz .L999
  2264. ALIGN_4
  2265. .L1_41:
  2266. #if !defined(TRMMKERNEL) || \
  2267. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2268. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2269. leaq BUFFER1, BO // first buffer to BO
  2270. addq $4 * SIZE, BO
  2271. #else
  2272. movq KK, %rax
  2273. leaq BUFFER1, BO // first buffer to BO
  2274. addq $4 * SIZE, BO
  2275. movq %rax, BI // Index for BO
  2276. leaq (BO, BI, SIZE), BO
  2277. leaq (AO, %rax, SIZE), AO
  2278. #endif
  2279. vzeroall
  2280. #ifndef TRMMKERNEL
  2281. movq K, %rax
  2282. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2283. movq K, %rax
  2284. subq KK, %rax
  2285. movq %rax, KKK
  2286. #else
  2287. movq KK, %rax
  2288. #ifdef LEFT
  2289. addq $1, %rax // number of values in AO
  2290. #else
  2291. addq $1, %rax // number of values in BO
  2292. #endif
  2293. movq %rax, KKK
  2294. #endif
  2295. andq $-8, %rax
  2296. je .L1_46
  2297. movq %rax, BI // Index for BO
  2298. leaq (AO, %rax, SIZE), AO
  2299. leaq (BO, BI, SIZE), BO
  2300. negq BI
  2301. negq %rax
  2302. ALIGN_4
  2303. .L1_42:
  2304. KERNEL1x1_SUB
  2305. KERNEL1x1_SUB
  2306. KERNEL1x1_SUB
  2307. KERNEL1x1_SUB
  2308. KERNEL1x1_SUB
  2309. KERNEL1x1_SUB
  2310. KERNEL1x1_SUB
  2311. KERNEL1x1_SUB
  2312. je .L1_46
  2313. KERNEL1x1_SUB
  2314. KERNEL1x1_SUB
  2315. KERNEL1x1_SUB
  2316. KERNEL1x1_SUB
  2317. KERNEL1x1_SUB
  2318. KERNEL1x1_SUB
  2319. KERNEL1x1_SUB
  2320. KERNEL1x1_SUB
  2321. je .L1_46
  2322. jmp .L1_42
  2323. ALIGN_4
  2324. .L1_46:
  2325. #ifndef TRMMKERNEL
  2326. movq K, %rax
  2327. #else
  2328. movq KKK, %rax
  2329. #endif
  2330. andq $7, %rax # if (k & 1)
  2331. je .L1_49
  2332. movq %rax, BI // Index for BO
  2333. leaq (AO, %rax, SIZE), AO
  2334. leaq (BO, BI, SIZE), BO
  2335. negq BI
  2336. negq %rax
  2337. ALIGN_4
  2338. .L1_47:
  2339. KERNEL1x1_SUB
  2340. jl .L1_47
  2341. ALIGN_4
  2342. .L1_49:
  2343. SAVE1x1
  2344. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2345. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2346. movq K, %rax
  2347. subq KKK, %rax
  2348. movq %rax, BI // Index for BO
  2349. leaq (BO, BI, SIZE), BO
  2350. leaq (AO, %rax, SIZE), AO
  2351. #endif
  2352. #if defined(TRMMKERNEL) && defined(LEFT)
  2353. addq $1, KK
  2354. #endif
  2355. addq $1 * SIZE, CO1 # coffset += 1
  2356. ALIGN_4
  2357. .L999:
  2358. movq SP, %rsp
  2359. movq (%rsp), %rbx
  2360. movq 8(%rsp), %rbp
  2361. movq 16(%rsp), %r12
  2362. movq 24(%rsp), %r13
  2363. movq 32(%rsp), %r14
  2364. movq 40(%rsp), %r15
  2365. #ifdef WINDOWS_ABI
  2366. movq 48(%rsp), %rdi
  2367. movq 56(%rsp), %rsi
  2368. movups 64(%rsp), %xmm6
  2369. movups 80(%rsp), %xmm7
  2370. movups 96(%rsp), %xmm8
  2371. movups 112(%rsp), %xmm9
  2372. movups 128(%rsp), %xmm10
  2373. movups 144(%rsp), %xmm11
  2374. movups 160(%rsp), %xmm12
  2375. movups 176(%rsp), %xmm13
  2376. movups 192(%rsp), %xmm14
  2377. movups 208(%rsp), %xmm15
  2378. #endif
  2379. addq $STACKSIZE, %rsp
  2380. ret
  2381. EPILOGUE