You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4.S 67 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894
  1. /*******************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA $f0 // param 4: alpha
  34. #define A $r7 // param 5: ba
  35. #define B $r8 // param 6: bb
  36. #define C $r9 // param 7: bc
  37. #define LDC $r10 // param 8: ldc
  38. #ifdef TRMMKERNEL
  39. #define OFFSET $r11 // param 9: offset
  40. #endif
  41. #define OFF $r12
  42. /* Cycle control parameters */
  43. #define I $r13
  44. #define J $r14
  45. #define L $r15
  46. #define TL $r16
  47. /* Matrix address */
  48. #define A0 $r17
  49. #define B0 $r18
  50. #define C0 $r19
  51. #define C1 $r20
  52. #define C2 $r23
  53. #define C3 $r24
  54. #define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */
  55. #define T1 $r26
  56. #define T2 $r27
  57. #define ZERO $r0
  58. /* LSX vectors */
  59. #define U0 $vr0
  60. #define U1 $vr1
  61. #define U2 $vr2
  62. #define U3 $vr3
  63. #define U4 $vr4
  64. #define U5 $vr5
  65. #define U6 $vr6
  66. #define U7 $vr7
  67. #define U8 $vr8
  68. #define U9 $vr9
  69. #define U10 $vr10
  70. #define U11 $vr11
  71. #define U12 $vr12
  72. #define U13 $vr13
  73. #define U14 $vr14
  74. #define U15 $vr15
  75. #define D0 $vr16
  76. #define D1 $vr17
  77. #define D2 $vr18
  78. #define D3 $vr19
  79. #define D4 $vr20
  80. #define D5 $vr21
  81. #define D6 $vr22
  82. #define D7 $vr23
  83. #define D8 $vr24
  84. #define D9 $vr25
  85. #define D10 $vr26
  86. #define D11 $vr27
  87. #define D12 $vr28
  88. #define D13 $vr29
  89. #define D14 $vr30
  90. #define D15 $vr31
  91. #define VALPHA $vr15
  92. /* Prefetch interval */
  93. #define A_PRE 0x200
  94. #define B_PRE 0x100
  95. .macro KERNEL2x8x4
  96. vld U0, A0, 0x00
  97. vfmadd.d D0, U8, U12, D0
  98. vfmadd.d D1, U9, U12, D1
  99. vld U1, A0, 0x10
  100. vfmadd.d D2, U10, U12, D2
  101. vfmadd.d D3, U11, U12, D3
  102. vld U2, A0, 0x20
  103. vfmadd.d D4, U8, U13, D4
  104. vfmadd.d D5, U9, U13, D5
  105. vld U3, A0, 0x30
  106. vfmadd.d D6, U10, U13, D6
  107. vfmadd.d D7, U11, U13, D7
  108. vldrepl.d U4, B0, 0x00
  109. vfmadd.d D8, U8, U14, D8
  110. vfmadd.d D9, U9, U14, D9
  111. preld 0, B0, B_PRE
  112. vldrepl.d U5, B0, 0x08
  113. vfmadd.d D10, U10, U14, D10
  114. vfmadd.d D11, U11, U14, D11
  115. preld 0, A0, A_PRE
  116. vldrepl.d U6, B0, 0x10
  117. vfmadd.d D12, U8, U15, D12
  118. vfmadd.d D13, U9, U15, D13
  119. preld 0, A0, A_PRE + 0x40
  120. vldrepl.d U7, B0, 0x18
  121. vfmadd.d D14, U10, U15, D14
  122. vfmadd.d D15, U11, U15, D15
  123. addi.d A0, A0, 0x40
  124. addi.d B0, B0, 0x20
  125. vld U8, A0, 0x00
  126. vfmadd.d D0, U0, U4, D0
  127. vfmadd.d D1, U1, U4, D1
  128. vld U9, A0, 0x10
  129. vfmadd.d D2, U2, U4, D2
  130. vfmadd.d D3, U3, U4, D3
  131. vld U10, A0, 0x20
  132. vfmadd.d D4, U0, U5, D4
  133. vfmadd.d D5, U1, U5, D5
  134. vld U11, A0, 0x30
  135. vfmadd.d D6, U2, U5, D6
  136. vfmadd.d D7, U3, U5, D7
  137. vldrepl.d U12, B0, 0x00
  138. vfmadd.d D8, U0, U6, D8
  139. vfmadd.d D9, U1, U6, D9
  140. preld 0, B0, B_PRE
  141. vldrepl.d U13, B0, 0x08
  142. vfmadd.d D10, U2, U6, D10
  143. vfmadd.d D11, U3, U6, D11
  144. preld 0, A0, A_PRE
  145. vldrepl.d U14, B0, 0x10
  146. vfmadd.d D12, U0, U7, D12
  147. vfmadd.d D13, U1, U7, D13
  148. preld 0, A0, A_PRE + 0x40
  149. vldrepl.d U15, B0, 0x18
  150. vfmadd.d D14, U2, U7, D14
  151. vfmadd.d D15, U3, U7, D15
  152. addi.d A0, A0, 0x40
  153. addi.d B0, B0, 0x20
  154. .endm
  155. .macro KERNEL2x8x4_END
  156. vld U0, A0, 0x00
  157. vfmadd.d D0, U8, U12, D0
  158. vfmadd.d D1, U9, U12, D1
  159. vld U1, A0, 0x10
  160. vfmadd.d D2, U10, U12, D2
  161. vfmadd.d D3, U11, U12, D3
  162. vld U2, A0, 0x20
  163. vfmadd.d D4, U8, U13, D4
  164. vfmadd.d D5, U9, U13, D5
  165. vld U3, A0, 0x30
  166. vfmadd.d D6, U10, U13, D6
  167. vfmadd.d D7, U11, U13, D7
  168. vldrepl.d U4, B0, 0x00
  169. vfmadd.d D8, U8, U14, D8
  170. vfmadd.d D9, U9, U14, D9
  171. preld 0, B0, B_PRE
  172. vldrepl.d U5, B0, 0x08
  173. vfmadd.d D10, U10, U14, D10
  174. vfmadd.d D11, U11, U14, D11
  175. preld 0, A0, A_PRE
  176. vldrepl.d U6, B0, 0x10
  177. vfmadd.d D12, U8, U15, D12
  178. vfmadd.d D13, U9, U15, D13
  179. preld 0, A0, A_PRE + 0x40
  180. vldrepl.d U7, B0, 0x18
  181. vfmadd.d D14, U10, U15, D14
  182. vfmadd.d D15, U11, U15, D15
  183. addi.d A0, A0, 0x40
  184. addi.d B0, B0, 0x20
  185. vfmadd.d D0, U0, U4, D0
  186. vfmadd.d D1, U1, U4, D1
  187. vfmadd.d D2, U2, U4, D2
  188. vfmadd.d D3, U3, U4, D3
  189. vfmadd.d D4, U0, U5, D4
  190. vfmadd.d D5, U1, U5, D5
  191. vfmadd.d D6, U2, U5, D6
  192. vfmadd.d D7, U3, U5, D7
  193. vfmadd.d D8, U0, U6, D8
  194. vfmadd.d D9, U1, U6, D9
  195. preld 0, B0, B_PRE
  196. vfmadd.d D10, U2, U6, D10
  197. vfmadd.d D11, U3, U6, D11
  198. preld 0, A0, A_PRE
  199. vfmadd.d D12, U0, U7, D12
  200. vfmadd.d D13, U1, U7, D13
  201. preld 0, A0, A_PRE + 0x40
  202. vfmadd.d D14, U2, U7, D14
  203. vfmadd.d D15, U3, U7, D15
  204. .endm
  205. .macro KERNEL8x8x4
  206. .rept 4
  207. KERNEL2x8x4
  208. .endr
  209. .endm
  210. .macro KERNEL8x8x4_END
  211. .rept 3
  212. KERNEL2x8x4
  213. .endr
  214. KERNEL2x8x4_END
  215. .endm
  216. .macro KERNEL2x4x4
  217. vld U0, A0, 0x00
  218. vld U1, A0, 0x10
  219. vldrepl.d U4, B0, 0x00
  220. vfmadd.d D0, U8, U12, D0
  221. vfmadd.d D1, U9, U12, D1
  222. vldrepl.d U5, B0, 0x08
  223. vfmadd.d D4, U8, U13, D4
  224. vfmadd.d D5, U9, U13, D5
  225. vldrepl.d U6, B0, 0x10
  226. vfmadd.d D8, U8, U14, D8
  227. vfmadd.d D9, U9, U14, D9
  228. vldrepl.d U7, B0, 0x18
  229. vfmadd.d D12, U8, U15, D12
  230. vfmadd.d D13, U9, U15, D13
  231. addi.d A0, A0, 0x20
  232. addi.d B0, B0, 0x20
  233. vld U8, A0, 0x00
  234. vld U9, A0, 0x10
  235. vldrepl.d U12, B0, 0x00
  236. vfmadd.d D0, U0, U4, D0
  237. vfmadd.d D1, U1, U4, D1
  238. vldrepl.d U13, B0, 0x08
  239. vfmadd.d D4, U0, U5, D4
  240. vfmadd.d D5, U1, U5, D5
  241. vldrepl.d U14, B0, 0x10
  242. vfmadd.d D8, U0, U6, D8
  243. vfmadd.d D9, U1, U6, D9
  244. vldrepl.d U15, B0, 0x18
  245. vfmadd.d D12, U0, U7, D12
  246. vfmadd.d D13, U1, U7, D13
  247. addi.d A0, A0, 0x20
  248. addi.d B0, B0, 0x20
  249. .endm
  250. .macro KERNEL2x4x4_END
  251. vld U0, A0, 0x00
  252. vld U1, A0, 0x10
  253. vldrepl.d U4, B0, 0x00
  254. vfmadd.d D0, U8, U12, D0
  255. vfmadd.d D1, U9, U12, D1
  256. vldrepl.d U5, B0, 0x08
  257. vfmadd.d D4, U8, U13, D4
  258. vfmadd.d D5, U9, U13, D5
  259. vldrepl.d U6, B0, 0x10
  260. vfmadd.d D8, U8, U14, D8
  261. vfmadd.d D9, U9, U14, D9
  262. vldrepl.d U7, B0, 0x18
  263. vfmadd.d D12, U8, U15, D12
  264. vfmadd.d D13, U9, U15, D13
  265. addi.d A0, A0, 0x20
  266. addi.d B0, B0, 0x20
  267. vfmadd.d D0, U0, U4, D0
  268. vfmadd.d D1, U1, U4, D1
  269. vfmadd.d D4, U0, U5, D4
  270. vfmadd.d D5, U1, U5, D5
  271. vfmadd.d D8, U0, U6, D8
  272. vfmadd.d D9, U1, U6, D9
  273. vfmadd.d D12, U0, U7, D12
  274. vfmadd.d D13, U1, U7, D13
  275. .endm
  276. .macro KERNEL8x4x4
  277. .rept 4
  278. KERNEL2x4x4
  279. .endr
  280. .endm
  281. .macro KERNEL8x4x4_END
  282. .rept 3
  283. KERNEL2x4x4
  284. .endr
  285. KERNEL2x4x4_END
  286. .endm
  287. .macro KERNEL2x2x4
  288. vldrepl.d U0, A0, 0x00
  289. vldrepl.d U1, A0, 0x08
  290. vfmadd.d D0, U8, U12, D0
  291. vfmadd.d D1, U8, U13, D1
  292. vfmadd.d D2, U9, U12, D2
  293. vfmadd.d D3, U9, U13, D3
  294. vld U4, B0, 0x00
  295. vld U5, B0, 0x10
  296. addi.d A0, A0, 0x10
  297. addi.d B0, B0, 0x20
  298. vldrepl.d U8, A0, 0x00
  299. vldrepl.d U9, A0, 0x08
  300. vfmadd.d D0, U0, U4, D0
  301. vfmadd.d D1, U0, U5, D1
  302. vfmadd.d D2, U1, U4, D2
  303. vfmadd.d D3, U1, U5, D3
  304. vld U12, B0, 0x00
  305. vld U13, B0, 0x10
  306. addi.d A0, A0, 0x10
  307. addi.d B0, B0, 0x20
  308. .endm
  309. .macro KERNEL2x2x4_END
  310. vldrepl.d U0, A0, 0x00
  311. vldrepl.d U1, A0, 0x08
  312. vfmadd.d D0, U8, U12, D0
  313. vfmadd.d D1, U8, U13, D1
  314. vfmadd.d D2, U9, U12, D2
  315. vfmadd.d D3, U9, U13, D3
  316. vld U4, B0, 0x00
  317. vld U5, B0, 0x10
  318. addi.d A0, A0, 0x10
  319. addi.d B0, B0, 0x20
  320. vfmadd.d D0, U0, U4, D0
  321. vfmadd.d D1, U0, U5, D1
  322. vfmadd.d D2, U1, U4, D2
  323. vfmadd.d D3, U1, U5, D3
  324. .endm
  325. .macro KERNEL8x2x4
  326. .rept 4
  327. KERNEL2x2x4
  328. .endr
  329. .endm
  330. .macro KERNEL8x2x4_END
  331. .rept 3
  332. KERNEL2x2x4
  333. .endr
  334. KERNEL2x2x4_END
  335. .endm
  336. .macro KERNEL2x1x4
  337. vldrepl.d U0, A0, 0x00
  338. vfmadd.d D0, U8, U12, D0
  339. vfmadd.d D1, U8, U13, D1
  340. vld U4, B0, 0x00
  341. vld U5, B0, 0x10
  342. vldrepl.d U8, A0, 0x08
  343. vfmadd.d D0, U0, U4, D0
  344. vfmadd.d D1, U0, U5, D1
  345. vld U12, B0, 0x20
  346. vld U13, B0, 0x30
  347. addi.d A0, A0, 0x10
  348. addi.d B0, B0, 0x40
  349. .endm
  350. .macro KERNEL2x1x4_END
  351. vldrepl.d U0, A0, 0x00
  352. vfmadd.d D0, U8, U12, D0
  353. vfmadd.d D1, U8, U13, D1
  354. vld U4, B0, 0x00
  355. vld U5, B0, 0x10
  356. addi.d A0, A0, 0x08
  357. addi.d B0, B0, 0x20
  358. vfmadd.d D0, U0, U4, D0
  359. vfmadd.d D1, U0, U5, D1
  360. .endm
  361. .macro KERNEL8x1x4
  362. .rept 4
  363. KERNEL2x1x4
  364. .endr
  365. .endm
  366. .macro KERNEL8x1x4_END
  367. .rept 3
  368. KERNEL2x1x4
  369. .endr
  370. KERNEL2x1x4_END
  371. .endm
  372. .macro KERNEL2x8x2
  373. vld U0, A0, 0x00
  374. vfmadd.d D0, U8, U12, D0
  375. vfmadd.d D1, U9, U12, D1
  376. vld U1, A0, 0x10
  377. vfmadd.d D2, U10, U12, D2
  378. vfmadd.d D3, U11, U12, D3
  379. vld U2, A0, 0x20
  380. vfmadd.d D4, U8, U13, D4
  381. vfmadd.d D5, U9, U13, D5
  382. vld U3, A0, 0x30
  383. vfmadd.d D6, U10, U13, D6
  384. vfmadd.d D7, U11, U13, D7
  385. vldrepl.d U4, B0, 0x00
  386. vldrepl.d U5, B0, 0x08
  387. addi.d A0, A0, 0x40
  388. addi.d B0, B0, 0x10
  389. vld U8, A0, 0x00
  390. vfmadd.d D0, U0, U4, D0
  391. vfmadd.d D1, U1, U4, D1
  392. vld U9, A0, 0x10
  393. vfmadd.d D2, U2, U4, D2
  394. vfmadd.d D3, U3, U4, D3
  395. vld U10, A0, 0x20
  396. vfmadd.d D4, U0, U5, D4
  397. vfmadd.d D5, U1, U5, D5
  398. vld U11, A0, 0x30
  399. vfmadd.d D6, U2, U5, D6
  400. vfmadd.d D7, U3, U5, D7
  401. vldrepl.d U12, B0, 0x00
  402. vldrepl.d U13, B0, 0x08
  403. addi.d A0, A0, 0x40
  404. addi.d B0, B0, 0x10
  405. .endm
  406. .macro KERNEL2x8x2_END
  407. vld U0, A0, 0x00
  408. vfmadd.d D0, U8, U12, D0
  409. vfmadd.d D1, U9, U12, D1
  410. vld U1, A0, 0x10
  411. vfmadd.d D2, U10, U12, D2
  412. vfmadd.d D3, U11, U12, D3
  413. vld U2, A0, 0x20
  414. vfmadd.d D4, U8, U13, D4
  415. vfmadd.d D5, U9, U13, D5
  416. vld U3, A0, 0x30
  417. vfmadd.d D6, U10, U13, D6
  418. vfmadd.d D7, U11, U13, D7
  419. vldrepl.d U4, B0, 0x00
  420. vldrepl.d U5, B0, 0x08
  421. addi.d A0, A0, 0x40
  422. addi.d B0, B0, 0x10
  423. vfmadd.d D0, U0, U4, D0
  424. vfmadd.d D1, U1, U4, D1
  425. vfmadd.d D2, U2, U4, D2
  426. vfmadd.d D3, U3, U4, D3
  427. vfmadd.d D4, U0, U5, D4
  428. vfmadd.d D5, U1, U5, D5
  429. vfmadd.d D6, U2, U5, D6
  430. vfmadd.d D7, U3, U5, D7
  431. .endm
  432. .macro KERNEL8x8x2
  433. .rept 4
  434. KERNEL2x8x2
  435. .endr
  436. .endm
  437. .macro KERNEL8x8x2_END
  438. .rept 3
  439. KERNEL2x8x2
  440. .endr
  441. KERNEL2x8x2_END
  442. .endm
  443. .macro KERNEL2x4x2
  444. vld U0, A0, 0x00
  445. vld U1, A0, 0x10
  446. vfmadd.d D0, U8, U12, D0
  447. vfmadd.d D1, U9, U12, D1
  448. vfmadd.d D4, U8, U13, D4
  449. vfmadd.d D5, U9, U13, D5
  450. vldrepl.d U4, B0, 0x00
  451. vldrepl.d U5, B0, 0x08
  452. vld U8, A0, 0x20
  453. vld U9, A0, 0x30
  454. vfmadd.d D0, U0, U4, D0
  455. vfmadd.d D1, U1, U4, D1
  456. vfmadd.d D4, U0, U5, D4
  457. vfmadd.d D5, U1, U5, D5
  458. vldrepl.d U12, B0, 0x10
  459. vldrepl.d U13, B0, 0x18
  460. addi.d A0, A0, 0x40
  461. addi.d B0, B0, 0x20
  462. .endm
  463. .macro KERNEL2x4x2_END
  464. vld U0, A0, 0x00
  465. vld U1, A0, 0x10
  466. vfmadd.d D0, U8, U12, D0
  467. vfmadd.d D1, U9, U12, D1
  468. vfmadd.d D4, U8, U13, D4
  469. vfmadd.d D5, U9, U13, D5
  470. vldrepl.d U4, B0, 0x00
  471. vldrepl.d U5, B0, 0x08
  472. addi.d A0, A0, 0x20
  473. addi.d B0, B0, 0x10
  474. vfmadd.d D0, U0, U4, D0
  475. vfmadd.d D1, U1, U4, D1
  476. vfmadd.d D4, U0, U5, D4
  477. vfmadd.d D5, U1, U5, D5
  478. .endm
  479. .macro KERNEL8x4x2
  480. .rept 4
  481. KERNEL2x4x2
  482. .endr
  483. .endm
  484. .macro KERNEL8x4x2_END
  485. .rept 3
  486. KERNEL2x4x2
  487. .endr
  488. KERNEL2x4x2_END
  489. .endm
  490. .macro KERNEL2x2x2
  491. vld U0, A0, 0x00
  492. vfmadd.d D0, U8, U12, D0
  493. vfmadd.d D4, U8, U13, D4
  494. vldrepl.d U4, B0, 0x00
  495. vldrepl.d U5, B0, 0x08
  496. vld U8, A0, 0x10
  497. vldrepl.d U12, B0, 0x10
  498. vldrepl.d U13, B0, 0x18
  499. vfmadd.d D0, U0, U4, D0
  500. vfmadd.d D4, U0, U5, D4
  501. addi.d A0, A0, 0x20
  502. addi.d B0, B0, 0x20
  503. .endm
  504. .macro KERNEL2x2x2_END
  505. vld U0, A0, 0x00
  506. vfmadd.d D0, U8, U12, D0
  507. vfmadd.d D4, U8, U13, D4
  508. vldrepl.d U4, B0, 0x00
  509. vldrepl.d U5, B0, 0x08
  510. addi.d A0, A0, 0x10
  511. addi.d B0, B0, 0x10
  512. vfmadd.d D0, U0, U4, D0
  513. vfmadd.d D4, U0, U5, D4
  514. .endm
  515. .macro KERNEL8x2x2
  516. .rept 4
  517. KERNEL2x2x2
  518. .endr
  519. .endm
  520. .macro KERNEL8x2x2_END
  521. .rept 3
  522. KERNEL2x2x2
  523. .endr
  524. KERNEL2x2x2_END
  525. .endm
  526. .macro KERNEL2x1x2
  527. vldrepl.d U0, A0, 0x00
  528. vfmadd.d D0, U8, U12, D0
  529. vld U4, B0, 0x00
  530. vldrepl.d U8, A0, 0x08
  531. vld U12, B0, 0x10
  532. vfmadd.d D0, U0, U4, D0
  533. addi.d A0, A0, 0x10
  534. addi.d B0, B0, 0x20
  535. .endm
  536. .macro KERNEL2x1x2_END
  537. vldrepl.d U0, A0, 0x00
  538. vfmadd.d D0, U8, U12, D0
  539. vld U4, B0, 0x00
  540. addi.d A0, A0, 0x08
  541. addi.d B0, B0, 0x10
  542. vfmadd.d D0, U0, U4, D0
  543. .endm
  544. .macro KERNEL8x1x2
  545. .rept 4
  546. KERNEL2x1x2
  547. .endr
  548. .endm
  549. .macro KERNEL8x1x2_END
  550. .rept 3
  551. KERNEL2x1x2
  552. .endr
  553. KERNEL2x1x2_END
  554. .endm
  555. .macro KERNEL2x8x1
  556. vld U0, A0, 0x00
  557. vfmadd.d D0, U8, U12, D0
  558. vfmadd.d D1, U9, U12, D1
  559. vld U1, A0, 0x10
  560. vfmadd.d D2, U10, U12, D2
  561. vfmadd.d D3, U11, U12, D3
  562. vldrepl.d U4, B0, 0x00
  563. vld U2, A0, 0x20
  564. vld U3, A0, 0x30
  565. vld U8, A0, 0x40
  566. vfmadd.d D0, U0, U4, D0
  567. vfmadd.d D1, U1, U4, D1
  568. vld U9, A0, 0x50
  569. vfmadd.d D2, U2, U4, D2
  570. vfmadd.d D3, U3, U4, D3
  571. vld U10, A0, 0x60
  572. vld U11, A0, 0x70
  573. vldrepl.d U12, B0, 0x08
  574. addi.d A0, A0, 0x80
  575. addi.d B0, B0, 0x10
  576. .endm
  577. .macro KERNEL2x8x1_END
  578. vld U0, A0, 0x00
  579. vfmadd.d D0, U8, U12, D0
  580. vfmadd.d D1, U9, U12, D1
  581. vld U1, A0, 0x10
  582. vfmadd.d D2, U10, U12, D2
  583. vfmadd.d D3, U11, U12, D3
  584. vld U2, A0, 0x20
  585. vld U3, A0, 0x30
  586. vldrepl.d U4, B0, 0x00
  587. addi.d A0, A0, 0x40
  588. addi.d B0, B0, 0x08
  589. vfmadd.d D0, U0, U4, D0
  590. vfmadd.d D1, U1, U4, D1
  591. vfmadd.d D2, U2, U4, D2
  592. vfmadd.d D3, U3, U4, D3
  593. .endm
  594. .macro KERNEL8x8x1
  595. .rept 4
  596. KERNEL2x8x1
  597. .endr
  598. .endm
  599. .macro KERNEL8x8x1_END
  600. .rept 3
  601. KERNEL2x8x1
  602. .endr
  603. KERNEL2x8x1_END
  604. .endm
  605. .macro KERNEL2x4x1
  606. vld U0, A0, 0x00
  607. vld U1, A0, 0x10
  608. vfmadd.d D0, U8, U12, D0
  609. vfmadd.d D1, U9, U12, D1
  610. vldrepl.d U4, B0, 0x00
  611. vld U8, A0, 0x20
  612. vld U9, A0, 0x30
  613. vfmadd.d D0, U0, U4, D0
  614. vfmadd.d D1, U1, U4, D1
  615. vldrepl.d U12, B0, 0x08
  616. addi.d A0, A0, 0x40
  617. addi.d B0, B0, 0x10
  618. .endm
  619. .macro KERNEL2x4x1_END
  620. vld U0, A0, 0x00
  621. vld U1, A0, 0x10
  622. vfmadd.d D0, U8, U12, D0
  623. vfmadd.d D1, U9, U12, D1
  624. vldrepl.d U4, B0, 0x00
  625. addi.d A0, A0, 0x20
  626. addi.d B0, B0, 0x08
  627. vfmadd.d D0, U0, U4, D0
  628. vfmadd.d D1, U1, U4, D1
  629. .endm
  630. .macro KERNEL8x4x1
  631. .rept 4
  632. KERNEL2x4x1
  633. .endr
  634. .endm
  635. .macro KERNEL8x4x1_END
  636. .rept 3
  637. KERNEL2x4x1
  638. .endr
  639. KERNEL2x4x1_END
  640. .endm
  641. .macro KERNEL2x2x1
  642. vld U0, A0, 0x00
  643. vfmadd.d D0, U8, U12, D0
  644. vldrepl.d U4, B0, 0x00
  645. addi.d A0, A0, 0x10
  646. addi.d B0, B0, 0x08
  647. vld U8, A0, 0x00
  648. vfmadd.d D0, U0, U4, D0
  649. vldrepl.d U12, B0, 0x00
  650. addi.d A0, A0, 0x10
  651. addi.d B0, B0, 0x08
  652. .endm
  653. .macro KERNEL2x2x1_END
  654. vld U0, A0, 0x00
  655. vfmadd.d D0, U8, U12, D0
  656. vldrepl.d U4, B0, 0x00
  657. addi.d A0, A0, 0x10
  658. addi.d B0, B0, 0x08
  659. vfmadd.d D0, U0, U4, D0
  660. .endm
  661. .macro KERNEL8x2x1
  662. .rept 4
  663. KERNEL2x2x1
  664. .endr
  665. .endm
  666. .macro KERNEL8x2x1_END
  667. .rept 3
  668. KERNEL2x2x1
  669. .endr
  670. KERNEL2x2x1_END
  671. .endm
  672. .macro KERNEL2x1x1
  673. vldrepl.d U0, A0, 0x00
  674. vfmadd.d D0, U8, U12, D0
  675. vldrepl.d U4, B0, 0x00
  676. addi.d A0, A0, 0x08
  677. addi.d B0, B0, 0x08
  678. vldrepl.d U8, A0, 0x00
  679. vfmadd.d D0, U0, U4, D0
  680. vldrepl.d U12, B0, 0x00
  681. addi.d A0, A0, 0x08
  682. addi.d B0, B0, 0x08
  683. .endm
  684. .macro KERNEL2x1x1_END
  685. vldrepl.d U0, A0, 0x00
  686. vfmadd.d D0, U8, U12, D0
  687. vldrepl.d U4, B0, 0x00
  688. addi.d A0, A0, 0x08
  689. addi.d B0, B0, 0x08
  690. vfmadd.d D0, U0, U4, D0
  691. .endm
  692. .macro KERNEL8x1x1
  693. .rept 4
  694. KERNEL2x1x1
  695. .endr
  696. .endm
  697. .macro KERNEL8x1x1_END
  698. .rept 3
  699. KERNEL2x1x1
  700. .endr
  701. KERNEL2x1x1_END
  702. .endm
  703. PROLOGUE
  704. addi.d $sp, $sp, -112
  705. /* Store regs */
  706. SDARG $r23, $sp, 0
  707. SDARG $r24, $sp, 8
  708. SDARG $r25, $sp, 16
  709. SDARG $r26, $sp, 24
  710. SDARG $r27, $sp, 32
  711. ST $f24, $sp, 40
  712. ST $f25, $sp, 48
  713. ST $f26, $sp, 56
  714. ST $f27, $sp, 64
  715. ST $f28, $sp, 72
  716. ST $f29, $sp, 80
  717. ST $f30, $sp, 88
  718. ST $f31, $sp, 96
  719. ST ALPHA, $sp, 104
  720. #if defined (TRMMKERNEL) && !defined(LEFT)
  721. sub.d OFF, ZERO, OFFSET
  722. #else
  723. xor OFF, OFF, OFF
  724. #endif
  725. /* if (!(N >> 2)) goto L_N3 */
  726. srai.d J, N, 2 /* J = bn >> 2 */
  727. andi N, N, 0x03
  728. vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */
  729. beq ZERO, J, .L_N3
  730. .L_J1: /* J-- && This loop include Condition 1 */
  731. /************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! *************************
  732. * dgemm_core_16x4 */
  733. move C0, C
  734. move A0, A
  735. slli.d T0, LDC, 3
  736. add.d C1, C0, T0
  737. addi.d J, J, -1 /* J-- */
  738. add.d C2, C1, T0
  739. add.d C3, C2, T0
  740. #if defined(TRMMKERNEL) && defined(LEFT)
  741. move OFF, OFFSET
  742. #endif
  743. /* if (!(M >> 3)) goto L_M8 */
  744. srai.d I, M, 3 /* I = bm >> 3 */
  745. beq ZERO, I, .L_M8
  746. .L_I1: /* I-- */
  747. #if defined(TRMMKERNEL)
  748. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  749. move B0, B
  750. #else
  751. slli.d T0, OFF, 0x06
  752. add.d A0, A0, T0
  753. slli.d T0, OFF, 0x05
  754. add.d B0, B, T0
  755. #endif
  756. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  757. sub.d L, K, OFF
  758. #elif defined(LEFT)
  759. /* number of values in A */
  760. addi.d L, OFF, 8
  761. #else
  762. /* number of values in B */
  763. addi.d L, OFF, 4
  764. #endif
  765. #else // #if !defined(TRMMKERNEL)
  766. move B0, B
  767. move L, K /* L = bk */
  768. #endif
  769. /* Calculate the first set of D0~D15,
  770. * avoidig set 0 operation
  771. * Load 8 * 64 from A0
  772. * U0 = {a1, a0}
  773. * U1 = {a3, a2}
  774. * U2 = {a5, a4}
  775. * U3 = {a7, a6}
  776. */
  777. vld U0, A0, 0x00
  778. vld U1, A0, 0x10
  779. vld U2, A0, 0x20
  780. vld U3, A0, 0x30
  781. vldrepl.d U4, B0, 0x00
  782. preld 0, C0, 0x00
  783. /* line 1 */
  784. vfmul.d D0, U0, U4
  785. vfmul.d D1, U1, U4
  786. preld 0, C0, 0x20
  787. vfmul.d D2, U2, U4
  788. vfmul.d D3, U3, U4
  789. vldrepl.d U5, B0, 0x08
  790. preld 0, C1, 0x00
  791. /* line 2 */
  792. vfmul.d D4, U0, U5
  793. vfmul.d D5, U1, U5
  794. preld 0, C1, 0x20
  795. vfmul.d D6, U2, U5
  796. vfmul.d D7, U3, U5
  797. vldrepl.d U6, B0, 0x10
  798. preld 0, C2, 0x00
  799. /* line 3 */
  800. vfmul.d D8, U0, U6
  801. vfmul.d D9, U1, U6
  802. preld 0, C2, 0x20
  803. vfmul.d D10, U2, U6
  804. vfmul.d D11, U3, U6
  805. vldrepl.d U7, B0, 0x18
  806. preld 0, C3, 0x00
  807. /* line 4 */
  808. vfmul.d D12, U0, U7
  809. vfmul.d D13, U1, U7
  810. preld 0, C3, 0x20
  811. vfmul.d D14, U2, U7
  812. vfmul.d D15, U3, U7
  813. /* Add stride for A0 and B0 */
  814. addi.d A0, A0, 0x40
  815. addi.d B0, B0, 0x20
  816. /* Reduce L */
  817. addi.d L, L, -1
  818. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  819. /* if (TL < 1) goto L_L7 */
  820. beq ZERO,TL, .L_L7
  821. vld U8, A0, 0x00
  822. vld U9, A0, 0x10
  823. vld U10, A0, 0x20
  824. vld U11, A0, 0x30
  825. addi.d TL, TL, -1
  826. vldrepl.d U12, B0, 0x00
  827. vldrepl.d U13, B0, 0x08
  828. vldrepl.d U14, B0, 0x10
  829. vldrepl.d U15, B0, 0x18
  830. addi.d A0, A0, 0x40
  831. addi.d B0, B0, 0x20
  832. beq ZERO, TL, .L_TL1_END
  833. .L_TL1: /* TL-- */
  834. KERNEL8x8x4
  835. addi.d TL, TL, -1 /* TL-- */
  836. blt ZERO,TL, .L_TL1
  837. .L_TL1_END:
  838. KERNEL8x8x4_END
  839. /* Maybe we need calculate the last
  840. * 7 sets of D0~D15?
  841. */
  842. .L_L7:
  843. /* if (!(L & 7)) goto L_L0 */
  844. andi TL, L, 7
  845. beq TL, ZERO,.L_L0
  846. .L_L71:
  847. /* Load 16 * 64 from A0 */
  848. vld U0, A0, 0x00
  849. vld U1, A0, 0x10
  850. vld U2, A0, 0x20
  851. vld U3, A0, 0x30
  852. /* Cumulative D0~D15 */
  853. vldrepl.d U4, B0, 0x00
  854. vfmadd.d D0, U0, U4, D0
  855. vfmadd.d D1, U1, U4, D1
  856. vfmadd.d D2, U2, U4, D2
  857. vfmadd.d D3, U3, U4, D3
  858. vldrepl.d U5, B0, 0x08
  859. vfmadd.d D4, U0, U5, D4
  860. vfmadd.d D5, U1, U5, D5
  861. vfmadd.d D6, U2, U5, D6
  862. vfmadd.d D7, U3, U5, D7
  863. vldrepl.d U6, B0, 0x10
  864. vfmadd.d D8, U0, U6, D8
  865. vfmadd.d D9, U1, U6, D9
  866. vfmadd.d D10, U2, U6, D10
  867. vfmadd.d D11, U3, U6, D11
  868. vldrepl.d U7, B0, 0x18
  869. vfmadd.d D12, U0, U7, D12
  870. vfmadd.d D13, U1, U7, D13
  871. vfmadd.d D14, U2, U7, D14
  872. vfmadd.d D15, U3, U7, D15
  873. /* Add stride for A0, B0 */
  874. addi.d A0, A0, 0x40
  875. addi.d B0, B0, 0x20
  876. addi.d TL, TL, -1
  877. blt ZERO,TL, .L_L71
  878. .L_L0:
  879. vldrepl.d VALPHA, $sp, 104
  880. #if defined(TRMMKERNEL)
  881. vfmul.d D0, D0, VALPHA
  882. vfmul.d D1, D1, VALPHA
  883. vfmul.d D2, D2, VALPHA
  884. vfmul.d D3, D3, VALPHA
  885. vfmul.d D4, D4, VALPHA
  886. vfmul.d D5, D5, VALPHA
  887. vfmul.d D6, D6, VALPHA
  888. vfmul.d D7, D7, VALPHA
  889. vfmul.d D8, D8, VALPHA
  890. vfmul.d D9, D9, VALPHA
  891. vfmul.d D10, D10, VALPHA
  892. vfmul.d D11, D11, VALPHA
  893. vfmul.d D12, D12, VALPHA
  894. vfmul.d D13, D13, VALPHA
  895. vfmul.d D14, D14, VALPHA
  896. vfmul.d D15, D15, VALPHA
  897. #else
  898. /* Load C0 */
  899. vld U0, C0, 0x00
  900. vld U1, C0, 0x10
  901. vld U2, C0, 0x20
  902. vld U3, C0, 0x30
  903. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  904. vfmadd.d D1, D1, VALPHA, U1
  905. vfmadd.d D2, D2, VALPHA, U2
  906. vfmadd.d D3, D3, VALPHA, U3
  907. /* Load C1 */
  908. vld U4, C1, 0x00
  909. vld U5, C1, 0x10
  910. vld U6, C1, 0x20
  911. vld U7, C1, 0x30
  912. vfmadd.d D4, D4, VALPHA, U4
  913. vfmadd.d D5, D5, VALPHA, U5
  914. vfmadd.d D6, D6, VALPHA, U6
  915. vfmadd.d D7, D7, VALPHA, U7
  916. /* Load C2 */
  917. vld U8, C2, 0x00
  918. vld U9, C2, 0x10
  919. vld U10, C2, 0x20
  920. vld U11, C2, 0x30
  921. vfmadd.d D8, D8, VALPHA, U8
  922. vfmadd.d D9, D9, VALPHA, U9
  923. vfmadd.d D10, D10, VALPHA, U10
  924. vfmadd.d D11, D11, VALPHA, U11
  925. /* Load C3 */
  926. vld U0, C3, 0x00
  927. vld U1, C3, 0x10
  928. vld U2, C3, 0x20
  929. vld U3, C3, 0x30
  930. vfmadd.d D12, D12, VALPHA, U0
  931. vfmadd.d D13, D13, VALPHA, U1
  932. vfmadd.d D14, D14, VALPHA, U2
  933. vfmadd.d D15, D15, VALPHA, U3
  934. #endif // #if defined(TRMMKERNEL)
  935. /* Store C0 */
  936. vst D0, C0, 0x00
  937. vst D1, C0, 0x10
  938. vst D2, C0, 0x20
  939. vst D3, C0, 0x30
  940. /* Store C1 */
  941. vst D4, C1, 0x00
  942. vst D5, C1, 0x10
  943. vst D6, C1, 0x20
  944. vst D7, C1, 0x30
  945. /* Store C2 */
  946. vst D8, C2, 0x00
  947. vst D9, C2, 0x10
  948. vst D10, C2, 0x20
  949. vst D11, C2, 0x30
  950. /* Store C3 */
  951. vst D12, C3, 0x00
  952. vst D13, C3, 0x10
  953. vst D14, C3, 0x20
  954. vst D15, C3, 0x30
  955. /* Add stride for C */
  956. addi.d C0, C0, 0x40
  957. addi.d C1, C1, 0x40
  958. addi.d C2, C2, 0x40
  959. addi.d C3, C3, 0x40
  960. #if defined(TRMMKERNEL)
  961. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  962. sub.d L, K, OFF
  963. #ifdef LEFT
  964. /* number of values in A */
  965. addi.d L, L, -8
  966. #else
  967. /* number of values in B */
  968. addi.d L, L, -4
  969. #endif
  970. slli.d T0, L, 0x06
  971. add.d A0, A0, T0
  972. slli.d T0, L, 0x05
  973. add.d B0, B0, T0
  974. #endif
  975. #ifdef LEFT
  976. addi.d OFF, OFF, 0x08
  977. #endif
  978. #endif // #if defined(TRMMKERNEL)
  979. addi.d I, I, -1 /* I-- */
  980. blt ZERO,I, .L_I1
  981. .L_M8:
  982. /* We have done M & 16, considering M=8/4/2/1 */
  983. andi I, M, 7
  984. beq ZERO,I, .L_M0
  985. andi I, M, 4
  986. beq ZERO,I, .L_M2
  987. #if defined(TRMMKERNEL)
  988. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  989. move B0, B
  990. #else
  991. slli.d T0, OFF, 0x05
  992. add.d A0, A0, T0
  993. add.d B0, B, T0
  994. #endif
  995. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  996. sub.d L, K, OFF
  997. #elif defined(LEFT)
  998. /* number of values in A */
  999. addi.d L, OFF, 4
  1000. #else
  1001. /* number of values in B */
  1002. addi.d L, OFF, 4
  1003. #endif
  1004. #else // #if !defined(TRMMKERNEL)
  1005. move B0, B
  1006. move L, K /* L = bk */
  1007. #endif
  1008. /* Load 4 * 64 from A0 */
  1009. vld U0, A0, 0x00
  1010. vld U1, A0, 0x10
  1011. vldrepl.d U4, B0, 0x00
  1012. /* line 1 */
  1013. vfmul.d D0, U0, U4
  1014. vfmul.d D1, U1, U4
  1015. vldrepl.d U5, B0, 0x08
  1016. /* line 2 */
  1017. vfmul.d D4, U0, U5
  1018. vfmul.d D5, U1, U5
  1019. vldrepl.d U6, B0, 0x10
  1020. /* line 3 */
  1021. vfmul.d D8, U0, U6
  1022. vfmul.d D9, U1, U6
  1023. vldrepl.d U7, B0, 0x18
  1024. /* line 4 */
  1025. vfmul.d D12, U0, U7
  1026. vfmul.d D13, U1, U7
  1027. /* Add stride for A0 and B0 */
  1028. addi.d A0, A0, 0x20
  1029. addi.d B0, B0, 0x20
  1030. /* Reduce L */
  1031. addi.d L, L, -1
  1032. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1033. /* if (TL < 1) goto L_M4_L7 */
  1034. beq ZERO,TL, .L_M4_L7
  1035. vld U8, A0, 0x00
  1036. vld U9, A0, 0x10
  1037. addi.d TL, TL, -1
  1038. vldrepl.d U12, B0, 0x00
  1039. vldrepl.d U13, B0, 0x08
  1040. vldrepl.d U14, B0, 0x10
  1041. vldrepl.d U15, B0, 0x18
  1042. addi.d A0, A0, 0x20
  1043. addi.d B0, B0, 0x20
  1044. beq ZERO, TL, .L_M4_TL1_END
  1045. .L_M4_TL1: /* TL-- */
  1046. KERNEL8x4x4
  1047. addi.d TL, TL, -1
  1048. blt ZERO,TL, .L_M4_TL1
  1049. .L_M4_TL1_END:
  1050. KERNEL8x4x4_END
  1051. .L_M4_L7:
  1052. /* if (!(L & 7)) goto L_M4_L0 */
  1053. andi TL, L, 7
  1054. beq TL, ZERO,.L_M4_L0
  1055. .L_M4_L71:
  1056. vld U0, A0, 0x00
  1057. vld U1, A0, 0x10
  1058. vldrepl.d U4, B0, 0x00
  1059. vfmadd.d D0, U0, U4, D0
  1060. vfmadd.d D1, U1, U4, D1
  1061. vldrepl.d U5, B0, 0x08
  1062. vfmadd.d D4, U0, U5, D4
  1063. vfmadd.d D5, U1, U5, D5
  1064. vldrepl.d U6, B0, 0x10
  1065. vfmadd.d D8, U0, U6, D8
  1066. vfmadd.d D9, U1, U6, D9
  1067. vldrepl.d U7, B0, 0x18
  1068. vfmadd.d D12, U0, U7, D12
  1069. vfmadd.d D13, U1, U7, D13
  1070. /* Add stride for A0, B0 */
  1071. addi.d A0, A0, 0x20
  1072. addi.d B0, B0, 0x20
  1073. addi.d TL, TL, -1
  1074. blt ZERO,TL, .L_M4_L71
  1075. .L_M4_L0:
  1076. vldrepl.d VALPHA, $sp, 104
  1077. #if defined(TRMMKERNEL)
  1078. vfmul.d D0, D0, VALPHA
  1079. vfmul.d D1, D1, VALPHA
  1080. vfmul.d D4, D4, VALPHA
  1081. vfmul.d D5, D5, VALPHA
  1082. vfmul.d D8, D8, VALPHA
  1083. vfmul.d D9, D9, VALPHA
  1084. vfmul.d D12, D12, VALPHA
  1085. vfmul.d D13, D13, VALPHA
  1086. #else
  1087. /* Load C0 */
  1088. vld U0, C0, 0x00
  1089. vld U1, C0, 0x10
  1090. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  1091. vfmadd.d D1, D1, VALPHA, U1
  1092. /* Load C1 */
  1093. vld U2, C1, 0x00
  1094. vld U3, C1, 0x10
  1095. vfmadd.d D4, D4, VALPHA, U2
  1096. vfmadd.d D5, D5, VALPHA, U3
  1097. /* Load C2 */
  1098. vld U4, C2, 0x00
  1099. vld U5, C2, 0x10
  1100. vfmadd.d D8, D8, VALPHA, U4
  1101. vfmadd.d D9, D9, VALPHA, U5
  1102. /* Load C3 */
  1103. vld U6, C3, 0x00
  1104. vld U7, C3, 0x10
  1105. vfmadd.d D12, D12, VALPHA, U6
  1106. vfmadd.d D13, D13, VALPHA, U7
  1107. #endif // #if defined(TRMMKERNEL)
  1108. /* Store C0 */
  1109. vst D0, C0, 0x00
  1110. vst D1, C0, 0x10
  1111. /* Store C1 */
  1112. vst D4, C1, 0x00
  1113. vst D5, C1, 0x10
  1114. /* Store C2 */
  1115. vst D8, C2, 0x00
  1116. vst D9, C2, 0x10
  1117. /* Store C3 */
  1118. vst D12, C3, 0x00
  1119. vst D13, C3, 0x10
  1120. /* Add stride for C */
  1121. addi.d C0, C0, 0x20
  1122. addi.d C1, C1, 0x20
  1123. addi.d C2, C2, 0x20
  1124. addi.d C3, C3, 0x20
  1125. #if defined(TRMMKERNEL)
  1126. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1127. sub.d L, K, OFF
  1128. #ifdef LEFT
  1129. /* number of values in A */
  1130. addi.d L, L, -4
  1131. #else
  1132. /* number of values in B */
  1133. addi.d L, L, -4
  1134. #endif
  1135. slli.d T0, L, 0x05
  1136. add.d A0, A0, T0
  1137. add.d B0, B0, T0
  1138. #endif
  1139. #ifdef LEFT
  1140. /* number of values in A */
  1141. addi.d OFF, OFF, 0x04
  1142. #endif
  1143. #endif // #if defined(TRMMKERNEL)
  1144. /********LOOP (if(N >> 2 ) && (M & 4) ) End************/
  1145. .L_M2:
  1146. andi I, M, 2
  1147. beq ZERO,I, .L_M1
  1148. #if defined(TRMMKERNEL)
  1149. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1150. move B0, B
  1151. #else
  1152. slli.d T0, OFF, 0x04
  1153. add.d A0, A0, T0
  1154. slli.d T0, OFF, 0x05
  1155. add.d B0, B, T0
  1156. #endif
  1157. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1158. sub.d L, K, OFF
  1159. #elif defined(LEFT)
  1160. /* number of values in A */
  1161. addi.d L, OFF, 2
  1162. #else
  1163. /* number of values in B */
  1164. addi.d L, OFF, 4
  1165. #endif
  1166. #else // #if !defined(TRMMKERNEL)
  1167. move B0, B
  1168. move L, K /* L = bk */
  1169. #endif
  1170. /* Load 2 * 64 from A0 */
  1171. vldrepl.d U0, A0, 0x00
  1172. vldrepl.d U1, A0, 0x08
  1173. vld U4, B0, 0x00
  1174. vld U5, B0, 0x10
  1175. vfmul.d D0, U0, U4
  1176. vfmul.d D1, U0, U5
  1177. vfmul.d D2, U1, U4
  1178. vfmul.d D3, U1, U5
  1179. /* Add stride for A0 and B0 */
  1180. addi.d A0, A0, 0x10
  1181. addi.d B0, B0, 0x20
  1182. /* Reduce L */
  1183. addi.d L, L, -1
  1184. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1185. /* if (TL < 1) goto L_M2_L7 */
  1186. beq ZERO,TL, .L_M2_L7
  1187. vldrepl.d U8, A0, 0x00
  1188. vldrepl.d U9, A0, 0x08
  1189. addi.d TL, TL, -1
  1190. vld U12, B0, 0x00
  1191. vld U13, B0, 0x10
  1192. addi.d A0, A0, 0x10
  1193. addi.d B0, B0, 0x20
  1194. beq ZERO, TL, .L_M2_TL1_END
  1195. .L_M2_TL1: /* TL-- */
  1196. KERNEL8x2x4
  1197. addi.d TL, TL, -1 /* TL-- */
  1198. blt ZERO,TL, .L_M2_TL1
  1199. .L_M2_TL1_END:
  1200. KERNEL8x2x4_END
  1201. .L_M2_L7:
  1202. /* if (!(L & 7)) goto L_M2_L0 */
  1203. andi TL, L, 7
  1204. beq TL, ZERO,.L_M2_L0
  1205. .L_M2_L71:
  1206. vldrepl.d U0, A0, 0x00
  1207. vldrepl.d U1, A0, 0x08
  1208. vld U4, B0, 0x00
  1209. vld U5, B0, 0x10
  1210. vfmadd.d D0, U0, U4, D0
  1211. vfmadd.d D1, U0, U5, D1
  1212. vfmadd.d D2, U1, U4, D2
  1213. vfmadd.d D3, U1, U5, D3
  1214. /* Add stride for A0, B0 */
  1215. addi.d A0, A0, 0x10
  1216. addi.d B0, B0, 0x20
  1217. addi.d TL, TL, -1
  1218. blt ZERO,TL, .L_M2_L71
  1219. .L_M2_L0:
  1220. vldrepl.d VALPHA, $sp, 104
  1221. #if defined(TRMMKERNEL)
  1222. vfmul.d D0, D0, VALPHA
  1223. vfmul.d D1, D1, VALPHA
  1224. vfmul.d D2, D2, VALPHA
  1225. vfmul.d D3, D3, VALPHA
  1226. vstelm.d D0, C0, 0x00, 0x00
  1227. vstelm.d D0, C1, 0x00, 0x01
  1228. vstelm.d D1, C2, 0x00, 0x00
  1229. vstelm.d D1, C3, 0x00, 0x01
  1230. vstelm.d D2, C0, 0x08, 0x00
  1231. vstelm.d D2, C1, 0x08, 0x01
  1232. vstelm.d D3, C2, 0x08, 0x00
  1233. vstelm.d D3, C3, 0x08, 0x01
  1234. #else
  1235. /* Load C0 */
  1236. vld U0, C0, 0x00
  1237. /* Load C1 */
  1238. vld U1, C1, 0x00
  1239. /* Load C2 */
  1240. vld U2, C2, 0x00
  1241. /* Load C3 */
  1242. vld U3, C3, 0x00
  1243. vilvl.d D4, D2, D0 //C0
  1244. vilvh.d D5, D2, D0 //C1
  1245. vilvl.d D6, D3, D1 //C2
  1246. vilvh.d D7, D3, D1 //C3
  1247. vfmadd.d D0, D4, VALPHA, U0
  1248. vfmadd.d D2, D5, VALPHA, U1
  1249. vfmadd.d D1, D6, VALPHA, U2
  1250. vfmadd.d D3, D7, VALPHA, U3
  1251. vst D0, C0, 0x00
  1252. vst D2, C1, 0x00
  1253. vst D1, C2, 0x00
  1254. vst D3, C3, 0x00
  1255. #endif // #if defined(TRMMKERNEL)
  1256. /* Add stride for C */
  1257. addi.d C0, C0, 0x10
  1258. addi.d C1, C1, 0x10
  1259. addi.d C2, C2, 0x10
  1260. addi.d C3, C3, 0x10
  1261. #if defined(TRMMKERNEL)
  1262. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1263. sub.d L, K, OFF
  1264. #ifdef LEFT
  1265. /* number of values in A */
  1266. addi.d L, L, -2
  1267. #else
  1268. /* number of values in B */
  1269. addi.d L, L, -4
  1270. #endif
  1271. slli.d T0, L, 0x04
  1272. add.d A0, A0, T0
  1273. slli.d T0, L, 0x05
  1274. add.d B0, B0, T0
  1275. #endif
  1276. #ifdef LEFT
  1277. /* number of values in A */
  1278. addi.d OFF, OFF, 0x02
  1279. #endif
  1280. #endif // #if defined(TRMMKERNEL)
  1281. /********LOOP (if(N >> 2 ) && (M & 2) ) End************/
  1282. .L_M1:
  1283. andi I, M, 1
  1284. beq ZERO,I, .L_M0
  1285. #if defined(TRMMKERNEL)
  1286. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1287. move B0, B
  1288. #else
  1289. slli.d T0, OFF, 0x03
  1290. add.d A0, A0, T0
  1291. slli.d T0, OFF, 0x05
  1292. add.d B0, B, T0
  1293. #endif
  1294. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1295. sub.d L, K, OFF
  1296. #elif defined(LEFT)
  1297. /* number of values in A */
  1298. addi.d L, OFF, 1
  1299. #else
  1300. /* number of values in B */
  1301. addi.d L, OFF, 4
  1302. #endif
  1303. #else // #if !defined(TRMMKERNEL)
  1304. move B0, B
  1305. move L, K /* L = bk */
  1306. #endif
  1307. vldrepl.d U0, A0, 0x00
  1308. vld U4, B0, 0x00
  1309. vld U5, B0, 0x10
  1310. vfmul.d D0, U0, U4
  1311. vfmul.d D1, U0, U5
  1312. /* Add stride for A0 and B0 */
  1313. addi.d A0, A0, 0x08
  1314. addi.d B0, B0, 0x20
  1315. /* Reduce L */
  1316. addi.d L, L, -1
  1317. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1318. /* if (TL < 1) goto L_M1_L7 */
  1319. beq ZERO,TL, .L_M1_L7
  1320. vldrepl.d U8, A0, 0x00
  1321. addi.d TL, TL, -1
  1322. vld U12, B0, 0x00
  1323. vld U13, B0, 0x10
  1324. addi.d A0, A0, 0x08
  1325. addi.d B0, B0, 0x20
  1326. beq ZERO, TL, .L_M1_TL1_END
  1327. .L_M1_TL1: /* TL-- */
  1328. KERNEL8x1x4
  1329. addi.d TL, TL, -1 /* TL-- */
  1330. blt ZERO,TL, .L_M1_TL1
  1331. .L_M1_TL1_END:
  1332. KERNEL8x1x4_END
  1333. .L_M1_L7:
  1334. /* if (!(L & 7)) goto L_M1_L0 */
  1335. andi TL, L, 7
  1336. beq TL, ZERO,.L_M1_L0
  1337. .L_M1_L71:
  1338. vldrepl.d U0, A0, 0x00
  1339. vld U4, B0, 0x00
  1340. vld U5, B0, 0x10
  1341. vfmadd.d D0, U0, U4, D0
  1342. vfmadd.d D1, U0, U5, D1
  1343. /* Add stride for A0, B0 */
  1344. addi.d A0, A0, 0x08
  1345. addi.d B0, B0, 0x20
  1346. addi.d TL, TL, -1
  1347. blt ZERO,TL, .L_M1_L71
  1348. .L_M1_L0:
  1349. vldrepl.d VALPHA, $sp, 104
  1350. #if defined(TRMMKERNEL)
  1351. vfmul.d D0, D0, VALPHA
  1352. vfmul.d D1, D1, VALPHA
  1353. vstelm.d D0, C0, 0x00, 0x00
  1354. vstelm.d D0, C1, 0x00, 0x01
  1355. vstelm.d D1, C2, 0x00, 0x00
  1356. vstelm.d D1, C3, 0x00, 0x01
  1357. #else
  1358. /* Load C0 */
  1359. vldrepl.d U0, C0, 0x00
  1360. vldrepl.d U1, C1, 0x00
  1361. vilvl.d D4, U1, U0
  1362. vfmadd.d D6, D0, VALPHA, D4
  1363. vldrepl.d U2, C2, 0x00
  1364. vldrepl.d U3, C3, 0x00
  1365. vilvl.d D5, U3, U2
  1366. vfmadd.d D7, D1, VALPHA, D5
  1367. vstelm.d D6, C0, 0x00, 0x00
  1368. vstelm.d D6, C1, 0x00, 0x01
  1369. vstelm.d D7, C2, 0x00, 0x00
  1370. vstelm.d D7, C3, 0x00, 0x01
  1371. #endif // #if defined(TRMMKERNEL)
  1372. /* Add stride for C */
  1373. addi.d C0, C0, 0x08
  1374. addi.d C1, C1, 0x08
  1375. addi.d C2, C2, 0x08
  1376. addi.d C3, C3, 0x08
  1377. #if defined(TRMMKERNEL)
  1378. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1379. sub.d L, K, OFF
  1380. #ifdef LEFT
  1381. /* number of values in A */
  1382. addi.d L, L, -1
  1383. #else
  1384. /* number of values in B */
  1385. addi.d L, L, -4
  1386. #endif
  1387. slli.d T0, L, 0x03
  1388. add.d A0, A0, T0
  1389. slli.d T0, L, 0x05
  1390. add.d B0, B0, T0
  1391. #endif
  1392. #ifdef LEFT
  1393. /* number of values in A */
  1394. addi.d OFF, OFF, 0x01
  1395. #endif
  1396. #endif // #if defined(TRMMKERNEL)
  1397. /********LOOP (if(N >> 2 ) && (M & 1) ) End************/
  1398. .L_M0:
  1399. /* Add stride for B and C
  1400. * B += (K * 32)
  1401. * C += (LDC * 32)
  1402. */
  1403. /* since the array type is double,
  1404. * so we must mul 32
  1405. */
  1406. slli.d T0, K, 5
  1407. slli.d T1, LDC, 5
  1408. add.d B, B, T0
  1409. add.d C, C, T1
  1410. #if defined(TRMMKERNEL) && !defined(LEFT)
  1411. addi.d OFF, OFF, 0x04
  1412. #endif
  1413. blt ZERO, J, .L_J1
  1414. //////////////// go back to L_J1 /////////////////
  1415. /////////////////////////////////////////////////
  1416. /************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/
  1417. vldrepl.d VALPHA, $sp, 104
  1418. .L_N3:
  1419. andi J, N, 2
  1420. beq ZERO, J, .L_N1
  1421. /************************* Condition 2 if((N & 2) && (M >> 3)) START !!! *************************
  1422. * dgemm_core_16x2 */
  1423. move C0, C
  1424. move A0, A
  1425. slli.d T0, LDC, 3
  1426. add.d C1, C0, T0
  1427. #if defined(TRMMKERNEL) && defined(LEFT)
  1428. move OFF, OFFSET
  1429. #endif
  1430. /* if (!(M >> 3)) goto L_N3_M8 */
  1431. srai.d I, M, 3 /* I = bm >> 3 */
  1432. beq ZERO, I, .L_N3_M8
  1433. .L_N3_I1:
  1434. #if defined(TRMMKERNEL)
  1435. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1436. move B0, B
  1437. #else
  1438. slli.d T0, OFF, 0x06
  1439. add.d A0, A0, T0
  1440. slli.d T0, OFF, 0x04
  1441. add.d B0, B, T0
  1442. #endif
  1443. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1444. sub.d L, K, OFF
  1445. #elif defined(LEFT)
  1446. /* number of values in A */
  1447. addi.d L, OFF, 8
  1448. #else
  1449. /* number of values in B */
  1450. addi.d L, OFF, 2
  1451. #endif
  1452. #else // #if !defined(TRMMKERNEL)
  1453. move B0, B
  1454. move L, K /* L = bk */
  1455. #endif
  1456. /* Load 8 * 64 from A0
  1457. * U0 = {a1, a0}
  1458. * U1 = {a3, a2}
  1459. * U2 = {a5, a4}
  1460. * U3 = {a7, a6}
  1461. */
  1462. vld U0, A0, 0x00
  1463. vld U1, A0, 0x10
  1464. vld U2, A0, 0x20
  1465. vld U3, A0, 0x30
  1466. vldrepl.d U4, B0, 0x00
  1467. /* line 1 */
  1468. vfmul.d D0, U0, U4
  1469. vfmul.d D1, U1, U4
  1470. vfmul.d D2, U2, U4
  1471. vfmul.d D3, U3, U4
  1472. vldrepl.d U5, B0, 0x08
  1473. /* line 2 */
  1474. vfmul.d D4, U0, U5
  1475. vfmul.d D5, U1, U5
  1476. vfmul.d D6, U2, U5
  1477. vfmul.d D7, U3, U5
  1478. /* Add stride for A0 and B0 */
  1479. addi.d A0, A0, 0x40
  1480. addi.d B0, B0, 0x10
  1481. /* Reduce L */
  1482. addi.d L, L, -1
  1483. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1484. /* if (TL < 1) goto L_N3_L7 */
  1485. beq ZERO,TL, .L_N3_L7
  1486. vld U8, A0, 0x00
  1487. vld U9, A0, 0x10
  1488. vld U10, A0, 0x20
  1489. vld U11, A0, 0x30
  1490. addi.d TL, TL, -1
  1491. vldrepl.d U12, B0, 0x00
  1492. vldrepl.d U13, B0, 0x08
  1493. addi.d A0, A0, 0x40
  1494. addi.d B0, B0, 0x10
  1495. beq ZERO, TL, .L_N3_TL1_END
  1496. .L_N3_TL1: /* TL-- */
  1497. KERNEL8x8x2
  1498. addi.d TL, TL, -1 /* TL-- */
  1499. blt ZERO,TL, .L_N3_TL1
  1500. .L_N3_TL1_END:
  1501. KERNEL8x8x2_END
  1502. .L_N3_L7:
  1503. /* if (!(L & 7)) goto L_N3_L0 */
  1504. andi TL, L, 7
  1505. beq TL, ZERO,.L_N3_L0
  1506. .L_N3_L71:
  1507. /* Load 16 * 64 from A0 */
  1508. vld U0, A0, 0x00
  1509. vld U1, A0, 0x10
  1510. vld U2, A0, 0x20
  1511. vld U3, A0, 0x30
  1512. vldrepl.d U4, B0, 0x00
  1513. vfmadd.d D0, U0, U4, D0
  1514. vfmadd.d D1, U1, U4, D1
  1515. vfmadd.d D2, U2, U4, D2
  1516. vfmadd.d D3, U3, U4, D3
  1517. vldrepl.d U5, B0, 0x08
  1518. vfmadd.d D4, U0, U5, D4
  1519. vfmadd.d D5, U1, U5, D5
  1520. vfmadd.d D6, U2, U5, D6
  1521. vfmadd.d D7, U3, U5, D7
  1522. /* Add stride for A0, B0 */
  1523. addi.d A0, A0, 0x40
  1524. addi.d B0, B0, 0x10
  1525. addi.d TL, TL, -1
  1526. blt ZERO,TL, .L_N3_L71
  1527. .L_N3_L0:
  1528. #if defined(TRMMKERNEL)
  1529. vfmul.d D0, D0, VALPHA
  1530. vfmul.d D1, D1, VALPHA
  1531. vfmul.d D2, D2, VALPHA
  1532. vfmul.d D3, D3, VALPHA
  1533. vfmul.d D4, D4, VALPHA
  1534. vfmul.d D5, D5, VALPHA
  1535. vfmul.d D6, D6, VALPHA
  1536. vfmul.d D7, D7, VALPHA
  1537. #else
  1538. /* Load C0 */
  1539. vld U0, C0, 0x00
  1540. vld U1, C0, 0x10
  1541. vld U2, C0, 0x20
  1542. vld U3, C0, 0x30
  1543. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  1544. vfmadd.d D1, D1, VALPHA, U1
  1545. vfmadd.d D2, D2, VALPHA, U2
  1546. vfmadd.d D3, D3, VALPHA, U3
  1547. /* Load C1 */
  1548. vld U4, C1, 0x00
  1549. vld U5, C1, 0x10
  1550. vld U6, C1, 0x20
  1551. vld U7, C1, 0x30
  1552. vfmadd.d D4, D4, VALPHA, U4
  1553. vfmadd.d D5, D5, VALPHA, U5
  1554. vfmadd.d D6, D6, VALPHA, U6
  1555. vfmadd.d D7, D7, VALPHA, U7
  1556. #endif // #if defined(TRMMKERNEL)
  1557. /* Store C0 */
  1558. vst D0, C0, 0x00
  1559. vst D1, C0, 0x10
  1560. vst D2, C0, 0x20
  1561. vst D3, C0, 0x30
  1562. /* Store C1 */
  1563. vst D4, C1, 0x00
  1564. vst D5, C1, 0x10
  1565. vst D6, C1, 0x20
  1566. vst D7, C1, 0x30
  1567. /* Add stride for C */
  1568. addi.d C0, C0, 0x40
  1569. addi.d C1, C1, 0x40
  1570. #if defined(TRMMKERNEL)
  1571. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1572. sub.d L, K, OFF
  1573. #ifdef LEFT
  1574. addi.d L, L, -8
  1575. #else
  1576. addi.d L, L, -2
  1577. #endif
  1578. slli.d T0, L, 0x06
  1579. add.d A0, A0, T0
  1580. slli.d T0, L, 0x04
  1581. add.d B0, B0, T0
  1582. #endif
  1583. #ifdef LEFT
  1584. addi.d OFF, OFF, 0x8
  1585. #endif
  1586. #endif // #if defined(TRMMKERNEL)
  1587. addi.d I, I, -1 /* I-- */
  1588. blt ZERO,I, .L_N3_I1
  1589. .L_N3_M8:
  1590. /* We have done M & 8, considering M=4/2/1 */
  1591. andi I, M, 7
  1592. beq ZERO,I, .L_N3_M0
  1593. andi I, M, 4
  1594. beq ZERO,I, .L_N3_M2
  1595. #if defined(TRMMKERNEL)
  1596. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1597. move B0, B
  1598. #else
  1599. slli.d T0, OFF, 0x05
  1600. add.d A0, A0, T0
  1601. slli.d T0, OFF, 0x04
  1602. add.d B0, B, T0
  1603. #endif
  1604. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1605. sub.d L, K, OFF
  1606. #elif defined(LEFT)
  1607. /* number of values in A */
  1608. addi.d L, OFF, 4
  1609. #else
  1610. /* number of values in B */
  1611. addi.d L, OFF, 2
  1612. #endif
  1613. #else // #if !defined(TRMMKERNEL)
  1614. move B0, B
  1615. move L, K /* L = bk */
  1616. #endif
  1617. /* Load 4 * 64 from A0 */
  1618. vld U0, A0, 0x00
  1619. vld U1, A0, 0x10
  1620. vldrepl.d U4, B0, 0x00
  1621. /* line 1 */
  1622. vfmul.d D0, U0, U4
  1623. vfmul.d D1, U1, U4
  1624. vldrepl.d U5, B0, 0x08
  1625. /* line 2 */
  1626. vfmul.d D4, U0, U5
  1627. vfmul.d D5, U1, U5
  1628. /* Add stride for A0 and B0 */
  1629. addi.d A0, A0, 0x20
  1630. addi.d B0, B0, 0x10
  1631. /* Reduce L */
  1632. addi.d L, L, -1
  1633. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1634. /* if (TL < 1) goto L_N3_M4_L7 */
  1635. beq ZERO,TL, .L_N3_M4_L7
  1636. vld U8, A0, 0x00
  1637. vld U9, A0, 0x10
  1638. addi.d TL, TL, -1
  1639. vldrepl.d U12, B0, 0x00
  1640. vldrepl.d U13, B0, 0x08
  1641. addi.d A0, A0, 0x20
  1642. addi.d B0, B0, 0x10
  1643. beq ZERO, TL, .L_N3_M4_TL1_END
  1644. .L_N3_M4_TL1: /* TL-- */
  1645. KERNEL8x4x2
  1646. addi.d TL, TL, -1 /* TL-- */
  1647. blt ZERO,TL, .L_N3_M4_TL1
  1648. .L_N3_M4_TL1_END:
  1649. KERNEL8x4x2_END
  1650. .L_N3_M4_L7:
  1651. /* if (!(L & 7)) goto L_N3_M4_L0 */
  1652. andi TL, L, 7
  1653. beq TL, ZERO,.L_N3_M4_L0
  1654. .L_N3_M4_L71:
  1655. vld U0, A0, 0x00
  1656. vld U1, A0, 0x10
  1657. vldrepl.d U4, B0, 0x00
  1658. vfmadd.d D0, U0, U4, D0
  1659. vfmadd.d D1, U1, U4, D1
  1660. vldrepl.d U5, B0, 0x08
  1661. vfmadd.d D4, U0, U5, D4
  1662. vfmadd.d D5, U1, U5, D5
  1663. /* Add stride for A0, B0 */
  1664. addi.d A0, A0, 0x20
  1665. addi.d B0, B0, 0x10
  1666. addi.d TL, TL, -1
  1667. blt ZERO,TL, .L_N3_M4_L71
  1668. .L_N3_M4_L0:
  1669. #if defined(TRMMKERNEL)
  1670. vfmul.d D0, D0, VALPHA
  1671. vfmul.d D1, D1, VALPHA
  1672. vfmul.d D4, D4, VALPHA
  1673. vfmul.d D5, D5, VALPHA
  1674. #else
  1675. /* Load C0 */
  1676. vld U0, C0, 0x00
  1677. vld U1, C0, 0x10
  1678. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  1679. vfmadd.d D1, D1, VALPHA, U1
  1680. /* Load C1 */
  1681. vld U2, C1, 0x00
  1682. vld U3, C1, 0x10
  1683. vfmadd.d D4, D4, VALPHA, U2
  1684. vfmadd.d D5, D5, VALPHA, U3
  1685. #endif // #if defined(TRMMKERNEL)
  1686. /* Store C0 */
  1687. vst D0, C0, 0x00
  1688. vst D1, C0, 0x10
  1689. /* Store C1 */
  1690. vst D4, C1, 0x00
  1691. vst D5, C1, 0x10
  1692. /* Add stride for C */
  1693. addi.d C0, C0, 0x20
  1694. addi.d C1, C1, 0x20
  1695. #if defined(TRMMKERNEL)
  1696. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1697. sub.d L, K, OFF
  1698. #ifdef LEFT
  1699. addi.d L, L, -4
  1700. #else
  1701. addi.d L, L, -2
  1702. #endif
  1703. slli.d T0, L, 0x05
  1704. add.d A0, A0, T0
  1705. slli.d T0, L, 0x04
  1706. add.d B0, B0, T0
  1707. #endif
  1708. #ifdef LEFT
  1709. addi.d OFF, OFF, 0x04
  1710. #endif
  1711. #endif // #if defined(TRMMKERNEL)
  1712. /********LOOP (if(N & 2 ) && (M & 4) ) End************/
  1713. .L_N3_M2:
  1714. andi I, M, 2
  1715. beq ZERO,I, .L_N3_M1
  1716. #if defined(TRMMKERNEL)
  1717. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1718. move B0, B
  1719. #else
  1720. slli.d T0, OFF, 0x04
  1721. add.d A0, A0, T0
  1722. add.d B0, B, T0
  1723. #endif
  1724. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1725. sub.d L, K, OFF
  1726. #elif defined(LEFT)
  1727. /* number of values in A */
  1728. addi.d L, OFF, 2
  1729. #else
  1730. /* number of values in B */
  1731. addi.d L, OFF, 2
  1732. #endif
  1733. #else // #if !defined(TRMMKERNEL)
  1734. move B0, B
  1735. move L, K /* L = bk */
  1736. #endif
  1737. /* Load 2 * 64 from A0 */
  1738. vld U0, A0, 0x00
  1739. vldrepl.d U4, B0, 0x00
  1740. /* line 1 */
  1741. vfmul.d D0, U0, U4
  1742. vldrepl.d U4, B0, 0x08
  1743. /* line 2 */
  1744. vfmul.d D4, U0, U4
  1745. /* Add stride for A0 and B0 */
  1746. addi.d A0, A0, 0x10
  1747. addi.d B0, B0, 0x10
  1748. /* Reduce L */
  1749. addi.d L, L, -1
  1750. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1751. /* if (TL < 1) goto L_N3_M2_L7 */
  1752. beq ZERO,TL, .L_N3_M2_L7
  1753. vld U8, A0, 0x00
  1754. addi.d TL, TL, -1
  1755. vldrepl.d U12, B0, 0x00
  1756. vldrepl.d U13, B0, 0x08
  1757. addi.d A0, A0, 0x10
  1758. addi.d B0, B0, 0x10
  1759. beq ZERO, TL, .L_N3_M2_TL1_END
  1760. .L_N3_M2_TL1: /* TL-- */
  1761. KERNEL8x2x2
  1762. addi.d TL, TL, -1 /* TL-- */
  1763. blt ZERO,TL, .L_N3_M2_TL1
  1764. .L_N3_M2_TL1_END:
  1765. KERNEL8x2x2_END
  1766. .L_N3_M2_L7:
  1767. /* if (!(L & 7)) goto L_N3_M2_L0 */
  1768. andi TL, L, 7
  1769. beq TL, ZERO,.L_N3_M2_L0
  1770. .L_N3_M2_L71:
  1771. vld U0, A0, 0x00
  1772. vldrepl.d U4, B0, 0x00
  1773. vldrepl.d U5, B0, 0x08
  1774. vfmadd.d D0, U0, U4, D0
  1775. vfmadd.d D4, U0, U5, D4
  1776. /* Add stride for A0, B0 */
  1777. addi.d A0, A0, 0x10
  1778. addi.d B0, B0, 0x10
  1779. addi.d TL, TL, -1
  1780. blt ZERO,TL, .L_N3_M2_L71
  1781. .L_N3_M2_L0:
  1782. #if defined(TRMMKERNEL)
  1783. vfmul.d D0, D0, VALPHA
  1784. vfmul.d D4, D4, VALPHA
  1785. #else
  1786. /* Load C0 */
  1787. vld U0, C0, 0x00
  1788. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  1789. /* Load C1 */
  1790. vld U1, C1, 0x00
  1791. vfmadd.d D4, D4, VALPHA, U1
  1792. #endif // #if defined(TRMMKERNEL)
  1793. vst D0, C0, 0x00
  1794. vst D4, C1, 0x00
  1795. /* Add stride for C */
  1796. addi.d C0, C0, 0x10
  1797. addi.d C1, C1, 0x10
  1798. #if defined(TRMMKERNEL)
  1799. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1800. sub.d L, K, OFF
  1801. #ifdef LEFT
  1802. addi.d L, L, -2
  1803. #else
  1804. addi.d L, L, -2
  1805. #endif
  1806. slli.d T0, L, 0x04
  1807. add.d A0, A0, T0
  1808. add.d B0, B0, T0
  1809. #endif
  1810. #ifdef LEFT
  1811. addi.d OFF, OFF, 0x02
  1812. #endif
  1813. #endif // #if defined(TRMMKERNEL)
  1814. /********LOOP (if(N & 2 ) && (M & 2) ) End************/
  1815. .L_N3_M1:
  1816. andi I, M, 1
  1817. beq ZERO,I, .L_N3_M0
  1818. #if defined(TRMMKERNEL)
  1819. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1820. move B0, B
  1821. #else
  1822. slli.d T0, OFF, 0x03
  1823. add.d A0, A0, T0
  1824. slli.d T0, OFF, 0x04
  1825. add.d B0, B, T0
  1826. #endif
  1827. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1828. sub.d L, K, OFF
  1829. #elif defined(LEFT)
  1830. /* number of values in A */
  1831. addi.d L, OFF, 1
  1832. #else
  1833. /* number of values in B */
  1834. addi.d L, OFF, 2
  1835. #endif
  1836. #else // #if !defined(TRMMKERNEL)
  1837. move B0, B
  1838. move L, K /* L = bk */
  1839. #endif
  1840. /* Load 1 * 64 from A0 */
  1841. vldrepl.d U0, A0, 0x00
  1842. vld U4, B0, 0x00
  1843. /* line 1 */
  1844. vfmul.d D0, U0, U4
  1845. /* Add stride for A0 and B0 */
  1846. addi.d A0, A0, 0x08
  1847. addi.d B0, B0, 0x10
  1848. /* Reduce L */
  1849. addi.d L, L, -1
  1850. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1851. /* if (TL < 1) goto L_N3_M1_L7 */
  1852. beq ZERO,TL, .L_N3_M1_L7
  1853. vldrepl.d U8, A0, 0x00
  1854. addi.d TL, TL, -1
  1855. vld U12, B0, 0x00
  1856. addi.d A0, A0, 0x08
  1857. addi.d B0, B0, 0x10
  1858. beq ZERO, TL, .L_N3_M1_TL1_END
  1859. .L_N3_M1_TL1: /* TL-- */
  1860. KERNEL8x1x2
  1861. addi.d TL, TL, -1 /* TL-- */
  1862. blt ZERO,TL, .L_N3_M1_TL1
  1863. .L_N3_M1_TL1_END:
  1864. KERNEL8x1x2_END
  1865. .L_N3_M1_L7:
  1866. /* if (!(L & 7)) goto L_N3_M1_L0 */
  1867. andi TL, L, 7
  1868. beq TL, ZERO,.L_N3_M1_L0
  1869. .L_N3_M1_L71:
  1870. vldrepl.d U0, A0, 0x00
  1871. vld U4, B0, 0x00
  1872. vfmadd.d D0, U0, U4, D0
  1873. /* Add stride for A0, B0 */
  1874. addi.d A0, A0, 0x08
  1875. addi.d B0, B0, 0x10
  1876. addi.d TL, TL, -1
  1877. blt ZERO,TL, .L_N3_M1_L71
  1878. .L_N3_M1_L0:
  1879. #if defined(TRMMKERNEL)
  1880. vfmul.d D0, D0, VALPHA
  1881. #else
  1882. /* Load C0 */
  1883. vld U0, C0, 0x00
  1884. vld U1, C1, 0x00
  1885. vilvl.d U2, U1, U0
  1886. vfmadd.d D0, D0, VALPHA, U2
  1887. #endif // #if defined(TRMMKERNEL)
  1888. vstelm.d D0, C0, 0x00, 0x00
  1889. vstelm.d D0, C1, 0x00, 0x01
  1890. /* Add stride for C */
  1891. addi.d C0, C0, 0x08
  1892. addi.d C1, C1, 0x08
  1893. #if defined(TRMMKERNEL)
  1894. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1895. sub.d L, K, OFF
  1896. #ifdef LEFT
  1897. addi.d L, L, -1
  1898. #else
  1899. addi.d L, L, -2
  1900. #endif
  1901. slli.d T0, L, 0x03
  1902. add.d A0, A0, T0
  1903. slli.d T0, L, 0x04
  1904. add.d B0, B0, T0
  1905. #endif
  1906. #ifdef LEFT
  1907. addi.d OFF, OFF, 0x01
  1908. #endif
  1909. #endif // #if defined(TRMMKERNEL)
  1910. /********LOOP (if(N & 2 ) && (M & 1) ) End************/
  1911. .L_N3_M0:
  1912. /* Add stride for B and C
  1913. * B += (K * 16)
  1914. * C += (LDC * 16)
  1915. */
  1916. /* since the array type is double,
  1917. * so we must mul 16
  1918. */
  1919. slli.d T0, K, 4
  1920. slli.d T1, LDC, 4
  1921. add.d B, B, T0
  1922. add.d C, C, T1
  1923. #if defined(TRMMKERNEL) && !defined(LEFT)
  1924. addi.d OFF, OFF, 0x02
  1925. #endif
  1926. /* We must reinit I */
  1927. srai.d I, M, 4 /* I = bm >> 4 */
  1928. /************************* Condition 2 if((N & 2) && (M >> 3)) End !!! *************************
  1929. * dgemm_core_16x2 */
  1930. .L_N1:
  1931. andi J, N, 1
  1932. beq ZERO, J, .L_N0
  1933. /************************* Condition 3 if((N & 1) && (M >> 3)) START !!! *************************
  1934. * dgemm_core_16x1 */
  1935. move C0, C
  1936. move A0, A
  1937. #if defined(TRMMKERNEL) && defined(LEFT)
  1938. move OFF, OFFSET
  1939. #endif
  1940. /* if (!(M >> 3)) goto L_N1_M8 */
  1941. srai.d I, M, 3 /* I = bm >> 3 */
  1942. beq ZERO, I, .L_N1_M8
  1943. .L_N1_I1:
  1944. #if defined(TRMMKERNEL)
  1945. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1946. move B0, B
  1947. #else
  1948. slli.d T0, OFF, 0x06
  1949. add.d A0, A0, T0
  1950. slli.d T0, OFF, 0x03
  1951. add.d B0, B, T0
  1952. #endif
  1953. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1954. sub.d L, K, OFF
  1955. #elif defined(LEFT)
  1956. /* number of values in A */
  1957. addi.d L, OFF, 8
  1958. #else
  1959. /* number of values in B */
  1960. addi.d L, OFF, 1
  1961. #endif
  1962. #else // #if !defined(TRMMKERNEL)
  1963. move B0, B
  1964. move L, K /* L = bk */
  1965. #endif
  1966. /* Load 8 * 64 from A0
  1967. * U0 = {a3, a2}
  1968. * U1 = {a1, a0}
  1969. * U2 = {a5, a4}
  1970. * U3 = {a7, a6}
  1971. */
  1972. vld U0, A0, 0x00
  1973. vld U1, A0, 0x10
  1974. vld U2, A0, 0x20
  1975. vld U3, A0, 0x30
  1976. vldrepl.d U4, B0, 0x00
  1977. /* line 1 */
  1978. vfmul.d D0, U0, U4
  1979. vfmul.d D1, U1, U4
  1980. vfmul.d D2, U2, U4
  1981. vfmul.d D3, U3, U4
  1982. /* Add stride for A0 and B0 */
  1983. addi.d A0, A0, 0x40
  1984. addi.d B0, B0, 0x08
  1985. /* Reduce L */
  1986. addi.d L, L, -1
  1987. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  1988. /* if (TL < 1) goto L_N1_L7 */
  1989. beq ZERO,TL, .L_N1_L7
  1990. vld U8, A0, 0x00
  1991. vld U9, A0, 0x10
  1992. vld U10, A0, 0x20
  1993. vld U11, A0, 0x30
  1994. addi.d TL, TL, -1
  1995. vldrepl.d U12, B0, 0x00
  1996. addi.d A0, A0, 0x40
  1997. addi.d B0, B0, 0x08
  1998. beq ZERO, TL, .L_N1_TL1_END
  1999. .L_N1_TL1: /* TL-- */
  2000. KERNEL8x8x1
  2001. addi.d TL, TL, -1 /* TL-- */
  2002. blt ZERO,TL, .L_N1_TL1
  2003. .L_N1_TL1_END:
  2004. KERNEL8x8x1_END
  2005. .L_N1_L7:
  2006. /* if (!(L & 7)) goto L_N1_L0 */
  2007. andi TL, L, 7
  2008. beq TL, ZERO,.L_N1_L0
  2009. .L_N1_L71:
  2010. /* Load 16 * 64 from A0 */
  2011. vld U0, A0, 0x00
  2012. vld U1, A0, 0x10
  2013. vld U2, A0, 0x20
  2014. vld U3, A0, 0x30
  2015. vldrepl.d U4, B0, 0x00
  2016. vfmadd.d D0, U0, U4, D0
  2017. vfmadd.d D1, U1, U4, D1
  2018. vfmadd.d D2, U2, U4, D2
  2019. vfmadd.d D3, U3, U4, D3
  2020. /* Add stride for A0, B0 */
  2021. addi.d A0, A0, 0x40
  2022. addi.d B0, B0, 0x08
  2023. addi.d TL, TL, -1
  2024. blt ZERO,TL, .L_N1_L71
  2025. .L_N1_L0:
  2026. #if defined(TRMMKERNEL)
  2027. vfmul.d D0, D0, VALPHA
  2028. vfmul.d D1, D1, VALPHA
  2029. vfmul.d D2, D2, VALPHA
  2030. vfmul.d D3, D3, VALPHA
  2031. #else
  2032. /* Load C0 */
  2033. vld U0, C0, 0x00
  2034. vld U1, C0, 0x10
  2035. vld U2, C0, 0x20
  2036. vld U3, C0, 0x30
  2037. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  2038. vfmadd.d D1, D1, VALPHA, U1
  2039. vfmadd.d D2, D2, VALPHA, U2
  2040. vfmadd.d D3, D3, VALPHA, U3
  2041. #endif // #if defined(TRMMKERNEL)
  2042. /* Store C0 */
  2043. vst D0, C0, 0x00
  2044. vst D1, C0, 0x10
  2045. vst D2, C0, 0x20
  2046. vst D3, C0, 0x30
  2047. /* Add stride for C */
  2048. addi.d C0, C0, 0x40
  2049. #if defined(TRMMKERNEL)
  2050. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2051. sub.d L, K, OFF
  2052. #ifdef LEFT
  2053. addi.d L, L, -8
  2054. #else
  2055. addi.d L, L, -1
  2056. #endif
  2057. slli.d T0, L, 0x06
  2058. add.d A0, A0, T0
  2059. slli.d T0, L, 0x03
  2060. add.d B0, B0, T0
  2061. #endif
  2062. #ifdef LEFT
  2063. addi.d OFF, OFF, 0x8
  2064. #endif
  2065. #endif // #if defined(TRMMKERNEL)
  2066. addi.d I, I, -1 /* I-- */
  2067. blt ZERO,I, .L_N1_I1
  2068. .L_N1_M8:
  2069. /* We have done M & 16, considering M=8/4/2/1 */
  2070. andi I, M, 7
  2071. beq ZERO,I, .L_N1_M0
  2072. andi I, M, 4
  2073. beq ZERO,I, .L_N1_M2
  2074. #if defined(TRMMKERNEL)
  2075. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2076. move B0, B
  2077. #else
  2078. slli.d T0, OFF, 0x05
  2079. add.d A0, A0, T0
  2080. slli.d T0, OFF, 0x03
  2081. add.d B0, B, T0
  2082. #endif
  2083. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2084. sub.d L, K, OFF
  2085. #elif defined(LEFT)
  2086. /* number of values in A */
  2087. addi.d L, OFF, 4
  2088. #else
  2089. /* number of values in B */
  2090. addi.d L, OFF, 1
  2091. #endif
  2092. #else // #if !defined(TRMMKERNEL)
  2093. move B0, B
  2094. move L, K /* L = bk */
  2095. #endif
  2096. /* Load 4 * 64 from A0 */
  2097. vld U0, A0, 0x00
  2098. vld U1, A0, 0x10
  2099. vldrepl.d U4, B0, 0x00
  2100. /* line 1 */
  2101. vfmul.d D0, U0, U4
  2102. vfmul.d D1, U1, U4
  2103. /* Add stride for A0 and B0 */
  2104. addi.d A0, A0, 0x20
  2105. addi.d B0, B0, 0x08
  2106. /* Reduce L */
  2107. addi.d L, L, -1
  2108. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  2109. /* if (TL < 1) goto L_N1_M4_L7 */
  2110. beq ZERO,TL, .L_N1_M4_L7
  2111. vld U8, A0, 0x00
  2112. vld U9, A0, 0x10
  2113. addi.d TL, TL, -1
  2114. vldrepl.d U12, B0, 0x00
  2115. addi.d A0, A0, 0x20
  2116. addi.d B0, B0, 0x08
  2117. beq ZERO, TL, .L_N1_M4_TL1_END
  2118. .L_N1_M4_TL1: /* TL-- */
  2119. KERNEL8x4x1
  2120. addi.d TL, TL, -1 /* TL-- */
  2121. blt ZERO,TL, .L_N1_M4_TL1
  2122. .L_N1_M4_TL1_END:
  2123. KERNEL8x4x1_END
  2124. .L_N1_M4_L7:
  2125. /* if (!(L & 7)) goto L_N1_M4_L0 */
  2126. andi TL, L, 7
  2127. beq TL, ZERO,.L_N1_M4_L0
  2128. .L_N1_M4_L71:
  2129. vld U0, A0, 0x00
  2130. vld U1, A0, 0x10
  2131. vldrepl.d U4, B0, 0x00
  2132. vfmadd.d D0, U0, U4, D0
  2133. vfmadd.d D1, U1, U4, D1
  2134. /* Add stride for A0, B0 */
  2135. addi.d A0, A0, 0x20
  2136. addi.d B0, B0, 0x08
  2137. addi.d TL, TL, -1
  2138. blt ZERO,TL, .L_N1_M4_L71
  2139. .L_N1_M4_L0:
  2140. #if defined(TRMMKERNEL)
  2141. vfmul.d D0, D0, VALPHA
  2142. vfmul.d D1, D1, VALPHA
  2143. #else
  2144. /* Load C0 */
  2145. vld U0, C0, 0x00
  2146. vld U1, C0, 0x10
  2147. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  2148. vfmadd.d D1, D1, VALPHA, U1
  2149. #endif // #if defined(TRMMKERNEL)
  2150. /* Store C0 */
  2151. vst D0, C0, 0x00
  2152. vst D1, C0, 0x10
  2153. /* Add stride for C */
  2154. addi.d C0, C0, 0x20
  2155. #if defined(TRMMKERNEL)
  2156. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2157. sub.d L, K, OFF
  2158. #ifdef LEFT
  2159. addi.d L, L, -4
  2160. #else
  2161. addi.d L, L, -1
  2162. #endif
  2163. slli.d T0, L, 0x05
  2164. add.d A0, A0, T0
  2165. slli.d T0, L, 0x03
  2166. add.d B0, B0, T0
  2167. #endif
  2168. #ifdef LEFT
  2169. addi.d OFF, OFF, 0x04
  2170. #endif
  2171. #endif // #if defined(TRMMKERNEL)
  2172. /********LOOP (if(N & 1) && (M & 4) ) End************/
  2173. .L_N1_M2:
  2174. andi I, M, 2
  2175. beq ZERO,I, .L_N1_M1
  2176. #if defined(TRMMKERNEL)
  2177. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2178. move B0, B
  2179. #else
  2180. slli.d T0, OFF, 0x04
  2181. add.d A0, A0, T0
  2182. slli.d T0, OFF, 0x03
  2183. add.d B0, B, T0
  2184. #endif
  2185. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2186. sub.d L, K, OFF
  2187. #elif defined(LEFT)
  2188. /* number of values in A */
  2189. addi.d L, OFF, 2
  2190. #else
  2191. /* number of values in B */
  2192. addi.d L, OFF, 1
  2193. #endif
  2194. #else // #if !defined(TRMMKERNEL)
  2195. move B0, B
  2196. move L, K /* L = bk */
  2197. #endif
  2198. /* Load 2 * 64 from A0 */
  2199. vld U0, A0, 0x00
  2200. vldrepl.d U4, B0, 0x00
  2201. /* line 1 */
  2202. vfmul.d D0, U0, U4
  2203. /* Add stride for A0 and B0 */
  2204. addi.d A0, A0, 0x10
  2205. addi.d B0, B0, 0x08
  2206. /* Reduce L */
  2207. addi.d L, L, -1
  2208. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  2209. /* if (TL < 1) goto L_N1_M2_L7 */
  2210. beq ZERO,TL, .L_N1_M2_L7
  2211. vld U8, A0, 0x00
  2212. addi.d TL, TL, -1
  2213. vldrepl.d U12, B0, 0x00
  2214. addi.d A0, A0, 0x10
  2215. addi.d B0, B0, 0x08
  2216. beq ZERO, TL, .L_N1_M2_TL1_END
  2217. .L_N1_M2_TL1: /* TL-- */
  2218. KERNEL8x2x1
  2219. addi.d TL, TL, -1 /* TL-- */
  2220. blt ZERO,TL, .L_N1_M2_TL1
  2221. .L_N1_M2_TL1_END:
  2222. KERNEL8x2x1_END
  2223. .L_N1_M2_L7:
  2224. /* if (!(L & 7)) goto L_N1_M2_L0 */
  2225. andi TL, L, 7
  2226. beq TL, ZERO,.L_N1_M2_L0
  2227. .L_N1_M2_L71:
  2228. vld U0, A0, 0x00
  2229. vldrepl.d U4, B0, 0x00
  2230. vfmadd.d D0, U0, U4, D0
  2231. /* Add stride for A0, B0 */
  2232. addi.d A0, A0, 0x10
  2233. addi.d B0, B0, 0x08
  2234. addi.d TL, TL, -1
  2235. blt ZERO,TL, .L_N1_M2_L71
  2236. .L_N1_M2_L0:
  2237. #if defined(TRMMKERNEL)
  2238. vfmul.d D0, D0, VALPHA
  2239. #else
  2240. /* Load C0 */
  2241. vld U0, C0, 0x00
  2242. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  2243. #endif // #if defined(TRMMKERNEL)
  2244. vstelm.d D0, C0, 0x00, 0x00
  2245. vstelm.d D0, C0, 0x08, 0x01
  2246. /* Add stride for C */
  2247. addi.d C0, C0, 0x10
  2248. #if defined(TRMMKERNEL)
  2249. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2250. sub.d L, K, OFF
  2251. #ifdef LEFT
  2252. addi.d L, L, -2
  2253. #else
  2254. addi.d L, L, -1
  2255. #endif
  2256. slli.d T0, L, 0x04
  2257. add.d A0, A0, T0
  2258. slli.d T0, L, 0x03
  2259. add.d B0, B0, T0
  2260. #endif
  2261. #ifdef LEFT
  2262. addi.d OFF, OFF, 0x02
  2263. #endif
  2264. #endif // #if defined(TRMMKERNEL)
  2265. /********LOOP (if(N & 1 ) && (M & 2) ) End************/
  2266. .L_N1_M1:
  2267. andi I, M, 1
  2268. beq ZERO,I, .L_N1_M0
  2269. #if defined(TRMMKERNEL)
  2270. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2271. move B0, B
  2272. #else
  2273. slli.d T0, OFF, 0x03
  2274. add.d A0, A0, T0
  2275. add.d B0, B, T0
  2276. #endif
  2277. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2278. sub.d L, K, OFF
  2279. #elif defined(LEFT)
  2280. /* number of values in A */
  2281. addi.d L, OFF, 1
  2282. #else
  2283. /* number of values in B */
  2284. addi.d L, OFF, 1
  2285. #endif
  2286. #else // #if !defined(TRMMKERNEL)
  2287. move B0, B
  2288. move L, K /* L = bk */
  2289. #endif
  2290. /* Load 1 * 64 from A0 */
  2291. vldrepl.d U0, A0, 0x00
  2292. vldrepl.d U4, B0, 0x00
  2293. /* line 1 */
  2294. vfmul.d D0, U0, U4
  2295. /* Add stride for A0 and B0 */
  2296. addi.d A0, A0, 0x08
  2297. addi.d B0, B0, 0x08
  2298. /* Reduce L */
  2299. addi.d L, L, -1
  2300. srai.d TL, L, 3 /* TL = (L-1) >> 3 */
  2301. /* if (TL < 1) goto L_N1_M1_L7 */
  2302. beq ZERO,TL, .L_N1_M1_L7
  2303. vldrepl.d U8, A0, 0x00
  2304. addi.d TL, TL, -1
  2305. vldrepl.d U12, B0, 0x00
  2306. addi.d A0, A0, 0x08
  2307. addi.d B0, B0, 0x08
  2308. beq ZERO, TL, .L_N1_M1_TL1_END
  2309. .L_N1_M1_TL1: /* TL-- */
  2310. KERNEL8x1x1
  2311. addi.d TL, TL, -1 /* TL-- */
  2312. blt ZERO,TL, .L_N1_M1_TL1
  2313. .L_N1_M1_TL1_END:
  2314. KERNEL8x1x1_END
  2315. .L_N1_M1_L7:
  2316. /* if (!(L & 7)) goto L_N1_M1_L0 */
  2317. andi TL, L, 7
  2318. beq TL, ZERO,.L_N1_M1_L0
  2319. .L_N1_M1_L71:
  2320. vldrepl.d U0, A0, 0x00
  2321. vldrepl.d U4, B0, 0x00
  2322. vfmadd.d D0, U0, U4, D0
  2323. /* Add stride for A0, B0 */
  2324. addi.d A0, A0, 0x08
  2325. addi.d B0, B0, 0x08
  2326. addi.d TL, TL, -1
  2327. blt ZERO,TL, .L_N1_M1_L71
  2328. .L_N1_M1_L0:
  2329. #if defined(TRMMKERNEL)
  2330. vfmul.d D0, D0, VALPHA
  2331. #else
  2332. /* Load C0 */
  2333. vldrepl.d U0, C0, 0x00
  2334. vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */
  2335. #endif // #if defined(TRMMKERNEL)
  2336. vstelm.d D0, C0, 0x00, 0x00
  2337. /* Add stride for C */
  2338. addi.d C0, C0, 0x08
  2339. #if defined(TRMMKERNEL)
  2340. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2341. sub.d L, K, OFF
  2342. #ifdef LEFT
  2343. addi.d L, L, -1
  2344. #else
  2345. addi.d L, L, -1
  2346. #endif
  2347. slli.d T0, L, 0x03
  2348. add.d A0, A0, T0
  2349. add.d B0, B0, T0
  2350. #endif
  2351. #ifdef LEFT
  2352. addi.d OFF, OFF, 0x01
  2353. #endif
  2354. #endif // #if defined(TRMMKERNEL)
  2355. /********LOOP (if(N & 1 ) && (M & 1) ) End************/
  2356. .L_N1_M0:
  2357. /************************* Condition 3 if((N & 1) && (M >> 3)) End !!! *************************
  2358. * dgemm_core_16x1 */
  2359. .L_N0:
  2360. /* Restore regs */
  2361. LDARG $r23, $sp, 0
  2362. LDARG $r24, $sp, 8
  2363. LDARG $r25, $sp, 16
  2364. LDARG $r26, $sp, 24
  2365. LDARG $r27, $sp, 32
  2366. LD $f24, $sp, 40
  2367. LD $f25, $sp, 48
  2368. LD $f26, $sp, 56
  2369. LD $f27, $sp, 64
  2370. LD $f28, $sp, 72
  2371. LD $f29, $sp, 80
  2372. LD $f30, $sp, 88
  2373. LD $f31, $sp, 96
  2374. addi.d $sp, $sp, 112
  2375. jirl $r0, $r1, 0x0
  2376. EPILOGUE