You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_8x4.S 37 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha x17
  49. #define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define alpha0 d10
  53. #define alphaV0 v10.d[0]
  54. #define A_PRE_SIZE 2560
  55. #define B_PRE_SIZE 448
  56. #define C_PRE_SIZE 128
  57. // 00 origM
  58. // 01 origN
  59. // 02 origK
  60. // 03 origPA
  61. // 04 origPB
  62. // 05 pC
  63. // 06 origLDC -> LDC
  64. // 07 offset
  65. // 08 counterL
  66. // 09 counterI
  67. // 10 counterJ
  68. // 11 pB
  69. // 12 pCRow0
  70. // 13 pCRow1
  71. // 14 pCRow2
  72. // 15 pA
  73. // 16 temp
  74. // 17 tempOffset
  75. // 18 must save tempK
  76. // 19 must save
  77. // 20 must save
  78. // 21 must save
  79. // 22 must save
  80. // 23 must save
  81. // 24 must save
  82. // 25 must save
  83. // 26 must save
  84. // 27 must save
  85. // 28 must save
  86. // 29 frame
  87. // 30 link
  88. // 31 sp
  89. //v00 ALPHA -> pA0_0, pA0_1
  90. //v01 pA0_2, pA0_3
  91. //v02 pA0_4, pA0_5
  92. //v03 pA0_6, pA0_7
  93. //v04 pA1_0, pA1_1
  94. //v05 pA1_2, pA1_3
  95. //v06 pA1_4, pA1_5
  96. //v07 pA1_6, pA1_7
  97. //v08 must save pB0_0
  98. //v09 must save pB0_1
  99. //v10 must save pB0_2 --> ALPHA0
  100. //v11 must save pB0_3
  101. //v12 must save pB1_0
  102. //v13 must save pB1_1
  103. //v14 must save pB1_2
  104. //v15 must save pB1_3
  105. //v16 must save C00, C01
  106. //v17 must save C02, C03
  107. //v18 C04, C05
  108. //v19 C06, C07
  109. //v20 C10, C11
  110. //v21 C12, C13
  111. //v22 C14, C15
  112. //v23 C16, C17
  113. //v24 C20, C21
  114. //v25 C22, C23
  115. //v26 C24, C25
  116. //v27 C26, C27
  117. //v28 C30, C31
  118. //v29 C32, C33
  119. //v30 C34, C35
  120. //v31 C36, C37
  121. /*******************************************************************************
  122. * Macro definitions
  123. *******************************************************************************/
  124. .macro INIT8x4
  125. fmov d16, xzr
  126. fmov d17, xzr
  127. fmov d18, d16
  128. fmov d19, xzr
  129. fmov d20, xzr
  130. fmov d21, d16
  131. fmov d22, d17
  132. fmov d23, d18
  133. fmov d24, xzr
  134. fmov d25, d16
  135. fmov d26, d17
  136. fmov d27, d18
  137. fmov d28, xzr
  138. fmov d29, d16
  139. fmov d30, d17
  140. fmov d31, d18
  141. .endm
  142. .macro KERNEL8x4_I
  143. ldp q0, q1, [pA], #32
  144. ldp d8, d9, [pB], #16
  145. fmul v16.2d, v0.2d, v8.d[0]
  146. fmul v20.2d, v0.2d, v9.d[0]
  147. ldp d10, d11, [pB], #16
  148. fmul v17.2d, v1.2d, v8.d[0]
  149. fmul v21.2d, v1.2d, v9.d[0]
  150. ldp q2, q3, [pA], #32
  151. fmul v24.2d, v0.2d, v10.d[0]
  152. fmul v28.2d, v0.2d, v11.d[0]
  153. ldp q4, q5, [pA], #32
  154. fmul v25.2d, v1.2d, v10.d[0]
  155. fmul v29.2d, v1.2d, v11.d[0]
  156. ldp d12, d13, [pB], #16
  157. fmul v18.2d, v2.2d, v8.d[0]
  158. fmul v22.2d, v2.2d, v9.d[0]
  159. ldp d14, d15, [pB], #16
  160. fmul v26.2d, v2.2d, v10.d[0]
  161. fmul v30.2d, v2.2d, v11.d[0]
  162. ldp q6, q7, [pA], #32
  163. fmul v19.2d, v3.2d, v8.d[0]
  164. fmul v27.2d, v3.2d, v10.d[0]
  165. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  166. fmul v31.2d, v3.2d, v11.d[0]
  167. fmul v23.2d, v3.2d, v9.d[0]
  168. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  169. .endm
  170. .macro KERNEL8x4_M1
  171. fmla v16.2d, v0.2d, v8.d[0]
  172. fmla v20.2d, v0.2d, v9.d[0]
  173. ldp q4, q5, [pA], #32
  174. fmla v24.2d, v0.2d, v10.d[0]
  175. fmla v28.2d, v0.2d, v11.d[0]
  176. ldp d12, d13, [pB], #16
  177. fmla v17.2d, v1.2d, v8.d[0]
  178. fmla v25.2d, v1.2d, v10.d[0]
  179. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  180. fmla v21.2d, v1.2d, v9.d[0]
  181. fmla v29.2d, v1.2d, v11.d[0]
  182. ldp d14, d15, [pB], #16
  183. fmla v18.2d, v2.2d, v8.d[0]
  184. fmla v22.2d, v2.2d, v9.d[0]
  185. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  186. fmla v26.2d, v2.2d, v10.d[0]
  187. fmla v30.2d, v2.2d, v11.d[0]
  188. fmla v19.2d, v3.2d, v8.d[0]
  189. fmla v23.2d, v3.2d, v9.d[0]
  190. ldp q6, q7, [pA], #32
  191. fmla v27.2d, v3.2d, v10.d[0]
  192. fmla v31.2d, v3.2d, v11.d[0]
  193. .endm
  194. .macro KERNEL8x4_M2
  195. fmla v16.2d, v4.2d, v12.d[0]
  196. fmla v20.2d, v4.2d, v13.d[0]
  197. fmla v24.2d, v4.2d, v14.d[0]
  198. fmla v28.2d, v4.2d, v15.d[0]
  199. ldp q0, q1, [pA], #32
  200. fmla v17.2d, v5.2d, v12.d[0]
  201. fmla v25.2d, v5.2d, v14.d[0]
  202. ldp d8, d9, [pB], #16
  203. fmla v21.2d, v5.2d, v13.d[0]
  204. fmla v29.2d, v5.2d, v15.d[0]
  205. ldp d10, d11, [pB], #16
  206. fmla v18.2d, v6.2d, v12.d[0]
  207. fmla v22.2d, v6.2d, v13.d[0]
  208. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  209. fmla v26.2d, v6.2d, v14.d[0]
  210. fmla v30.2d, v6.2d, v15.d[0]
  211. fmla v19.2d, v7.2d, v12.d[0]
  212. fmla v23.2d, v7.2d, v13.d[0]
  213. ldp q2, q3, [pA], #32
  214. fmla v27.2d, v7.2d, v14.d[0]
  215. fmla v31.2d, v7.2d, v15.d[0]
  216. .endm
  217. .macro KERNEL8x4_E
  218. fmla v16.2d, v4.2d, v12.d[0]
  219. fmla v20.2d, v4.2d, v13.d[0]
  220. fmla v24.2d, v4.2d, v14.d[0]
  221. fmla v28.2d, v4.2d, v15.d[0]
  222. fmla v17.2d, v5.2d, v12.d[0]
  223. fmla v25.2d, v5.2d, v14.d[0]
  224. fmla v21.2d, v5.2d, v13.d[0]
  225. fmla v29.2d, v5.2d, v15.d[0]
  226. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  227. fmla v18.2d, v6.2d, v12.d[0]
  228. fmla v22.2d, v6.2d, v13.d[0]
  229. fmla v26.2d, v6.2d, v14.d[0]
  230. fmla v30.2d, v6.2d, v15.d[0]
  231. fmla v19.2d, v7.2d, v12.d[0]
  232. fmla v23.2d, v7.2d, v13.d[0]
  233. fmla v27.2d, v7.2d, v14.d[0]
  234. fmla v31.2d, v7.2d, v15.d[0]
  235. .endm
  236. .macro KERNEL8x4_SUB
  237. ldp q0, q1, [pA], #32
  238. ldp d8, d9, [pB], #16
  239. fmla v16.2d, v0.2d, v8.d[0]
  240. fmla v20.2d, v0.2d, v9.d[0]
  241. ldp d10, d11, [pB], #16
  242. fmla v17.2d, v1.2d, v8.d[0]
  243. fmla v21.2d, v1.2d, v9.d[0]
  244. ldp q2, q3, [pA], #32
  245. fmla v24.2d, v0.2d, v10.d[0]
  246. fmla v28.2d, v0.2d, v11.d[0]
  247. fmla v25.2d, v1.2d, v10.d[0]
  248. fmla v29.2d, v1.2d, v11.d[0]
  249. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  250. fmla v18.2d, v2.2d, v8.d[0]
  251. fmla v22.2d, v2.2d, v9.d[0]
  252. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  253. fmla v26.2d, v2.2d, v10.d[0]
  254. fmla v30.2d, v2.2d, v11.d[0]
  255. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  256. fmla v19.2d, v3.2d, v8.d[0]
  257. fmla v27.2d, v3.2d, v10.d[0]
  258. fmla v31.2d, v3.2d, v11.d[0]
  259. fmla v23.2d, v3.2d, v9.d[0]
  260. .endm
  261. .macro SAVE8x4
  262. fmov alpha0, alpha
  263. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  264. fmul v0.2d, v16.2d, alphaV0
  265. fmul v1.2d, v17.2d, alphaV0
  266. stp q0, q1, [pCRow0]
  267. add pCRow0, pCRow0, #32
  268. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  269. fmul v2.2d, v18.2d, alphaV0
  270. fmul v3.2d, v19.2d, alphaV0
  271. stp q2, q3, [pCRow0]
  272. add pCRow0, pCRow0, #32
  273. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  274. fmul v4.2d, v20.2d, alphaV0
  275. fmul v5.2d, v21.2d, alphaV0
  276. stp q4, q5, [pCRow1]
  277. add pCRow1, pCRow1, #32
  278. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  279. fmul v6.2d, v22.2d, alphaV0
  280. fmul v7.2d, v23.2d, alphaV0
  281. stp q6, q7, [pCRow1]
  282. add pCRow1, pCRow1, #32
  283. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  284. fmul v0.2d, v24.2d, alphaV0
  285. fmul v1.2d, v25.2d, alphaV0
  286. stp q0, q1, [pCRow2]
  287. add pCRow2, pCRow2, #32
  288. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  289. fmul v2.2d, v26.2d, alphaV0
  290. fmul v3.2d, v27.2d, alphaV0
  291. stp q2, q3, [pCRow2]
  292. add pCRow2, pCRow2, #32
  293. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  294. fmul v4.2d, v28.2d, alphaV0
  295. fmul v5.2d, v29.2d, alphaV0
  296. stp q4, q5, [pCRow3]
  297. add pCRow3, pCRow3, #32
  298. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  299. fmul v6.2d, v30.2d, alphaV0
  300. fmul v7.2d, v31.2d, alphaV0
  301. stp q6, q7, [pCRow3]
  302. add pCRow3, pCRow3, #32
  303. .endm
  304. /******************************************************************************/
  305. .macro INIT4x4
  306. fmov d16, xzr
  307. fmov d17, d16
  308. fmov d20, d17
  309. fmov d21, d16
  310. fmov d24, d17
  311. fmov d25, d16
  312. fmov d28, d17
  313. fmov d29, d16
  314. .endm
  315. .macro KERNEL4x4_SUB
  316. ld1 {v8.2d, v9.2d}, [pB]
  317. add pB, pB, #32
  318. ld1 {v0.2d, v1.2d}, [pA]
  319. add pA, pA, #32
  320. fmla v16.2d, v0.2d, v8.d[0]
  321. fmla v29.2d, v1.2d, v9.d[1]
  322. fmla v20.2d, v0.2d, v8.d[1]
  323. fmla v25.2d, v1.2d, v9.d[0]
  324. fmla v24.2d, v0.2d, v9.d[0]
  325. fmla v21.2d, v1.2d, v8.d[1]
  326. fmla v28.2d, v0.2d, v9.d[1]
  327. fmla v17.2d, v1.2d, v8.d[0]
  328. .endm
  329. .macro SAVE4x4
  330. fmov alpha0, alpha
  331. fmul v8.2d, v16.2d, alphaV0
  332. fmul v9.2d, v17.2d, alphaV0
  333. st1 {v8.2d, v9.2d}, [pCRow0]
  334. add pCRow1, pCRow0, LDC
  335. fmul v12.2d, v20.2d, alphaV0
  336. fmul v13.2d, v21.2d, alphaV0
  337. st1 {v12.2d, v13.2d}, [pCRow1]
  338. add pCRow2, pCRow1, LDC
  339. fmul v8.2d, v24.2d, alphaV0
  340. fmul v9.2d, v25.2d, alphaV0
  341. st1 {v8.2d, v9.2d}, [pCRow2]
  342. add pCRow1, pCRow2, LDC
  343. fmul v12.2d, v28.2d, alphaV0
  344. fmul v13.2d, v29.2d, alphaV0
  345. st1 {v12.2d, v13.2d}, [pCRow1]
  346. add pCRow0, pCRow0, #32
  347. .endm
  348. /******************************************************************************/
  349. .macro INIT2x4
  350. fmov d16, xzr
  351. fmov d20, d16
  352. fmov d24, d20
  353. fmov d28, d16
  354. .endm
  355. .macro KERNEL2x4_SUB
  356. ld1 {v8.2d, v9.2d}, [pB]
  357. add pB, pB, #32
  358. ld1 {v0.2d}, [pA]
  359. add pA, pA, #16
  360. fmla v16.2d, v0.2d, v8.d[0]
  361. fmla v20.2d, v0.2d, v8.d[1]
  362. fmla v24.2d, v0.2d, v9.d[0]
  363. fmla v28.2d, v0.2d, v9.d[1]
  364. .endm
  365. .macro SAVE2x4
  366. fmov alpha0, alpha
  367. fmul v8.2d, v16.2d, alphaV0
  368. st1 {v8.2d}, [pCRow0]
  369. add pCRow1, pCRow0, LDC
  370. fmul v12.2d, v20.2d, alphaV0
  371. st1 {v12.2d}, [pCRow1]
  372. add pCRow2, pCRow1, LDC
  373. fmul v8.2d, v24.2d, alphaV0
  374. st1 {v8.2d}, [pCRow2]
  375. add pCRow1, pCRow2, LDC
  376. fmul v12.2d, v28.2d, alphaV0
  377. st1 {v12.2d}, [pCRow1]
  378. add pCRow0, pCRow0, #16
  379. .endm
  380. /******************************************************************************/
  381. .macro INIT1x4
  382. fmov d16, xzr
  383. fmov d20, d16
  384. .endm
  385. .macro KERNEL1x4_SUB
  386. ldr d0, [pA]
  387. add pA, pA, #8
  388. ld1 {v8.2d, v9.2d}, [pB]
  389. add pB, pB, #32
  390. fmla v16.2d, v8.2d, v0.d[0]
  391. fmla v20.2d, v9.2d, v0.d[0]
  392. .endm
  393. .macro SAVE1x4
  394. fmov alpha0, alpha
  395. add pCRow1, pCRow0, LDC
  396. fmul v8.2d, v16.2d, alphaV0
  397. st1 {v8.d}[0], [pCRow0]
  398. st1 {v8.d}[1], [pCRow1]
  399. add pCRow2, pCRow1, LDC
  400. add pCRow1, pCRow2, LDC
  401. fmul v12.2d, v20.2d, alphaV0
  402. st1 {v12.d}[0], [pCRow2]
  403. st1 {v12.d}[1], [pCRow1]
  404. add pCRow0, pCRow0, #8
  405. .endm
  406. /******************************************************************************/
  407. .macro INIT8x2
  408. fmov d16, xzr
  409. fmov d17, xzr
  410. fmov d18, d16
  411. fmov d19, d17
  412. fmov d20, xzr
  413. fmov d21, d16
  414. fmov d22, d17
  415. fmov d23, d18
  416. .endm
  417. .macro KERNEL8x2_SUB
  418. ld1 {v0.2d, v1.2d}, [pA]
  419. add pA, pA, #32
  420. ld1 {v8.2d}, [pB]
  421. add pB, pB, #16
  422. ld1 {v2.2d, v3.2d}, [pA]
  423. add pA, pA, #32
  424. fmla v16.2d, v0.2d, v8.d[0]
  425. fmla v17.2d, v1.2d, v8.d[0]
  426. fmla v18.2d, v2.2d, v8.d[0]
  427. fmla v19.2d, v3.2d, v8.d[0]
  428. fmla v20.2d, v0.2d, v8.d[1]
  429. fmla v21.2d, v1.2d, v8.d[1]
  430. fmla v22.2d, v2.2d, v8.d[1]
  431. fmla v23.2d, v3.2d, v8.d[1]
  432. .endm
  433. .macro SAVE8x2
  434. fmov alpha0, alpha
  435. add pCRow1, pCRow0, LDC
  436. fmul v0.2d, v16.2d, alphaV0
  437. fmul v1.2d, v17.2d, alphaV0
  438. fmul v2.2d, v18.2d, alphaV0
  439. fmul v3.2d, v19.2d, alphaV0
  440. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  441. fmul v4.2d, v20.2d, alphaV0
  442. fmul v5.2d, v21.2d, alphaV0
  443. fmul v6.2d, v22.2d, alphaV0
  444. fmul v7.2d, v23.2d, alphaV0
  445. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  446. add pCRow0, pCRow0, #64
  447. .endm
  448. /******************************************************************************/
  449. .macro INIT4x2
  450. fmov d16, xzr
  451. fmov d17, d16
  452. fmov d20, d17
  453. fmov d21, d16
  454. .endm
  455. .macro KERNEL4x2_SUB
  456. ld1 {v8.2d}, [pB]
  457. add pB, pB, #16
  458. ld1 {v0.2d, v1.2d}, [pA]
  459. add pA, pA, #32
  460. fmla v16.2d, v0.2d, v8.d[0]
  461. fmla v17.2d, v1.2d, v8.d[0]
  462. fmla v20.2d, v0.2d, v8.d[1]
  463. fmla v21.2d, v1.2d, v8.d[1]
  464. .endm
  465. .macro SAVE4x2
  466. fmov alpha0, alpha
  467. fmul v8.2d, v16.2d, alphaV0
  468. fmul v9.2d, v17.2d, alphaV0
  469. st1 {v8.2d, v9.2d}, [pCRow0]
  470. add pCRow1, pCRow0, LDC
  471. fmul v12.2d, v20.2d, alphaV0
  472. fmul v13.2d, v21.2d, alphaV0
  473. st1 {v12.2d, v13.2d}, [pCRow1]
  474. add pCRow0, pCRow0, #32
  475. .endm
  476. /******************************************************************************/
  477. .macro INIT2x2
  478. fmov d16, xzr
  479. fmov d20, d16
  480. .endm
  481. .macro KERNEL2x2_SUB
  482. ld1 {v8.2d}, [pB]
  483. add pB, pB, #16
  484. ld1 {v0.2d}, [pA]
  485. add pA, pA, #16
  486. fmla v16.2d, v0.2d, v8.d[0]
  487. fmla v20.2d, v0.2d, v8.d[1]
  488. .endm
  489. .macro SAVE2x2
  490. fmov alpha0, alpha
  491. fmul v8.2d, v16.2d, alphaV0
  492. st1 {v8.2d}, [pCRow0]
  493. add pCRow1 , pCRow0, LDC
  494. fmul v12.2d, v20.2d, alphaV0
  495. st1 {v12.2d}, [pCRow1]
  496. add pCRow0, pCRow0, #16
  497. .endm
  498. /******************************************************************************/
  499. .macro INIT1x2
  500. fmov d16, xzr
  501. .endm
  502. .macro KERNEL1x2_SUB
  503. ld1 {v8.2d} , [pB]
  504. add pB , pB, #16
  505. ldr d0 , [pA]
  506. add pA, pA, #8
  507. fmla v16.2d, v8.2d, v0.d[0]
  508. .endm
  509. .macro SAVE1x2
  510. fmov alpha0, alpha
  511. add pCRow1 , pCRow0, LDC
  512. fmul v8.2d, v16.2d, alphaV0
  513. st1 {v8.d}[0], [pCRow0]
  514. st1 {v8.d}[1], [pCRow1]
  515. add pCRow0, pCRow0, #8
  516. .endm
  517. /******************************************************************************/
  518. .macro INIT8x1
  519. fmov d16, xzr
  520. fmov d17, xzr
  521. fmov d18, d16
  522. fmov d19, d17
  523. .endm
  524. .macro KERNEL8x1_SUB
  525. ld1 {v0.2d, v1.2d}, [pA]
  526. add pA , pA, #32
  527. ldr d8, [pB]
  528. add pB , pB, #8
  529. ld1 {v2.2d, v3.2d}, [pA]
  530. add pA, pA, #32
  531. fmla v16.2d, v0.2d, v8.d[0]
  532. fmla v17.2d, v1.2d, v8.d[0]
  533. fmla v18.2d, v2.2d, v8.d[0]
  534. fmla v19.2d, v3.2d, v8.d[0]
  535. .endm
  536. .macro SAVE8x1
  537. fmov alpha0, alpha
  538. fmul v0.2d, v16.2d, alphaV0
  539. fmul v1.2d, v17.2d, alphaV0
  540. fmul v2.2d, v18.2d, alphaV0
  541. fmul v3.2d, v19.2d, alphaV0
  542. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  543. add pCRow0, pCRow0, #64
  544. .endm
  545. /******************************************************************************/
  546. .macro INIT4x1
  547. fmov d16, xzr
  548. fmov d17, d16
  549. .endm
  550. .macro KERNEL4x1_SUB
  551. ldr d8, [pB]
  552. add pB , pB, #8
  553. ld1 {v0.2d, v1.2d}, [pA]
  554. add pA , pA, #32
  555. fmla v16.2d, v0.2d, v8.d[0]
  556. fmla v17.2d, v1.2d, v8.d[0]
  557. .endm
  558. .macro SAVE4x1
  559. fmov alpha0, alpha
  560. fmul v8.2d, v16.2d, alphaV0
  561. fmul v9.2d, v17.2d, alphaV0
  562. st1 {v8.2d, v9.2d}, [pCRow0]
  563. add pCRow0, pCRow0, #32
  564. .endm
  565. /******************************************************************************/
  566. .macro INIT2x1
  567. fmov d16, xzr
  568. .endm
  569. .macro KERNEL2x1_SUB
  570. ldr d8, [pB]
  571. add pB , pB, #8
  572. ld1 {v0.2d}, [pA]
  573. add pA , pA, #16
  574. fmla v16.2d, v0.2d, v8.d[0]
  575. .endm
  576. .macro SAVE2x1
  577. fmov alpha0, alpha
  578. fmul v8.2d, v16.2d, alphaV0
  579. st1 {v8.2d}, [pCRow0]
  580. add pCRow0, pCRow0, #16
  581. .endm
  582. /******************************************************************************/
  583. .macro INIT1x1
  584. fmov d16, xzr
  585. .endm
  586. .macro KERNEL1x1_SUB
  587. ldr d8, [pB]
  588. add pB , pB, #8
  589. ldr d0, [pA]
  590. add pA , pA, #8
  591. fmadd d16, d0, d8, d16
  592. .endm
  593. .macro SAVE1x1
  594. fmov alpha0, alpha
  595. fmul d8, d16, alpha0
  596. str d8, [pCRow0]
  597. add pCRow0, pCRow0, #8
  598. .endm
  599. /*******************************************************************************
  600. * End of macro definitions
  601. *******************************************************************************/
  602. PROLOGUE
  603. .align 5
  604. add sp, sp, #-(11 * 16)
  605. stp d8, d9, [sp, #(0 * 16)]
  606. stp d10, d11, [sp, #(1 * 16)]
  607. stp d12, d13, [sp, #(2 * 16)]
  608. stp d14, d15, [sp, #(3 * 16)]
  609. stp d16, d17, [sp, #(4 * 16)]
  610. stp x18, x19, [sp, #(5 * 16)]
  611. stp x20, x21, [sp, #(6 * 16)]
  612. stp x22, x23, [sp, #(7 * 16)]
  613. stp x24, x25, [sp, #(8 * 16)]
  614. stp x26, x27, [sp, #(9 * 16)]
  615. str x28, [sp, #(10 * 16)]
  616. prfm PLDL1KEEP, [origPB]
  617. prfm PLDL1KEEP, [origPA]
  618. fmov alpha, d0
  619. lsl LDC, LDC, #3 // ldc = ldc * 8
  620. #if !defined(LEFT)
  621. neg tempOffset, offset
  622. #endif
  623. mov pB, origPB
  624. mov counterJ, origN
  625. asr counterJ, counterJ, #2 // J = J / 4
  626. cmp counterJ, #0
  627. ble .Ldtrmm_kernel_L2_BEGIN
  628. /******************************************************************************/
  629. .Ldtrmm_kernel_L4_BEGIN:
  630. mov pCRow0, pC
  631. add pCRow1, pCRow0, LDC
  632. add pCRow2, pCRow1, LDC
  633. add pCRow3, pCRow2, LDC
  634. add pC, pCRow3, LDC
  635. #if defined(LEFT)
  636. mov tempOffset, offset
  637. #endif
  638. mov pA, origPA // pA = start of A array
  639. .Ldtrmm_kernel_L4_M8_BEGIN:
  640. mov counterI, origM
  641. asr counterI, counterI, #3 // counterI = counterI / 8
  642. cmp counterI, #0
  643. ble .Ldtrmm_kernel_L4_M4_BEGIN
  644. .align 5
  645. .Ldtrmm_kernel_L4_M8_20:
  646. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  647. mov pB, origPB
  648. #else
  649. mov pB, origPB
  650. lsl temp, tempOffset, #6
  651. add pA, pA, temp
  652. lsl temp, tempOffset, #5
  653. add pB, pB, temp
  654. #endif
  655. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  656. sub tempK, origK, tempOffset
  657. #elif defined(LEFT)
  658. add tempK, tempOffset, #8
  659. #else
  660. add tempK, tempOffset, #4
  661. #endif
  662. asr counterL , tempK, #3 // L = K / 8
  663. cmp counterL , #2 // is there at least 4 to do?
  664. blt .Ldtrmm_kernel_L4_M8_32
  665. KERNEL8x4_I // do one in the K
  666. KERNEL8x4_M2 // do another in the K
  667. KERNEL8x4_M1
  668. KERNEL8x4_M2
  669. KERNEL8x4_M1
  670. KERNEL8x4_M2
  671. KERNEL8x4_M1
  672. KERNEL8x4_M2
  673. subs counterL, counterL, #2 // subtract 2
  674. ble .Ldtrmm_kernel_L4_M8_22a
  675. .align 5
  676. .Ldtrmm_kernel_L4_M8_22:
  677. KERNEL8x4_M1
  678. KERNEL8x4_M2
  679. KERNEL8x4_M1
  680. KERNEL8x4_M2
  681. KERNEL8x4_M1
  682. KERNEL8x4_M2
  683. KERNEL8x4_M1
  684. KERNEL8x4_M2
  685. subs counterL, counterL, #1
  686. bgt .Ldtrmm_kernel_L4_M8_22
  687. .align 5
  688. .Ldtrmm_kernel_L4_M8_22a:
  689. KERNEL8x4_M1
  690. KERNEL8x4_M2
  691. KERNEL8x4_M1
  692. KERNEL8x4_M2
  693. KERNEL8x4_M1
  694. KERNEL8x4_M2
  695. KERNEL8x4_M1
  696. KERNEL8x4_E
  697. b .Ldtrmm_kernel_L4_M8_44
  698. .align 5
  699. .Ldtrmm_kernel_L4_M8_32:
  700. tst counterL, #1
  701. ble .Ldtrmm_kernel_L4_M8_40
  702. KERNEL8x4_I
  703. KERNEL8x4_M2
  704. KERNEL8x4_M1
  705. KERNEL8x4_M2
  706. KERNEL8x4_M1
  707. KERNEL8x4_M2
  708. KERNEL8x4_M1
  709. KERNEL8x4_E
  710. b .Ldtrmm_kernel_L4_M8_44
  711. .Ldtrmm_kernel_L4_M8_40:
  712. INIT8x4
  713. .Ldtrmm_kernel_L4_M8_44:
  714. ands counterL , tempK, #7
  715. ble .Ldtrmm_kernel_L4_M8_100
  716. .align 5
  717. .Ldtrmm_kernel_L4_M8_46:
  718. KERNEL8x4_SUB
  719. subs counterL, counterL, #1
  720. bne .Ldtrmm_kernel_L4_M8_46
  721. .Ldtrmm_kernel_L4_M8_100:
  722. SAVE8x4
  723. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  724. sub tempK, origK, tempOffset
  725. #if defined(LEFT)
  726. sub tempK, tempK, #8
  727. #else
  728. sub tempK, tempK, #4
  729. #endif
  730. lsl temp, tempK, #6
  731. add pA, pA, temp
  732. lsl temp, tempK, #5
  733. add pB, pB, temp
  734. #endif
  735. #if defined(LEFT)
  736. add tempOffset, tempOffset, #8
  737. #endif
  738. prfm PLDL1KEEP, [pA]
  739. prfm PLDL1KEEP, [pA, #64]
  740. prfm PLDL1KEEP, [origPB]
  741. .Ldtrmm_kernel_L4_M8_END:
  742. subs counterI, counterI, #1
  743. bne .Ldtrmm_kernel_L4_M8_20
  744. .Ldtrmm_kernel_L4_M4_BEGIN:
  745. mov counterI, origM
  746. tst counterI , #7
  747. ble .Ldtrmm_kernel_L4_END
  748. tst counterI, #4
  749. ble .Ldtrmm_kernel_L4_M2_BEGIN
  750. .Ldtrmm_kernel_L4_M4_20:
  751. INIT4x4
  752. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  753. mov pB, origPB
  754. #else
  755. mov pB, origPB
  756. lsl temp, tempOffset, #5
  757. add pB, pB, temp
  758. add pA, pA, temp
  759. #endif
  760. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  761. sub tempK, origK, tempOffset
  762. #elif defined(LEFT)
  763. add tempK, tempOffset, #4
  764. #else
  765. add tempK, tempOffset, #4
  766. #endif
  767. asr counterL , tempK, #3 // counterL = counterL / 8
  768. cmp counterL , #0
  769. ble .Ldtrmm_kernel_L4_M4_40
  770. .Ldtrmm_kernel_L4_M4_22:
  771. KERNEL4x4_SUB
  772. KERNEL4x4_SUB
  773. KERNEL4x4_SUB
  774. KERNEL4x4_SUB
  775. KERNEL4x4_SUB
  776. KERNEL4x4_SUB
  777. KERNEL4x4_SUB
  778. KERNEL4x4_SUB
  779. subs counterL, counterL, #1
  780. bgt .Ldtrmm_kernel_L4_M4_22
  781. .Ldtrmm_kernel_L4_M4_40:
  782. ands counterL , tempK, #7 // counterL = counterL % 8
  783. ble .Ldtrmm_kernel_L4_M4_100
  784. .Ldtrmm_kernel_L4_M4_42:
  785. KERNEL4x4_SUB
  786. subs counterL, counterL, #1
  787. bgt .Ldtrmm_kernel_L4_M4_42
  788. .Ldtrmm_kernel_L4_M4_100:
  789. SAVE4x4
  790. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  791. sub tempK, origK, tempOffset
  792. #if defined(LEFT)
  793. sub tempK, tempK, #4
  794. #else
  795. sub tempK, tempK, #4
  796. #endif
  797. lsl temp, tempK, #5
  798. add pA, pA, temp
  799. add pB, pB, temp
  800. #endif
  801. #if defined(LEFT)
  802. add tempOffset, tempOffset, #4
  803. #endif
  804. .Ldtrmm_kernel_L4_M4_END:
  805. .Ldtrmm_kernel_L4_M2_BEGIN:
  806. mov counterI, origM
  807. tst counterI , #3
  808. ble .Ldtrmm_kernel_L4_END
  809. tst counterI, #2 // counterI = counterI / 2
  810. ble .Ldtrmm_kernel_L4_M1_BEGIN
  811. .Ldtrmm_kernel_L4_M2_20:
  812. INIT2x4
  813. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  814. mov pB, origPB
  815. #else
  816. mov pB, origPB
  817. lsl temp, tempOffset, #4
  818. add pA, pA, temp
  819. lsl temp, tempOffset, #5
  820. add pB, pB, temp
  821. #endif
  822. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  823. sub tempK, origK, tempOffset
  824. #elif defined(LEFT)
  825. add tempK, tempOffset, #2
  826. #else
  827. add tempK, tempOffset, #4
  828. #endif
  829. asr counterL , tempK, #3 // counterL = counterL / 8
  830. cmp counterL , #0
  831. ble .Ldtrmm_kernel_L4_M2_40
  832. .Ldtrmm_kernel_L4_M2_22:
  833. KERNEL2x4_SUB
  834. KERNEL2x4_SUB
  835. KERNEL2x4_SUB
  836. KERNEL2x4_SUB
  837. KERNEL2x4_SUB
  838. KERNEL2x4_SUB
  839. KERNEL2x4_SUB
  840. KERNEL2x4_SUB
  841. subs counterL, counterL, #1
  842. bgt .Ldtrmm_kernel_L4_M2_22
  843. .Ldtrmm_kernel_L4_M2_40:
  844. ands counterL , tempK, #7 // counterL = counterL % 8
  845. ble .Ldtrmm_kernel_L4_M2_100
  846. .Ldtrmm_kernel_L4_M2_42:
  847. KERNEL2x4_SUB
  848. subs counterL, counterL, #1
  849. bgt .Ldtrmm_kernel_L4_M2_42
  850. .Ldtrmm_kernel_L4_M2_100:
  851. SAVE2x4
  852. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  853. sub tempK, origK, tempOffset
  854. #if defined(LEFT)
  855. sub tempK, tempK, #2
  856. #else
  857. sub tempK, tempK, #4
  858. #endif
  859. lsl temp, tempK, #4
  860. add pA, pA, temp
  861. lsl temp, tempK, #5
  862. add pB, pB, temp
  863. #endif
  864. #if defined(LEFT)
  865. add tempOffset, tempOffset, #2
  866. #endif
  867. .Ldtrmm_kernel_L4_M2_END:
  868. .Ldtrmm_kernel_L4_M1_BEGIN:
  869. tst counterI, #1 // counterI = counterI % 2
  870. ble .Ldtrmm_kernel_L4_END
  871. .Ldtrmm_kernel_L4_M1_20:
  872. INIT1x4
  873. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  874. mov pB, origPB
  875. #else
  876. mov pB, origPB
  877. lsl temp, tempOffset, #5
  878. add pB, pB, temp
  879. lsl temp, tempOffset, #3
  880. add pA, pA, temp
  881. #endif
  882. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  883. sub tempK, origK, tempOffset
  884. #elif defined(LEFT)
  885. add tempK, tempOffset, #1
  886. #else
  887. add tempK, tempOffset, #4
  888. #endif
  889. asr counterL , tempK, #3 // counterL = counterL / 8
  890. cmp counterL , #0
  891. ble .Ldtrmm_kernel_L4_M1_40
  892. .Ldtrmm_kernel_L4_M1_22:
  893. KERNEL1x4_SUB
  894. KERNEL1x4_SUB
  895. KERNEL1x4_SUB
  896. KERNEL1x4_SUB
  897. KERNEL1x4_SUB
  898. KERNEL1x4_SUB
  899. KERNEL1x4_SUB
  900. KERNEL1x4_SUB
  901. subs counterL, counterL, #1
  902. bgt .Ldtrmm_kernel_L4_M1_22
  903. .Ldtrmm_kernel_L4_M1_40:
  904. ands counterL , tempK, #7 // counterL = counterL % 8
  905. ble .Ldtrmm_kernel_L4_M1_100
  906. .Ldtrmm_kernel_L4_M1_42:
  907. KERNEL1x4_SUB
  908. subs counterL, counterL, #1
  909. bgt .Ldtrmm_kernel_L4_M1_42
  910. .Ldtrmm_kernel_L4_M1_100:
  911. SAVE1x4
  912. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  913. sub tempK, origK, tempOffset
  914. #if defined(LEFT)
  915. sub tempK, tempK, #1
  916. #else
  917. sub tempK, tempK, #4
  918. #endif
  919. lsl temp, tempK, #3
  920. add pA, pA, temp
  921. lsl temp, tempK, #5
  922. add pB, pB, temp
  923. #endif
  924. #if defined(LEFT)
  925. add tempOffset, tempOffset, #1
  926. #endif
  927. .Ldtrmm_kernel_L4_END:
  928. lsl temp, origK, #5
  929. add origPB, origPB, temp // B = B + K * 4 * 8
  930. #if !defined(LEFT)
  931. add tempOffset, tempOffset, #4
  932. #endif
  933. subs counterJ, counterJ , #1 // j--
  934. bgt .Ldtrmm_kernel_L4_BEGIN
  935. /******************************************************************************/
  936. .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  937. mov counterJ , origN
  938. tst counterJ , #3
  939. ble .Ldtrmm_kernel_L999 // error, N was less than 4?
  940. tst counterJ , #2
  941. ble .Ldtrmm_kernel_L1_BEGIN
  942. mov pCRow0, pC // pCRow0 = pC
  943. add pC,pC,LDC, lsl #1
  944. #if defined(LEFT)
  945. mov tempOffset, offset
  946. #endif
  947. mov pA, origPA // pA = A
  948. .Ldtrmm_kernel_L2_M8_BEGIN:
  949. mov counterI, origM
  950. asr counterI, counterI, #3 // counterI = counterI / 8
  951. cmp counterI, #0
  952. ble .Ldtrmm_kernel_L2_M4_BEGIN
  953. .Ldtrmm_kernel_L2_M8_20:
  954. INIT8x2
  955. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  956. mov pB, origPB
  957. #else
  958. mov pB, origPB
  959. lsl temp, tempOffset, #6
  960. add pA, pA, temp
  961. lsl temp, tempOffset, #4
  962. add pB, pB, temp
  963. #endif
  964. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  965. sub tempK, origK, tempOffset
  966. #elif defined(LEFT)
  967. add tempK, tempOffset, #8
  968. #else
  969. add tempK, tempOffset, #2
  970. #endif
  971. asr counterL , tempK, #3 // counterL = counterL / 8
  972. cmp counterL,#0
  973. ble .Ldtrmm_kernel_L2_M8_40
  974. .align 5
  975. .Ldtrmm_kernel_L2_M8_22:
  976. KERNEL8x2_SUB
  977. KERNEL8x2_SUB
  978. KERNEL8x2_SUB
  979. KERNEL8x2_SUB
  980. KERNEL8x2_SUB
  981. KERNEL8x2_SUB
  982. KERNEL8x2_SUB
  983. KERNEL8x2_SUB
  984. subs counterL, counterL, #1
  985. bgt .Ldtrmm_kernel_L2_M8_22
  986. .Ldtrmm_kernel_L2_M8_40:
  987. ands counterL , tempK, #7 // counterL = counterL % 8
  988. ble .Ldtrmm_kernel_L2_M8_100
  989. .Ldtrmm_kernel_L2_M8_42:
  990. KERNEL8x2_SUB
  991. subs counterL, counterL, #1
  992. bgt .Ldtrmm_kernel_L2_M8_42
  993. .Ldtrmm_kernel_L2_M8_100:
  994. SAVE8x2
  995. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  996. sub tempK, origK, tempOffset
  997. #if defined(LEFT)
  998. sub tempK, tempK, #8
  999. #else
  1000. sub tempK, tempK, #2
  1001. #endif
  1002. lsl temp, tempK, #6
  1003. add pA, pA, temp
  1004. lsl temp, tempK, #4
  1005. add pB, pB, temp
  1006. #endif
  1007. #if defined(LEFT)
  1008. add tempOffset, tempOffset, #8
  1009. #endif
  1010. .Ldtrmm_kernel_L2_M8_END:
  1011. subs counterI, counterI, #1
  1012. bgt .Ldtrmm_kernel_L2_M8_20
  1013. .Ldtrmm_kernel_L2_M4_BEGIN:
  1014. mov counterI, origM
  1015. tst counterI , #7
  1016. ble .Ldtrmm_kernel_L2_END
  1017. tst counterI, #4 // counterI = counterI / 2
  1018. ble .Ldtrmm_kernel_L2_M2_BEGIN
  1019. .Ldtrmm_kernel_L2_M4_20:
  1020. INIT4x2
  1021. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1022. mov pB, origPB
  1023. #else
  1024. mov pB, origPB
  1025. lsl temp, tempOffset, #4
  1026. add pB, pB, temp
  1027. lsl temp, tempOffset, #5
  1028. add pA, pA, temp
  1029. #endif
  1030. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1031. sub tempK, origK, tempOffset
  1032. #elif defined(LEFT)
  1033. add tempK, tempOffset, #4
  1034. #else
  1035. add tempK, tempOffset, #2
  1036. #endif
  1037. asr counterL , tempK, #3 // counterL = counterL / 8
  1038. cmp counterL,#0
  1039. ble .Ldtrmm_kernel_L2_M4_40
  1040. .align 5
  1041. .Ldtrmm_kernel_L2_M4_22:
  1042. KERNEL4x2_SUB
  1043. KERNEL4x2_SUB
  1044. KERNEL4x2_SUB
  1045. KERNEL4x2_SUB
  1046. KERNEL4x2_SUB
  1047. KERNEL4x2_SUB
  1048. KERNEL4x2_SUB
  1049. KERNEL4x2_SUB
  1050. subs counterL, counterL, #1
  1051. bgt .Ldtrmm_kernel_L2_M4_22
  1052. .Ldtrmm_kernel_L2_M4_40:
  1053. ands counterL , tempK, #7 // counterL = counterL % 8
  1054. ble .Ldtrmm_kernel_L2_M4_100
  1055. .Ldtrmm_kernel_L2_M4_42:
  1056. KERNEL4x2_SUB
  1057. subs counterL, counterL, #1
  1058. bgt .Ldtrmm_kernel_L2_M4_42
  1059. .Ldtrmm_kernel_L2_M4_100:
  1060. SAVE4x2
  1061. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1062. sub tempK, origK, tempOffset
  1063. #if defined(LEFT)
  1064. sub tempK, tempK, #4
  1065. #else
  1066. sub tempK, tempK, #2
  1067. #endif
  1068. lsl temp, tempK, #5
  1069. add pA, pA, temp
  1070. lsl temp, tempK, #4
  1071. add pB, pB, temp
  1072. #endif
  1073. #if defined(LEFT)
  1074. add tempOffset, tempOffset, #4
  1075. #endif
  1076. .Ldtrmm_kernel_L2_M4_END:
  1077. .Ldtrmm_kernel_L2_M2_BEGIN:
  1078. mov counterI, origM
  1079. tst counterI , #3
  1080. ble .Ldtrmm_kernel_L2_END
  1081. tst counterI, #2 // counterI = counterI / 2
  1082. ble .Ldtrmm_kernel_L2_M1_BEGIN
  1083. .Ldtrmm_kernel_L2_M2_20:
  1084. INIT2x2
  1085. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1086. mov pB, origPB
  1087. #else
  1088. mov pB, origPB
  1089. lsl temp, tempOffset, #4
  1090. add pB, pB, temp
  1091. lsl temp, tempOffset, #4
  1092. add pA, pA, temp
  1093. #endif
  1094. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1095. sub tempK, origK, tempOffset
  1096. #elif defined(LEFT)
  1097. add tempK, tempOffset, #2
  1098. #else
  1099. add tempK, tempOffset, #2
  1100. #endif
  1101. asr counterL , tempK, #3 // counterL = counterL / 8
  1102. cmp counterL,#0
  1103. ble .Ldtrmm_kernel_L2_M2_40
  1104. .Ldtrmm_kernel_L2_M2_22:
  1105. KERNEL2x2_SUB
  1106. KERNEL2x2_SUB
  1107. KERNEL2x2_SUB
  1108. KERNEL2x2_SUB
  1109. KERNEL2x2_SUB
  1110. KERNEL2x2_SUB
  1111. KERNEL2x2_SUB
  1112. KERNEL2x2_SUB
  1113. subs counterL, counterL, #1
  1114. bgt .Ldtrmm_kernel_L2_M2_22
  1115. .Ldtrmm_kernel_L2_M2_40:
  1116. ands counterL , tempK, #7 // counterL = counterL % 8
  1117. ble .Ldtrmm_kernel_L2_M2_100
  1118. .Ldtrmm_kernel_L2_M2_42:
  1119. KERNEL2x2_SUB
  1120. subs counterL, counterL, #1
  1121. bgt .Ldtrmm_kernel_L2_M2_42
  1122. .Ldtrmm_kernel_L2_M2_100:
  1123. SAVE2x2
  1124. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1125. sub tempK, origK, tempOffset
  1126. #if defined(LEFT)
  1127. sub tempK, tempK, #2
  1128. #else
  1129. sub tempK, tempK, #2
  1130. #endif
  1131. lsl temp, tempK, #4
  1132. add pA, pA, temp
  1133. lsl temp, tempK, #4
  1134. add pB, pB, temp
  1135. #endif
  1136. #if defined(LEFT)
  1137. add tempOffset, tempOffset, #2
  1138. #endif
  1139. .Ldtrmm_kernel_L2_M2_END:
  1140. .Ldtrmm_kernel_L2_M1_BEGIN:
  1141. tst counterI, #1 // counterI = counterI % 2
  1142. ble .Ldtrmm_kernel_L2_END
  1143. .Ldtrmm_kernel_L2_M1_20:
  1144. INIT1x2
  1145. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1146. mov pB, origPB
  1147. #else
  1148. mov pB, origPB
  1149. lsl temp, tempOffset, #4
  1150. add pB, pB, temp
  1151. lsl temp, tempOffset, #3
  1152. add pA, pA, temp
  1153. #endif
  1154. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1155. sub tempK, origK, tempOffset
  1156. #elif defined(LEFT)
  1157. add tempK, tempOffset, #1
  1158. #else
  1159. add tempK, tempOffset, #2
  1160. #endif
  1161. asr counterL , tempK, #3 // counterL = counterL / 8
  1162. cmp counterL, #0
  1163. ble .Ldtrmm_kernel_L2_M1_40
  1164. .Ldtrmm_kernel_L2_M1_22:
  1165. KERNEL1x2_SUB
  1166. KERNEL1x2_SUB
  1167. KERNEL1x2_SUB
  1168. KERNEL1x2_SUB
  1169. KERNEL1x2_SUB
  1170. KERNEL1x2_SUB
  1171. KERNEL1x2_SUB
  1172. KERNEL1x2_SUB
  1173. subs counterL, counterL, #1
  1174. bgt .Ldtrmm_kernel_L2_M1_22
  1175. .Ldtrmm_kernel_L2_M1_40:
  1176. ands counterL , tempK, #7 // counterL = counterL % 8
  1177. ble .Ldtrmm_kernel_L2_M1_100
  1178. .Ldtrmm_kernel_L2_M1_42:
  1179. KERNEL1x2_SUB
  1180. subs counterL, counterL, #1
  1181. bgt .Ldtrmm_kernel_L2_M1_42
  1182. .Ldtrmm_kernel_L2_M1_100:
  1183. SAVE1x2
  1184. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1185. sub tempK, origK, tempOffset
  1186. #if defined(LEFT)
  1187. sub tempK, tempK, #1
  1188. #else
  1189. sub tempK, tempK, #2
  1190. #endif
  1191. lsl temp, tempK, #3
  1192. add pA, pA, temp
  1193. lsl temp, tempK, #4
  1194. add pB, pB, temp
  1195. #endif
  1196. #if defined(LEFT)
  1197. add tempOffset, tempOffset, #1
  1198. #endif
  1199. .Ldtrmm_kernel_L2_END:
  1200. #if !defined(LEFT)
  1201. add tempOffset, tempOffset, #2
  1202. #endif
  1203. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1204. /******************************************************************************/
  1205. .Ldtrmm_kernel_L1_BEGIN:
  1206. mov counterJ , origN
  1207. tst counterJ , #1
  1208. ble .Ldtrmm_kernel_L999 // done
  1209. mov pCRow0, pC // pCRow0 = C
  1210. add pC , pC , LDC // Update pC to point to next
  1211. #if defined(LEFT)
  1212. mov tempOffset, offset
  1213. #endif
  1214. mov pA, origPA // pA = A
  1215. .Ldtrmm_kernel_L1_M8_BEGIN:
  1216. mov counterI, origM
  1217. asr counterI, counterI, #3 // counterI = counterI / 8
  1218. cmp counterI, #0
  1219. ble .Ldtrmm_kernel_L1_M4_BEGIN
  1220. .Ldtrmm_kernel_L1_M8_20:
  1221. INIT8x1
  1222. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1223. mov pB, origPB
  1224. #else
  1225. mov pB, origPB
  1226. lsl temp, tempOffset, #6
  1227. add pA, pA, temp
  1228. lsl temp, tempOffset, #3
  1229. add pB, pB, temp
  1230. #endif
  1231. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1232. sub tempK, origK, tempOffset
  1233. #elif defined(LEFT)
  1234. add tempK, tempOffset, #8
  1235. #else
  1236. add tempK, tempOffset, #1
  1237. #endif
  1238. asr counterL , tempK, #3 // counterL = counterL / 8
  1239. cmp counterL , #0
  1240. ble .Ldtrmm_kernel_L1_M8_40
  1241. .align 5
  1242. .Ldtrmm_kernel_L1_M8_22:
  1243. KERNEL8x1_SUB
  1244. KERNEL8x1_SUB
  1245. KERNEL8x1_SUB
  1246. KERNEL8x1_SUB
  1247. KERNEL8x1_SUB
  1248. KERNEL8x1_SUB
  1249. KERNEL8x1_SUB
  1250. KERNEL8x1_SUB
  1251. subs counterL, counterL, #1
  1252. bgt .Ldtrmm_kernel_L1_M8_22
  1253. .Ldtrmm_kernel_L1_M8_40:
  1254. ands counterL , tempK, #7 // counterL = counterL % 8
  1255. ble .Ldtrmm_kernel_L1_M8_100
  1256. .Ldtrmm_kernel_L1_M8_42:
  1257. KERNEL8x1_SUB
  1258. subs counterL, counterL, #1
  1259. bgt .Ldtrmm_kernel_L1_M8_42
  1260. .Ldtrmm_kernel_L1_M8_100:
  1261. SAVE8x1
  1262. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1263. sub tempK, origK, tempOffset
  1264. #if defined(LEFT)
  1265. sub tempK, tempK, #8
  1266. #else
  1267. sub tempK, tempK, #1
  1268. #endif
  1269. lsl temp, tempK, #6
  1270. add pA, pA, temp
  1271. lsl temp, tempK, #3
  1272. add pB, pB, temp
  1273. #endif
  1274. #if defined(LEFT)
  1275. add tempOffset, tempOffset, #8
  1276. #endif
  1277. .Ldtrmm_kernel_L1_M8_END:
  1278. subs counterI, counterI, #1
  1279. bgt .Ldtrmm_kernel_L1_M8_20
  1280. .Ldtrmm_kernel_L1_M4_BEGIN:
  1281. mov counterI, origM
  1282. tst counterI , #7
  1283. ble .Ldtrmm_kernel_L1_END
  1284. tst counterI, #4 // counterI = counterI / 2
  1285. ble .Ldtrmm_kernel_L1_M2_BEGIN
  1286. .Ldtrmm_kernel_L1_M4_20:
  1287. INIT4x1
  1288. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1289. mov pB, origPB
  1290. #else
  1291. mov pB, origPB
  1292. lsl temp, tempOffset, #3
  1293. add pB, pB, temp
  1294. lsl temp, tempOffset, #5
  1295. add pA, pA, temp
  1296. #endif
  1297. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1298. sub tempK, origK, tempOffset
  1299. #elif defined(LEFT)
  1300. add tempK, tempOffset, #4
  1301. #else
  1302. add tempK, tempOffset, #1
  1303. #endif
  1304. asr counterL , tempK, #3 // counterL = counterL / 8
  1305. cmp counterL , #0
  1306. ble .Ldtrmm_kernel_L1_M4_40
  1307. .align 5
  1308. .Ldtrmm_kernel_L1_M4_22:
  1309. KERNEL4x1_SUB
  1310. KERNEL4x1_SUB
  1311. KERNEL4x1_SUB
  1312. KERNEL4x1_SUB
  1313. KERNEL4x1_SUB
  1314. KERNEL4x1_SUB
  1315. KERNEL4x1_SUB
  1316. KERNEL4x1_SUB
  1317. subs counterL, counterL, #1
  1318. bgt .Ldtrmm_kernel_L1_M4_22
  1319. .Ldtrmm_kernel_L1_M4_40:
  1320. ands counterL , tempK, #7 // counterL = counterL % 8
  1321. ble .Ldtrmm_kernel_L1_M4_100
  1322. .Ldtrmm_kernel_L1_M4_42:
  1323. KERNEL4x1_SUB
  1324. subs counterL, counterL, #1
  1325. bgt .Ldtrmm_kernel_L1_M4_42
  1326. .Ldtrmm_kernel_L1_M4_100:
  1327. SAVE4x1
  1328. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1329. sub tempK, origK, tempOffset
  1330. #if defined(LEFT)
  1331. sub tempK, tempK, #4
  1332. #else
  1333. sub tempK, tempK, #1
  1334. #endif
  1335. lsl temp, tempK, #5
  1336. add pA, pA, temp
  1337. lsl temp, tempK, #3
  1338. add pB, pB, temp
  1339. #endif
  1340. #if defined(LEFT)
  1341. add tempOffset, tempOffset, #4
  1342. #endif
  1343. .Ldtrmm_kernel_L1_M4_END:
  1344. .Ldtrmm_kernel_L1_M2_BEGIN:
  1345. mov counterI, origM
  1346. tst counterI , #3
  1347. ble .Ldtrmm_kernel_L1_END
  1348. tst counterI, #2 // counterI = counterI / 2
  1349. ble .Ldtrmm_kernel_L1_M1_BEGIN
  1350. .Ldtrmm_kernel_L1_M2_20:
  1351. INIT2x1
  1352. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1353. mov pB, origPB
  1354. #else
  1355. mov pB, origPB
  1356. lsl temp, tempOffset, #3
  1357. add pB, pB, temp
  1358. lsl temp, tempOffset, #4
  1359. add pA, pA, temp
  1360. #endif
  1361. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1362. sub tempK, origK, tempOffset
  1363. #elif defined(LEFT)
  1364. add tempK, tempOffset, #2
  1365. #else
  1366. add tempK, tempOffset, #1
  1367. #endif
  1368. asr counterL , tempK, #3 // counterL = counterL / 8
  1369. cmp counterL , #0
  1370. ble .Ldtrmm_kernel_L1_M2_40
  1371. .Ldtrmm_kernel_L1_M2_22:
  1372. KERNEL2x1_SUB
  1373. KERNEL2x1_SUB
  1374. KERNEL2x1_SUB
  1375. KERNEL2x1_SUB
  1376. KERNEL2x1_SUB
  1377. KERNEL2x1_SUB
  1378. KERNEL2x1_SUB
  1379. KERNEL2x1_SUB
  1380. subs counterL, counterL, #1
  1381. bgt .Ldtrmm_kernel_L1_M2_22
  1382. .Ldtrmm_kernel_L1_M2_40:
  1383. ands counterL , tempK, #7 // counterL = counterL % 8
  1384. ble .Ldtrmm_kernel_L1_M2_100
  1385. .Ldtrmm_kernel_L1_M2_42:
  1386. KERNEL2x1_SUB
  1387. subs counterL, counterL, #1
  1388. bgt .Ldtrmm_kernel_L1_M2_42
  1389. .Ldtrmm_kernel_L1_M2_100:
  1390. SAVE2x1
  1391. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1392. sub tempK, origK, tempOffset
  1393. #if defined(LEFT)
  1394. sub tempK, tempK, #2
  1395. #else
  1396. sub tempK, tempK, #1
  1397. #endif
  1398. lsl temp, tempK, #4
  1399. add pA, pA, temp
  1400. lsl temp, tempK, #3
  1401. add pB, pB, temp
  1402. #endif
  1403. #if defined(LEFT)
  1404. add tempOffset, tempOffset, #2
  1405. #endif
  1406. .Ldtrmm_kernel_L1_M2_END:
  1407. .Ldtrmm_kernel_L1_M1_BEGIN:
  1408. tst counterI, #1 // counterI = counterI % 2
  1409. ble .Ldtrmm_kernel_L1_END
  1410. .Ldtrmm_kernel_L1_M1_20:
  1411. INIT1x1
  1412. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1413. mov pB, origPB
  1414. #else
  1415. mov pB, origPB
  1416. lsl temp, tempOffset, #3
  1417. add pB, pB, temp
  1418. lsl temp, tempOffset, #3
  1419. add pA, pA, temp
  1420. #endif
  1421. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1422. sub tempK, origK, tempOffset
  1423. #elif defined(LEFT)
  1424. add tempK, tempOffset, #1
  1425. #else
  1426. add tempK, tempOffset, #1
  1427. #endif
  1428. asr counterL , tempK, #3 // counterL = counterL / 8
  1429. cmp counterL , #0
  1430. ble .Ldtrmm_kernel_L1_M1_40
  1431. .Ldtrmm_kernel_L1_M1_22:
  1432. KERNEL1x1_SUB
  1433. KERNEL1x1_SUB
  1434. KERNEL1x1_SUB
  1435. KERNEL1x1_SUB
  1436. KERNEL1x1_SUB
  1437. KERNEL1x1_SUB
  1438. KERNEL1x1_SUB
  1439. KERNEL1x1_SUB
  1440. subs counterL, counterL, #1
  1441. bgt .Ldtrmm_kernel_L1_M1_22
  1442. .Ldtrmm_kernel_L1_M1_40:
  1443. ands counterL , tempK, #7 // counterL = counterL % 8
  1444. ble .Ldtrmm_kernel_L1_M1_100
  1445. .Ldtrmm_kernel_L1_M1_42:
  1446. KERNEL1x1_SUB
  1447. subs counterL, counterL, #1
  1448. bgt .Ldtrmm_kernel_L1_M1_42
  1449. .Ldtrmm_kernel_L1_M1_100:
  1450. SAVE1x1
  1451. .Ldtrmm_kernel_L1_END:
  1452. .Ldtrmm_kernel_L999:
  1453. mov x0, #0 // set return value
  1454. ldp d8, d9, [sp, #(0 * 16)]
  1455. ldp d10, d11, [sp, #(1 * 16)]
  1456. ldp d12, d13, [sp, #(2 * 16)]
  1457. ldp d14, d15, [sp, #(3 * 16)]
  1458. ldp d16, d17, [sp, #(4 * 16)]
  1459. ldp x18, x19, [sp, #(5 * 16)]
  1460. ldp x20, x21, [sp, #(6 * 16)]
  1461. ldp x22, x23, [sp, #(7 * 16)]
  1462. ldp x24, x25, [sp, #(8 * 16)]
  1463. ldp x26, x27, [sp, #(9 * 16)]
  1464. ldr x28, [sp, #(10 * 16)]
  1465. add sp, sp, #(11*16)
  1466. ret
  1467. EPILOGUE