You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x4_vfpv3.S 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-32] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-244 ]
  48. #define KKK [fp, #-248]
  49. #define LDC [fp, #-252 ]
  50. #define M [fp, #-256 ]
  51. #define N [fp, #-260 ]
  52. #define K [fp, #-264 ]
  53. #define A [fp, #-268 ]
  54. #define FP_ZERO [fp, #-240]
  55. #define FP_ZERO_0 [fp, # -240]
  56. #define FP_ZERO_1 [fp, # -236]
  57. #define ALPHA [fp, #-280]
  58. #define B [fp, #4 ]
  59. #define C [fp, #8 ]
  60. #define OLD_LDC [fp, #12 ]
  61. #define OFFSET [fp, #16 ]
  62. #define I r0
  63. #define J r1
  64. #define L r2
  65. #define AO r5
  66. #define BO r6
  67. #define CO1 r8
  68. #define CO2 r9
  69. #define K1 r7
  70. #define BC r12
  71. #define A_PRE 96
  72. #define B_PRE 96
  73. #define C_PRE 64
  74. /**************************************************************************************
  75. * Macro definitions
  76. **************************************************************************************/
  77. .macro INIT4x4
  78. flds S16, FP_ZERO
  79. vmov.f32 s17, s16
  80. vmov.f32 s18, s16
  81. vmov.f32 s19, s16
  82. vmov.f32 s20, s16
  83. vmov.f32 s21, s16
  84. vmov.f32 s22, s16
  85. vmov.f32 s23, s16
  86. vmov.f32 s24, s16
  87. vmov.f32 s25, s16
  88. vmov.f32 s26, s16
  89. vmov.f32 s27, s16
  90. vmov.f32 s28, s16
  91. vmov.f32 s29, s16
  92. vmov.f32 s30, s16
  93. vmov.f32 s31, s16
  94. .endm
  95. .macro KERNEL4x4_I
  96. fldmias AO!, { s0 - s1 }
  97. pld [ AO , #A_PRE-8 ]
  98. fldmias BO!, { s8 - s9 }
  99. pld [ BO , #B_PRE-8 ]
  100. fmuls s16 , s0, s8
  101. fldmias AO!, { s2 - s3 }
  102. fmuls s17 , s1, s8
  103. fmuls s18 , s2, s8
  104. fldmias BO!, { s10 - s11 }
  105. fmuls s19 , s3, s8
  106. fmuls s20 , s0, s9
  107. fldmias AO!, { s4 - s5 }
  108. fmuls s21 , s1, s9
  109. fmuls s22 , s2, s9
  110. fldmias AO!, { s6 - s7 }
  111. fmuls s23 , s3, s9
  112. fmuls s24 , s0, s10
  113. fldmias BO!, { s12 - s13 }
  114. fmuls s25 , s1, s10
  115. fmuls s26 , s2, s10
  116. fldmias BO!, { s14 - s15 }
  117. fmuls s27 , s3, s10
  118. fmuls s28 , s0, s11
  119. fmuls s29 , s1, s11
  120. fmuls s30 , s2, s11
  121. fmuls s31 , s3, s11
  122. .endm
  123. .macro KERNEL4x4_M2
  124. pld [ AO , #A_PRE ]
  125. fmacs s16 , s4, s12
  126. fmacs s17 , s5, s12
  127. fldmias AO!, { s0 - s1 }
  128. fmacs s18 , s6, s12
  129. pld [ BO , #B_PRE ]
  130. fmacs s19 , s7, s12
  131. fmacs s20 , s4, s13
  132. fldmias AO!, { s2 - s3 }
  133. fmacs s21 , s5, s13
  134. fmacs s22 , s6, s13
  135. fldmias BO!, { s8 - s9 }
  136. fmacs s23 , s7, s13
  137. fmacs s24 , s4, s14
  138. fldmias BO!, { s10 - s11 }
  139. fmacs s25 , s5, s14
  140. fmacs s26 , s6, s14
  141. fmacs s27 , s7, s14
  142. fmacs s28 , s4, s15
  143. fmacs s29 , s5, s15
  144. fmacs s30 , s6, s15
  145. fmacs s31 , s7, s15
  146. .endm
  147. .macro KERNEL4x4_M1
  148. fmacs s16 , s0, s8
  149. fldmias AO!, { s4 - s5 }
  150. fmacs s17 , s1, s8
  151. fmacs s18 , s2, s8
  152. fldmias AO!, { s6 - s7 }
  153. fmacs s19 , s3, s8
  154. fmacs s20 , s0, s9
  155. fldmias BO!, { s12 - s13 }
  156. fmacs s21 , s1, s9
  157. fmacs s22 , s2, s9
  158. fldmias BO!, { s14 - s15 }
  159. fmacs s23 , s3, s9
  160. fmacs s24 , s0, s10
  161. fmacs s25 , s1, s10
  162. fmacs s26 , s2, s10
  163. fmacs s27 , s3, s10
  164. fmacs s28 , s0, s11
  165. fmacs s29 , s1, s11
  166. fmacs s30 , s2, s11
  167. fmacs s31 , s3, s11
  168. .endm
  169. .macro KERNEL4x4_E
  170. fmacs s16 , s4, s12
  171. fmacs s17 , s5, s12
  172. fmacs s18 , s6, s12
  173. fmacs s19 , s7, s12
  174. fmacs s20 , s4, s13
  175. fmacs s21 , s5, s13
  176. fmacs s22 , s6, s13
  177. fmacs s23 , s7, s13
  178. fmacs s24 , s4, s14
  179. fmacs s25 , s5, s14
  180. fmacs s26 , s6, s14
  181. fmacs s27 , s7, s14
  182. fmacs s28 , s4, s15
  183. fmacs s29 , s5, s15
  184. fmacs s30 , s6, s15
  185. fmacs s31 , s7, s15
  186. .endm
  187. .macro KERNEL4x4_SUB
  188. flds s8 , [ BO ]
  189. pld [ BO , #B_PRE ]
  190. flds s0 , [ AO ]
  191. pld [ AO , #A_PRE ]
  192. flds s1 , [ AO, #4 ]
  193. fmacs s16 , s0, s8
  194. flds s2 , [ AO, #8 ]
  195. fmacs s17 , s1, s8
  196. flds s3 , [ AO, #12 ]
  197. fmacs s18 , s2, s8
  198. flds s9 , [ BO, #4 ]
  199. fmacs s19 , s3, s8
  200. flds s10, [ BO, #8 ]
  201. fmacs s20 , s0, s9
  202. flds s11, [ BO, #12 ]
  203. fmacs s21 , s1, s9
  204. fmacs s22 , s2, s9
  205. fmacs s23 , s3, s9
  206. fmacs s24 , s0, s10
  207. fmacs s25 , s1, s10
  208. fmacs s26 , s2, s10
  209. fmacs s27 , s3, s10
  210. fmacs s28 , s0, s11
  211. fmacs s29 , s1, s11
  212. add AO , AO, #16
  213. fmacs s30 , s2, s11
  214. add BO , BO, #16
  215. fmacs s31 , s3, s11
  216. .endm
  217. .macro SAVE4x4
  218. ldr r3 , LDC
  219. add CO2 , CO1, r3
  220. flds s0, ALPHA
  221. add r4 , CO2, r3
  222. fmuls s8 , s0 , s16
  223. fmuls s9 , s0 , s17
  224. fmuls s10, s0 , s18
  225. fmuls s11, s0 , s19
  226. fmuls s12, s0 , s20
  227. fsts s8 , [CO1]
  228. fmuls s13, s0 , s21
  229. fsts s9 , [CO1, #4 ]
  230. fmuls s14, s0 , s22
  231. fsts s10, [CO1, #8 ]
  232. fmuls s15, s0 , s23
  233. fsts s11, [CO1, #12 ]
  234. fmuls s8 , s0 , s24
  235. fsts s12, [CO2]
  236. fmuls s9 , s0 , s25
  237. fsts s13, [CO2, #4 ]
  238. fmuls s10, s0 , s26
  239. fsts s14, [CO2, #8 ]
  240. fmuls s11, s0 , s27
  241. fsts s15, [CO2, #12 ]
  242. add CO2, r4 , r3
  243. fsts s8 , [r4 ]
  244. fmuls s12, s0 , s28
  245. fsts s9 , [r4 , #4 ]
  246. fmuls s13, s0 , s29
  247. fsts s10, [r4 , #8 ]
  248. fmuls s14, s0 , s30
  249. fsts s11, [r4 , #12 ]
  250. fmuls s15, s0 , s31
  251. fstmias CO2, { s12 - s15 }
  252. add CO1, CO1, #16
  253. .endm
  254. /******************************************************************************/
  255. .macro INIT2x4
  256. flds S16, FP_ZERO
  257. vmov.f32 s17, s16
  258. vmov.f32 s20, s16
  259. vmov.f32 s21, s16
  260. vmov.f32 s24, s16
  261. vmov.f32 s25, s16
  262. vmov.f32 s28, s16
  263. vmov.f32 s29, s16
  264. .endm
  265. .macro KERNEL2x4_SUB
  266. flds s8 , [ BO ]
  267. flds s9 , [ BO, #4 ]
  268. flds s10, [ BO, #8 ]
  269. flds s11, [ BO, #12 ]
  270. flds s0 , [ AO ]
  271. flds s1 , [ AO, #4 ]
  272. fmacs s16 , s0, s8
  273. fmacs s17 , s1, s8
  274. fmacs s20 , s0, s9
  275. fmacs s21 , s1, s9
  276. fmacs s24 , s0, s10
  277. fmacs s25 , s1, s10
  278. fmacs s28 , s0, s11
  279. fmacs s29 , s1, s11
  280. add AO , AO, #8
  281. add BO , BO, #16
  282. .endm
  283. .macro SAVE2x4
  284. ldr r3 , LDC
  285. add CO2 , CO1, r3
  286. add r4 , CO2, r3
  287. flds s0, ALPHA
  288. fmuls s8 , s0 , s16
  289. fmuls s9 , s0 , s17
  290. fsts s8 , [CO1]
  291. fsts s9 , [CO1, #4 ]
  292. fmuls s12, s0 , s20
  293. fmuls s13, s0 , s21
  294. fsts s12, [CO2]
  295. fsts s13, [CO2, #4 ]
  296. fmuls s8 , s0 , s24
  297. fmuls s9 , s0 , s25
  298. fsts s8 , [r4 ]
  299. fsts s9 , [r4 , #4 ]
  300. add CO2, r4 , r3
  301. fmuls s12, s0 , s28
  302. fmuls s13, s0 , s29
  303. fsts s12, [CO2]
  304. fsts s13, [CO2, #4 ]
  305. add CO1, CO1, #8
  306. .endm
  307. /******************************************************************************/
  308. .macro INIT1x4
  309. flds S16, FP_ZERO
  310. vmov.f32 s20, s16
  311. vmov.f32 s24, s16
  312. vmov.f32 s28, s16
  313. .endm
  314. .macro KERNEL1x4_SUB
  315. flds s8 , [ BO ]
  316. flds s9 , [ BO, #4 ]
  317. flds s10, [ BO, #8 ]
  318. flds s11, [ BO, #12 ]
  319. flds s0 , [ AO ]
  320. fmacs s16 , s0, s8
  321. fmacs s20 , s0, s9
  322. fmacs s24 , s0, s10
  323. fmacs s28 , s0, s11
  324. add AO , AO, #4
  325. add BO , BO, #16
  326. .endm
  327. .macro SAVE1x4
  328. ldr r3 , LDC
  329. add CO2 , CO1, r3
  330. add r4 , CO2, r3
  331. flds s0, ALPHA
  332. fmuls s8 , s0 , s16
  333. fsts s8 , [CO1]
  334. fmuls s12, s0 , s20
  335. fsts s12, [CO2]
  336. fmuls s8 , s0 , s24
  337. fsts s8 , [r4 ]
  338. add CO2, r4 , r3
  339. fmuls s12, s0 , s28
  340. fsts s12, [CO2]
  341. add CO1, CO1, #4
  342. .endm
  343. /******************************************************************************/
  344. /******************************************************************************/
  345. .macro INIT4x2
  346. flds S16, FP_ZERO
  347. vmov.f32 s17, s16
  348. vmov.f32 s18, s16
  349. vmov.f32 s19, s16
  350. vmov.f32 s20, s16
  351. vmov.f32 s21, s16
  352. vmov.f32 s22, s16
  353. vmov.f32 s23, s16
  354. .endm
  355. .macro KERNEL4x2_SUB
  356. flds s8 , [ BO ]
  357. flds s9 , [ BO, #4 ]
  358. flds s0 , [ AO ]
  359. flds s1 , [ AO, #4 ]
  360. flds s2 , [ AO, #8 ]
  361. flds s3 , [ AO, #12 ]
  362. fmacs s16 , s0, s8
  363. fmacs s17 , s1, s8
  364. fmacs s18 , s2, s8
  365. fmacs s19 , s3, s8
  366. fmacs s20 , s0, s9
  367. fmacs s21 , s1, s9
  368. fmacs s22 , s2, s9
  369. fmacs s23 , s3, s9
  370. add AO , AO, #16
  371. add BO , BO, #8
  372. .endm
  373. .macro SAVE4x2
  374. ldr r3 , LDC
  375. add CO2 , CO1, r3
  376. flds s0, ALPHA
  377. fmuls s8 , s0 , s16
  378. fmuls s9 , s0 , s17
  379. fmuls s10, s0 , s18
  380. fmuls s11, s0 , s19
  381. fsts s8 , [CO1]
  382. fsts s9 , [CO1, #4 ]
  383. fsts s10, [CO1, #8 ]
  384. fsts s11, [CO1, #12 ]
  385. fmuls s12, s0 , s20
  386. fmuls s13, s0 , s21
  387. fmuls s14, s0 , s22
  388. fmuls s15, s0 , s23
  389. fsts s12, [CO2]
  390. fsts s13, [CO2, #4 ]
  391. fsts s14, [CO2, #8 ]
  392. fsts s15, [CO2, #12 ]
  393. add CO1, CO1, #16
  394. .endm
  395. /******************************************************************************/
  396. .macro INIT2x2
  397. flds S16, FP_ZERO
  398. vmov.f32 s17, s16
  399. vmov.f32 s20, s16
  400. vmov.f32 s21, s16
  401. .endm
  402. .macro KERNEL2x2_SUB
  403. flds s8 , [ BO ]
  404. flds s9 , [ BO, #4 ]
  405. flds s0 , [ AO ]
  406. flds s1 , [ AO, #4 ]
  407. fmacs s16 , s0, s8
  408. fmacs s17 , s1, s8
  409. fmacs s20 , s0, s9
  410. fmacs s21 , s1, s9
  411. add AO , AO, #8
  412. add BO , BO, #8
  413. .endm
  414. .macro SAVE2x2
  415. ldr r3 , LDC
  416. add CO2 , CO1, r3
  417. flds s0, ALPHA
  418. fmuls s8 , s0 , s16
  419. fmuls s9 , s0 , s17
  420. fsts s8 , [CO1]
  421. fsts s9 , [CO1, #4 ]
  422. fmuls s12, s0 , s20
  423. fmuls s13, s0 , s21
  424. fsts s12, [CO2]
  425. fsts s13, [CO2, #4 ]
  426. add CO1, CO1, #8
  427. .endm
  428. /******************************************************************************/
  429. .macro INIT1x2
  430. flds S16, FP_ZERO
  431. vmov.f32 s20, s16
  432. .endm
  433. .macro KERNEL1x2_SUB
  434. flds s8 , [ BO ]
  435. flds s9 , [ BO, #4 ]
  436. flds s0 , [ AO ]
  437. fmacs s16 , s0, s8
  438. fmacs s20 , s0, s9
  439. add AO , AO, #4
  440. add BO , BO, #8
  441. .endm
  442. .macro SAVE1x2
  443. ldr r3 , LDC
  444. add CO2 , CO1, r3
  445. flds s0, ALPHA
  446. fmuls s8 , s0 , s16
  447. fsts s8 , [CO1]
  448. fmuls s12, s0 , s20
  449. fsts s12, [CO2]
  450. add CO1, CO1, #4
  451. .endm
  452. /******************************************************************************/
  453. /******************************************************************************/
  454. .macro INIT4x1
  455. flds S16, FP_ZERO
  456. vmov.f32 s17, s16
  457. vmov.f32 s18, s16
  458. vmov.f32 s19, s16
  459. .endm
  460. .macro KERNEL4x1_SUB
  461. flds s8 , [ BO ]
  462. flds s0 , [ AO ]
  463. flds s1 , [ AO, #4 ]
  464. flds s2 , [ AO, #8 ]
  465. flds s3 , [ AO, #12 ]
  466. fmacs s16 , s0, s8
  467. fmacs s17 , s1, s8
  468. fmacs s18 , s2, s8
  469. fmacs s19 , s3, s8
  470. add AO , AO, #16
  471. add BO , BO, #4
  472. .endm
  473. .macro SAVE4x1
  474. flds s0, ALPHA
  475. fmuls s8 , s0 , s16
  476. fmuls s9 , s0 , s17
  477. fmuls s10, s0 , s18
  478. fmuls s11, s0 , s19
  479. fsts s8 , [CO1]
  480. fsts s9 , [CO1, #4 ]
  481. fsts s10, [CO1, #8 ]
  482. fsts s11, [CO1, #12 ]
  483. add CO1, CO1, #16
  484. .endm
  485. /******************************************************************************/
  486. .macro INIT2x1
  487. flds S16, FP_ZERO
  488. vmov.f32 s17, s16
  489. .endm
  490. .macro KERNEL2x1_SUB
  491. flds s8 , [ BO ]
  492. flds s0 , [ AO ]
  493. flds s1 , [ AO, #4 ]
  494. fmacs s16 , s0, s8
  495. fmacs s17 , s1, s8
  496. add AO , AO, #8
  497. add BO , BO, #4
  498. .endm
  499. .macro SAVE2x1
  500. flds s0, ALPHA
  501. fmuls s8 , s0 , s16
  502. fmuls s9 , s0 , s17
  503. fsts s8 , [CO1]
  504. fsts s9 , [CO1, #4 ]
  505. add CO1, CO1, #8
  506. .endm
  507. /******************************************************************************/
  508. .macro INIT1x1
  509. flds S16, FP_ZERO
  510. .endm
  511. .macro KERNEL1x1_SUB
  512. flds s8 , [ BO ]
  513. flds s0 , [ AO ]
  514. fmacs s16 , s0, s8
  515. add AO , AO, #4
  516. add BO , BO, #4
  517. .endm
  518. .macro SAVE1x1
  519. flds s0, ALPHA
  520. fmuls s8 , s0 , s16
  521. fsts s8 , [CO1]
  522. add CO1, CO1, #4
  523. .endm
  524. /**************************************************************************************
  525. * End of macro definitions
  526. **************************************************************************************/
  527. PROLOGUE
  528. .align 5
  529. push {r4 - r9, fp}
  530. add fp, sp, #24
  531. sub sp, sp, #STACKSIZE // reserve stack
  532. str OLD_M, M
  533. str OLD_N, N
  534. str OLD_K, K
  535. str OLD_A, A
  536. vstr OLD_ALPHA, ALPHA
  537. sub r3, fp, #128
  538. vstm r3, { s8 - s31} // store floating point registers
  539. movs r4, #0
  540. str r4, FP_ZERO
  541. str r4, FP_ZERO_1
  542. ldr r3, OLD_LDC
  543. lsl r3, r3, #2 // ldc = ldc * 4
  544. str r3, LDC
  545. ldr r3, OFFSET
  546. #ifndef LEFT
  547. neg r3 , r3
  548. #endif
  549. str r3 , KK
  550. ldr BC, B
  551. ldr J, N
  552. asrs J, J, #2 // J = J / 4
  553. ble _L2_BEGIN
  554. _L4_BEGIN:
  555. ldr CO1, C // CO1 = C
  556. ldr r4 , LDC
  557. lsl r4 , r4 , #2 // LDC * 4
  558. add r3 , r4, CO1
  559. str r3 , C // store C
  560. #if defined(LEFT)
  561. ldr r3 , OFFSET
  562. str r3 , KK
  563. #endif
  564. ldr AO, A // AO = A
  565. pld [AO , #A_PRE-64]
  566. pld [AO , #A_PRE-32]
  567. _L4_M4_BEGIN:
  568. ldr I, M
  569. asrs I, I, #2 // I = I / 4
  570. ble _L4_M2_BEGIN
  571. _L4_M4_20:
  572. #if (defined(LEFT) && defined(TRANSA)) || \
  573. (!defined(LEFT) && !defined(TRANSA))
  574. mov BO, BC
  575. #else
  576. mov BO, BC
  577. ldr r3 , KK
  578. lsls r4 , r3 , #4 // 4 float values
  579. add BO , BO , r4
  580. lsls r4 , r3 , #4 // 4 float values
  581. add AO , AO , r4
  582. #endif
  583. #ifndef TRMMKERNEL
  584. ldr K1, K
  585. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  586. ldr K1, K
  587. ldr r3, KK
  588. sub K1, K1, r3
  589. str K1, KKK
  590. #else
  591. ldr K1, KK
  592. #ifdef LEFT
  593. add K1, K1, #4 // number of values in AO
  594. #else
  595. add K1, K1, #4 // number of values in BO
  596. #endif
  597. str K1, KKK
  598. #endif
  599. asrs L , K1, #3 // L = L / 8
  600. cmp L , #3
  601. blt _L4_M4_30
  602. .align 5
  603. KERNEL4x4_I
  604. KERNEL4x4_M2
  605. KERNEL4x4_M1
  606. KERNEL4x4_M2
  607. KERNEL4x4_M1
  608. KERNEL4x4_M2
  609. KERNEL4x4_M1
  610. KERNEL4x4_M2
  611. sub L, L, #2
  612. _L4_M4_22:
  613. KERNEL4x4_M1
  614. KERNEL4x4_M2
  615. KERNEL4x4_M1
  616. KERNEL4x4_M2
  617. KERNEL4x4_M1
  618. KERNEL4x4_M2
  619. KERNEL4x4_M1
  620. KERNEL4x4_M2
  621. subs L, L, #1
  622. bgt _L4_M4_22
  623. KERNEL4x4_M1
  624. KERNEL4x4_M2
  625. KERNEL4x4_M1
  626. KERNEL4x4_M2
  627. KERNEL4x4_M1
  628. KERNEL4x4_M2
  629. KERNEL4x4_M1
  630. KERNEL4x4_E
  631. b _L4_M4_44
  632. _L4_M4_30:
  633. tst L, #3
  634. ble _L4_M4_40
  635. tst L, #2
  636. ble _L4_M4_32
  637. KERNEL4x4_I
  638. KERNEL4x4_M2
  639. KERNEL4x4_M1
  640. KERNEL4x4_M2
  641. KERNEL4x4_M1
  642. KERNEL4x4_M2
  643. KERNEL4x4_M1
  644. KERNEL4x4_M2
  645. KERNEL4x4_M1
  646. KERNEL4x4_M2
  647. KERNEL4x4_M1
  648. KERNEL4x4_M2
  649. KERNEL4x4_M1
  650. KERNEL4x4_M2
  651. KERNEL4x4_M1
  652. KERNEL4x4_E
  653. b _L4_M4_44
  654. _L4_M4_32:
  655. tst L, #1
  656. ble _L4_M4_40
  657. KERNEL4x4_I
  658. KERNEL4x4_M2
  659. KERNEL4x4_M1
  660. KERNEL4x4_M2
  661. KERNEL4x4_M1
  662. KERNEL4x4_M2
  663. KERNEL4x4_M1
  664. KERNEL4x4_E
  665. b _L4_M4_44
  666. _L4_M4_40:
  667. INIT4x4
  668. _L4_M4_44:
  669. ands L , K1, #7 // L = L % 8
  670. ble _L4_M4_100
  671. _L4_M4_46:
  672. KERNEL4x4_SUB
  673. subs L, L, #1
  674. bne _L4_M4_46
  675. _L4_M4_100:
  676. SAVE4x4
  677. #if (defined(LEFT) && defined(TRANSA)) || \
  678. (!defined(LEFT) && !defined(TRANSA))
  679. ldr r3 , K
  680. ldr r4 , KKK
  681. sub r3 , r3 , r4
  682. lsls r4 , r3 , #4 // 4 float values
  683. add BO , BO , r4
  684. lsls r4 , r3 , #4 // 4 float values
  685. add AO , AO , r4
  686. #endif
  687. #if defined(LEFT)
  688. ldr r3 , KK
  689. add r3 , r3 , #4 // number of values in AO
  690. str r3 , KK
  691. #endif
  692. _L4_M4_END:
  693. subs I, I, #1
  694. bne _L4_M4_20
  695. _L4_M2_BEGIN:
  696. ldr I, M
  697. tst I , #3
  698. ble _L4_END
  699. tst I, #2 // I = I / 2
  700. ble _L4_M1_BEGIN
  701. _L4_M2_20:
  702. INIT2x4
  703. #if (defined(LEFT) && defined(TRANSA)) || \
  704. (!defined(LEFT) && !defined(TRANSA))
  705. mov BO, BC
  706. #else
  707. mov BO, BC
  708. ldr r3 , KK
  709. lsls r4 , r3 , #4 // 4 float values
  710. add BO , BO , r4
  711. lsls r4 , r3 , #3 // 2 float values
  712. add AO , AO , r4
  713. #endif
  714. #ifndef TRMMKERNEL
  715. ldr K1, K
  716. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  717. ldr K1, K
  718. ldr r3, KK
  719. sub K1, K1, r3
  720. str K1, KKK
  721. #else
  722. ldr K1, KK
  723. #ifdef LEFT
  724. add K1, K1, #2 // number of values in AO
  725. #else
  726. add K1, K1, #4 // number of values in BO
  727. #endif
  728. str K1, KKK
  729. #endif
  730. asrs L , K1, #3 // L = L / 8
  731. ble _L4_M2_40
  732. _L4_M2_22:
  733. KERNEL2x4_SUB
  734. KERNEL2x4_SUB
  735. KERNEL2x4_SUB
  736. KERNEL2x4_SUB
  737. KERNEL2x4_SUB
  738. KERNEL2x4_SUB
  739. KERNEL2x4_SUB
  740. KERNEL2x4_SUB
  741. subs L, L, #1
  742. bgt _L4_M2_22
  743. _L4_M2_40:
  744. ands L , K1, #7 // L = L % 8
  745. ble _L4_M2_100
  746. _L4_M2_42:
  747. KERNEL2x4_SUB
  748. subs L, L, #1
  749. bgt _L4_M2_42
  750. _L4_M2_100:
  751. SAVE2x4
  752. #if (defined(LEFT) && defined(TRANSA)) || \
  753. (!defined(LEFT) && !defined(TRANSA))
  754. ldr r3 , K
  755. ldr r4 , KKK
  756. sub r3 , r3 , r4
  757. lsls r4 , r3 , #4 // 4 float values
  758. add BO , BO , r4
  759. lsls r4 , r3 , #3 // 2 float values
  760. add AO , AO , r4
  761. #endif
  762. #if defined(LEFT)
  763. ldr r3 , KK
  764. add r3 , r3 , #2 // number of values in AO
  765. str r3 , KK
  766. #endif
  767. _L4_M2_END:
  768. _L4_M1_BEGIN:
  769. tst I, #1 // I = I % 2
  770. ble _L4_END
  771. _L4_M1_20:
  772. INIT1x4
  773. #if (defined(LEFT) && defined(TRANSA)) || \
  774. (!defined(LEFT) && !defined(TRANSA))
  775. mov BO, BC
  776. #else
  777. mov BO, BC
  778. ldr r3 , KK
  779. lsls r4 , r3 , #4 // 4 float values
  780. add BO , BO , r4
  781. lsls r4 , r3 , #2 // 1 float value
  782. add AO , AO , r4
  783. #endif
  784. #ifndef TRMMKERNEL
  785. ldr K1, K
  786. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  787. ldr K1, K
  788. ldr r3, KK
  789. sub K1, K1, r3
  790. str K1, KKK
  791. #else
  792. ldr K1, KK
  793. #ifdef LEFT
  794. add K1, K1, #1 // number of values in AO
  795. #else
  796. add K1, K1, #4 // number of values in BO
  797. #endif
  798. str K1, KKK
  799. #endif
  800. asrs L , K1, #3 // L = L / 8
  801. ble _L4_M1_40
  802. _L4_M1_22:
  803. KERNEL1x4_SUB
  804. KERNEL1x4_SUB
  805. KERNEL1x4_SUB
  806. KERNEL1x4_SUB
  807. KERNEL1x4_SUB
  808. KERNEL1x4_SUB
  809. KERNEL1x4_SUB
  810. KERNEL1x4_SUB
  811. subs L, L, #1
  812. bgt _L4_M1_22
  813. _L4_M1_40:
  814. ands L , K1, #7 // L = L % 8
  815. ble _L4_M1_100
  816. _L4_M1_42:
  817. KERNEL1x4_SUB
  818. subs L, L, #1
  819. bgt _L4_M1_42
  820. _L4_M1_100:
  821. SAVE1x4
  822. #if (defined(LEFT) && defined(TRANSA)) || \
  823. (!defined(LEFT) && !defined(TRANSA))
  824. ldr r3 , K
  825. ldr r4 , KKK
  826. sub r3 , r3 , r4
  827. lsls r4 , r3 , #4 // 4 float values
  828. add BO , BO , r4
  829. lsls r4 , r3 , #2 // 1 float value
  830. add AO , AO , r4
  831. #endif
  832. #if defined(LEFT)
  833. ldr r3 , KK
  834. add r3 , r3 , #1 // number of values in AO
  835. str r3 , KK
  836. #endif
  837. _L4_END:
  838. mov r3, BC
  839. ldr r4, K
  840. lsl r4, r4, #4 // k * 4 * 4
  841. add r3, r3, r4 // B = B + K * 4 * 4
  842. mov BC, r3
  843. #if !defined(LEFT)
  844. ldr r3 , KK
  845. add r3 , r3 , #4 // number of values in BO
  846. str r3 , KK
  847. #endif
  848. subs J , #1 // j--
  849. bgt _L4_BEGIN
  850. /*********************************************************************************************/
  851. _L2_BEGIN:
  852. ldr J , N
  853. tst J , #3
  854. ble _L999
  855. tst J , #2
  856. ble _L1_BEGIN
  857. ldr CO1, C // CO1 = C
  858. ldr r4 , LDC
  859. lsl r4 , r4 , #1 // LDC * 2
  860. add r3 , r4, CO1
  861. str r3 , C // store C
  862. #if defined(LEFT)
  863. ldr r3 , OFFSET
  864. str r3 , KK
  865. #endif
  866. ldr AO, A // AO = A
  867. //pld [AO , #A_PRE-96]
  868. //pld [AO , #A_PRE-64]
  869. //pld [AO , #A_PRE-32]
  870. _L2_M4_BEGIN:
  871. ldr I, M
  872. asrs I, I, #2 // I = I / 4
  873. ble _L2_M2_BEGIN
  874. _L2_M4_20:
  875. INIT4x2
  876. #if (defined(LEFT) && defined(TRANSA)) || \
  877. (!defined(LEFT) && !defined(TRANSA))
  878. mov BO, BC
  879. #else
  880. mov BO, BC
  881. ldr r3 , KK
  882. lsls r4 , r3 , #3 // 2 float values
  883. add BO , BO , r4
  884. lsls r4 , r3 , #4 // 4 float values
  885. add AO , AO , r4
  886. #endif
  887. #ifndef TRMMKERNEL
  888. ldr K1, K
  889. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  890. ldr K1, K
  891. ldr r3, KK
  892. sub K1, K1, r3
  893. str K1, KKK
  894. #else
  895. ldr K1, KK
  896. #ifdef LEFT
  897. add K1, K1, #4 // number of values in AO
  898. #else
  899. add K1, K1, #2 // number of values in BO
  900. #endif
  901. str K1, KKK
  902. #endif
  903. asrs L , K1, #3 // L = L / 8
  904. ble _L2_M4_40
  905. .align 5
  906. _L2_M4_22:
  907. KERNEL4x2_SUB
  908. KERNEL4x2_SUB
  909. KERNEL4x2_SUB
  910. KERNEL4x2_SUB
  911. KERNEL4x2_SUB
  912. KERNEL4x2_SUB
  913. KERNEL4x2_SUB
  914. KERNEL4x2_SUB
  915. subs L, L, #1
  916. bgt _L2_M4_22
  917. _L2_M4_40:
  918. ands L , K1, #7 // L = L % 8
  919. ble _L2_M4_100
  920. _L2_M4_42:
  921. KERNEL4x2_SUB
  922. subs L, L, #1
  923. bgt _L2_M4_42
  924. _L2_M4_100:
  925. SAVE4x2
  926. #if (defined(LEFT) && defined(TRANSA)) || \
  927. (!defined(LEFT) && !defined(TRANSA))
  928. ldr r3 , K
  929. ldr r4 , KKK
  930. sub r3 , r3 , r4
  931. lsls r4 , r3 , #3 // 2 float values
  932. add BO , BO , r4
  933. lsls r4 , r3 , #4 // 4 float values
  934. add AO , AO , r4
  935. #endif
  936. #if defined(LEFT)
  937. ldr r3 , KK
  938. add r3 , r3 , #4 // number of values in AO
  939. str r3 , KK
  940. #endif
  941. _L2_M4_END:
  942. subs I, I, #1
  943. bgt _L2_M4_20
  944. _L2_M2_BEGIN:
  945. ldr I, M
  946. tst I , #3
  947. ble _L2_END
  948. tst I, #2 // I = I / 2
  949. ble _L2_M1_BEGIN
  950. _L2_M2_20:
  951. INIT2x2
  952. #if (defined(LEFT) && defined(TRANSA)) || \
  953. (!defined(LEFT) && !defined(TRANSA))
  954. mov BO, BC
  955. #else
  956. mov BO, BC
  957. ldr r3 , KK
  958. lsls r4 , r3 , #3 // 2 float values
  959. add BO , BO , r4
  960. lsls r4 , r3 , #3 // 2 float values
  961. add AO , AO , r4
  962. #endif
  963. #ifndef TRMMKERNEL
  964. ldr K1, K
  965. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  966. ldr K1, K
  967. ldr r3, KK
  968. sub K1, K1, r3
  969. str K1, KKK
  970. #else
  971. ldr K1, KK
  972. #ifdef LEFT
  973. add K1, K1, #2 // number of values in AO
  974. #else
  975. add K1, K1, #2 // number of values in BO
  976. #endif
  977. str K1, KKK
  978. #endif
  979. asrs L , K1, #3 // L = L / 8
  980. ble _L2_M2_40
  981. _L2_M2_22:
  982. KERNEL2x2_SUB
  983. KERNEL2x2_SUB
  984. KERNEL2x2_SUB
  985. KERNEL2x2_SUB
  986. KERNEL2x2_SUB
  987. KERNEL2x2_SUB
  988. KERNEL2x2_SUB
  989. KERNEL2x2_SUB
  990. subs L, L, #1
  991. bgt _L2_M2_22
  992. _L2_M2_40:
  993. ands L , K1, #7 // L = L % 8
  994. ble _L2_M2_100
  995. _L2_M2_42:
  996. KERNEL2x2_SUB
  997. subs L, L, #1
  998. bgt _L2_M2_42
  999. _L2_M2_100:
  1000. SAVE2x2
  1001. #if (defined(LEFT) && defined(TRANSA)) || \
  1002. (!defined(LEFT) && !defined(TRANSA))
  1003. ldr r3 , K
  1004. ldr r4 , KKK
  1005. sub r3 , r3 , r4
  1006. lsls r4 , r3 , #3 // 2 float values
  1007. add BO , BO , r4
  1008. lsls r4 , r3 , #3 // 2 float values
  1009. add AO , AO , r4
  1010. #endif
  1011. #if defined(LEFT)
  1012. ldr r3 , KK
  1013. add r3 , r3 , #2 // number of values in AO
  1014. str r3 , KK
  1015. #endif
  1016. _L2_M2_END:
  1017. _L2_M1_BEGIN:
  1018. tst I, #1 // I = I % 2
  1019. ble _L2_END
  1020. _L2_M1_20:
  1021. INIT1x2
  1022. #if (defined(LEFT) && defined(TRANSA)) || \
  1023. (!defined(LEFT) && !defined(TRANSA))
  1024. mov BO, BC
  1025. #else
  1026. mov BO, BC
  1027. ldr r3 , KK
  1028. lsls r4 , r3 , #3 // 2 float values
  1029. add BO , BO , r4
  1030. lsls r4 , r3 , #2 // 1 float value
  1031. add AO , AO , r4
  1032. #endif
  1033. #ifndef TRMMKERNEL
  1034. ldr K1, K
  1035. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1036. ldr K1, K
  1037. ldr r3, KK
  1038. sub K1, K1, r3
  1039. str K1, KKK
  1040. #else
  1041. ldr K1, KK
  1042. #ifdef LEFT
  1043. add K1, K1, #1 // number of values in AO
  1044. #else
  1045. add K1, K1, #2 // number of values in BO
  1046. #endif
  1047. str K1, KKK
  1048. #endif
  1049. asrs L , K1, #3 // L = L / 8
  1050. ble _L2_M1_40
  1051. _L2_M1_22:
  1052. KERNEL1x2_SUB
  1053. KERNEL1x2_SUB
  1054. KERNEL1x2_SUB
  1055. KERNEL1x2_SUB
  1056. KERNEL1x2_SUB
  1057. KERNEL1x2_SUB
  1058. KERNEL1x2_SUB
  1059. KERNEL1x2_SUB
  1060. subs L, L, #1
  1061. bgt _L2_M1_22
  1062. _L2_M1_40:
  1063. ands L , K1, #7 // L = L % 8
  1064. ble _L2_M1_100
  1065. _L2_M1_42:
  1066. KERNEL1x2_SUB
  1067. subs L, L, #1
  1068. bgt _L2_M1_42
  1069. _L2_M1_100:
  1070. SAVE1x2
  1071. #if (defined(LEFT) && defined(TRANSA)) || \
  1072. (!defined(LEFT) && !defined(TRANSA))
  1073. ldr r3 , K
  1074. ldr r4 , KKK
  1075. sub r3 , r3 , r4
  1076. lsls r4 , r3 , #3 // 2 float values
  1077. add BO , BO , r4
  1078. lsls r4 , r3 , #2 // 1 float value
  1079. add AO , AO , r4
  1080. #endif
  1081. #if defined(LEFT)
  1082. ldr r3 , KK
  1083. add r3 , r3 , #1 // number of values in AO
  1084. str r3 , KK
  1085. #endif
  1086. _L2_END:
  1087. mov r3, BC
  1088. ldr r4, K
  1089. lsl r4, r4, #3 // k * 2 * 4
  1090. add r3, r3, r4 // B = B + K * 2 * 4
  1091. mov BC, r3
  1092. #if !defined(LEFT)
  1093. ldr r3 , KK
  1094. add r3 , r3 , #2 // number of values in BO
  1095. str r3 , KK
  1096. #endif
  1097. /*********************************************************************************************/
  1098. _L1_BEGIN:
  1099. ldr J , N
  1100. tst J , #1
  1101. ble _L999
  1102. ldr CO1, C // CO1 = C
  1103. ldr r4 , LDC
  1104. add r3 , r4, CO1
  1105. str r3 , C // store C
  1106. #if defined(LEFT)
  1107. ldr r3 , OFFSET
  1108. str r3 , KK
  1109. #endif
  1110. ldr AO, A // AO = A
  1111. //pld [AO , #A_PRE-96]
  1112. //pld [AO , #A_PRE-64]
  1113. //pld [AO , #A_PRE-32]
  1114. _L1_M4_BEGIN:
  1115. ldr I, M
  1116. asrs I, I, #2 // I = I / 4
  1117. ble _L1_M2_BEGIN
  1118. _L1_M4_20:
  1119. INIT4x1
  1120. #if (defined(LEFT) && defined(TRANSA)) || \
  1121. (!defined(LEFT) && !defined(TRANSA))
  1122. mov BO, BC
  1123. #else
  1124. mov BO, BC
  1125. ldr r3 , KK
  1126. lsls r4 , r3 , #2 // 1 float value
  1127. add BO , BO , r4
  1128. lsls r4 , r3 , #4 // 4 float values
  1129. add AO , AO , r4
  1130. #endif
  1131. #ifndef TRMMKERNEL
  1132. ldr K1, K
  1133. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1134. ldr K1, K
  1135. ldr r3, KK
  1136. sub K1, K1, r3
  1137. str K1, KKK
  1138. #else
  1139. ldr K1, KK
  1140. #ifdef LEFT
  1141. add K1, K1, #4 // number of values in AO
  1142. #else
  1143. add K1, K1, #1 // number of values in BO
  1144. #endif
  1145. str K1, KKK
  1146. #endif
  1147. asrs L , K1, #3 // L = L / 8
  1148. ble _L1_M4_40
  1149. .align 5
  1150. _L1_M4_22:
  1151. KERNEL4x1_SUB
  1152. KERNEL4x1_SUB
  1153. KERNEL4x1_SUB
  1154. KERNEL4x1_SUB
  1155. KERNEL4x1_SUB
  1156. KERNEL4x1_SUB
  1157. KERNEL4x1_SUB
  1158. KERNEL4x1_SUB
  1159. subs L, L, #1
  1160. bgt _L1_M4_22
  1161. _L1_M4_40:
  1162. ands L , K1, #7 // L = L % 8
  1163. ble _L1_M4_100
  1164. _L1_M4_42:
  1165. KERNEL4x1_SUB
  1166. subs L, L, #1
  1167. bgt _L1_M4_42
  1168. _L1_M4_100:
  1169. SAVE4x1
  1170. #if (defined(LEFT) && defined(TRANSA)) || \
  1171. (!defined(LEFT) && !defined(TRANSA))
  1172. ldr r3 , K
  1173. ldr r4 , KKK
  1174. sub r3 , r3 , r4
  1175. lsls r4 , r3 , #2 // 1 float value
  1176. add BO , BO , r4
  1177. lsls r4 , r3 , #4 // 4 float values
  1178. add AO , AO , r4
  1179. #endif
  1180. #if defined(LEFT)
  1181. ldr r3 , KK
  1182. add r3 , r3 , #4 // number of values in AO
  1183. str r3 , KK
  1184. #endif
  1185. _L1_M4_END:
  1186. subs I, I, #1
  1187. bgt _L1_M4_20
  1188. _L1_M2_BEGIN:
  1189. ldr I, M
  1190. tst I , #3
  1191. ble _L1_END
  1192. tst I, #2 // I = I / 2
  1193. ble _L1_M1_BEGIN
  1194. _L1_M2_20:
  1195. INIT2x1
  1196. #if (defined(LEFT) && defined(TRANSA)) || \
  1197. (!defined(LEFT) && !defined(TRANSA))
  1198. mov BO, BC
  1199. #else
  1200. mov BO, BC
  1201. ldr r3 , KK
  1202. lsls r4 , r3 , #2 // 1 float value
  1203. add BO , BO , r4
  1204. lsls r4 , r3 , #3 // 2 float values
  1205. add AO , AO , r4
  1206. #endif
  1207. #ifndef TRMMKERNEL
  1208. ldr K1, K
  1209. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1210. ldr K1, K
  1211. ldr r3, KK
  1212. sub K1, K1, r3
  1213. str K1, KKK
  1214. #else
  1215. ldr K1, KK
  1216. #ifdef LEFT
  1217. add K1, K1, #2 // number of values in AO
  1218. #else
  1219. add K1, K1, #1 // number of values in BO
  1220. #endif
  1221. str K1, KKK
  1222. #endif
  1223. asrs L , K1, #3 // L = L / 8
  1224. ble _L1_M2_40
  1225. _L1_M2_22:
  1226. KERNEL2x1_SUB
  1227. KERNEL2x1_SUB
  1228. KERNEL2x1_SUB
  1229. KERNEL2x1_SUB
  1230. KERNEL2x1_SUB
  1231. KERNEL2x1_SUB
  1232. KERNEL2x1_SUB
  1233. KERNEL2x1_SUB
  1234. subs L, L, #1
  1235. bgt _L1_M2_22
  1236. _L1_M2_40:
  1237. ands L , K1, #7 // L = L % 8
  1238. ble _L1_M2_100
  1239. _L1_M2_42:
  1240. KERNEL2x1_SUB
  1241. subs L, L, #1
  1242. bgt _L1_M2_42
  1243. _L1_M2_100:
  1244. SAVE2x1
  1245. #if (defined(LEFT) && defined(TRANSA)) || \
  1246. (!defined(LEFT) && !defined(TRANSA))
  1247. ldr r3 , K
  1248. ldr r4 , KKK
  1249. sub r3 , r3 , r4
  1250. lsls r4 , r3 , #2 // 1 float value
  1251. add BO , BO , r4
  1252. lsls r4 , r3 , #3 // 2 float values
  1253. add AO , AO , r4
  1254. #endif
  1255. #if defined(LEFT)
  1256. ldr r3 , KK
  1257. add r3 , r3 , #2 // number of values in AO
  1258. str r3 , KK
  1259. #endif
  1260. _L1_M2_END:
  1261. _L1_M1_BEGIN:
  1262. tst I, #1 // I = I % 2
  1263. ble _L1_END
  1264. _L1_M1_20:
  1265. INIT1x1
  1266. #if (defined(LEFT) && defined(TRANSA)) || \
  1267. (!defined(LEFT) && !defined(TRANSA))
  1268. mov BO, BC
  1269. #else
  1270. mov BO, BC
  1271. ldr r3 , KK
  1272. lsls r4 , r3 , #2 // 1 float value
  1273. add BO , BO , r4
  1274. lsls r4 , r3 , #2 // 1 float value
  1275. add AO , AO , r4
  1276. #endif
  1277. #ifndef TRMMKERNEL
  1278. ldr K1, K
  1279. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1280. ldr K1, K
  1281. ldr r3, KK
  1282. sub K1, K1, r3
  1283. str K1, KKK
  1284. #else
  1285. ldr K1, KK
  1286. #ifdef LEFT
  1287. add K1, K1, #1 // number of values in AO
  1288. #else
  1289. add K1, K1, #1 // number of values in BO
  1290. #endif
  1291. str K1, KKK
  1292. #endif
  1293. asrs L , K1, #3 // L = L / 8
  1294. ble _L1_M1_40
  1295. _L1_M1_22:
  1296. KERNEL1x1_SUB
  1297. KERNEL1x1_SUB
  1298. KERNEL1x1_SUB
  1299. KERNEL1x1_SUB
  1300. KERNEL1x1_SUB
  1301. KERNEL1x1_SUB
  1302. KERNEL1x1_SUB
  1303. KERNEL1x1_SUB
  1304. subs L, L, #1
  1305. bgt _L1_M1_22
  1306. _L1_M1_40:
  1307. ands L , K1, #7 // L = L % 8
  1308. ble _L1_M1_100
  1309. _L1_M1_42:
  1310. KERNEL1x1_SUB
  1311. subs L, L, #1
  1312. bgt _L1_M1_42
  1313. _L1_M1_100:
  1314. SAVE1x1
  1315. _L1_END:
  1316. _L999:
  1317. sub r3, fp, #128
  1318. vldm r3, { s8 - s31} // restore floating point registers
  1319. movs r0, #0 // set return value
  1320. sub sp, fp, #24
  1321. pop {r4 - r9, fp}
  1322. bx lr
  1323. EPILOGUE