You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x4_vfpv3.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-32] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-244 ]
  48. #define KKK [fp, #-248]
  49. #define LDC [fp, #-252 ]
  50. #define M [fp, #-256 ]
  51. #define N [fp, #-260 ]
  52. #define K [fp, #-264 ]
  53. #define A [fp, #-268 ]
  54. #define ALPHA [fp, #-280]
  55. #define B [fp, #4 ]
  56. #define C [fp, #8 ]
  57. #define OLD_LDC [fp, #12 ]
  58. #define OFFSET [fp, #16 ]
  59. #define I r0
  60. #define J r1
  61. #define L r2
  62. #define AO r5
  63. #define BO r6
  64. #define CO1 r8
  65. #define CO2 r9
  66. #define K1 r7
  67. #define BC r12
  68. #define A_PRE 96
  69. #define B_PRE 96
  70. #define C_PRE 64
  71. /**************************************************************************************
  72. * Macro definitions
  73. **************************************************************************************/
  74. .macro INIT4x4
  75. vsub.f32 s16 , s16 , s16
  76. vmov.f32 s17, s16
  77. vmov.f32 s18, s16
  78. vmov.f32 s19, s16
  79. vmov.f32 s20, s16
  80. vmov.f32 s21, s16
  81. vmov.f32 s22, s16
  82. vmov.f32 s23, s16
  83. vmov.f32 s24, s16
  84. vmov.f32 s25, s16
  85. vmov.f32 s26, s16
  86. vmov.f32 s27, s16
  87. vmov.f32 s28, s16
  88. vmov.f32 s29, s16
  89. vmov.f32 s30, s16
  90. vmov.f32 s31, s16
  91. .endm
  92. .macro KERNEL4x4_I
  93. fldmias AO!, { s0 - s1 }
  94. pld [ AO , #A_PRE-8 ]
  95. fldmias BO!, { s8 - s9 }
  96. pld [ BO , #B_PRE-8 ]
  97. fmuls s16 , s0, s8
  98. fldmias AO!, { s2 - s3 }
  99. fmuls s17 , s1, s8
  100. fmuls s18 , s2, s8
  101. fldmias BO!, { s10 - s11 }
  102. fmuls s19 , s3, s8
  103. fmuls s20 , s0, s9
  104. fldmias AO!, { s4 - s5 }
  105. fmuls s21 , s1, s9
  106. fmuls s22 , s2, s9
  107. fldmias AO!, { s6 - s7 }
  108. fmuls s23 , s3, s9
  109. fmuls s24 , s0, s10
  110. fldmias BO!, { s12 - s13 }
  111. fmuls s25 , s1, s10
  112. fmuls s26 , s2, s10
  113. fldmias BO!, { s14 - s15 }
  114. fmuls s27 , s3, s10
  115. fmuls s28 , s0, s11
  116. fmuls s29 , s1, s11
  117. fmuls s30 , s2, s11
  118. fmuls s31 , s3, s11
  119. .endm
  120. .macro KERNEL4x4_M2
  121. pld [ AO , #A_PRE ]
  122. fmacs s16 , s4, s12
  123. fmacs s17 , s5, s12
  124. fldmias AO!, { s0 - s1 }
  125. fmacs s18 , s6, s12
  126. pld [ BO , #B_PRE ]
  127. fmacs s19 , s7, s12
  128. fmacs s20 , s4, s13
  129. fldmias AO!, { s2 - s3 }
  130. fmacs s21 , s5, s13
  131. fmacs s22 , s6, s13
  132. fldmias BO!, { s8 - s9 }
  133. fmacs s23 , s7, s13
  134. fmacs s24 , s4, s14
  135. fldmias BO!, { s10 - s11 }
  136. fmacs s25 , s5, s14
  137. fmacs s26 , s6, s14
  138. fmacs s27 , s7, s14
  139. fmacs s28 , s4, s15
  140. fmacs s29 , s5, s15
  141. fmacs s30 , s6, s15
  142. fmacs s31 , s7, s15
  143. .endm
  144. .macro KERNEL4x4_M1
  145. fmacs s16 , s0, s8
  146. fldmias AO!, { s4 - s5 }
  147. fmacs s17 , s1, s8
  148. fmacs s18 , s2, s8
  149. fldmias AO!, { s6 - s7 }
  150. fmacs s19 , s3, s8
  151. fmacs s20 , s0, s9
  152. fldmias BO!, { s12 - s13 }
  153. fmacs s21 , s1, s9
  154. fmacs s22 , s2, s9
  155. fldmias BO!, { s14 - s15 }
  156. fmacs s23 , s3, s9
  157. fmacs s24 , s0, s10
  158. fmacs s25 , s1, s10
  159. fmacs s26 , s2, s10
  160. fmacs s27 , s3, s10
  161. fmacs s28 , s0, s11
  162. fmacs s29 , s1, s11
  163. fmacs s30 , s2, s11
  164. fmacs s31 , s3, s11
  165. .endm
  166. .macro KERNEL4x4_E
  167. fmacs s16 , s4, s12
  168. fmacs s17 , s5, s12
  169. fmacs s18 , s6, s12
  170. fmacs s19 , s7, s12
  171. fmacs s20 , s4, s13
  172. fmacs s21 , s5, s13
  173. fmacs s22 , s6, s13
  174. fmacs s23 , s7, s13
  175. fmacs s24 , s4, s14
  176. fmacs s25 , s5, s14
  177. fmacs s26 , s6, s14
  178. fmacs s27 , s7, s14
  179. fmacs s28 , s4, s15
  180. fmacs s29 , s5, s15
  181. fmacs s30 , s6, s15
  182. fmacs s31 , s7, s15
  183. .endm
  184. .macro KERNEL4x4_SUB
  185. flds s8 , [ BO ]
  186. pld [ BO , #B_PRE ]
  187. flds s0 , [ AO ]
  188. pld [ AO , #A_PRE ]
  189. flds s1 , [ AO, #4 ]
  190. fmacs s16 , s0, s8
  191. flds s2 , [ AO, #8 ]
  192. fmacs s17 , s1, s8
  193. flds s3 , [ AO, #12 ]
  194. fmacs s18 , s2, s8
  195. flds s9 , [ BO, #4 ]
  196. fmacs s19 , s3, s8
  197. flds s10, [ BO, #8 ]
  198. fmacs s20 , s0, s9
  199. flds s11, [ BO, #12 ]
  200. fmacs s21 , s1, s9
  201. fmacs s22 , s2, s9
  202. fmacs s23 , s3, s9
  203. fmacs s24 , s0, s10
  204. fmacs s25 , s1, s10
  205. fmacs s26 , s2, s10
  206. fmacs s27 , s3, s10
  207. fmacs s28 , s0, s11
  208. fmacs s29 , s1, s11
  209. add AO , AO, #16
  210. fmacs s30 , s2, s11
  211. add BO , BO, #16
  212. fmacs s31 , s3, s11
  213. .endm
  214. .macro SAVE4x4
  215. ldr r3 , LDC
  216. add CO2 , CO1, r3
  217. flds s0, ALPHA
  218. add r4 , CO2, r3
  219. fmuls s8 , s0 , s16
  220. fmuls s9 , s0 , s17
  221. fmuls s10, s0 , s18
  222. fmuls s11, s0 , s19
  223. fmuls s12, s0 , s20
  224. fsts s8 , [CO1]
  225. fmuls s13, s0 , s21
  226. fsts s9 , [CO1, #4 ]
  227. fmuls s14, s0 , s22
  228. fsts s10, [CO1, #8 ]
  229. fmuls s15, s0 , s23
  230. fsts s11, [CO1, #12 ]
  231. fmuls s8 , s0 , s24
  232. fsts s12, [CO2]
  233. fmuls s9 , s0 , s25
  234. fsts s13, [CO2, #4 ]
  235. fmuls s10, s0 , s26
  236. fsts s14, [CO2, #8 ]
  237. fmuls s11, s0 , s27
  238. fsts s15, [CO2, #12 ]
  239. add CO2, r4 , r3
  240. fsts s8 , [r4 ]
  241. fmuls s12, s0 , s28
  242. fsts s9 , [r4 , #4 ]
  243. fmuls s13, s0 , s29
  244. fsts s10, [r4 , #8 ]
  245. fmuls s14, s0 , s30
  246. fsts s11, [r4 , #12 ]
  247. fmuls s15, s0 , s31
  248. fstmias CO2, { s12 - s15 }
  249. add CO1, CO1, #16
  250. .endm
  251. /******************************************************************************/
  252. .macro INIT2x4
  253. vsub.f32 s16 , s16 , s16
  254. vmov.f32 s17, s16
  255. vmov.f32 s20, s16
  256. vmov.f32 s21, s16
  257. vmov.f32 s24, s16
  258. vmov.f32 s25, s16
  259. vmov.f32 s28, s16
  260. vmov.f32 s29, s16
  261. .endm
  262. .macro KERNEL2x4_SUB
  263. flds s8 , [ BO ]
  264. flds s9 , [ BO, #4 ]
  265. flds s10, [ BO, #8 ]
  266. flds s11, [ BO, #12 ]
  267. flds s0 , [ AO ]
  268. flds s1 , [ AO, #4 ]
  269. fmacs s16 , s0, s8
  270. fmacs s17 , s1, s8
  271. fmacs s20 , s0, s9
  272. fmacs s21 , s1, s9
  273. fmacs s24 , s0, s10
  274. fmacs s25 , s1, s10
  275. fmacs s28 , s0, s11
  276. fmacs s29 , s1, s11
  277. add AO , AO, #8
  278. add BO , BO, #16
  279. .endm
  280. .macro SAVE2x4
  281. ldr r3 , LDC
  282. add CO2 , CO1, r3
  283. add r4 , CO2, r3
  284. flds s0, ALPHA
  285. fmuls s8 , s0 , s16
  286. fmuls s9 , s0 , s17
  287. fsts s8 , [CO1]
  288. fsts s9 , [CO1, #4 ]
  289. fmuls s12, s0 , s20
  290. fmuls s13, s0 , s21
  291. fsts s12, [CO2]
  292. fsts s13, [CO2, #4 ]
  293. fmuls s8 , s0 , s24
  294. fmuls s9 , s0 , s25
  295. fsts s8 , [r4 ]
  296. fsts s9 , [r4 , #4 ]
  297. add CO2, r4 , r3
  298. fmuls s12, s0 , s28
  299. fmuls s13, s0 , s29
  300. fsts s12, [CO2]
  301. fsts s13, [CO2, #4 ]
  302. add CO1, CO1, #8
  303. .endm
  304. /******************************************************************************/
  305. .macro INIT1x4
  306. vsub.f32 s16 , s16 , s16
  307. vmov.f32 s20, s16
  308. vmov.f32 s24, s16
  309. vmov.f32 s28, s16
  310. .endm
  311. .macro KERNEL1x4_SUB
  312. flds s8 , [ BO ]
  313. flds s9 , [ BO, #4 ]
  314. flds s10, [ BO, #8 ]
  315. flds s11, [ BO, #12 ]
  316. flds s0 , [ AO ]
  317. fmacs s16 , s0, s8
  318. fmacs s20 , s0, s9
  319. fmacs s24 , s0, s10
  320. fmacs s28 , s0, s11
  321. add AO , AO, #4
  322. add BO , BO, #16
  323. .endm
  324. .macro SAVE1x4
  325. ldr r3 , LDC
  326. add CO2 , CO1, r3
  327. add r4 , CO2, r3
  328. flds s0, ALPHA
  329. fmuls s8 , s0 , s16
  330. fsts s8 , [CO1]
  331. fmuls s12, s0 , s20
  332. fsts s12, [CO2]
  333. fmuls s8 , s0 , s24
  334. fsts s8 , [r4 ]
  335. add CO2, r4 , r3
  336. fmuls s12, s0 , s28
  337. fsts s12, [CO2]
  338. add CO1, CO1, #4
  339. .endm
  340. /******************************************************************************/
  341. /******************************************************************************/
  342. .macro INIT4x2
  343. vsub.f32 s16 , s16 , s16
  344. vmov.f32 s17, s16
  345. vmov.f32 s18, s16
  346. vmov.f32 s19, s16
  347. vmov.f32 s20, s16
  348. vmov.f32 s21, s16
  349. vmov.f32 s22, s16
  350. vmov.f32 s23, s16
  351. .endm
  352. .macro KERNEL4x2_SUB
  353. flds s8 , [ BO ]
  354. flds s9 , [ BO, #4 ]
  355. flds s0 , [ AO ]
  356. flds s1 , [ AO, #4 ]
  357. flds s2 , [ AO, #8 ]
  358. flds s3 , [ AO, #12 ]
  359. fmacs s16 , s0, s8
  360. fmacs s17 , s1, s8
  361. fmacs s18 , s2, s8
  362. fmacs s19 , s3, s8
  363. fmacs s20 , s0, s9
  364. fmacs s21 , s1, s9
  365. fmacs s22 , s2, s9
  366. fmacs s23 , s3, s9
  367. add AO , AO, #16
  368. add BO , BO, #8
  369. .endm
  370. .macro SAVE4x2
  371. ldr r3 , LDC
  372. add CO2 , CO1, r3
  373. flds s0, ALPHA
  374. fmuls s8 , s0 , s16
  375. fmuls s9 , s0 , s17
  376. fmuls s10, s0 , s18
  377. fmuls s11, s0 , s19
  378. fsts s8 , [CO1]
  379. fsts s9 , [CO1, #4 ]
  380. fsts s10, [CO1, #8 ]
  381. fsts s11, [CO1, #12 ]
  382. fmuls s12, s0 , s20
  383. fmuls s13, s0 , s21
  384. fmuls s14, s0 , s22
  385. fmuls s15, s0 , s23
  386. fsts s12, [CO2]
  387. fsts s13, [CO2, #4 ]
  388. fsts s14, [CO2, #8 ]
  389. fsts s15, [CO2, #12 ]
  390. add CO1, CO1, #16
  391. .endm
  392. /******************************************************************************/
  393. .macro INIT2x2
  394. vsub.f32 s16 , s16 , s16
  395. vmov.f32 s17, s16
  396. vmov.f32 s20, s16
  397. vmov.f32 s21, s16
  398. .endm
  399. .macro KERNEL2x2_SUB
  400. flds s8 , [ BO ]
  401. flds s9 , [ BO, #4 ]
  402. flds s0 , [ AO ]
  403. flds s1 , [ AO, #4 ]
  404. fmacs s16 , s0, s8
  405. fmacs s17 , s1, s8
  406. fmacs s20 , s0, s9
  407. fmacs s21 , s1, s9
  408. add AO , AO, #8
  409. add BO , BO, #8
  410. .endm
  411. .macro SAVE2x2
  412. ldr r3 , LDC
  413. add CO2 , CO1, r3
  414. flds s0, ALPHA
  415. fmuls s8 , s0 , s16
  416. fmuls s9 , s0 , s17
  417. fsts s8 , [CO1]
  418. fsts s9 , [CO1, #4 ]
  419. fmuls s12, s0 , s20
  420. fmuls s13, s0 , s21
  421. fsts s12, [CO2]
  422. fsts s13, [CO2, #4 ]
  423. add CO1, CO1, #8
  424. .endm
  425. /******************************************************************************/
  426. .macro INIT1x2
  427. vsub.f32 s16 , s16 , s16
  428. vmov.f32 s20, s16
  429. .endm
  430. .macro KERNEL1x2_SUB
  431. flds s8 , [ BO ]
  432. flds s9 , [ BO, #4 ]
  433. flds s0 , [ AO ]
  434. fmacs s16 , s0, s8
  435. fmacs s20 , s0, s9
  436. add AO , AO, #4
  437. add BO , BO, #8
  438. .endm
  439. .macro SAVE1x2
  440. ldr r3 , LDC
  441. add CO2 , CO1, r3
  442. flds s0, ALPHA
  443. fmuls s8 , s0 , s16
  444. fsts s8 , [CO1]
  445. fmuls s12, s0 , s20
  446. fsts s12, [CO2]
  447. add CO1, CO1, #4
  448. .endm
  449. /******************************************************************************/
  450. /******************************************************************************/
  451. .macro INIT4x1
  452. vsub.f32 s16 , s16 , s16
  453. vmov.f32 s17, s16
  454. vmov.f32 s18, s16
  455. vmov.f32 s19, s16
  456. .endm
  457. .macro KERNEL4x1_SUB
  458. flds s8 , [ BO ]
  459. flds s0 , [ AO ]
  460. flds s1 , [ AO, #4 ]
  461. flds s2 , [ AO, #8 ]
  462. flds s3 , [ AO, #12 ]
  463. fmacs s16 , s0, s8
  464. fmacs s17 , s1, s8
  465. fmacs s18 , s2, s8
  466. fmacs s19 , s3, s8
  467. add AO , AO, #16
  468. add BO , BO, #4
  469. .endm
  470. .macro SAVE4x1
  471. flds s0, ALPHA
  472. fmuls s8 , s0 , s16
  473. fmuls s9 , s0 , s17
  474. fmuls s10, s0 , s18
  475. fmuls s11, s0 , s19
  476. fsts s8 , [CO1]
  477. fsts s9 , [CO1, #4 ]
  478. fsts s10, [CO1, #8 ]
  479. fsts s11, [CO1, #12 ]
  480. add CO1, CO1, #16
  481. .endm
  482. /******************************************************************************/
  483. .macro INIT2x1
  484. vsub.f32 s16 , s16 , s16
  485. vmov.f32 s17, s16
  486. .endm
  487. .macro KERNEL2x1_SUB
  488. flds s8 , [ BO ]
  489. flds s0 , [ AO ]
  490. flds s1 , [ AO, #4 ]
  491. fmacs s16 , s0, s8
  492. fmacs s17 , s1, s8
  493. add AO , AO, #8
  494. add BO , BO, #4
  495. .endm
  496. .macro SAVE2x1
  497. flds s0, ALPHA
  498. fmuls s8 , s0 , s16
  499. fmuls s9 , s0 , s17
  500. fsts s8 , [CO1]
  501. fsts s9 , [CO1, #4 ]
  502. add CO1, CO1, #8
  503. .endm
  504. /******************************************************************************/
  505. .macro INIT1x1
  506. vsub.f32 s16 , s16 , s16
  507. .endm
  508. .macro KERNEL1x1_SUB
  509. flds s8 , [ BO ]
  510. flds s0 , [ AO ]
  511. fmacs s16 , s0, s8
  512. add AO , AO, #4
  513. add BO , BO, #4
  514. .endm
  515. .macro SAVE1x1
  516. flds s0, ALPHA
  517. fmuls s8 , s0 , s16
  518. fsts s8 , [CO1]
  519. add CO1, CO1, #4
  520. .endm
  521. /**************************************************************************************
  522. * End of macro definitions
  523. **************************************************************************************/
  524. PROLOGUE
  525. .align 5
  526. push {r4 - r9, fp}
  527. add fp, sp, #24
  528. sub sp, sp, #STACKSIZE // reserve stack
  529. str OLD_M, M
  530. str OLD_N, N
  531. str OLD_K, K
  532. str OLD_A, A
  533. vstr OLD_ALPHA, ALPHA
  534. sub r3, fp, #128
  535. vstm r3, { s8 - s31} // store floating point registers
  536. ldr r3, OLD_LDC
  537. lsl r3, r3, #2 // ldc = ldc * 4
  538. str r3, LDC
  539. ldr r3, OFFSET
  540. #ifndef LEFT
  541. neg r3 , r3
  542. #endif
  543. str r3 , KK
  544. ldr BC, B
  545. ldr J, N
  546. asrs J, J, #2 // J = J / 4
  547. ble _L2_BEGIN
  548. _L4_BEGIN:
  549. ldr CO1, C // CO1 = C
  550. ldr r4 , LDC
  551. lsl r4 , r4 , #2 // LDC * 4
  552. add r3 , r4, CO1
  553. str r3 , C // store C
  554. #if defined(LEFT)
  555. ldr r3 , OFFSET
  556. str r3 , KK
  557. #endif
  558. ldr AO, A // AO = A
  559. pld [AO , #A_PRE-64]
  560. pld [AO , #A_PRE-32]
  561. _L4_M4_BEGIN:
  562. ldr I, M
  563. asrs I, I, #2 // I = I / 4
  564. ble _L4_M2_BEGIN
  565. _L4_M4_20:
  566. #if (defined(LEFT) && defined(TRANSA)) || \
  567. (!defined(LEFT) && !defined(TRANSA))
  568. mov BO, BC
  569. #else
  570. mov BO, BC
  571. ldr r3 , KK
  572. lsls r4 , r3 , #4 // 4 float values
  573. add BO , BO , r4
  574. lsls r4 , r3 , #4 // 4 float values
  575. add AO , AO , r4
  576. #endif
  577. #ifndef TRMMKERNEL
  578. ldr K1, K
  579. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  580. ldr K1, K
  581. ldr r3, KK
  582. sub K1, K1, r3
  583. str K1, KKK
  584. #else
  585. ldr K1, KK
  586. #ifdef LEFT
  587. add K1, K1, #4 // number of values in AO
  588. #else
  589. add K1, K1, #4 // number of values in BO
  590. #endif
  591. str K1, KKK
  592. #endif
  593. asrs L , K1, #3 // L = L / 8
  594. cmp L , #3
  595. blt _L4_M4_30
  596. .align 5
  597. KERNEL4x4_I
  598. KERNEL4x4_M2
  599. KERNEL4x4_M1
  600. KERNEL4x4_M2
  601. KERNEL4x4_M1
  602. KERNEL4x4_M2
  603. KERNEL4x4_M1
  604. KERNEL4x4_M2
  605. sub L, L, #2
  606. _L4_M4_22:
  607. KERNEL4x4_M1
  608. KERNEL4x4_M2
  609. KERNEL4x4_M1
  610. KERNEL4x4_M2
  611. KERNEL4x4_M1
  612. KERNEL4x4_M2
  613. KERNEL4x4_M1
  614. KERNEL4x4_M2
  615. subs L, L, #1
  616. bgt _L4_M4_22
  617. KERNEL4x4_M1
  618. KERNEL4x4_M2
  619. KERNEL4x4_M1
  620. KERNEL4x4_M2
  621. KERNEL4x4_M1
  622. KERNEL4x4_M2
  623. KERNEL4x4_M1
  624. KERNEL4x4_E
  625. b _L4_M4_44
  626. _L4_M4_30:
  627. tst L, #3
  628. ble _L4_M4_40
  629. tst L, #2
  630. ble _L4_M4_32
  631. KERNEL4x4_I
  632. KERNEL4x4_M2
  633. KERNEL4x4_M1
  634. KERNEL4x4_M2
  635. KERNEL4x4_M1
  636. KERNEL4x4_M2
  637. KERNEL4x4_M1
  638. KERNEL4x4_M2
  639. KERNEL4x4_M1
  640. KERNEL4x4_M2
  641. KERNEL4x4_M1
  642. KERNEL4x4_M2
  643. KERNEL4x4_M1
  644. KERNEL4x4_M2
  645. KERNEL4x4_M1
  646. KERNEL4x4_E
  647. b _L4_M4_44
  648. _L4_M4_32:
  649. tst L, #1
  650. ble _L4_M4_40
  651. KERNEL4x4_I
  652. KERNEL4x4_M2
  653. KERNEL4x4_M1
  654. KERNEL4x4_M2
  655. KERNEL4x4_M1
  656. KERNEL4x4_M2
  657. KERNEL4x4_M1
  658. KERNEL4x4_E
  659. b _L4_M4_44
  660. _L4_M4_40:
  661. INIT4x4
  662. _L4_M4_44:
  663. ands L , K1, #7 // L = L % 8
  664. ble _L4_M4_100
  665. _L4_M4_46:
  666. KERNEL4x4_SUB
  667. subs L, L, #1
  668. bne _L4_M4_46
  669. _L4_M4_100:
  670. SAVE4x4
  671. #if (defined(LEFT) && defined(TRANSA)) || \
  672. (!defined(LEFT) && !defined(TRANSA))
  673. ldr r3 , K
  674. ldr r4 , KKK
  675. sub r3 , r3 , r4
  676. lsls r4 , r3 , #4 // 4 float values
  677. add BO , BO , r4
  678. lsls r4 , r3 , #4 // 4 float values
  679. add AO , AO , r4
  680. #endif
  681. #if defined(LEFT)
  682. ldr r3 , KK
  683. add r3 , r3 , #4 // number of values in AO
  684. str r3 , KK
  685. #endif
  686. _L4_M4_END:
  687. subs I, I, #1
  688. bne _L4_M4_20
  689. _L4_M2_BEGIN:
  690. ldr I, M
  691. tst I , #3
  692. ble _L4_END
  693. tst I, #2 // I = I / 2
  694. ble _L4_M1_BEGIN
  695. _L4_M2_20:
  696. INIT2x4
  697. #if (defined(LEFT) && defined(TRANSA)) || \
  698. (!defined(LEFT) && !defined(TRANSA))
  699. mov BO, BC
  700. #else
  701. mov BO, BC
  702. ldr r3 , KK
  703. lsls r4 , r3 , #4 // 4 float values
  704. add BO , BO , r4
  705. lsls r4 , r3 , #3 // 2 float values
  706. add AO , AO , r4
  707. #endif
  708. #ifndef TRMMKERNEL
  709. ldr K1, K
  710. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  711. ldr K1, K
  712. ldr r3, KK
  713. sub K1, K1, r3
  714. str K1, KKK
  715. #else
  716. ldr K1, KK
  717. #ifdef LEFT
  718. add K1, K1, #2 // number of values in AO
  719. #else
  720. add K1, K1, #4 // number of values in BO
  721. #endif
  722. str K1, KKK
  723. #endif
  724. asrs L , K1, #3 // L = L / 8
  725. ble _L4_M2_40
  726. _L4_M2_22:
  727. KERNEL2x4_SUB
  728. KERNEL2x4_SUB
  729. KERNEL2x4_SUB
  730. KERNEL2x4_SUB
  731. KERNEL2x4_SUB
  732. KERNEL2x4_SUB
  733. KERNEL2x4_SUB
  734. KERNEL2x4_SUB
  735. subs L, L, #1
  736. bgt _L4_M2_22
  737. _L4_M2_40:
  738. ands L , K1, #7 // L = L % 8
  739. ble _L4_M2_100
  740. _L4_M2_42:
  741. KERNEL2x4_SUB
  742. subs L, L, #1
  743. bgt _L4_M2_42
  744. _L4_M2_100:
  745. SAVE2x4
  746. #if (defined(LEFT) && defined(TRANSA)) || \
  747. (!defined(LEFT) && !defined(TRANSA))
  748. ldr r3 , K
  749. ldr r4 , KKK
  750. sub r3 , r3 , r4
  751. lsls r4 , r3 , #4 // 4 float values
  752. add BO , BO , r4
  753. lsls r4 , r3 , #3 // 2 float values
  754. add AO , AO , r4
  755. #endif
  756. #if defined(LEFT)
  757. ldr r3 , KK
  758. add r3 , r3 , #2 // number of values in AO
  759. str r3 , KK
  760. #endif
  761. _L4_M2_END:
  762. _L4_M1_BEGIN:
  763. tst I, #1 // I = I % 2
  764. ble _L4_END
  765. _L4_M1_20:
  766. INIT1x4
  767. #if (defined(LEFT) && defined(TRANSA)) || \
  768. (!defined(LEFT) && !defined(TRANSA))
  769. mov BO, BC
  770. #else
  771. mov BO, BC
  772. ldr r3 , KK
  773. lsls r4 , r3 , #4 // 4 float values
  774. add BO , BO , r4
  775. lsls r4 , r3 , #2 // 1 float value
  776. add AO , AO , r4
  777. #endif
  778. #ifndef TRMMKERNEL
  779. ldr K1, K
  780. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  781. ldr K1, K
  782. ldr r3, KK
  783. sub K1, K1, r3
  784. str K1, KKK
  785. #else
  786. ldr K1, KK
  787. #ifdef LEFT
  788. add K1, K1, #1 // number of values in AO
  789. #else
  790. add K1, K1, #4 // number of values in BO
  791. #endif
  792. str K1, KKK
  793. #endif
  794. asrs L , K1, #3 // L = L / 8
  795. ble _L4_M1_40
  796. _L4_M1_22:
  797. KERNEL1x4_SUB
  798. KERNEL1x4_SUB
  799. KERNEL1x4_SUB
  800. KERNEL1x4_SUB
  801. KERNEL1x4_SUB
  802. KERNEL1x4_SUB
  803. KERNEL1x4_SUB
  804. KERNEL1x4_SUB
  805. subs L, L, #1
  806. bgt _L4_M1_22
  807. _L4_M1_40:
  808. ands L , K1, #7 // L = L % 8
  809. ble _L4_M1_100
  810. _L4_M1_42:
  811. KERNEL1x4_SUB
  812. subs L, L, #1
  813. bgt _L4_M1_42
  814. _L4_M1_100:
  815. SAVE1x4
  816. #if (defined(LEFT) && defined(TRANSA)) || \
  817. (!defined(LEFT) && !defined(TRANSA))
  818. ldr r3 , K
  819. ldr r4 , KKK
  820. sub r3 , r3 , r4
  821. lsls r4 , r3 , #4 // 4 float values
  822. add BO , BO , r4
  823. lsls r4 , r3 , #2 // 1 float value
  824. add AO , AO , r4
  825. #endif
  826. #if defined(LEFT)
  827. ldr r3 , KK
  828. add r3 , r3 , #1 // number of values in AO
  829. str r3 , KK
  830. #endif
  831. _L4_END:
  832. mov r3, BC
  833. ldr r4, K
  834. lsl r4, r4, #4 // k * 4 * 4
  835. add r3, r3, r4 // B = B + K * 4 * 4
  836. mov BC, r3
  837. #if !defined(LEFT)
  838. ldr r3 , KK
  839. add r3 , r3 , #4 // number of values in BO
  840. str r3 , KK
  841. #endif
  842. subs J , #1 // j--
  843. bgt _L4_BEGIN
  844. /*********************************************************************************************/
  845. _L2_BEGIN:
  846. ldr J , N
  847. tst J , #3
  848. ble _L999
  849. tst J , #2
  850. ble _L1_BEGIN
  851. ldr CO1, C // CO1 = C
  852. ldr r4 , LDC
  853. lsl r4 , r4 , #1 // LDC * 2
  854. add r3 , r4, CO1
  855. str r3 , C // store C
  856. #if defined(LEFT)
  857. ldr r3 , OFFSET
  858. str r3 , KK
  859. #endif
  860. ldr AO, A // AO = A
  861. //pld [AO , #A_PRE-96]
  862. //pld [AO , #A_PRE-64]
  863. //pld [AO , #A_PRE-32]
  864. _L2_M4_BEGIN:
  865. ldr I, M
  866. asrs I, I, #2 // I = I / 4
  867. ble _L2_M2_BEGIN
  868. _L2_M4_20:
  869. INIT4x2
  870. #if (defined(LEFT) && defined(TRANSA)) || \
  871. (!defined(LEFT) && !defined(TRANSA))
  872. mov BO, BC
  873. #else
  874. mov BO, BC
  875. ldr r3 , KK
  876. lsls r4 , r3 , #3 // 2 float values
  877. add BO , BO , r4
  878. lsls r4 , r3 , #4 // 4 float values
  879. add AO , AO , r4
  880. #endif
  881. #ifndef TRMMKERNEL
  882. ldr K1, K
  883. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  884. ldr K1, K
  885. ldr r3, KK
  886. sub K1, K1, r3
  887. str K1, KKK
  888. #else
  889. ldr K1, KK
  890. #ifdef LEFT
  891. add K1, K1, #4 // number of values in AO
  892. #else
  893. add K1, K1, #2 // number of values in BO
  894. #endif
  895. str K1, KKK
  896. #endif
  897. asrs L , K1, #3 // L = L / 8
  898. ble _L2_M4_40
  899. .align 5
  900. _L2_M4_22:
  901. KERNEL4x2_SUB
  902. KERNEL4x2_SUB
  903. KERNEL4x2_SUB
  904. KERNEL4x2_SUB
  905. KERNEL4x2_SUB
  906. KERNEL4x2_SUB
  907. KERNEL4x2_SUB
  908. KERNEL4x2_SUB
  909. subs L, L, #1
  910. bgt _L2_M4_22
  911. _L2_M4_40:
  912. ands L , K1, #7 // L = L % 8
  913. ble _L2_M4_100
  914. _L2_M4_42:
  915. KERNEL4x2_SUB
  916. subs L, L, #1
  917. bgt _L2_M4_42
  918. _L2_M4_100:
  919. SAVE4x2
  920. #if (defined(LEFT) && defined(TRANSA)) || \
  921. (!defined(LEFT) && !defined(TRANSA))
  922. ldr r3 , K
  923. ldr r4 , KKK
  924. sub r3 , r3 , r4
  925. lsls r4 , r3 , #3 // 2 float values
  926. add BO , BO , r4
  927. lsls r4 , r3 , #4 // 4 float values
  928. add AO , AO , r4
  929. #endif
  930. #if defined(LEFT)
  931. ldr r3 , KK
  932. add r3 , r3 , #4 // number of values in AO
  933. str r3 , KK
  934. #endif
  935. _L2_M4_END:
  936. subs I, I, #1
  937. bgt _L2_M4_20
  938. _L2_M2_BEGIN:
  939. ldr I, M
  940. tst I , #3
  941. ble _L2_END
  942. tst I, #2 // I = I / 2
  943. ble _L2_M1_BEGIN
  944. _L2_M2_20:
  945. INIT2x2
  946. #if (defined(LEFT) && defined(TRANSA)) || \
  947. (!defined(LEFT) && !defined(TRANSA))
  948. mov BO, BC
  949. #else
  950. mov BO, BC
  951. ldr r3 , KK
  952. lsls r4 , r3 , #3 // 2 float values
  953. add BO , BO , r4
  954. lsls r4 , r3 , #3 // 2 float values
  955. add AO , AO , r4
  956. #endif
  957. #ifndef TRMMKERNEL
  958. ldr K1, K
  959. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  960. ldr K1, K
  961. ldr r3, KK
  962. sub K1, K1, r3
  963. str K1, KKK
  964. #else
  965. ldr K1, KK
  966. #ifdef LEFT
  967. add K1, K1, #2 // number of values in AO
  968. #else
  969. add K1, K1, #2 // number of values in BO
  970. #endif
  971. str K1, KKK
  972. #endif
  973. asrs L , K1, #3 // L = L / 8
  974. ble _L2_M2_40
  975. _L2_M2_22:
  976. KERNEL2x2_SUB
  977. KERNEL2x2_SUB
  978. KERNEL2x2_SUB
  979. KERNEL2x2_SUB
  980. KERNEL2x2_SUB
  981. KERNEL2x2_SUB
  982. KERNEL2x2_SUB
  983. KERNEL2x2_SUB
  984. subs L, L, #1
  985. bgt _L2_M2_22
  986. _L2_M2_40:
  987. ands L , K1, #7 // L = L % 8
  988. ble _L2_M2_100
  989. _L2_M2_42:
  990. KERNEL2x2_SUB
  991. subs L, L, #1
  992. bgt _L2_M2_42
  993. _L2_M2_100:
  994. SAVE2x2
  995. #if (defined(LEFT) && defined(TRANSA)) || \
  996. (!defined(LEFT) && !defined(TRANSA))
  997. ldr r3 , K
  998. ldr r4 , KKK
  999. sub r3 , r3 , r4
  1000. lsls r4 , r3 , #3 // 2 float values
  1001. add BO , BO , r4
  1002. lsls r4 , r3 , #3 // 2 float values
  1003. add AO , AO , r4
  1004. #endif
  1005. #if defined(LEFT)
  1006. ldr r3 , KK
  1007. add r3 , r3 , #2 // number of values in AO
  1008. str r3 , KK
  1009. #endif
  1010. _L2_M2_END:
  1011. _L2_M1_BEGIN:
  1012. tst I, #1 // I = I % 2
  1013. ble _L2_END
  1014. _L2_M1_20:
  1015. INIT1x2
  1016. #if (defined(LEFT) && defined(TRANSA)) || \
  1017. (!defined(LEFT) && !defined(TRANSA))
  1018. mov BO, BC
  1019. #else
  1020. mov BO, BC
  1021. ldr r3 , KK
  1022. lsls r4 , r3 , #3 // 2 float values
  1023. add BO , BO , r4
  1024. lsls r4 , r3 , #2 // 1 float value
  1025. add AO , AO , r4
  1026. #endif
  1027. #ifndef TRMMKERNEL
  1028. ldr K1, K
  1029. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1030. ldr K1, K
  1031. ldr r3, KK
  1032. sub K1, K1, r3
  1033. str K1, KKK
  1034. #else
  1035. ldr K1, KK
  1036. #ifdef LEFT
  1037. add K1, K1, #1 // number of values in AO
  1038. #else
  1039. add K1, K1, #2 // number of values in BO
  1040. #endif
  1041. str K1, KKK
  1042. #endif
  1043. asrs L , K1, #3 // L = L / 8
  1044. ble _L2_M1_40
  1045. _L2_M1_22:
  1046. KERNEL1x2_SUB
  1047. KERNEL1x2_SUB
  1048. KERNEL1x2_SUB
  1049. KERNEL1x2_SUB
  1050. KERNEL1x2_SUB
  1051. KERNEL1x2_SUB
  1052. KERNEL1x2_SUB
  1053. KERNEL1x2_SUB
  1054. subs L, L, #1
  1055. bgt _L2_M1_22
  1056. _L2_M1_40:
  1057. ands L , K1, #7 // L = L % 8
  1058. ble _L2_M1_100
  1059. _L2_M1_42:
  1060. KERNEL1x2_SUB
  1061. subs L, L, #1
  1062. bgt _L2_M1_42
  1063. _L2_M1_100:
  1064. SAVE1x2
  1065. #if (defined(LEFT) && defined(TRANSA)) || \
  1066. (!defined(LEFT) && !defined(TRANSA))
  1067. ldr r3 , K
  1068. ldr r4 , KKK
  1069. sub r3 , r3 , r4
  1070. lsls r4 , r3 , #3 // 2 float values
  1071. add BO , BO , r4
  1072. lsls r4 , r3 , #2 // 1 float value
  1073. add AO , AO , r4
  1074. #endif
  1075. #if defined(LEFT)
  1076. ldr r3 , KK
  1077. add r3 , r3 , #1 // number of values in AO
  1078. str r3 , KK
  1079. #endif
  1080. _L2_END:
  1081. mov r3, BC
  1082. ldr r4, K
  1083. lsl r4, r4, #3 // k * 2 * 4
  1084. add r3, r3, r4 // B = B + K * 2 * 4
  1085. mov BC, r3
  1086. #if !defined(LEFT)
  1087. ldr r3 , KK
  1088. add r3 , r3 , #2 // number of values in BO
  1089. str r3 , KK
  1090. #endif
  1091. /*********************************************************************************************/
  1092. _L1_BEGIN:
  1093. ldr J , N
  1094. tst J , #1
  1095. ble _L999
  1096. ldr CO1, C // CO1 = C
  1097. ldr r4 , LDC
  1098. add r3 , r4, CO1
  1099. str r3 , C // store C
  1100. #if defined(LEFT)
  1101. ldr r3 , OFFSET
  1102. str r3 , KK
  1103. #endif
  1104. ldr AO, A // AO = A
  1105. //pld [AO , #A_PRE-96]
  1106. //pld [AO , #A_PRE-64]
  1107. //pld [AO , #A_PRE-32]
  1108. _L1_M4_BEGIN:
  1109. ldr I, M
  1110. asrs I, I, #2 // I = I / 4
  1111. ble _L1_M2_BEGIN
  1112. _L1_M4_20:
  1113. INIT4x1
  1114. #if (defined(LEFT) && defined(TRANSA)) || \
  1115. (!defined(LEFT) && !defined(TRANSA))
  1116. mov BO, BC
  1117. #else
  1118. mov BO, BC
  1119. ldr r3 , KK
  1120. lsls r4 , r3 , #2 // 1 float value
  1121. add BO , BO , r4
  1122. lsls r4 , r3 , #4 // 4 float values
  1123. add AO , AO , r4
  1124. #endif
  1125. #ifndef TRMMKERNEL
  1126. ldr K1, K
  1127. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1128. ldr K1, K
  1129. ldr r3, KK
  1130. sub K1, K1, r3
  1131. str K1, KKK
  1132. #else
  1133. ldr K1, KK
  1134. #ifdef LEFT
  1135. add K1, K1, #4 // number of values in AO
  1136. #else
  1137. add K1, K1, #1 // number of values in BO
  1138. #endif
  1139. str K1, KKK
  1140. #endif
  1141. asrs L , K1, #3 // L = L / 8
  1142. ble _L1_M4_40
  1143. .align 5
  1144. _L1_M4_22:
  1145. KERNEL4x1_SUB
  1146. KERNEL4x1_SUB
  1147. KERNEL4x1_SUB
  1148. KERNEL4x1_SUB
  1149. KERNEL4x1_SUB
  1150. KERNEL4x1_SUB
  1151. KERNEL4x1_SUB
  1152. KERNEL4x1_SUB
  1153. subs L, L, #1
  1154. bgt _L1_M4_22
  1155. _L1_M4_40:
  1156. ands L , K1, #7 // L = L % 8
  1157. ble _L1_M4_100
  1158. _L1_M4_42:
  1159. KERNEL4x1_SUB
  1160. subs L, L, #1
  1161. bgt _L1_M4_42
  1162. _L1_M4_100:
  1163. SAVE4x1
  1164. #if (defined(LEFT) && defined(TRANSA)) || \
  1165. (!defined(LEFT) && !defined(TRANSA))
  1166. ldr r3 , K
  1167. ldr r4 , KKK
  1168. sub r3 , r3 , r4
  1169. lsls r4 , r3 , #2 // 1 float value
  1170. add BO , BO , r4
  1171. lsls r4 , r3 , #4 // 4 float values
  1172. add AO , AO , r4
  1173. #endif
  1174. #if defined(LEFT)
  1175. ldr r3 , KK
  1176. add r3 , r3 , #4 // number of values in AO
  1177. str r3 , KK
  1178. #endif
  1179. _L1_M4_END:
  1180. subs I, I, #1
  1181. bgt _L1_M4_20
  1182. _L1_M2_BEGIN:
  1183. ldr I, M
  1184. tst I , #3
  1185. ble _L1_END
  1186. tst I, #2 // I = I / 2
  1187. ble _L1_M1_BEGIN
  1188. _L1_M2_20:
  1189. INIT2x1
  1190. #if (defined(LEFT) && defined(TRANSA)) || \
  1191. (!defined(LEFT) && !defined(TRANSA))
  1192. mov BO, BC
  1193. #else
  1194. mov BO, BC
  1195. ldr r3 , KK
  1196. lsls r4 , r3 , #2 // 1 float value
  1197. add BO , BO , r4
  1198. lsls r4 , r3 , #3 // 2 float values
  1199. add AO , AO , r4
  1200. #endif
  1201. #ifndef TRMMKERNEL
  1202. ldr K1, K
  1203. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1204. ldr K1, K
  1205. ldr r3, KK
  1206. sub K1, K1, r3
  1207. str K1, KKK
  1208. #else
  1209. ldr K1, KK
  1210. #ifdef LEFT
  1211. add K1, K1, #2 // number of values in AO
  1212. #else
  1213. add K1, K1, #1 // number of values in BO
  1214. #endif
  1215. str K1, KKK
  1216. #endif
  1217. asrs L , K1, #3 // L = L / 8
  1218. ble _L1_M2_40
  1219. _L1_M2_22:
  1220. KERNEL2x1_SUB
  1221. KERNEL2x1_SUB
  1222. KERNEL2x1_SUB
  1223. KERNEL2x1_SUB
  1224. KERNEL2x1_SUB
  1225. KERNEL2x1_SUB
  1226. KERNEL2x1_SUB
  1227. KERNEL2x1_SUB
  1228. subs L, L, #1
  1229. bgt _L1_M2_22
  1230. _L1_M2_40:
  1231. ands L , K1, #7 // L = L % 8
  1232. ble _L1_M2_100
  1233. _L1_M2_42:
  1234. KERNEL2x1_SUB
  1235. subs L, L, #1
  1236. bgt _L1_M2_42
  1237. _L1_M2_100:
  1238. SAVE2x1
  1239. #if (defined(LEFT) && defined(TRANSA)) || \
  1240. (!defined(LEFT) && !defined(TRANSA))
  1241. ldr r3 , K
  1242. ldr r4 , KKK
  1243. sub r3 , r3 , r4
  1244. lsls r4 , r3 , #2 // 1 float value
  1245. add BO , BO , r4
  1246. lsls r4 , r3 , #3 // 2 float values
  1247. add AO , AO , r4
  1248. #endif
  1249. #if defined(LEFT)
  1250. ldr r3 , KK
  1251. add r3 , r3 , #2 // number of values in AO
  1252. str r3 , KK
  1253. #endif
  1254. _L1_M2_END:
  1255. _L1_M1_BEGIN:
  1256. tst I, #1 // I = I % 2
  1257. ble _L1_END
  1258. _L1_M1_20:
  1259. INIT1x1
  1260. #if (defined(LEFT) && defined(TRANSA)) || \
  1261. (!defined(LEFT) && !defined(TRANSA))
  1262. mov BO, BC
  1263. #else
  1264. mov BO, BC
  1265. ldr r3 , KK
  1266. lsls r4 , r3 , #2 // 1 float value
  1267. add BO , BO , r4
  1268. lsls r4 , r3 , #2 // 1 float value
  1269. add AO , AO , r4
  1270. #endif
  1271. #ifndef TRMMKERNEL
  1272. ldr K1, K
  1273. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1274. ldr K1, K
  1275. ldr r3, KK
  1276. sub K1, K1, r3
  1277. str K1, KKK
  1278. #else
  1279. ldr K1, KK
  1280. #ifdef LEFT
  1281. add K1, K1, #1 // number of values in AO
  1282. #else
  1283. add K1, K1, #1 // number of values in BO
  1284. #endif
  1285. str K1, KKK
  1286. #endif
  1287. asrs L , K1, #3 // L = L / 8
  1288. ble _L1_M1_40
  1289. _L1_M1_22:
  1290. KERNEL1x1_SUB
  1291. KERNEL1x1_SUB
  1292. KERNEL1x1_SUB
  1293. KERNEL1x1_SUB
  1294. KERNEL1x1_SUB
  1295. KERNEL1x1_SUB
  1296. KERNEL1x1_SUB
  1297. KERNEL1x1_SUB
  1298. subs L, L, #1
  1299. bgt _L1_M1_22
  1300. _L1_M1_40:
  1301. ands L , K1, #7 // L = L % 8
  1302. ble _L1_M1_100
  1303. _L1_M1_42:
  1304. KERNEL1x1_SUB
  1305. subs L, L, #1
  1306. bgt _L1_M1_42
  1307. _L1_M1_100:
  1308. SAVE1x1
  1309. _L1_END:
  1310. _L999:
  1311. sub r3, fp, #128
  1312. vldm r3, { s8 - s31} // restore floating point registers
  1313. movs r0, #0 // set return value
  1314. sub sp, fp, #24
  1315. pop {r4 - r9, fp}
  1316. bx lr
  1317. EPILOGUE