You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfpv3.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define ALPHA_I [fp, #-272]
  56. #define ALPHA_R [fp, #-280]
  57. #define B [fp, #4 ]
  58. #define C [fp, #8 ]
  59. #define OLD_LDC [fp, #12 ]
  60. #define OFFSET [fp, #16 ]
  61. #define I r0
  62. #define J r1
  63. #define L r2
  64. #define AO r5
  65. #define BO r6
  66. #define CO1 r8
  67. #define CO2 r9
  68. #define K1 r7
  69. #define BC r12
  70. #define A_PRE 96
  71. #define B_PRE 96
  72. #define C_PRE 64
  73. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  74. #define FADD_R fsubs
  75. #define FADD_I fadds
  76. #define FMAC_R1 fnmuls
  77. #define FMAC_R2 fnmacs
  78. #define FMAC_I1 fmuls
  79. #define FMAC_I2 fnmacs
  80. #elif defined(CN) || defined(CT)
  81. #define FADD_R fadds
  82. #define FADD_I fsubs
  83. #define FMAC_R1 fmuls
  84. #define FMAC_R2 fmacs
  85. #define FMAC_I1 fnmuls
  86. #define FMAC_I2 fmacs
  87. #elif defined(NC) || defined(TC)
  88. #define FADD_R fadds
  89. #define FADD_I fsubs
  90. #define FMAC_R1 fmuls
  91. #define FMAC_R2 fnmacs
  92. #define FMAC_I1 fmuls
  93. #define FMAC_I2 fmacs
  94. #else
  95. #define FADD_R fsubs
  96. #define FADD_I fadds
  97. #define FMAC_R1 fnmuls
  98. #define FMAC_R2 fmacs
  99. #define FMAC_I1 fnmuls
  100. #define FMAC_I2 fnmacs
  101. #endif
  102. /**************************************************************************************
  103. * Macro definitions
  104. **************************************************************************************/
  105. .macro INIT2x2
  106. vsub.f32 s16 , s16 , s16
  107. vmov.f32 s17, s16
  108. vmov.f32 s18, s16
  109. vmov.f32 s19, s16
  110. vmov.f32 s20, s16
  111. vmov.f32 s21, s16
  112. vmov.f32 s22, s16
  113. vmov.f32 s23, s16
  114. vmov.f32 s24, s16
  115. vmov.f32 s25, s16
  116. vmov.f32 s26, s16
  117. vmov.f32 s27, s16
  118. vmov.f32 s28, s16
  119. vmov.f32 s29, s16
  120. vmov.f32 s30, s16
  121. vmov.f32 s31, s16
  122. .endm
  123. .macro KERNEL2x2_I
  124. pld [ AO , #A_PRE ]
  125. pld [ BO , #B_PRE ]
  126. fldmias AO!, { s0 - s1 }
  127. fldmias BO!, { s8 - s9 }
  128. fmuls s16 , s0, s8
  129. fmuls s24 , s1, s9
  130. fldmias AO!, { s2 - s3 }
  131. fmuls s17 , s0, s9
  132. fmuls s25 , s1, s8
  133. fldmias BO!, { s10 - s11 }
  134. fmuls s18 , s2, s8
  135. fmuls s26 , s3, s9
  136. fldmias AO!, { s4 - s5 }
  137. fmuls s19 , s2, s9
  138. fmuls s27 , s3, s8
  139. fldmias BO!, { s12 - s13 }
  140. fmuls s20 , s0, s10
  141. fmuls s28 , s1, s11
  142. fldmias AO!, { s6 - s7 }
  143. fmuls s21 , s0, s11
  144. fmuls s29 , s1, s10
  145. fldmias BO!, { s14 - s15 }
  146. fmuls s22 , s2, s10
  147. fmuls s30 , s3, s11
  148. fmuls s23 , s2, s11
  149. fmuls s31 , s3, s10
  150. .endm
  151. .macro KERNEL2x2_M1
  152. fmacs s16 , s0, s8
  153. fldmias AO!, { s4 - s5 }
  154. fmacs s24 , s1, s9
  155. fmacs s17 , s0, s9
  156. fldmias BO!, { s12 - s13 }
  157. fmacs s25 , s1, s8
  158. fmacs s18 , s2, s8
  159. fldmias AO!, { s6 - s7 }
  160. fmacs s26 , s3, s9
  161. fmacs s19 , s2, s9
  162. fldmias BO!, { s14 - s15 }
  163. fmacs s27 , s3, s8
  164. fmacs s20 , s0, s10
  165. fmacs s28 , s1, s11
  166. fmacs s21 , s0, s11
  167. fmacs s29 , s1, s10
  168. fmacs s22 , s2, s10
  169. fmacs s30 , s3, s11
  170. fmacs s23 , s2, s11
  171. fmacs s31 , s3, s10
  172. .endm
  173. .macro KERNEL2x2_M2
  174. pld [ AO , #A_PRE ]
  175. fmacs s16 , s4, s12
  176. pld [ BO , #B_PRE ]
  177. fmacs s24 , s5, s13
  178. fmacs s17 , s4, s13
  179. fldmias AO!, { s0 - s1 }
  180. fmacs s25 , s5, s12
  181. fmacs s18 , s6, s12
  182. fmacs s26 , s7, s13
  183. fldmias BO!, { s8 - s9 }
  184. fmacs s19 , s6, s13
  185. fmacs s27 , s7, s12
  186. fldmias AO!, { s2 - s3 }
  187. fmacs s20 , s4, s14
  188. fmacs s28 , s5, s15
  189. fldmias BO!, { s10 - s11 }
  190. fmacs s21 , s4, s15
  191. fmacs s29 , s5, s14
  192. fmacs s22 , s6, s14
  193. fmacs s30 , s7, s15
  194. fmacs s23 , s6, s15
  195. fmacs s31 , s7, s14
  196. .endm
  197. .macro KERNEL2x2_E
  198. fmacs s16 , s4, s12
  199. fmacs s24 , s5, s13
  200. fmacs s17 , s4, s13
  201. fmacs s25 , s5, s12
  202. fmacs s18 , s6, s12
  203. fmacs s26 , s7, s13
  204. fmacs s19 , s6, s13
  205. fmacs s27 , s7, s12
  206. fmacs s20 , s4, s14
  207. fmacs s28 , s5, s15
  208. fmacs s21 , s4, s15
  209. fmacs s29 , s5, s14
  210. fmacs s22 , s6, s14
  211. fmacs s30 , s7, s15
  212. fmacs s23 , s6, s15
  213. fmacs s31 , s7, s14
  214. .endm
  215. .macro KERNEL2x2_SUB
  216. fldmias AO!, { s0 - s1 }
  217. fldmias BO!, { s8 - s9 }
  218. fmacs s16 , s0, s8
  219. fmacs s24 , s1, s9
  220. fldmias AO!, { s2 - s3 }
  221. fmacs s17 , s0, s9
  222. fmacs s25 , s1, s8
  223. fldmias BO!, { s10 - s11 }
  224. fmacs s18 , s2, s8
  225. fmacs s26 , s3, s9
  226. fmacs s19 , s2, s9
  227. fmacs s27 , s3, s8
  228. fmacs s20 , s0, s10
  229. fmacs s28 , s1, s11
  230. fmacs s21 , s0, s11
  231. fmacs s29 , s1, s10
  232. fmacs s22 , s2, s10
  233. fmacs s30 , s3, s11
  234. fmacs s23 , s2, s11
  235. fmacs s31 , s3, s10
  236. .endm
  237. .macro SAVE2x2
  238. ldr r3 , LDC
  239. add CO2 , CO1, r3
  240. flds s0, ALPHA_R
  241. flds s1, ALPHA_I
  242. FADD_R s16, s24 , s16
  243. FADD_I s17, s25 , s17
  244. FADD_R s18, s26 , s18
  245. FADD_I s19, s27 , s19
  246. FADD_R s20, s28 , s20
  247. FADD_I s21, s29 , s21
  248. FADD_R s22, s30 , s22
  249. FADD_I s23, s31 , s23
  250. FMAC_R1 s4 , s0 , s16
  251. FMAC_I1 s5 , s0 , s17
  252. FMAC_R2 s4 , s1 , s17
  253. FMAC_I2 s5 , s1 , s16
  254. FMAC_R1 s6 , s0 , s18
  255. FMAC_I1 s7 , s0 , s19
  256. FMAC_R2 s6 , s1 , s19
  257. FMAC_I2 s7 , s1 , s18
  258. FMAC_R1 s8 , s0 , s20
  259. FMAC_I1 s9 , s0 , s21
  260. FMAC_R2 s8 , s1 , s21
  261. FMAC_I2 s9 , s1 , s20
  262. FMAC_R1 s10, s0 , s22
  263. FMAC_I1 s11, s0 , s23
  264. FMAC_R2 s10, s1 , s23
  265. FMAC_I2 s11, s1 , s22
  266. fstmias CO1, { s4 - s7 }
  267. fstmias CO2, { s8 - s11 }
  268. add CO1, CO1, #16
  269. .endm
  270. /******************************************************************************/
  271. .macro INIT1x2
  272. vsub.f32 s16 , s16 , s16
  273. vmov.f32 s17, s16
  274. vmov.f32 s20, s16
  275. vmov.f32 s21, s16
  276. vmov.f32 s24, s16
  277. vmov.f32 s25, s16
  278. vmov.f32 s28, s16
  279. vmov.f32 s29, s16
  280. .endm
  281. .macro KERNEL1x2_I
  282. pld [ AO , #A_PRE ]
  283. pld [ BO , #B_PRE ]
  284. flds s0 , [ AO ]
  285. flds s1 , [ AO, #4 ]
  286. flds s8 , [ BO ]
  287. flds s9 , [ BO, #4 ]
  288. flds s10, [ BO, #8 ]
  289. flds s11, [ BO, #12 ]
  290. fmuls s16 , s0, s8
  291. fmuls s24 , s1, s9
  292. fmuls s17 , s0, s9
  293. fmuls s25 , s1, s8
  294. fmuls s20 , s0, s10
  295. fmuls s28 , s1, s11
  296. fmuls s21 , s0, s11
  297. fmuls s29 , s1, s10
  298. add BO , BO, #16
  299. add AO , AO, #8
  300. pld [ BO , #B_PRE ]
  301. flds s4 , [ AO, #0 ]
  302. flds s5 , [ AO, #4 ]
  303. flds s12, [ BO ]
  304. flds s13, [ BO, #4 ]
  305. flds s14, [ BO, #8 ]
  306. flds s15, [ BO, #12 ]
  307. add BO , BO, #16
  308. add AO , AO, #8
  309. .endm
  310. .macro KERNEL1x2_M1
  311. pld [ BO , #B_PRE ]
  312. fmacs s16 , s0, s8
  313. fmacs s24 , s1, s9
  314. fmacs s17 , s0, s9
  315. fmacs s25 , s1, s8
  316. fmacs s20 , s0, s10
  317. fmacs s28 , s1, s11
  318. fmacs s21 , s0, s11
  319. fmacs s29 , s1, s10
  320. flds s4 , [ AO, #0 ]
  321. flds s5 , [ AO, #4 ]
  322. flds s12, [ BO ]
  323. flds s13, [ BO, #4 ]
  324. flds s14, [ BO, #8 ]
  325. flds s15, [ BO, #12 ]
  326. add BO , BO, #16
  327. add AO , AO, #8
  328. .endm
  329. .macro KERNEL1x2_M2
  330. pld [ AO , #A_PRE ]
  331. pld [ BO , #B_PRE ]
  332. fmacs s16 , s4, s12
  333. fmacs s24 , s5, s13
  334. fmacs s17 , s4, s13
  335. fmacs s25 , s5, s12
  336. fmacs s20 , s4, s14
  337. fmacs s28 , s5, s15
  338. fmacs s21 , s4, s15
  339. fmacs s29 , s5, s14
  340. flds s0 , [ AO, #0 ]
  341. flds s1 , [ AO, #4 ]
  342. flds s8 , [ BO ]
  343. flds s9 , [ BO, #4 ]
  344. flds s10, [ BO, #8 ]
  345. flds s11, [ BO, #12 ]
  346. add BO , BO, #16
  347. add AO , AO, #8
  348. .endm
  349. .macro KERNEL1x2_E
  350. fmacs s16 , s4, s12
  351. fmacs s24 , s5, s13
  352. fmacs s17 , s4, s13
  353. fmacs s25 , s5, s12
  354. fmacs s20 , s4, s14
  355. fmacs s28 , s5, s15
  356. fmacs s21 , s4, s15
  357. fmacs s29 , s5, s14
  358. .endm
  359. .macro KERNEL1x2_SUB
  360. pld [ AO , #A_PRE ]
  361. pld [ BO , #B_PRE ]
  362. flds s0 , [ AO ]
  363. flds s1 , [ AO, #4 ]
  364. flds s8 , [ BO ]
  365. flds s9 , [ BO, #4 ]
  366. flds s10, [ BO, #8 ]
  367. flds s11, [ BO, #12 ]
  368. fmacs s16 , s0, s8
  369. fmacs s24 , s1, s9
  370. fmacs s17 , s0, s9
  371. fmacs s25 , s1, s8
  372. fmacs s20 , s0, s10
  373. fmacs s28 , s1, s11
  374. fmacs s21 , s0, s11
  375. fmacs s29 , s1, s10
  376. add BO , BO, #16
  377. add AO , AO, #8
  378. .endm
  379. .macro SAVE1x2
  380. ldr r3 , LDC
  381. add CO2 , CO1, r3
  382. flds s0, ALPHA_R
  383. flds s1, ALPHA_I
  384. FADD_R s16, s24 , s16
  385. FADD_I s17, s25 , s17
  386. FADD_R s20, s28 , s20
  387. FADD_I s21, s29 , s21
  388. FMAC_R1 s4 , s0 , s16
  389. FMAC_I1 s5 , s0 , s17
  390. FMAC_R2 s4 , s1 , s17
  391. FMAC_I2 s5 , s1 , s16
  392. FMAC_R1 s8 , s0 , s20
  393. FMAC_I1 s9 , s0 , s21
  394. FMAC_R2 s8 , s1 , s21
  395. FMAC_I2 s9 , s1 , s20
  396. fstmias CO1, { s4 - s5 }
  397. fstmias CO2, { s8 - s9 }
  398. add CO1, CO1, #8
  399. .endm
  400. /******************************************************************************/
  401. .macro INIT2x1
  402. vsub.f32 s16 , s16 , s16
  403. vmov.f32 s17, s16
  404. vmov.f32 s18, s16
  405. vmov.f32 s19, s16
  406. vmov.f32 s24, s16
  407. vmov.f32 s25, s16
  408. vmov.f32 s26, s16
  409. vmov.f32 s27, s16
  410. .endm
  411. .macro KERNEL2x1_I
  412. pld [ AO , #A_PRE ]
  413. pld [ BO , #B_PRE ]
  414. flds s0 , [ AO ]
  415. flds s1 , [ AO, #4 ]
  416. flds s2 , [ AO, #8 ]
  417. flds s3 , [ AO, #12 ]
  418. flds s8 , [ BO ]
  419. flds s9 , [ BO, #4 ]
  420. fmuls s16 , s0, s8
  421. fmuls s24 , s1, s9
  422. fmuls s17 , s0, s9
  423. fmuls s25 , s1, s8
  424. fmuls s18 , s2, s8
  425. fmuls s26 , s3, s9
  426. fmuls s19 , s2, s9
  427. fmuls s27 , s3, s8
  428. add BO , BO, #8
  429. add AO , AO, #16
  430. pld [ BO , #B_PRE ]
  431. pld [ AO , #A_PRE ]
  432. flds s4 , [ AO, #0 ]
  433. flds s5 , [ AO, #4 ]
  434. flds s6 , [ AO, #8 ]
  435. flds s7 , [ AO, #12 ]
  436. flds s12, [ BO ]
  437. flds s13, [ BO, #4 ]
  438. add BO , BO, #8
  439. add AO , AO, #16
  440. .endm
  441. .macro KERNEL2x1_M1
  442. pld [ AO , #A_PRE ]
  443. pld [ BO , #B_PRE ]
  444. fmacs s16 , s0, s8
  445. fmacs s24 , s1, s9
  446. fmacs s17 , s0, s9
  447. fmacs s25 , s1, s8
  448. fmacs s18 , s2, s8
  449. fmacs s26 , s3, s9
  450. fmacs s19 , s2, s9
  451. fmacs s27 , s3, s8
  452. flds s4 , [ AO, #0 ]
  453. flds s5 , [ AO, #4 ]
  454. flds s6 , [ AO, #8 ]
  455. flds s7 , [ AO, #12 ]
  456. flds s12, [ BO ]
  457. flds s13, [ BO, #4 ]
  458. add BO , BO, #8
  459. add AO , AO, #16
  460. .endm
  461. .macro KERNEL2x1_M2
  462. pld [ AO , #A_PRE ]
  463. pld [ BO , #B_PRE ]
  464. fmacs s16 , s4, s12
  465. fmacs s24 , s5, s13
  466. fmacs s17 , s4, s13
  467. fmacs s25 , s5, s12
  468. fmacs s18 , s6, s12
  469. fmacs s26 , s7, s13
  470. fmacs s19 , s6, s13
  471. fmacs s27 , s7, s12
  472. flds s0 , [ AO, #0 ]
  473. flds s1 , [ AO, #4 ]
  474. flds s2 , [ AO, #8 ]
  475. flds s3 , [ AO, #12 ]
  476. flds s8 , [ BO ]
  477. flds s9 , [ BO, #4 ]
  478. add BO , BO, #8
  479. add AO , AO, #16
  480. .endm
  481. .macro KERNEL2x1_E
  482. fmacs s16 , s4, s12
  483. fmacs s24 , s5, s13
  484. fmacs s17 , s4, s13
  485. fmacs s25 , s5, s12
  486. fmacs s18 , s6, s12
  487. fmacs s26 , s7, s13
  488. fmacs s19 , s6, s13
  489. fmacs s27 , s7, s12
  490. .endm
  491. .macro KERNEL2x1_SUB
  492. pld [ AO , #A_PRE ]
  493. pld [ BO , #B_PRE ]
  494. flds s0 , [ AO ]
  495. flds s1 , [ AO, #4 ]
  496. flds s2 , [ AO, #8 ]
  497. flds s3 , [ AO, #12 ]
  498. flds s8 , [ BO ]
  499. flds s9 , [ BO, #4 ]
  500. fmacs s16 , s0, s8
  501. fmacs s24 , s1, s9
  502. fmacs s17 , s0, s9
  503. fmacs s25 , s1, s8
  504. fmacs s18 , s2, s8
  505. fmacs s26 , s3, s9
  506. fmacs s19 , s2, s9
  507. fmacs s27 , s3, s8
  508. add BO , BO, #8
  509. add AO , AO, #16
  510. .endm
  511. .macro SAVE2x1
  512. flds s0, ALPHA_R
  513. flds s1, ALPHA_I
  514. FADD_R s16, s24 , s16
  515. FADD_I s17, s25 , s17
  516. FADD_R s18, s26 , s18
  517. FADD_I s19, s27 , s19
  518. FMAC_R1 s4 , s0 , s16
  519. FMAC_I1 s5 , s0 , s17
  520. FMAC_R2 s4 , s1 , s17
  521. FMAC_I2 s5 , s1 , s16
  522. FMAC_R1 s6 , s0 , s18
  523. FMAC_I1 s7 , s0 , s19
  524. FMAC_R2 s6 , s1 , s19
  525. FMAC_I2 s7 , s1 , s18
  526. fstmias CO1, { s4 - s7 }
  527. add CO1, CO1, #16
  528. .endm
  529. /******************************************************************************/
  530. .macro INIT1x1
  531. vsub.f32 s16 , s16 , s16
  532. vmov.f32 s17, s16
  533. vmov.f32 s24, s16
  534. vmov.f32 s25, s16
  535. .endm
  536. .macro KERNEL1x1_I
  537. pld [ AO , #A_PRE ]
  538. pld [ BO , #B_PRE ]
  539. flds s0 , [ AO ]
  540. flds s1 , [ AO, #4 ]
  541. flds s8 , [ BO ]
  542. flds s9 , [ BO, #4 ]
  543. fmuls s16 , s0, s8
  544. fmuls s24 , s1, s9
  545. fmuls s17 , s0, s9
  546. fmuls s25 , s1, s8
  547. add BO , BO, #8
  548. add AO , AO, #8
  549. pld [ BO , #B_PRE ]
  550. pld [ AO , #A_PRE ]
  551. flds s4 , [ AO, #0 ]
  552. flds s5 , [ AO, #4 ]
  553. flds s12, [ BO ]
  554. flds s13, [ BO, #4 ]
  555. add BO , BO, #8
  556. add AO , AO, #8
  557. .endm
  558. .macro KERNEL1x1_M1
  559. fmacs s16 , s0, s8
  560. fmacs s24 , s1, s9
  561. fmacs s17 , s0, s9
  562. fmacs s25 , s1, s8
  563. flds s4 , [ AO, #0 ]
  564. flds s5 , [ AO, #4 ]
  565. flds s12, [ BO ]
  566. flds s13, [ BO, #4 ]
  567. add BO , BO, #8
  568. add AO , AO, #8
  569. .endm
  570. .macro KERNEL1x1_M2
  571. fmacs s16 , s4, s12
  572. fmacs s24 , s5, s13
  573. fmacs s17 , s4, s13
  574. fmacs s25 , s5, s12
  575. flds s0 , [ AO, #0 ]
  576. flds s1 , [ AO, #4 ]
  577. flds s8 , [ BO ]
  578. flds s9 , [ BO, #4 ]
  579. add BO , BO, #8
  580. add AO , AO, #8
  581. .endm
  582. .macro KERNEL1x1_E
  583. fmacs s16 , s4, s12
  584. fmacs s24 , s5, s13
  585. fmacs s17 , s4, s13
  586. fmacs s25 , s5, s12
  587. .endm
  588. .macro KERNEL1x1_SUB
  589. flds s0 , [ AO ]
  590. flds s1 , [ AO, #4 ]
  591. flds s8 , [ BO ]
  592. flds s9 , [ BO, #4 ]
  593. fmacs s16 , s0, s8
  594. fmacs s24 , s1, s9
  595. fmacs s17 , s0, s9
  596. fmacs s25 , s1, s8
  597. add BO , BO, #8
  598. add AO , AO, #8
  599. .endm
  600. .macro SAVE1x1
  601. flds s0, ALPHA_R
  602. flds s1, ALPHA_I
  603. FADD_R s16, s24 , s16
  604. FADD_I s17, s25 , s17
  605. FMAC_R1 s4 , s0 , s16
  606. FMAC_I1 s5 , s0 , s17
  607. FMAC_R2 s4 , s1 , s17
  608. FMAC_I2 s5 , s1 , s16
  609. fstmias CO1, { s4 - s5 }
  610. add CO1, CO1, #8
  611. .endm
  612. /******************************************************************************/
  613. /**************************************************************************************
  614. * End of macro definitions
  615. **************************************************************************************/
  616. PROLOGUE
  617. .align 5
  618. push {r4 - r9, fp}
  619. add fp, sp, #24
  620. sub sp, sp, #STACKSIZE // reserve stack
  621. str OLD_M, M
  622. str OLD_N, N
  623. str OLD_K, K
  624. str OLD_A, A
  625. vstr OLD_ALPHA_R, ALPHA_R
  626. vstr OLD_ALPHA_I, ALPHA_I
  627. sub r3, fp, #128
  628. vstm r3, { s8 - s31} // store floating point registers
  629. ldr r3, OLD_LDC
  630. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  631. str r3, LDC
  632. ldr r3, OFFSET
  633. #ifndef LEFT
  634. neg r3 , r3
  635. #endif
  636. str r3 , KK
  637. ldr BC, B
  638. ldr J, N
  639. asrs J, J, #1 // J = J / 2
  640. ble _L1_BEGIN
  641. _L2_BEGIN:
  642. ldr CO1, C // CO1 = C
  643. ldr r4 , LDC
  644. lsl r4 , r4 , #1 // LDC * 2
  645. add r3 , r4, CO1
  646. str r3 , C // store C
  647. #if defined(LEFT)
  648. ldr r3 , OFFSET
  649. str r3 , KK
  650. #endif
  651. ldr AO, A // AO = A
  652. pld [AO , #A_PRE-64]
  653. pld [AO , #A_PRE-32]
  654. _L2_M2_BEGIN:
  655. ldr I, M
  656. asrs I, I, #1 // I = I / 2
  657. ble _L2_M1_BEGIN
  658. _L2_M2_20:
  659. #if (defined(LEFT) && defined(TRANSA)) || \
  660. (!defined(LEFT) && !defined(TRANSA))
  661. mov BO, BC
  662. #else
  663. mov BO, BC
  664. ldr r3 , KK
  665. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  666. add BO , BO , r4
  667. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  668. add AO , AO , r4
  669. #endif
  670. #ifndef TRMMKERNEL
  671. ldr K1, K
  672. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  673. ldr K1, K
  674. ldr r3, KK
  675. sub K1, K1, r3
  676. str K1, KKK
  677. #else
  678. ldr K1, KK
  679. #ifdef LEFT
  680. add K1, K1, #2 // number of values in AO
  681. #else
  682. add K1, K1, #2 // number of values in BO
  683. #endif
  684. str K1, KKK
  685. #endif
  686. asrs L , K1, #3 // L = L / 8
  687. cmp L , #3
  688. blt _L2_M2_30
  689. .align 5
  690. KERNEL2x2_I
  691. KERNEL2x2_M2
  692. KERNEL2x2_M1
  693. KERNEL2x2_M2
  694. KERNEL2x2_M1
  695. KERNEL2x2_M2
  696. KERNEL2x2_M1
  697. KERNEL2x2_M2
  698. sub L, L, #2
  699. _L2_M2_22:
  700. KERNEL2x2_M1
  701. KERNEL2x2_M2
  702. KERNEL2x2_M1
  703. KERNEL2x2_M2
  704. KERNEL2x2_M1
  705. KERNEL2x2_M2
  706. KERNEL2x2_M1
  707. KERNEL2x2_M2
  708. subs L, L, #1
  709. bgt _L2_M2_22
  710. KERNEL2x2_M1
  711. KERNEL2x2_M2
  712. KERNEL2x2_M1
  713. KERNEL2x2_M2
  714. KERNEL2x2_M1
  715. KERNEL2x2_M2
  716. KERNEL2x2_M1
  717. KERNEL2x2_E
  718. b _L2_M2_44
  719. _L2_M2_30:
  720. tst L, #3
  721. ble _L2_M2_40
  722. tst L, #2
  723. ble _L2_M2_32
  724. KERNEL2x2_I
  725. KERNEL2x2_M2
  726. KERNEL2x2_M1
  727. KERNEL2x2_M2
  728. KERNEL2x2_M1
  729. KERNEL2x2_M2
  730. KERNEL2x2_M1
  731. KERNEL2x2_M2
  732. KERNEL2x2_M1
  733. KERNEL2x2_M2
  734. KERNEL2x2_M1
  735. KERNEL2x2_M2
  736. KERNEL2x2_M1
  737. KERNEL2x2_M2
  738. KERNEL2x2_M1
  739. KERNEL2x2_E
  740. b _L2_M2_44
  741. _L2_M2_32:
  742. tst L, #1
  743. ble _L2_M2_40
  744. KERNEL2x2_I
  745. KERNEL2x2_M2
  746. KERNEL2x2_M1
  747. KERNEL2x2_M2
  748. KERNEL2x2_M1
  749. KERNEL2x2_M2
  750. KERNEL2x2_M1
  751. KERNEL2x2_E
  752. b _L2_M2_44
  753. _L2_M2_40:
  754. INIT2x2
  755. _L2_M2_44:
  756. ands L , K1, #7 // L = L % 8
  757. ble _L2_M2_100
  758. _L2_M2_46:
  759. KERNEL2x2_SUB
  760. subs L, L, #1
  761. bne _L2_M2_46
  762. _L2_M2_100:
  763. SAVE2x2
  764. #if (defined(LEFT) && defined(TRANSA)) || \
  765. (!defined(LEFT) && !defined(TRANSA))
  766. ldr r3 , K
  767. ldr r4 , KKK
  768. sub r3 , r3 , r4
  769. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  770. add BO , BO , r4
  771. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  772. add AO , AO , r4
  773. #endif
  774. #if defined(LEFT)
  775. ldr r3 , KK
  776. add r3 , r3 , #2 // number of values in AO
  777. str r3 , KK
  778. #endif
  779. _L2_M2_END:
  780. subs I, I, #1
  781. bne _L2_M2_20
  782. _L2_M1_BEGIN:
  783. ldr I, M
  784. tst I, #1 // I = I % 2
  785. ble _L2_END
  786. _L2_M1_20:
  787. INIT1x2
  788. #if (defined(LEFT) && defined(TRANSA)) || \
  789. (!defined(LEFT) && !defined(TRANSA))
  790. mov BO, BC
  791. #else
  792. mov BO, BC
  793. ldr r3 , KK
  794. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  795. add BO , BO , r4
  796. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  797. add AO , AO , r4
  798. #endif
  799. #ifndef TRMMKERNEL
  800. ldr K1, K
  801. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  802. ldr K1, K
  803. ldr r3, KK
  804. sub K1, K1, r3
  805. str K1, KKK
  806. #else
  807. ldr K1, KK
  808. #ifdef LEFT
  809. add K1, K1, #1 // number of values in AO
  810. #else
  811. add K1, K1, #2 // number of values in BO
  812. #endif
  813. str K1, KKK
  814. #endif
  815. asrs L , K1, #3 // L = L / 8
  816. ble _L2_M1_40
  817. _L2_M1_22:
  818. KERNEL1x2_SUB
  819. KERNEL1x2_SUB
  820. KERNEL1x2_SUB
  821. KERNEL1x2_SUB
  822. KERNEL1x2_SUB
  823. KERNEL1x2_SUB
  824. KERNEL1x2_SUB
  825. KERNEL1x2_SUB
  826. subs L, L, #1
  827. bgt _L2_M1_22
  828. _L2_M1_40:
  829. ands L , K1, #7 // L = L % 8
  830. ble _L2_M1_100
  831. _L2_M1_42:
  832. KERNEL1x2_SUB
  833. subs L, L, #1
  834. bgt _L2_M1_42
  835. _L2_M1_100:
  836. SAVE1x2
  837. #if (defined(LEFT) && defined(TRANSA)) || \
  838. (!defined(LEFT) && !defined(TRANSA))
  839. ldr r3 , K
  840. ldr r4 , KKK
  841. sub r3 , r3 , r4
  842. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  843. add BO , BO , r4
  844. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  845. add AO , AO , r4
  846. #endif
  847. #if defined(LEFT)
  848. ldr r3 , KK
  849. add r3 , r3 , #1 // number of values in AO
  850. str r3 , KK
  851. #endif
  852. _L2_END:
  853. mov r3, BC
  854. ldr r4, K
  855. lsl r4, r4, #4 // k * 2 * 4 * 2
  856. add r3, r3, r4 // B = B + K * 2 * 8
  857. mov BC, r3
  858. #if !defined(LEFT)
  859. ldr r3 , KK
  860. add r3 , r3 , #2 // number of values in BO
  861. str r3 , KK
  862. #endif
  863. subs J , #1 // j--
  864. bgt _L2_BEGIN
  865. /*********************************************************************************************/
  866. _L1_BEGIN:
  867. ldr J , N
  868. tst J , #1
  869. ble _L999
  870. ldr CO1, C // CO1 = C
  871. ldr r4 , LDC
  872. add r3 , r4, CO1
  873. str r3 , C // store C
  874. #if defined(LEFT)
  875. ldr r3 , OFFSET
  876. str r3 , KK
  877. #endif
  878. ldr AO, A // AO = A
  879. _L1_M2_BEGIN:
  880. ldr I, M
  881. asrs I, I, #1 // I = I / 2
  882. ble _L1_M1_BEGIN
  883. _L1_M2_20:
  884. #if (defined(LEFT) && defined(TRANSA)) || \
  885. (!defined(LEFT) && !defined(TRANSA))
  886. mov BO, BC
  887. #else
  888. mov BO, BC
  889. ldr r3 , KK
  890. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  891. add BO , BO , r4
  892. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  893. add AO , AO , r4
  894. #endif
  895. #ifndef TRMMKERNEL
  896. ldr K1, K
  897. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  898. ldr K1, K
  899. ldr r3, KK
  900. sub K1, K1, r3
  901. str K1, KKK
  902. #else
  903. ldr K1, KK
  904. #ifdef LEFT
  905. add K1, K1, #2 // number of values in AO
  906. #else
  907. add K1, K1, #1 // number of values in BO
  908. #endif
  909. str K1, KKK
  910. #endif
  911. asrs L , K1, #3 // L = L / 8
  912. cmp L , #3
  913. blt _L1_M2_30
  914. .align 5
  915. KERNEL2x1_I
  916. KERNEL2x1_M2
  917. KERNEL2x1_M1
  918. KERNEL2x1_M2
  919. KERNEL2x1_M1
  920. KERNEL2x1_M2
  921. KERNEL2x1_M1
  922. KERNEL2x1_M2
  923. sub L, L, #2
  924. _L1_M2_22:
  925. KERNEL2x1_M1
  926. KERNEL2x1_M2
  927. KERNEL2x1_M1
  928. KERNEL2x1_M2
  929. KERNEL2x1_M1
  930. KERNEL2x1_M2
  931. KERNEL2x1_M1
  932. KERNEL2x1_M2
  933. subs L, L, #1
  934. bgt _L1_M2_22
  935. KERNEL2x1_M1
  936. KERNEL2x1_M2
  937. KERNEL2x1_M1
  938. KERNEL2x1_M2
  939. KERNEL2x1_M1
  940. KERNEL2x1_M2
  941. KERNEL2x1_M1
  942. KERNEL2x1_E
  943. b _L1_M2_44
  944. _L1_M2_30:
  945. tst L, #3
  946. ble _L1_M2_40
  947. tst L, #2
  948. ble _L1_M2_32
  949. KERNEL2x1_I
  950. KERNEL2x1_M2
  951. KERNEL2x1_M1
  952. KERNEL2x1_M2
  953. KERNEL2x1_M1
  954. KERNEL2x1_M2
  955. KERNEL2x1_M1
  956. KERNEL2x1_M2
  957. KERNEL2x1_M1
  958. KERNEL2x1_M2
  959. KERNEL2x1_M1
  960. KERNEL2x1_M2
  961. KERNEL2x1_M1
  962. KERNEL2x1_M2
  963. KERNEL2x1_M1
  964. KERNEL2x1_E
  965. b _L1_M2_44
  966. _L1_M2_32:
  967. tst L, #1
  968. ble _L1_M2_40
  969. KERNEL2x1_I
  970. KERNEL2x1_M2
  971. KERNEL2x1_M1
  972. KERNEL2x1_M2
  973. KERNEL2x1_M1
  974. KERNEL2x1_M2
  975. KERNEL2x1_M1
  976. KERNEL2x1_E
  977. b _L1_M2_44
  978. _L1_M2_40:
  979. INIT2x1
  980. _L1_M2_44:
  981. ands L , K1, #7 // L = L % 8
  982. ble _L1_M2_100
  983. _L1_M2_46:
  984. KERNEL2x1_SUB
  985. subs L, L, #1
  986. bne _L1_M2_46
  987. _L1_M2_100:
  988. SAVE2x1
  989. #if (defined(LEFT) && defined(TRANSA)) || \
  990. (!defined(LEFT) && !defined(TRANSA))
  991. ldr r3 , K
  992. ldr r4 , KKK
  993. sub r3 , r3 , r4
  994. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  995. add BO , BO , r4
  996. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  997. add AO , AO , r4
  998. #endif
  999. #if defined(LEFT)
  1000. ldr r3 , KK
  1001. add r3 , r3 , #2 // number of values in AO
  1002. str r3 , KK
  1003. #endif
  1004. _L1_M2_END:
  1005. subs I, I, #1
  1006. bne _L1_M2_20
  1007. _L1_M1_BEGIN:
  1008. ldr I, M
  1009. tst I, #1 // I = I % 2
  1010. ble _L1_END
  1011. _L1_M1_20:
  1012. INIT1x1
  1013. #if (defined(LEFT) && defined(TRANSA)) || \
  1014. (!defined(LEFT) && !defined(TRANSA))
  1015. mov BO, BC
  1016. #else
  1017. mov BO, BC
  1018. ldr r3 , KK
  1019. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1020. add BO , BO , r4
  1021. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  1022. add AO , AO , r4
  1023. #endif
  1024. #ifndef TRMMKERNEL
  1025. ldr K1, K
  1026. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1027. ldr K1, K
  1028. ldr r3, KK
  1029. sub K1, K1, r3
  1030. str K1, KKK
  1031. #else
  1032. ldr K1, KK
  1033. #ifdef LEFT
  1034. add K1, K1, #1 // number of values in AO
  1035. #else
  1036. add K1, K1, #1 // number of values in BO
  1037. #endif
  1038. str K1, KKK
  1039. #endif
  1040. asrs L , K1, #3 // L = L / 8
  1041. ble _L1_M1_40
  1042. _L1_M1_22:
  1043. KERNEL1x1_SUB
  1044. KERNEL1x1_SUB
  1045. KERNEL1x1_SUB
  1046. KERNEL1x1_SUB
  1047. KERNEL1x1_SUB
  1048. KERNEL1x1_SUB
  1049. KERNEL1x1_SUB
  1050. KERNEL1x1_SUB
  1051. subs L, L, #1
  1052. bgt _L1_M1_22
  1053. _L1_M1_40:
  1054. ands L , K1, #7 // L = L % 8
  1055. ble _L1_M1_100
  1056. _L1_M1_42:
  1057. KERNEL1x1_SUB
  1058. subs L, L, #1
  1059. bgt _L1_M1_42
  1060. _L1_M1_100:
  1061. SAVE1x1
  1062. _L1_END:
  1063. _L999:
  1064. sub r3, fp, #128
  1065. vldm r3, { s8 - s31} // restore floating point registers
  1066. movs r0, #0 // set return value
  1067. sub sp, fp, #24
  1068. pop {r4 - r9, fp}
  1069. bx lr
  1070. EPILOGUE