You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfp.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define ALPHA_I [fp, #-272]
  56. #define ALPHA_R [fp, #-280]
  57. #define B [fp, #4 ]
  58. #define C [fp, #8 ]
  59. #define OLD_LDC [fp, #12 ]
  60. #define OFFSET [fp, #16 ]
  61. #define I r0
  62. #define J r1
  63. #define L r2
  64. #define AO r5
  65. #define BO r6
  66. #define CO1 r8
  67. #define CO2 r9
  68. #define K1 r7
  69. #define BC r12
  70. #define A_PRE 96
  71. #define B_PRE 96
  72. #define C_PRE 64
  73. /**************************************************************************************
  74. * Macro definitions
  75. **************************************************************************************/
  76. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  77. #define KMAC_R fnmacs
  78. #define KMAC_I fmacs
  79. #define FMAC_R1 fmacs
  80. #define FMAC_R2 fnmacs
  81. #define FMAC_I1 fmacs
  82. #define FMAC_I2 fmacs
  83. #elif defined(CN) || defined(CT)
  84. #define KMAC_R fmacs
  85. #define KMAC_I fnmacs
  86. #define FMAC_R1 fmacs
  87. #define FMAC_R2 fnmacs
  88. #define FMAC_I1 fmacs
  89. #define FMAC_I2 fmacs
  90. #elif defined(NC) || defined(TC)
  91. #define KMAC_R fmacs
  92. #define KMAC_I fnmacs
  93. #define FMAC_R1 fmacs
  94. #define FMAC_R2 fmacs
  95. #define FMAC_I1 fnmacs
  96. #define FMAC_I2 fmacs
  97. #else
  98. #define KMAC_R fnmacs
  99. #define KMAC_I fmacs
  100. #define FMAC_R1 fmacs
  101. #define FMAC_R2 fmacs
  102. #define FMAC_I1 fnmacs
  103. #define FMAC_I2 fmacs
  104. #endif
  105. .macro INIT2x2
  106. vsub.f32 s8 , s8 , s8
  107. vmov.f32 s9 , s8
  108. vmov.f32 s10, s8
  109. vmov.f32 s11, s8
  110. vmov.f32 s12, s8
  111. vmov.f32 s13, s8
  112. vmov.f32 s14, s8
  113. vmov.f32 s15, s8
  114. .endm
  115. .macro KERNEL2x2_I
  116. pld [ AO, #A_PRE ]
  117. fldmias AO!, { s0 - s3 }
  118. pld [ BO, #B_PRE ]
  119. fldmias BO!, { s4 - s7 }
  120. fmuls s8 , s0, s4
  121. fmuls s9 , s0, s5
  122. fmuls s10 , s2, s4
  123. fmuls s11 , s2, s5
  124. KMAC_R s8 , s1, s5
  125. KMAC_I s9 , s1, s4
  126. KMAC_R s10 , s3, s5
  127. KMAC_I s11 , s3, s4
  128. fmuls s12 , s0, s6
  129. fmuls s13 , s0, s7
  130. fmuls s14 , s2, s6
  131. fmuls s15 , s2, s7
  132. KMAC_R s12 , s1, s7
  133. KMAC_I s13 , s1, s6
  134. KMAC_R s14 , s3, s7
  135. KMAC_I s15 , s3, s6
  136. .endm
  137. .macro KERNEL2x2_M1
  138. pld [ AO, #A_PRE ]
  139. fldmias AO!, { s0 - s3 }
  140. pld [ BO, #B_PRE ]
  141. fldmias BO!, { s4 - s7 }
  142. fmacs s8 , s0, s4
  143. fmacs s9 , s0, s5
  144. fmacs s10 , s2, s4
  145. fmacs s11 , s2, s5
  146. KMAC_R s8 , s1, s5
  147. KMAC_I s9 , s1, s4
  148. KMAC_R s10 , s3, s5
  149. KMAC_I s11 , s3, s4
  150. fmacs s12 , s0, s6
  151. fmacs s13 , s0, s7
  152. fmacs s14 , s2, s6
  153. fmacs s15 , s2, s7
  154. KMAC_R s12 , s1, s7
  155. KMAC_I s13 , s1, s6
  156. KMAC_R s14 , s3, s7
  157. KMAC_I s15 , s3, s6
  158. .endm
  159. .macro KERNEL2x2_M2
  160. fldmias AO!, { s0 - s3 }
  161. fldmias BO!, { s4 - s7 }
  162. fmacs s8 , s0, s4
  163. fmacs s9 , s0, s5
  164. fmacs s10 , s2, s4
  165. fmacs s11 , s2, s5
  166. KMAC_R s8 , s1, s5
  167. KMAC_I s9 , s1, s4
  168. KMAC_R s10 , s3, s5
  169. KMAC_I s11 , s3, s4
  170. fmacs s12 , s0, s6
  171. fmacs s13 , s0, s7
  172. fmacs s14 , s2, s6
  173. fmacs s15 , s2, s7
  174. KMAC_R s12 , s1, s7
  175. KMAC_I s13 , s1, s6
  176. KMAC_R s14 , s3, s7
  177. KMAC_I s15 , s3, s6
  178. .endm
  179. .macro KERNEL2x2_E
  180. fldmias AO!, { s0 - s3 }
  181. fldmias BO!, { s4 - s7 }
  182. fmacs s8 , s0, s4
  183. fmacs s9 , s0, s5
  184. fmacs s10 , s2, s4
  185. fmacs s11 , s2, s5
  186. KMAC_R s8 , s1, s5
  187. KMAC_I s9 , s1, s4
  188. KMAC_R s10 , s3, s5
  189. KMAC_I s11 , s3, s4
  190. fmacs s12 , s0, s6
  191. fmacs s13 , s0, s7
  192. fmacs s14 , s2, s6
  193. fmacs s15 , s2, s7
  194. KMAC_R s12 , s1, s7
  195. KMAC_I s13 , s1, s6
  196. KMAC_R s14 , s3, s7
  197. KMAC_I s15 , s3, s6
  198. .endm
  199. .macro KERNEL2x2_SUB
  200. fldmias AO!, { s0 - s3 }
  201. fldmias BO!, { s4 - s7 }
  202. fmacs s8 , s0, s4
  203. fmacs s9 , s0, s5
  204. fmacs s10 , s2, s4
  205. fmacs s11 , s2, s5
  206. KMAC_R s8 , s1, s5
  207. KMAC_I s9 , s1, s4
  208. KMAC_R s10 , s3, s5
  209. KMAC_I s11 , s3, s4
  210. fmacs s12 , s0, s6
  211. fmacs s13 , s0, s7
  212. fmacs s14 , s2, s6
  213. fmacs s15 , s2, s7
  214. KMAC_R s12 , s1, s7
  215. KMAC_I s13 , s1, s6
  216. KMAC_R s14 , s3, s7
  217. KMAC_I s15 , s3, s6
  218. .endm
  219. .macro SAVE2x2
  220. ldr r3 , LDC
  221. add CO2 , CO1, r3
  222. flds s0, ALPHA_R
  223. flds s1, ALPHA_I
  224. vsub.f32 s4, s4, s4
  225. vsub.f32 s5, s5, s5
  226. vsub.f32 s6, s6, s6
  227. vsub.f32 s7, s7, s7
  228. FMAC_R1 s4 , s0 , s8
  229. FMAC_I1 s5 , s0 , s9
  230. FMAC_R2 s4 , s1 , s9
  231. FMAC_I2 s5 , s1 , s8
  232. FMAC_R1 s6 , s0 , s10
  233. FMAC_I1 s7 , s0 , s11
  234. FMAC_R2 s6 , s1 , s11
  235. FMAC_I2 s7 , s1 , s10
  236. fstmias CO1, { s4 - s7 }
  237. vsub.f32 s4, s4, s4
  238. vsub.f32 s5, s5, s5
  239. vsub.f32 s6, s6, s6
  240. vsub.f32 s7, s7, s7
  241. FMAC_R1 s4 , s0 , s12
  242. FMAC_I1 s5 , s0 , s13
  243. FMAC_R2 s4 , s1 , s13
  244. FMAC_I2 s5 , s1 , s12
  245. FMAC_R1 s6 , s0 , s14
  246. FMAC_I1 s7 , s0 , s15
  247. FMAC_R2 s6 , s1 , s15
  248. FMAC_I2 s7 , s1 , s14
  249. fstmias CO2, { s4 - s7 }
  250. add CO1, CO1, #16
  251. .endm
  252. /******************************************************************************/
  253. .macro INIT1x2
  254. vsub.f32 s8 , s8 , s8
  255. vmov.f32 s9 , s8
  256. vmov.f32 s12, s8
  257. vmov.f32 s13, s8
  258. .endm
  259. .macro KERNEL1x2_I
  260. flds s0 , [ AO ]
  261. flds s1 , [ AO, #4 ]
  262. flds s4 , [ BO ]
  263. flds s5 , [ BO, #4 ]
  264. flds s6 , [ BO, #8 ]
  265. flds s7 , [ BO, #12 ]
  266. fmuls s8 , s0, s4
  267. KMAC_R s8 , s1, s5
  268. fmuls s9 , s0, s5
  269. KMAC_I s9 , s1, s4
  270. fmuls s12 , s0, s6
  271. KMAC_R s12 , s1, s7
  272. fmuls s13 , s0, s7
  273. KMAC_I s13 , s1, s6
  274. add BO , BO, #16
  275. add AO , AO, #8
  276. .endm
  277. .macro KERNEL1x2_M1
  278. flds s0 , [ AO ]
  279. flds s1 , [ AO, #4 ]
  280. flds s4 , [ BO ]
  281. flds s5 , [ BO, #4 ]
  282. flds s6 , [ BO, #8 ]
  283. flds s7 , [ BO, #12 ]
  284. fmacs s8 , s0, s4
  285. KMAC_R s8 , s1, s5
  286. fmacs s9 , s0, s5
  287. KMAC_I s9 , s1, s4
  288. fmacs s12 , s0, s6
  289. KMAC_R s12 , s1, s7
  290. fmacs s13 , s0, s7
  291. KMAC_I s13 , s1, s6
  292. add BO , BO, #16
  293. add AO , AO, #8
  294. .endm
  295. .macro KERNEL1x2_M2
  296. flds s0 , [ AO ]
  297. flds s1 , [ AO, #4 ]
  298. flds s4 , [ BO ]
  299. flds s5 , [ BO, #4 ]
  300. flds s6 , [ BO, #8 ]
  301. flds s7 , [ BO, #12 ]
  302. fmacs s8 , s0, s4
  303. KMAC_R s8 , s1, s5
  304. fmacs s9 , s0, s5
  305. KMAC_I s9 , s1, s4
  306. fmacs s12 , s0, s6
  307. KMAC_R s12 , s1, s7
  308. fmacs s13 , s0, s7
  309. KMAC_I s13 , s1, s6
  310. add BO , BO, #16
  311. add AO , AO, #8
  312. .endm
  313. .macro KERNEL1x2_E
  314. flds s0 , [ AO ]
  315. flds s1 , [ AO, #4 ]
  316. flds s4 , [ BO ]
  317. flds s5 , [ BO, #4 ]
  318. flds s6 , [ BO, #8 ]
  319. flds s7 , [ BO, #12 ]
  320. fmacs s8 , s0, s4
  321. KMAC_R s8 , s1, s5
  322. fmacs s9 , s0, s5
  323. KMAC_I s9 , s1, s4
  324. fmacs s12 , s0, s6
  325. KMAC_R s12 , s1, s7
  326. fmacs s13 , s0, s7
  327. KMAC_I s13 , s1, s6
  328. add BO , BO, #16
  329. add AO , AO, #8
  330. .endm
  331. .macro KERNEL1x2_SUB
  332. flds s0 , [ AO ]
  333. flds s1 , [ AO, #4 ]
  334. flds s4 , [ BO ]
  335. flds s5 , [ BO, #4 ]
  336. flds s6 , [ BO, #8 ]
  337. flds s7 , [ BO, #12 ]
  338. fmacs s8 , s0, s4
  339. KMAC_R s8 , s1, s5
  340. fmacs s9 , s0, s5
  341. KMAC_I s9 , s1, s4
  342. fmacs s12 , s0, s6
  343. KMAC_R s12 , s1, s7
  344. fmacs s13 , s0, s7
  345. KMAC_I s13 , s1, s6
  346. add BO , BO, #16
  347. add AO , AO, #8
  348. .endm
  349. .macro SAVE1x2
  350. ldr r3 , LDC
  351. add CO2 , CO1, r3
  352. flds s0, ALPHA_R
  353. flds s1, ALPHA_I
  354. vsub.f32 s4, s4, s4
  355. vsub.f32 s5, s5, s5
  356. FMAC_R1 s4 , s0 , s8
  357. FMAC_I1 s5 , s0 , s9
  358. FMAC_R2 s4 , s1 , s9
  359. FMAC_I2 s5 , s1 , s8
  360. fstmias CO1, { s4 - s5 }
  361. vsub.f32 s4, s4, s4
  362. vsub.f32 s5, s5, s5
  363. FMAC_R1 s4 , s0 , s12
  364. FMAC_I1 s5 , s0 , s13
  365. FMAC_R2 s4 , s1 , s13
  366. FMAC_I2 s5 , s1 , s12
  367. fstmias CO2, { s4 - s5 }
  368. add CO1, CO1, #8
  369. .endm
  370. /******************************************************************************/
  371. .macro INIT2x1
  372. vsub.f32 s8 , s8 , s8
  373. vmov.f32 s9 , s8
  374. vmov.f32 s10, s8
  375. vmov.f32 s11, s8
  376. .endm
  377. .macro KERNEL2x1_I
  378. flds s0 , [ AO ]
  379. flds s1 , [ AO, #4 ]
  380. flds s2 , [ AO, #8 ]
  381. flds s3 , [ AO, #12 ]
  382. flds s4 , [ BO ]
  383. flds s5 , [ BO, #4 ]
  384. fmuls s8 , s0, s4
  385. KMAC_R s8 , s1, s5
  386. fmuls s9 , s0, s5
  387. KMAC_I s9 , s1, s4
  388. fmuls s10 , s2, s4
  389. KMAC_R s10 , s3, s5
  390. fmuls s11 , s2, s5
  391. KMAC_I s11 , s3, s4
  392. add BO , BO, #8
  393. add AO , AO, #16
  394. .endm
  395. .macro KERNEL2x1_M1
  396. flds s0 , [ AO ]
  397. flds s1 , [ AO, #4 ]
  398. flds s2 , [ AO, #8 ]
  399. flds s3 , [ AO, #12 ]
  400. flds s4 , [ BO ]
  401. flds s5 , [ BO, #4 ]
  402. fmacs s8 , s0, s4
  403. KMAC_R s8 , s1, s5
  404. fmacs s9 , s0, s5
  405. KMAC_I s9 , s1, s4
  406. fmacs s10 , s2, s4
  407. KMAC_R s10 , s3, s5
  408. fmacs s11 , s2, s5
  409. KMAC_I s11 , s3, s4
  410. add BO , BO, #8
  411. add AO , AO, #16
  412. .endm
  413. .macro KERNEL2x1_M2
  414. flds s0 , [ AO ]
  415. flds s1 , [ AO, #4 ]
  416. flds s2 , [ AO, #8 ]
  417. flds s3 , [ AO, #12 ]
  418. flds s4 , [ BO ]
  419. flds s5 , [ BO, #4 ]
  420. fmacs s8 , s0, s4
  421. KMAC_R s8 , s1, s5
  422. fmacs s9 , s0, s5
  423. KMAC_I s9 , s1, s4
  424. fmacs s10 , s2, s4
  425. KMAC_R s10 , s3, s5
  426. fmacs s11 , s2, s5
  427. KMAC_I s11 , s3, s4
  428. add BO , BO, #8
  429. add AO , AO, #16
  430. .endm
  431. .macro KERNEL2x1_E
  432. flds s0 , [ AO ]
  433. flds s1 , [ AO, #4 ]
  434. flds s2 , [ AO, #8 ]
  435. flds s3 , [ AO, #12 ]
  436. flds s4 , [ BO ]
  437. flds s5 , [ BO, #4 ]
  438. fmacs s8 , s0, s4
  439. KMAC_R s8 , s1, s5
  440. fmacs s9 , s0, s5
  441. KMAC_I s9 , s1, s4
  442. fmacs s10 , s2, s4
  443. KMAC_R s10 , s3, s5
  444. fmacs s11 , s2, s5
  445. KMAC_I s11 , s3, s4
  446. add BO , BO, #8
  447. add AO , AO, #16
  448. .endm
  449. .macro KERNEL2x1_SUB
  450. flds s0 , [ AO ]
  451. flds s1 , [ AO, #4 ]
  452. flds s2 , [ AO, #8 ]
  453. flds s3 , [ AO, #12 ]
  454. flds s4 , [ BO ]
  455. flds s5 , [ BO, #4 ]
  456. fmacs s8 , s0, s4
  457. KMAC_R s8 , s1, s5
  458. fmacs s9 , s0, s5
  459. KMAC_I s9 , s1, s4
  460. fmacs s10 , s2, s4
  461. KMAC_R s10 , s3, s5
  462. fmacs s11 , s2, s5
  463. KMAC_I s11 , s3, s4
  464. add BO , BO, #8
  465. add AO , AO, #16
  466. .endm
  467. .macro SAVE2x1
  468. flds s0, ALPHA_R
  469. flds s1, ALPHA_I
  470. vsub.f32 s4, s4, s4
  471. vsub.f32 s5, s5, s5
  472. vsub.f32 s6, s6, s6
  473. vsub.f32 s7, s7, s7
  474. FMAC_R1 s4 , s0 , s8
  475. FMAC_I1 s5 , s0 , s9
  476. FMAC_R2 s4 , s1 , s9
  477. FMAC_I2 s5 , s1 , s8
  478. FMAC_R1 s6 , s0 , s10
  479. FMAC_I1 s7 , s0 , s11
  480. FMAC_R2 s6 , s1 , s11
  481. FMAC_I2 s7 , s1 , s10
  482. fstmias CO1, { s4 - s7 }
  483. add CO1, CO1, #16
  484. .endm
  485. /******************************************************************************/
  486. .macro INIT1x1
  487. vsub.f32 s8 , s8 , s8
  488. vmov.f32 s9 , s8
  489. .endm
  490. .macro KERNEL1x1_I
  491. flds s0 , [ AO ]
  492. flds s1 , [ AO, #4 ]
  493. flds s4 , [ BO ]
  494. flds s5 , [ BO, #4 ]
  495. fmuls s8 , s0, s4
  496. KMAC_R s8 , s1, s5
  497. fmuls s9 , s0, s5
  498. KMAC_I s9 , s1, s4
  499. add BO , BO, #8
  500. add AO , AO, #8
  501. .endm
  502. .macro KERNEL1x1_M1
  503. flds s0 , [ AO ]
  504. flds s1 , [ AO, #4 ]
  505. flds s4 , [ BO ]
  506. flds s5 , [ BO, #4 ]
  507. fmacs s8 , s0, s4
  508. KMAC_R s8 , s1, s5
  509. fmacs s9 , s0, s5
  510. KMAC_I s9 , s1, s4
  511. add BO , BO, #8
  512. add AO , AO, #8
  513. .endm
  514. .macro KERNEL1x1_M2
  515. flds s0 , [ AO ]
  516. flds s1 , [ AO, #4 ]
  517. flds s4 , [ BO ]
  518. flds s5 , [ BO, #4 ]
  519. fmacs s8 , s0, s4
  520. KMAC_R s8 , s1, s5
  521. fmacs s9 , s0, s5
  522. KMAC_I s9 , s1, s4
  523. add BO , BO, #8
  524. add AO , AO, #8
  525. .endm
  526. .macro KERNEL1x1_E
  527. flds s0 , [ AO ]
  528. flds s1 , [ AO, #4 ]
  529. flds s4 , [ BO ]
  530. flds s5 , [ BO, #4 ]
  531. fmacs s8 , s0, s4
  532. KMAC_R s8 , s1, s5
  533. fmacs s9 , s0, s5
  534. KMAC_I s9 , s1, s4
  535. add BO , BO, #8
  536. add AO , AO, #8
  537. .endm
  538. .macro KERNEL1x1_SUB
  539. flds s0 , [ AO ]
  540. flds s1 , [ AO, #4 ]
  541. flds s4 , [ BO ]
  542. flds s5 , [ BO, #4 ]
  543. fmacs s8 , s0, s4
  544. KMAC_R s8 , s1, s5
  545. fmacs s9 , s0, s5
  546. KMAC_I s9 , s1, s4
  547. add BO , BO, #8
  548. add AO , AO, #8
  549. .endm
  550. .macro SAVE1x1
  551. flds s0, ALPHA_R
  552. flds s1, ALPHA_I
  553. vsub.f32 s4, s4, s4
  554. vsub.f32 s5, s5, s5
  555. FMAC_R1 s4 , s0 , s8
  556. FMAC_I1 s5 , s0 , s9
  557. FMAC_R2 s4 , s1 , s9
  558. FMAC_I2 s5 , s1 , s8
  559. fstmias CO1, { s4 - s5 }
  560. add CO1, CO1, #8
  561. .endm
  562. /**************************************************************************************
  563. * End of macro definitions
  564. **************************************************************************************/
  565. PROLOGUE
  566. .align 5
  567. push {r4 - r9, fp}
  568. add fp, sp, #24
  569. sub sp, sp, #STACKSIZE // reserve stack
  570. str OLD_M, M
  571. str OLD_N, N
  572. str OLD_K, K
  573. str OLD_A, A
  574. vstr OLD_ALPHA_R, ALPHA_R
  575. vstr OLD_ALPHA_I, ALPHA_I
  576. sub r3, fp, #128
  577. vstm r3, { s8 - s15} // store floating point registers
  578. ldr r3, OLD_LDC
  579. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  580. str r3, LDC
  581. ldr r3, OFFSET
  582. #ifndef LEFT
  583. neg r3 , r3
  584. #endif
  585. str r3 , KK
  586. ldr BC, B
  587. ldr J, N
  588. asrs J, J, #1 // J = J / 2
  589. ble _L1_BEGIN
  590. _L2_BEGIN:
  591. ldr CO1, C // CO1 = C
  592. ldr r4 , LDC
  593. lsl r4 , r4 , #1 // LDC * 2
  594. add r3 , r4, CO1
  595. str r3 , C // store C
  596. #if defined(LEFT)
  597. ldr r3 , OFFSET
  598. str r3 , KK
  599. #endif
  600. ldr AO, A // AO = A
  601. pld [AO , #A_PRE-64]
  602. pld [AO , #A_PRE-32]
  603. _L2_M2_BEGIN:
  604. ldr I, M
  605. asrs I, I, #1 // I = I / 2
  606. ble _L2_M1_BEGIN
  607. _L2_M2_20:
  608. #if (defined(LEFT) && defined(TRANSA)) || \
  609. (!defined(LEFT) && !defined(TRANSA))
  610. mov BO, BC
  611. #else
  612. mov BO, BC
  613. ldr r3 , KK
  614. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  615. add BO , BO , r4
  616. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  617. add AO , AO , r4
  618. #endif
  619. #ifndef TRMMKERNEL
  620. ldr K1, K
  621. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  622. ldr K1, K
  623. ldr r3, KK
  624. sub K1, K1, r3
  625. str K1, KKK
  626. #else
  627. ldr K1, KK
  628. #ifdef LEFT
  629. add K1, K1, #2 // number of values in AO
  630. #else
  631. add K1, K1, #2 // number of values in BO
  632. #endif
  633. str K1, KKK
  634. #endif
  635. asrs L , K1, #3 // L = L / 8
  636. cmp L , #3
  637. blt _L2_M2_30
  638. .align 5
  639. KERNEL2x2_I
  640. KERNEL2x2_M2
  641. KERNEL2x2_M1
  642. KERNEL2x2_M2
  643. KERNEL2x2_M1
  644. KERNEL2x2_M2
  645. KERNEL2x2_M1
  646. KERNEL2x2_M2
  647. sub L, L, #2
  648. _L2_M2_22:
  649. KERNEL2x2_M1
  650. KERNEL2x2_M2
  651. KERNEL2x2_M1
  652. KERNEL2x2_M2
  653. KERNEL2x2_M1
  654. KERNEL2x2_M2
  655. KERNEL2x2_M1
  656. KERNEL2x2_M2
  657. subs L, L, #1
  658. bgt _L2_M2_22
  659. KERNEL2x2_M1
  660. KERNEL2x2_M2
  661. KERNEL2x2_M1
  662. KERNEL2x2_M2
  663. KERNEL2x2_M1
  664. KERNEL2x2_M2
  665. KERNEL2x2_M1
  666. KERNEL2x2_E
  667. b _L2_M2_44
  668. _L2_M2_30:
  669. tst L, #3
  670. ble _L2_M2_40
  671. tst L, #2
  672. ble _L2_M2_32
  673. KERNEL2x2_I
  674. KERNEL2x2_M2
  675. KERNEL2x2_M1
  676. KERNEL2x2_M2
  677. KERNEL2x2_M1
  678. KERNEL2x2_M2
  679. KERNEL2x2_M1
  680. KERNEL2x2_M2
  681. KERNEL2x2_M1
  682. KERNEL2x2_M2
  683. KERNEL2x2_M1
  684. KERNEL2x2_M2
  685. KERNEL2x2_M1
  686. KERNEL2x2_M2
  687. KERNEL2x2_M1
  688. KERNEL2x2_E
  689. b _L2_M2_44
  690. _L2_M2_32:
  691. tst L, #1
  692. ble _L2_M2_40
  693. KERNEL2x2_I
  694. KERNEL2x2_M2
  695. KERNEL2x2_M1
  696. KERNEL2x2_M2
  697. KERNEL2x2_M1
  698. KERNEL2x2_M2
  699. KERNEL2x2_M1
  700. KERNEL2x2_E
  701. b _L2_M2_44
  702. _L2_M2_40:
  703. INIT2x2
  704. _L2_M2_44:
  705. ands L , K1, #7 // L = L % 8
  706. ble _L2_M2_100
  707. _L2_M2_46:
  708. KERNEL2x2_SUB
  709. subs L, L, #1
  710. bne _L2_M2_46
  711. _L2_M2_100:
  712. SAVE2x2
  713. #if (defined(LEFT) && defined(TRANSA)) || \
  714. (!defined(LEFT) && !defined(TRANSA))
  715. ldr r3 , K
  716. ldr r4 , KKK
  717. sub r3 , r3 , r4
  718. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  719. add BO , BO , r4
  720. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  721. add AO , AO , r4
  722. #endif
  723. #if defined(LEFT)
  724. ldr r3 , KK
  725. add r3 , r3 , #2 // number of values in AO
  726. str r3 , KK
  727. #endif
  728. _L2_M2_END:
  729. subs I, I, #1
  730. bne _L2_M2_20
  731. _L2_M1_BEGIN:
  732. ldr I, M
  733. tst I, #1 // I = I % 2
  734. ble _L2_END
  735. _L2_M1_20:
  736. INIT1x2
  737. #if (defined(LEFT) && defined(TRANSA)) || \
  738. (!defined(LEFT) && !defined(TRANSA))
  739. mov BO, BC
  740. #else
  741. mov BO, BC
  742. ldr r3 , KK
  743. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  744. add BO , BO , r4
  745. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  746. add AO , AO , r4
  747. #endif
  748. #ifndef TRMMKERNEL
  749. ldr K1, K
  750. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  751. ldr K1, K
  752. ldr r3, KK
  753. sub K1, K1, r3
  754. str K1, KKK
  755. #else
  756. ldr K1, KK
  757. #ifdef LEFT
  758. add K1, K1, #1 // number of values in AO
  759. #else
  760. add K1, K1, #2 // number of values in BO
  761. #endif
  762. str K1, KKK
  763. #endif
  764. asrs L , K1, #3 // L = L / 8
  765. ble _L2_M1_40
  766. _L2_M1_22:
  767. KERNEL1x2_SUB
  768. KERNEL1x2_SUB
  769. KERNEL1x2_SUB
  770. KERNEL1x2_SUB
  771. KERNEL1x2_SUB
  772. KERNEL1x2_SUB
  773. KERNEL1x2_SUB
  774. KERNEL1x2_SUB
  775. subs L, L, #1
  776. bgt _L2_M1_22
  777. _L2_M1_40:
  778. ands L , K1, #7 // L = L % 8
  779. ble _L2_M1_100
  780. _L2_M1_42:
  781. KERNEL1x2_SUB
  782. subs L, L, #1
  783. bgt _L2_M1_42
  784. _L2_M1_100:
  785. SAVE1x2
  786. #if (defined(LEFT) && defined(TRANSA)) || \
  787. (!defined(LEFT) && !defined(TRANSA))
  788. ldr r3 , K
  789. ldr r4 , KKK
  790. sub r3 , r3 , r4
  791. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  792. add BO , BO , r4
  793. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  794. add AO , AO , r4
  795. #endif
  796. #if defined(LEFT)
  797. ldr r3 , KK
  798. add r3 , r3 , #1 // number of values in AO
  799. str r3 , KK
  800. #endif
  801. _L2_END:
  802. mov r3, BC
  803. ldr r4, K
  804. lsl r4, r4, #4 // k * 2 * 4 * 2
  805. add r3, r3, r4 // B = B + K * 2 * 8
  806. mov BC, r3
  807. #if !defined(LEFT)
  808. ldr r3 , KK
  809. add r3 , r3 , #2 // number of values in BO
  810. str r3 , KK
  811. #endif
  812. subs J , #1 // j--
  813. bgt _L2_BEGIN
  814. /*********************************************************************************************/
  815. _L1_BEGIN:
  816. ldr J , N
  817. tst J , #1
  818. ble _L999
  819. ldr CO1, C // CO1 = C
  820. ldr r4 , LDC
  821. add r3 , r4, CO1
  822. str r3 , C // store C
  823. #if defined(LEFT)
  824. ldr r3 , OFFSET
  825. str r3 , KK
  826. #endif
  827. ldr AO, A // AO = A
  828. _L1_M2_BEGIN:
  829. ldr I, M
  830. asrs I, I, #1 // I = I / 2
  831. ble _L1_M1_BEGIN
  832. _L1_M2_20:
  833. #if (defined(LEFT) && defined(TRANSA)) || \
  834. (!defined(LEFT) && !defined(TRANSA))
  835. mov BO, BC
  836. #else
  837. mov BO, BC
  838. ldr r3 , KK
  839. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  840. add BO , BO , r4
  841. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  842. add AO , AO , r4
  843. #endif
  844. #ifndef TRMMKERNEL
  845. ldr K1, K
  846. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  847. ldr K1, K
  848. ldr r3, KK
  849. sub K1, K1, r3
  850. str K1, KKK
  851. #else
  852. ldr K1, KK
  853. #ifdef LEFT
  854. add K1, K1, #2 // number of values in AO
  855. #else
  856. add K1, K1, #1 // number of values in BO
  857. #endif
  858. str K1, KKK
  859. #endif
  860. asrs L , K1, #3 // L = L / 8
  861. cmp L , #3
  862. blt _L1_M2_30
  863. .align 5
  864. KERNEL2x1_I
  865. KERNEL2x1_M2
  866. KERNEL2x1_M1
  867. KERNEL2x1_M2
  868. KERNEL2x1_M1
  869. KERNEL2x1_M2
  870. KERNEL2x1_M1
  871. KERNEL2x1_M2
  872. sub L, L, #2
  873. _L1_M2_22:
  874. KERNEL2x1_M1
  875. KERNEL2x1_M2
  876. KERNEL2x1_M1
  877. KERNEL2x1_M2
  878. KERNEL2x1_M1
  879. KERNEL2x1_M2
  880. KERNEL2x1_M1
  881. KERNEL2x1_M2
  882. subs L, L, #1
  883. bgt _L1_M2_22
  884. KERNEL2x1_M1
  885. KERNEL2x1_M2
  886. KERNEL2x1_M1
  887. KERNEL2x1_M2
  888. KERNEL2x1_M1
  889. KERNEL2x1_M2
  890. KERNEL2x1_M1
  891. KERNEL2x1_E
  892. b _L1_M2_44
  893. _L1_M2_30:
  894. tst L, #3
  895. ble _L1_M2_40
  896. tst L, #2
  897. ble _L1_M2_32
  898. KERNEL2x1_I
  899. KERNEL2x1_M2
  900. KERNEL2x1_M1
  901. KERNEL2x1_M2
  902. KERNEL2x1_M1
  903. KERNEL2x1_M2
  904. KERNEL2x1_M1
  905. KERNEL2x1_M2
  906. KERNEL2x1_M1
  907. KERNEL2x1_M2
  908. KERNEL2x1_M1
  909. KERNEL2x1_M2
  910. KERNEL2x1_M1
  911. KERNEL2x1_M2
  912. KERNEL2x1_M1
  913. KERNEL2x1_E
  914. b _L1_M2_44
  915. _L1_M2_32:
  916. tst L, #1
  917. ble _L1_M2_40
  918. KERNEL2x1_I
  919. KERNEL2x1_M2
  920. KERNEL2x1_M1
  921. KERNEL2x1_M2
  922. KERNEL2x1_M1
  923. KERNEL2x1_M2
  924. KERNEL2x1_M1
  925. KERNEL2x1_E
  926. b _L1_M2_44
  927. _L1_M2_40:
  928. INIT2x1
  929. _L1_M2_44:
  930. ands L , K1, #7 // L = L % 8
  931. ble _L1_M2_100
  932. _L1_M2_46:
  933. KERNEL2x1_SUB
  934. subs L, L, #1
  935. bne _L1_M2_46
  936. _L1_M2_100:
  937. SAVE2x1
  938. #if (defined(LEFT) && defined(TRANSA)) || \
  939. (!defined(LEFT) && !defined(TRANSA))
  940. ldr r3 , K
  941. ldr r4 , KKK
  942. sub r3 , r3 , r4
  943. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  944. add BO , BO , r4
  945. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  946. add AO , AO , r4
  947. #endif
  948. #if defined(LEFT)
  949. ldr r3 , KK
  950. add r3 , r3 , #2 // number of values in AO
  951. str r3 , KK
  952. #endif
  953. _L1_M2_END:
  954. subs I, I, #1
  955. bne _L1_M2_20
  956. _L1_M1_BEGIN:
  957. ldr I, M
  958. tst I, #1 // I = I % 2
  959. ble _L1_END
  960. _L1_M1_20:
  961. INIT1x1
  962. #if (defined(LEFT) && defined(TRANSA)) || \
  963. (!defined(LEFT) && !defined(TRANSA))
  964. mov BO, BC
  965. #else
  966. mov BO, BC
  967. ldr r3 , KK
  968. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  969. add BO , BO , r4
  970. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  971. add AO , AO , r4
  972. #endif
  973. #ifndef TRMMKERNEL
  974. ldr K1, K
  975. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  976. ldr K1, K
  977. ldr r3, KK
  978. sub K1, K1, r3
  979. str K1, KKK
  980. #else
  981. ldr K1, KK
  982. #ifdef LEFT
  983. add K1, K1, #1 // number of values in AO
  984. #else
  985. add K1, K1, #1 // number of values in BO
  986. #endif
  987. str K1, KKK
  988. #endif
  989. asrs L , K1, #3 // L = L / 8
  990. ble _L1_M1_40
  991. _L1_M1_22:
  992. KERNEL1x1_SUB
  993. KERNEL1x1_SUB
  994. KERNEL1x1_SUB
  995. KERNEL1x1_SUB
  996. KERNEL1x1_SUB
  997. KERNEL1x1_SUB
  998. KERNEL1x1_SUB
  999. KERNEL1x1_SUB
  1000. subs L, L, #1
  1001. bgt _L1_M1_22
  1002. _L1_M1_40:
  1003. ands L , K1, #7 // L = L % 8
  1004. ble _L1_M1_100
  1005. _L1_M1_42:
  1006. KERNEL1x1_SUB
  1007. subs L, L, #1
  1008. bgt _L1_M1_42
  1009. _L1_M1_100:
  1010. SAVE1x1
  1011. _L1_END:
  1012. _L999:
  1013. sub r3, fp, #128
  1014. vldm r3, { s8 - s15} // restore floating point registers
  1015. movs r0, #0 // set return value
  1016. sub sp, fp, #24
  1017. pop {r4 - r9, fp}
  1018. bx lr
  1019. EPILOGUE