You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_2x2_vfp.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/10/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R s0
  42. #define OLD_ALPHA_I s1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #define B [fp, #4 ]
  61. #define C [fp, #8 ]
  62. #define OLD_LDC [fp, #12 ]
  63. #define OFFSET [fp, #16 ]
  64. #define I r0
  65. #define J r1
  66. #define L r2
  67. #define AO r5
  68. #define BO r6
  69. #define CO1 r8
  70. #define CO2 r9
  71. #define K1 r7
  72. #define BC r12
  73. #define A_PRE 96
  74. #define B_PRE 96
  75. #define C_PRE 64
  76. /**************************************************************************************
  77. * Macro definitions
  78. **************************************************************************************/
  79. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  80. #define KMAC_R fnmacs
  81. #define KMAC_I fmacs
  82. #define FMAC_R1 fmacs
  83. #define FMAC_R2 fnmacs
  84. #define FMAC_I1 fmacs
  85. #define FMAC_I2 fmacs
  86. #elif defined(CN) || defined(CT)
  87. #define KMAC_R fmacs
  88. #define KMAC_I fnmacs
  89. #define FMAC_R1 fmacs
  90. #define FMAC_R2 fnmacs
  91. #define FMAC_I1 fmacs
  92. #define FMAC_I2 fmacs
  93. #elif defined(NC) || defined(TC)
  94. #define KMAC_R fmacs
  95. #define KMAC_I fnmacs
  96. #define FMAC_R1 fmacs
  97. #define FMAC_R2 fmacs
  98. #define FMAC_I1 fnmacs
  99. #define FMAC_I2 fmacs
  100. #else
  101. #define KMAC_R fnmacs
  102. #define KMAC_I fmacs
  103. #define FMAC_R1 fmacs
  104. #define FMAC_R2 fmacs
  105. #define FMAC_I1 fnmacs
  106. #define FMAC_I2 fmacs
  107. #endif
  108. .macro INIT2x2
  109. flds s8 , FP_ZERO
  110. vmov.f32 s9 , s8
  111. vmov.f32 s10, s8
  112. vmov.f32 s11, s8
  113. vmov.f32 s12, s8
  114. vmov.f32 s13, s8
  115. vmov.f32 s14, s8
  116. vmov.f32 s15, s8
  117. .endm
  118. .macro KERNEL2x2_I
  119. pld [ AO, #A_PRE ]
  120. fldmias AO!, { s0 - s3 }
  121. pld [ BO, #B_PRE ]
  122. fldmias BO!, { s4 - s7 }
  123. fmuls s8 , s0, s4
  124. fmuls s9 , s0, s5
  125. fmuls s10 , s2, s4
  126. fmuls s11 , s2, s5
  127. KMAC_R s8 , s1, s5
  128. KMAC_I s9 , s1, s4
  129. KMAC_R s10 , s3, s5
  130. KMAC_I s11 , s3, s4
  131. fmuls s12 , s0, s6
  132. fmuls s13 , s0, s7
  133. fmuls s14 , s2, s6
  134. fmuls s15 , s2, s7
  135. KMAC_R s12 , s1, s7
  136. KMAC_I s13 , s1, s6
  137. KMAC_R s14 , s3, s7
  138. KMAC_I s15 , s3, s6
  139. .endm
  140. .macro KERNEL2x2_M1
  141. pld [ AO, #A_PRE ]
  142. fldmias AO!, { s0 - s3 }
  143. pld [ BO, #B_PRE ]
  144. fldmias BO!, { s4 - s7 }
  145. fmacs s8 , s0, s4
  146. fmacs s9 , s0, s5
  147. fmacs s10 , s2, s4
  148. fmacs s11 , s2, s5
  149. KMAC_R s8 , s1, s5
  150. KMAC_I s9 , s1, s4
  151. KMAC_R s10 , s3, s5
  152. KMAC_I s11 , s3, s4
  153. fmacs s12 , s0, s6
  154. fmacs s13 , s0, s7
  155. fmacs s14 , s2, s6
  156. fmacs s15 , s2, s7
  157. KMAC_R s12 , s1, s7
  158. KMAC_I s13 , s1, s6
  159. KMAC_R s14 , s3, s7
  160. KMAC_I s15 , s3, s6
  161. .endm
  162. .macro KERNEL2x2_M2
  163. fldmias AO!, { s0 - s3 }
  164. fldmias BO!, { s4 - s7 }
  165. fmacs s8 , s0, s4
  166. fmacs s9 , s0, s5
  167. fmacs s10 , s2, s4
  168. fmacs s11 , s2, s5
  169. KMAC_R s8 , s1, s5
  170. KMAC_I s9 , s1, s4
  171. KMAC_R s10 , s3, s5
  172. KMAC_I s11 , s3, s4
  173. fmacs s12 , s0, s6
  174. fmacs s13 , s0, s7
  175. fmacs s14 , s2, s6
  176. fmacs s15 , s2, s7
  177. KMAC_R s12 , s1, s7
  178. KMAC_I s13 , s1, s6
  179. KMAC_R s14 , s3, s7
  180. KMAC_I s15 , s3, s6
  181. .endm
  182. .macro KERNEL2x2_E
  183. fldmias AO!, { s0 - s3 }
  184. fldmias BO!, { s4 - s7 }
  185. fmacs s8 , s0, s4
  186. fmacs s9 , s0, s5
  187. fmacs s10 , s2, s4
  188. fmacs s11 , s2, s5
  189. KMAC_R s8 , s1, s5
  190. KMAC_I s9 , s1, s4
  191. KMAC_R s10 , s3, s5
  192. KMAC_I s11 , s3, s4
  193. fmacs s12 , s0, s6
  194. fmacs s13 , s0, s7
  195. fmacs s14 , s2, s6
  196. fmacs s15 , s2, s7
  197. KMAC_R s12 , s1, s7
  198. KMAC_I s13 , s1, s6
  199. KMAC_R s14 , s3, s7
  200. KMAC_I s15 , s3, s6
  201. .endm
  202. .macro KERNEL2x2_SUB
  203. fldmias AO!, { s0 - s3 }
  204. fldmias BO!, { s4 - s7 }
  205. fmacs s8 , s0, s4
  206. fmacs s9 , s0, s5
  207. fmacs s10 , s2, s4
  208. fmacs s11 , s2, s5
  209. KMAC_R s8 , s1, s5
  210. KMAC_I s9 , s1, s4
  211. KMAC_R s10 , s3, s5
  212. KMAC_I s11 , s3, s4
  213. fmacs s12 , s0, s6
  214. fmacs s13 , s0, s7
  215. fmacs s14 , s2, s6
  216. fmacs s15 , s2, s7
  217. KMAC_R s12 , s1, s7
  218. KMAC_I s13 , s1, s6
  219. KMAC_R s14 , s3, s7
  220. KMAC_I s15 , s3, s6
  221. .endm
  222. .macro SAVE2x2
  223. ldr r3 , LDC
  224. add CO2 , CO1, r3
  225. flds s0, ALPHA_R
  226. flds s1, ALPHA_I
  227. flds s4, FP_ZERO
  228. vmov.f32 s5, s4
  229. vmov.f32 s6, s4
  230. vmov.f32 s7, s4
  231. FMAC_R1 s4 , s0 , s8
  232. FMAC_I1 s5 , s0 , s9
  233. FMAC_R2 s4 , s1 , s9
  234. FMAC_I2 s5 , s1 , s8
  235. FMAC_R1 s6 , s0 , s10
  236. FMAC_I1 s7 , s0 , s11
  237. FMAC_R2 s6 , s1 , s11
  238. FMAC_I2 s7 , s1 , s10
  239. fstmias CO1, { s4 - s7 }
  240. flds s4, FP_ZERO
  241. vmov.f32 s5, s4
  242. vmov.f32 s6, s4
  243. vmov.f32 s7, s4
  244. FMAC_R1 s4 , s0 , s12
  245. FMAC_I1 s5 , s0 , s13
  246. FMAC_R2 s4 , s1 , s13
  247. FMAC_I2 s5 , s1 , s12
  248. FMAC_R1 s6 , s0 , s14
  249. FMAC_I1 s7 , s0 , s15
  250. FMAC_R2 s6 , s1 , s15
  251. FMAC_I2 s7 , s1 , s14
  252. fstmias CO2, { s4 - s7 }
  253. add CO1, CO1, #16
  254. .endm
  255. /******************************************************************************/
  256. .macro INIT1x2
  257. flds s8 , FP_ZERO
  258. vmov.f32 s9 , s8
  259. vmov.f32 s12, s8
  260. vmov.f32 s13, s8
  261. .endm
  262. .macro KERNEL1x2_I
  263. flds s0 , [ AO ]
  264. flds s1 , [ AO, #4 ]
  265. flds s4 , [ BO ]
  266. flds s5 , [ BO, #4 ]
  267. flds s6 , [ BO, #8 ]
  268. flds s7 , [ BO, #12 ]
  269. fmuls s8 , s0, s4
  270. KMAC_R s8 , s1, s5
  271. fmuls s9 , s0, s5
  272. KMAC_I s9 , s1, s4
  273. fmuls s12 , s0, s6
  274. KMAC_R s12 , s1, s7
  275. fmuls s13 , s0, s7
  276. KMAC_I s13 , s1, s6
  277. add BO , BO, #16
  278. add AO , AO, #8
  279. .endm
  280. .macro KERNEL1x2_M1
  281. flds s0 , [ AO ]
  282. flds s1 , [ AO, #4 ]
  283. flds s4 , [ BO ]
  284. flds s5 , [ BO, #4 ]
  285. flds s6 , [ BO, #8 ]
  286. flds s7 , [ BO, #12 ]
  287. fmacs s8 , s0, s4
  288. KMAC_R s8 , s1, s5
  289. fmacs s9 , s0, s5
  290. KMAC_I s9 , s1, s4
  291. fmacs s12 , s0, s6
  292. KMAC_R s12 , s1, s7
  293. fmacs s13 , s0, s7
  294. KMAC_I s13 , s1, s6
  295. add BO , BO, #16
  296. add AO , AO, #8
  297. .endm
  298. .macro KERNEL1x2_M2
  299. flds s0 , [ AO ]
  300. flds s1 , [ AO, #4 ]
  301. flds s4 , [ BO ]
  302. flds s5 , [ BO, #4 ]
  303. flds s6 , [ BO, #8 ]
  304. flds s7 , [ BO, #12 ]
  305. fmacs s8 , s0, s4
  306. KMAC_R s8 , s1, s5
  307. fmacs s9 , s0, s5
  308. KMAC_I s9 , s1, s4
  309. fmacs s12 , s0, s6
  310. KMAC_R s12 , s1, s7
  311. fmacs s13 , s0, s7
  312. KMAC_I s13 , s1, s6
  313. add BO , BO, #16
  314. add AO , AO, #8
  315. .endm
  316. .macro KERNEL1x2_E
  317. flds s0 , [ AO ]
  318. flds s1 , [ AO, #4 ]
  319. flds s4 , [ BO ]
  320. flds s5 , [ BO, #4 ]
  321. flds s6 , [ BO, #8 ]
  322. flds s7 , [ BO, #12 ]
  323. fmacs s8 , s0, s4
  324. KMAC_R s8 , s1, s5
  325. fmacs s9 , s0, s5
  326. KMAC_I s9 , s1, s4
  327. fmacs s12 , s0, s6
  328. KMAC_R s12 , s1, s7
  329. fmacs s13 , s0, s7
  330. KMAC_I s13 , s1, s6
  331. add BO , BO, #16
  332. add AO , AO, #8
  333. .endm
  334. .macro KERNEL1x2_SUB
  335. flds s0 , [ AO ]
  336. flds s1 , [ AO, #4 ]
  337. flds s4 , [ BO ]
  338. flds s5 , [ BO, #4 ]
  339. flds s6 , [ BO, #8 ]
  340. flds s7 , [ BO, #12 ]
  341. fmacs s8 , s0, s4
  342. KMAC_R s8 , s1, s5
  343. fmacs s9 , s0, s5
  344. KMAC_I s9 , s1, s4
  345. fmacs s12 , s0, s6
  346. KMAC_R s12 , s1, s7
  347. fmacs s13 , s0, s7
  348. KMAC_I s13 , s1, s6
  349. add BO , BO, #16
  350. add AO , AO, #8
  351. .endm
  352. .macro SAVE1x2
  353. ldr r3 , LDC
  354. add CO2 , CO1, r3
  355. flds s0, ALPHA_R
  356. flds s1, ALPHA_I
  357. flds s4, FP_ZERO
  358. vmov.f32 s5, s4
  359. FMAC_R1 s4 , s0 , s8
  360. FMAC_I1 s5 , s0 , s9
  361. FMAC_R2 s4 , s1 , s9
  362. FMAC_I2 s5 , s1 , s8
  363. fstmias CO1, { s4 - s5 }
  364. flds s4, FP_ZERO
  365. vmov.f32 s5, s4
  366. FMAC_R1 s4 , s0 , s12
  367. FMAC_I1 s5 , s0 , s13
  368. FMAC_R2 s4 , s1 , s13
  369. FMAC_I2 s5 , s1 , s12
  370. fstmias CO2, { s4 - s5 }
  371. add CO1, CO1, #8
  372. .endm
  373. /******************************************************************************/
  374. .macro INIT2x1
  375. flds s8 , FP_ZERO
  376. vmov.f32 s9 , s8
  377. vmov.f32 s10, s8
  378. vmov.f32 s11, s8
  379. .endm
  380. .macro KERNEL2x1_I
  381. flds s0 , [ AO ]
  382. flds s1 , [ AO, #4 ]
  383. flds s2 , [ AO, #8 ]
  384. flds s3 , [ AO, #12 ]
  385. flds s4 , [ BO ]
  386. flds s5 , [ BO, #4 ]
  387. fmuls s8 , s0, s4
  388. KMAC_R s8 , s1, s5
  389. fmuls s9 , s0, s5
  390. KMAC_I s9 , s1, s4
  391. fmuls s10 , s2, s4
  392. KMAC_R s10 , s3, s5
  393. fmuls s11 , s2, s5
  394. KMAC_I s11 , s3, s4
  395. add BO , BO, #8
  396. add AO , AO, #16
  397. .endm
  398. .macro KERNEL2x1_M1
  399. flds s0 , [ AO ]
  400. flds s1 , [ AO, #4 ]
  401. flds s2 , [ AO, #8 ]
  402. flds s3 , [ AO, #12 ]
  403. flds s4 , [ BO ]
  404. flds s5 , [ BO, #4 ]
  405. fmacs s8 , s0, s4
  406. KMAC_R s8 , s1, s5
  407. fmacs s9 , s0, s5
  408. KMAC_I s9 , s1, s4
  409. fmacs s10 , s2, s4
  410. KMAC_R s10 , s3, s5
  411. fmacs s11 , s2, s5
  412. KMAC_I s11 , s3, s4
  413. add BO , BO, #8
  414. add AO , AO, #16
  415. .endm
  416. .macro KERNEL2x1_M2
  417. flds s0 , [ AO ]
  418. flds s1 , [ AO, #4 ]
  419. flds s2 , [ AO, #8 ]
  420. flds s3 , [ AO, #12 ]
  421. flds s4 , [ BO ]
  422. flds s5 , [ BO, #4 ]
  423. fmacs s8 , s0, s4
  424. KMAC_R s8 , s1, s5
  425. fmacs s9 , s0, s5
  426. KMAC_I s9 , s1, s4
  427. fmacs s10 , s2, s4
  428. KMAC_R s10 , s3, s5
  429. fmacs s11 , s2, s5
  430. KMAC_I s11 , s3, s4
  431. add BO , BO, #8
  432. add AO , AO, #16
  433. .endm
  434. .macro KERNEL2x1_E
  435. flds s0 , [ AO ]
  436. flds s1 , [ AO, #4 ]
  437. flds s2 , [ AO, #8 ]
  438. flds s3 , [ AO, #12 ]
  439. flds s4 , [ BO ]
  440. flds s5 , [ BO, #4 ]
  441. fmacs s8 , s0, s4
  442. KMAC_R s8 , s1, s5
  443. fmacs s9 , s0, s5
  444. KMAC_I s9 , s1, s4
  445. fmacs s10 , s2, s4
  446. KMAC_R s10 , s3, s5
  447. fmacs s11 , s2, s5
  448. KMAC_I s11 , s3, s4
  449. add BO , BO, #8
  450. add AO , AO, #16
  451. .endm
  452. .macro KERNEL2x1_SUB
  453. flds s0 , [ AO ]
  454. flds s1 , [ AO, #4 ]
  455. flds s2 , [ AO, #8 ]
  456. flds s3 , [ AO, #12 ]
  457. flds s4 , [ BO ]
  458. flds s5 , [ BO, #4 ]
  459. fmacs s8 , s0, s4
  460. KMAC_R s8 , s1, s5
  461. fmacs s9 , s0, s5
  462. KMAC_I s9 , s1, s4
  463. fmacs s10 , s2, s4
  464. KMAC_R s10 , s3, s5
  465. fmacs s11 , s2, s5
  466. KMAC_I s11 , s3, s4
  467. add BO , BO, #8
  468. add AO , AO, #16
  469. .endm
  470. .macro SAVE2x1
  471. flds s0, ALPHA_R
  472. flds s1, ALPHA_I
  473. flds s4, FP_ZERO
  474. vmov.f32 s5, s4
  475. vmov.f32 s6, s4
  476. vmov.f32 s7, s4
  477. FMAC_R1 s4 , s0 , s8
  478. FMAC_I1 s5 , s0 , s9
  479. FMAC_R2 s4 , s1 , s9
  480. FMAC_I2 s5 , s1 , s8
  481. FMAC_R1 s6 , s0 , s10
  482. FMAC_I1 s7 , s0 , s11
  483. FMAC_R2 s6 , s1 , s11
  484. FMAC_I2 s7 , s1 , s10
  485. fstmias CO1, { s4 - s7 }
  486. add CO1, CO1, #16
  487. .endm
  488. /******************************************************************************/
  489. .macro INIT1x1
  490. flds s8 , FP_ZERO
  491. vmov.f32 s9 , s8
  492. .endm
  493. .macro KERNEL1x1_I
  494. flds s0 , [ AO ]
  495. flds s1 , [ AO, #4 ]
  496. flds s4 , [ BO ]
  497. flds s5 , [ BO, #4 ]
  498. fmuls s8 , s0, s4
  499. KMAC_R s8 , s1, s5
  500. fmuls s9 , s0, s5
  501. KMAC_I s9 , s1, s4
  502. add BO , BO, #8
  503. add AO , AO, #8
  504. .endm
  505. .macro KERNEL1x1_M1
  506. flds s0 , [ AO ]
  507. flds s1 , [ AO, #4 ]
  508. flds s4 , [ BO ]
  509. flds s5 , [ BO, #4 ]
  510. fmacs s8 , s0, s4
  511. KMAC_R s8 , s1, s5
  512. fmacs s9 , s0, s5
  513. KMAC_I s9 , s1, s4
  514. add BO , BO, #8
  515. add AO , AO, #8
  516. .endm
  517. .macro KERNEL1x1_M2
  518. flds s0 , [ AO ]
  519. flds s1 , [ AO, #4 ]
  520. flds s4 , [ BO ]
  521. flds s5 , [ BO, #4 ]
  522. fmacs s8 , s0, s4
  523. KMAC_R s8 , s1, s5
  524. fmacs s9 , s0, s5
  525. KMAC_I s9 , s1, s4
  526. add BO , BO, #8
  527. add AO , AO, #8
  528. .endm
  529. .macro KERNEL1x1_E
  530. flds s0 , [ AO ]
  531. flds s1 , [ AO, #4 ]
  532. flds s4 , [ BO ]
  533. flds s5 , [ BO, #4 ]
  534. fmacs s8 , s0, s4
  535. KMAC_R s8 , s1, s5
  536. fmacs s9 , s0, s5
  537. KMAC_I s9 , s1, s4
  538. add BO , BO, #8
  539. add AO , AO, #8
  540. .endm
  541. .macro KERNEL1x1_SUB
  542. flds s0 , [ AO ]
  543. flds s1 , [ AO, #4 ]
  544. flds s4 , [ BO ]
  545. flds s5 , [ BO, #4 ]
  546. fmacs s8 , s0, s4
  547. KMAC_R s8 , s1, s5
  548. fmacs s9 , s0, s5
  549. KMAC_I s9 , s1, s4
  550. add BO , BO, #8
  551. add AO , AO, #8
  552. .endm
  553. .macro SAVE1x1
  554. flds s0, ALPHA_R
  555. flds s1, ALPHA_I
  556. flds s4, FP_ZERO
  557. vmov.f32 s5, s4
  558. FMAC_R1 s4 , s0 , s8
  559. FMAC_I1 s5 , s0 , s9
  560. FMAC_R2 s4 , s1 , s9
  561. FMAC_I2 s5 , s1 , s8
  562. fstmias CO1, { s4 - s5 }
  563. add CO1, CO1, #8
  564. .endm
  565. /**************************************************************************************
  566. * End of macro definitions
  567. **************************************************************************************/
  568. PROLOGUE
  569. .align 5
  570. push {r4 - r9, fp}
  571. add fp, sp, #24
  572. sub sp, sp, #STACKSIZE // reserve stack
  573. str OLD_M, M
  574. str OLD_N, N
  575. str OLD_K, K
  576. str OLD_A, A
  577. vstr OLD_ALPHA_R, ALPHA_R
  578. vstr OLD_ALPHA_I, ALPHA_I
  579. sub r3, fp, #128
  580. vstm r3, { s8 - s15} // store floating point registers
  581. movs r4, #0
  582. str r4, FP_ZERO
  583. str r4, FP_ZERO_1
  584. ldr r3, OLD_LDC
  585. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  586. str r3, LDC
  587. ldr r3, OFFSET
  588. #ifndef LEFT
  589. neg r3 , r3
  590. #endif
  591. str r3 , KK
  592. ldr BC, B
  593. ldr J, N
  594. asrs J, J, #1 // J = J / 2
  595. ble _L1_BEGIN
  596. _L2_BEGIN:
  597. ldr CO1, C // CO1 = C
  598. ldr r4 , LDC
  599. lsl r4 , r4 , #1 // LDC * 2
  600. add r3 , r4, CO1
  601. str r3 , C // store C
  602. #if defined(LEFT)
  603. ldr r3 , OFFSET
  604. str r3 , KK
  605. #endif
  606. ldr AO, A // AO = A
  607. pld [AO , #A_PRE-64]
  608. pld [AO , #A_PRE-32]
  609. _L2_M2_BEGIN:
  610. ldr I, M
  611. asrs I, I, #1 // I = I / 2
  612. ble _L2_M1_BEGIN
  613. _L2_M2_20:
  614. #if (defined(LEFT) && defined(TRANSA)) || \
  615. (!defined(LEFT) && !defined(TRANSA))
  616. mov BO, BC
  617. #else
  618. mov BO, BC
  619. ldr r3 , KK
  620. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  621. add BO , BO , r4
  622. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  623. add AO , AO , r4
  624. #endif
  625. #ifndef TRMMKERNEL
  626. ldr K1, K
  627. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  628. ldr K1, K
  629. ldr r3, KK
  630. sub K1, K1, r3
  631. str K1, KKK
  632. #else
  633. ldr K1, KK
  634. #ifdef LEFT
  635. add K1, K1, #2 // number of values in AO
  636. #else
  637. add K1, K1, #2 // number of values in BO
  638. #endif
  639. str K1, KKK
  640. #endif
  641. asrs L , K1, #3 // L = L / 8
  642. cmp L , #3
  643. blt _L2_M2_30
  644. .align 5
  645. KERNEL2x2_I
  646. KERNEL2x2_M2
  647. KERNEL2x2_M1
  648. KERNEL2x2_M2
  649. KERNEL2x2_M1
  650. KERNEL2x2_M2
  651. KERNEL2x2_M1
  652. KERNEL2x2_M2
  653. sub L, L, #2
  654. _L2_M2_22:
  655. KERNEL2x2_M1
  656. KERNEL2x2_M2
  657. KERNEL2x2_M1
  658. KERNEL2x2_M2
  659. KERNEL2x2_M1
  660. KERNEL2x2_M2
  661. KERNEL2x2_M1
  662. KERNEL2x2_M2
  663. subs L, L, #1
  664. bgt _L2_M2_22
  665. KERNEL2x2_M1
  666. KERNEL2x2_M2
  667. KERNEL2x2_M1
  668. KERNEL2x2_M2
  669. KERNEL2x2_M1
  670. KERNEL2x2_M2
  671. KERNEL2x2_M1
  672. KERNEL2x2_E
  673. b _L2_M2_44
  674. _L2_M2_30:
  675. tst L, #3
  676. ble _L2_M2_40
  677. tst L, #2
  678. ble _L2_M2_32
  679. KERNEL2x2_I
  680. KERNEL2x2_M2
  681. KERNEL2x2_M1
  682. KERNEL2x2_M2
  683. KERNEL2x2_M1
  684. KERNEL2x2_M2
  685. KERNEL2x2_M1
  686. KERNEL2x2_M2
  687. KERNEL2x2_M1
  688. KERNEL2x2_M2
  689. KERNEL2x2_M1
  690. KERNEL2x2_M2
  691. KERNEL2x2_M1
  692. KERNEL2x2_M2
  693. KERNEL2x2_M1
  694. KERNEL2x2_E
  695. b _L2_M2_44
  696. _L2_M2_32:
  697. tst L, #1
  698. ble _L2_M2_40
  699. KERNEL2x2_I
  700. KERNEL2x2_M2
  701. KERNEL2x2_M1
  702. KERNEL2x2_M2
  703. KERNEL2x2_M1
  704. KERNEL2x2_M2
  705. KERNEL2x2_M1
  706. KERNEL2x2_E
  707. b _L2_M2_44
  708. _L2_M2_40:
  709. INIT2x2
  710. _L2_M2_44:
  711. ands L , K1, #7 // L = L % 8
  712. ble _L2_M2_100
  713. _L2_M2_46:
  714. KERNEL2x2_SUB
  715. subs L, L, #1
  716. bne _L2_M2_46
  717. _L2_M2_100:
  718. SAVE2x2
  719. #if (defined(LEFT) && defined(TRANSA)) || \
  720. (!defined(LEFT) && !defined(TRANSA))
  721. ldr r3 , K
  722. ldr r4 , KKK
  723. sub r3 , r3 , r4
  724. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  725. add BO , BO , r4
  726. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  727. add AO , AO , r4
  728. #endif
  729. #if defined(LEFT)
  730. ldr r3 , KK
  731. add r3 , r3 , #2 // number of values in AO
  732. str r3 , KK
  733. #endif
  734. _L2_M2_END:
  735. subs I, I, #1
  736. bne _L2_M2_20
  737. _L2_M1_BEGIN:
  738. ldr I, M
  739. tst I, #1 // I = I % 2
  740. ble _L2_END
  741. _L2_M1_20:
  742. INIT1x2
  743. #if (defined(LEFT) && defined(TRANSA)) || \
  744. (!defined(LEFT) && !defined(TRANSA))
  745. mov BO, BC
  746. #else
  747. mov BO, BC
  748. ldr r3 , KK
  749. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  750. add BO , BO , r4
  751. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  752. add AO , AO , r4
  753. #endif
  754. #ifndef TRMMKERNEL
  755. ldr K1, K
  756. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  757. ldr K1, K
  758. ldr r3, KK
  759. sub K1, K1, r3
  760. str K1, KKK
  761. #else
  762. ldr K1, KK
  763. #ifdef LEFT
  764. add K1, K1, #1 // number of values in AO
  765. #else
  766. add K1, K1, #2 // number of values in BO
  767. #endif
  768. str K1, KKK
  769. #endif
  770. asrs L , K1, #3 // L = L / 8
  771. ble _L2_M1_40
  772. _L2_M1_22:
  773. KERNEL1x2_SUB
  774. KERNEL1x2_SUB
  775. KERNEL1x2_SUB
  776. KERNEL1x2_SUB
  777. KERNEL1x2_SUB
  778. KERNEL1x2_SUB
  779. KERNEL1x2_SUB
  780. KERNEL1x2_SUB
  781. subs L, L, #1
  782. bgt _L2_M1_22
  783. _L2_M1_40:
  784. ands L , K1, #7 // L = L % 8
  785. ble _L2_M1_100
  786. _L2_M1_42:
  787. KERNEL1x2_SUB
  788. subs L, L, #1
  789. bgt _L2_M1_42
  790. _L2_M1_100:
  791. SAVE1x2
  792. #if (defined(LEFT) && defined(TRANSA)) || \
  793. (!defined(LEFT) && !defined(TRANSA))
  794. ldr r3 , K
  795. ldr r4 , KKK
  796. sub r3 , r3 , r4
  797. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  798. add BO , BO , r4
  799. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  800. add AO , AO , r4
  801. #endif
  802. #if defined(LEFT)
  803. ldr r3 , KK
  804. add r3 , r3 , #1 // number of values in AO
  805. str r3 , KK
  806. #endif
  807. _L2_END:
  808. mov r3, BC
  809. ldr r4, K
  810. lsl r4, r4, #4 // k * 2 * 4 * 2
  811. add r3, r3, r4 // B = B + K * 2 * 8
  812. mov BC, r3
  813. #if !defined(LEFT)
  814. ldr r3 , KK
  815. add r3 , r3 , #2 // number of values in BO
  816. str r3 , KK
  817. #endif
  818. subs J , #1 // j--
  819. bgt _L2_BEGIN
  820. /*********************************************************************************************/
  821. _L1_BEGIN:
  822. ldr J , N
  823. tst J , #1
  824. ble _L999
  825. ldr CO1, C // CO1 = C
  826. ldr r4 , LDC
  827. add r3 , r4, CO1
  828. str r3 , C // store C
  829. #if defined(LEFT)
  830. ldr r3 , OFFSET
  831. str r3 , KK
  832. #endif
  833. ldr AO, A // AO = A
  834. _L1_M2_BEGIN:
  835. ldr I, M
  836. asrs I, I, #1 // I = I / 2
  837. ble _L1_M1_BEGIN
  838. _L1_M2_20:
  839. #if (defined(LEFT) && defined(TRANSA)) || \
  840. (!defined(LEFT) && !defined(TRANSA))
  841. mov BO, BC
  842. #else
  843. mov BO, BC
  844. ldr r3 , KK
  845. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  846. add BO , BO , r4
  847. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  848. add AO , AO , r4
  849. #endif
  850. #ifndef TRMMKERNEL
  851. ldr K1, K
  852. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  853. ldr K1, K
  854. ldr r3, KK
  855. sub K1, K1, r3
  856. str K1, KKK
  857. #else
  858. ldr K1, KK
  859. #ifdef LEFT
  860. add K1, K1, #2 // number of values in AO
  861. #else
  862. add K1, K1, #1 // number of values in BO
  863. #endif
  864. str K1, KKK
  865. #endif
  866. asrs L , K1, #3 // L = L / 8
  867. cmp L , #3
  868. blt _L1_M2_30
  869. .align 5
  870. KERNEL2x1_I
  871. KERNEL2x1_M2
  872. KERNEL2x1_M1
  873. KERNEL2x1_M2
  874. KERNEL2x1_M1
  875. KERNEL2x1_M2
  876. KERNEL2x1_M1
  877. KERNEL2x1_M2
  878. sub L, L, #2
  879. _L1_M2_22:
  880. KERNEL2x1_M1
  881. KERNEL2x1_M2
  882. KERNEL2x1_M1
  883. KERNEL2x1_M2
  884. KERNEL2x1_M1
  885. KERNEL2x1_M2
  886. KERNEL2x1_M1
  887. KERNEL2x1_M2
  888. subs L, L, #1
  889. bgt _L1_M2_22
  890. KERNEL2x1_M1
  891. KERNEL2x1_M2
  892. KERNEL2x1_M1
  893. KERNEL2x1_M2
  894. KERNEL2x1_M1
  895. KERNEL2x1_M2
  896. KERNEL2x1_M1
  897. KERNEL2x1_E
  898. b _L1_M2_44
  899. _L1_M2_30:
  900. tst L, #3
  901. ble _L1_M2_40
  902. tst L, #2
  903. ble _L1_M2_32
  904. KERNEL2x1_I
  905. KERNEL2x1_M2
  906. KERNEL2x1_M1
  907. KERNEL2x1_M2
  908. KERNEL2x1_M1
  909. KERNEL2x1_M2
  910. KERNEL2x1_M1
  911. KERNEL2x1_M2
  912. KERNEL2x1_M1
  913. KERNEL2x1_M2
  914. KERNEL2x1_M1
  915. KERNEL2x1_M2
  916. KERNEL2x1_M1
  917. KERNEL2x1_M2
  918. KERNEL2x1_M1
  919. KERNEL2x1_E
  920. b _L1_M2_44
  921. _L1_M2_32:
  922. tst L, #1
  923. ble _L1_M2_40
  924. KERNEL2x1_I
  925. KERNEL2x1_M2
  926. KERNEL2x1_M1
  927. KERNEL2x1_M2
  928. KERNEL2x1_M1
  929. KERNEL2x1_M2
  930. KERNEL2x1_M1
  931. KERNEL2x1_E
  932. b _L1_M2_44
  933. _L1_M2_40:
  934. INIT2x1
  935. _L1_M2_44:
  936. ands L , K1, #7 // L = L % 8
  937. ble _L1_M2_100
  938. _L1_M2_46:
  939. KERNEL2x1_SUB
  940. subs L, L, #1
  941. bne _L1_M2_46
  942. _L1_M2_100:
  943. SAVE2x1
  944. #if (defined(LEFT) && defined(TRANSA)) || \
  945. (!defined(LEFT) && !defined(TRANSA))
  946. ldr r3 , K
  947. ldr r4 , KKK
  948. sub r3 , r3 , r4
  949. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  950. add BO , BO , r4
  951. lsls r4 , r3 , #4 // 2 * 4 * 2 float values
  952. add AO , AO , r4
  953. #endif
  954. #if defined(LEFT)
  955. ldr r3 , KK
  956. add r3 , r3 , #2 // number of values in AO
  957. str r3 , KK
  958. #endif
  959. _L1_M2_END:
  960. subs I, I, #1
  961. bne _L1_M2_20
  962. _L1_M1_BEGIN:
  963. ldr I, M
  964. tst I, #1 // I = I % 2
  965. ble _L1_END
  966. _L1_M1_20:
  967. INIT1x1
  968. #if (defined(LEFT) && defined(TRANSA)) || \
  969. (!defined(LEFT) && !defined(TRANSA))
  970. mov BO, BC
  971. #else
  972. mov BO, BC
  973. ldr r3 , KK
  974. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  975. add BO , BO , r4
  976. lsls r4 , r3 , #3 // 1 * 4 * 2 float values
  977. add AO , AO , r4
  978. #endif
  979. #ifndef TRMMKERNEL
  980. ldr K1, K
  981. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  982. ldr K1, K
  983. ldr r3, KK
  984. sub K1, K1, r3
  985. str K1, KKK
  986. #else
  987. ldr K1, KK
  988. #ifdef LEFT
  989. add K1, K1, #1 // number of values in AO
  990. #else
  991. add K1, K1, #1 // number of values in BO
  992. #endif
  993. str K1, KKK
  994. #endif
  995. asrs L , K1, #3 // L = L / 8
  996. ble _L1_M1_40
  997. _L1_M1_22:
  998. KERNEL1x1_SUB
  999. KERNEL1x1_SUB
  1000. KERNEL1x1_SUB
  1001. KERNEL1x1_SUB
  1002. KERNEL1x1_SUB
  1003. KERNEL1x1_SUB
  1004. KERNEL1x1_SUB
  1005. KERNEL1x1_SUB
  1006. subs L, L, #1
  1007. bgt _L1_M1_22
  1008. _L1_M1_40:
  1009. ands L , K1, #7 // L = L % 8
  1010. ble _L1_M1_100
  1011. _L1_M1_42:
  1012. KERNEL1x1_SUB
  1013. subs L, L, #1
  1014. bgt _L1_M1_42
  1015. _L1_M1_100:
  1016. SAVE1x1
  1017. _L1_END:
  1018. _L999:
  1019. sub r3, fp, #128
  1020. vldm r3, { s8 - s15} // restore floating point registers
  1021. movs r0, #0 // set return value
  1022. sub sp, fp, #24
  1023. pop {r4 - r9, fp}
  1024. bx lr
  1025. EPILOGUE