You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_kernel_2x2_vfp.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA_R d0
  42. #define OLD_ALPHA_I d1
  43. /******************************************************
  44. * [fp, #-128] - [fp, #-64] is reserved
  45. * for store and restore of floating point
  46. * registers
  47. *******************************************************/
  48. #define KKK [fp, #-240]
  49. #define KK [fp, #-244 ]
  50. #define A [fp, #-248 ]
  51. #define LDC [fp, #-252 ]
  52. #define M [fp, #-256 ]
  53. #define N [fp, #-260 ]
  54. #define K [fp, #-264 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA_I [fp, #-272]
  59. #define ALPHA_R [fp, #-280]
  60. #define B [fp, #4 ]
  61. #define C [fp, #8 ]
  62. #define OLD_LDC [fp, #12 ]
  63. #define OFFSET [fp, #16 ]
  64. #define I r0
  65. #define J r1
  66. #define L r2
  67. #define AO r5
  68. #define BO r6
  69. #define CO1 r8
  70. #define CO2 r9
  71. #define K1 r7
  72. #define BC r12
  73. #define A_PRE 96
  74. #define B_PRE 96
  75. #define C_PRE 64
  76. /**************************************************************************************
  77. * Macro definitions
  78. **************************************************************************************/
  79. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  80. #define KMAC_R fnmacd
  81. #define KMAC_I fmacd
  82. #define FMAC_R1 fmacd
  83. #define FMAC_R2 fnmacd
  84. #define FMAC_I1 fmacd
  85. #define FMAC_I2 fmacd
  86. #elif defined(CN) || defined(CT)
  87. #define KMAC_R fmacd
  88. #define KMAC_I fnmacd
  89. #define FMAC_R1 fmacd
  90. #define FMAC_R2 fnmacd
  91. #define FMAC_I1 fmacd
  92. #define FMAC_I2 fmacd
  93. #elif defined(NC) || defined(TC)
  94. #define KMAC_R fmacd
  95. #define KMAC_I fnmacd
  96. #define FMAC_R1 fmacd
  97. #define FMAC_R2 fmacd
  98. #define FMAC_I1 fnmacd
  99. #define FMAC_I2 fmacd
  100. #else
  101. #define KMAC_R fnmacd
  102. #define KMAC_I fmacd
  103. #define FMAC_R1 fmacd
  104. #define FMAC_R2 fmacd
  105. #define FMAC_I1 fnmacd
  106. #define FMAC_I2 fmacd
  107. #endif
  108. /**************************************************************************************
  109. * Macro definitions
  110. **************************************************************************************/
  111. .macro INIT2x2
  112. fldd d8 , FP_ZERO
  113. vmov.f64 d9 , d8
  114. vmov.f64 d10, d8
  115. vmov.f64 d11, d8
  116. vmov.f64 d12, d8
  117. vmov.f64 d13, d8
  118. vmov.f64 d14, d8
  119. vmov.f64 d15, d8
  120. .endm
  121. .macro KERNEL2x2_I
  122. pld [ AO, #A_PRE ]
  123. pld [ BO, #B_PRE ]
  124. fldd d0 , [ AO ]
  125. fldd d1 , [ AO, #8 ]
  126. fldd d2 , [ AO, #16 ]
  127. fldd d3 , [ AO, #24 ]
  128. fldd d4 , [ BO ]
  129. fldd d5 , [ BO, #8 ]
  130. fldd d6 , [ BO, #16 ]
  131. fldd d7 , [ BO, #24 ]
  132. fmuld d8 , d0, d4
  133. KMAC_R d8 , d1, d5
  134. fmuld d9 , d0, d5
  135. KMAC_I d9 , d1, d4
  136. fmuld d10 , d2, d4
  137. KMAC_R d10 , d3, d5
  138. fmuld d11 , d2, d5
  139. KMAC_I d11 , d3, d4
  140. fmuld d12 , d0, d6
  141. KMAC_R d12 , d1, d7
  142. fmuld d13 , d0, d7
  143. KMAC_I d13 , d1, d6
  144. fmuld d14 , d2, d6
  145. KMAC_R d14 , d3, d7
  146. fmuld d15 , d2, d7
  147. KMAC_I d15 , d3, d6
  148. add BO , BO, #32
  149. add AO , AO, #32
  150. .endm
  151. .macro KERNEL2x2_M1
  152. fldd d0 , [ AO ]
  153. fldd d4 , [ BO ]
  154. fldd d5 , [ BO, #8 ]
  155. fmacd d8 , d0, d4
  156. fldd d1 , [ AO, #8 ]
  157. fmacd d9 , d0, d5
  158. fldd d2 , [ AO, #16 ]
  159. KMAC_R d8 , d1, d5
  160. fldd d3 , [ AO, #24 ]
  161. KMAC_I d9 , d1, d4
  162. fldd d6 , [ BO, #16 ]
  163. fmacd d10 , d2, d4
  164. fldd d7 , [ BO, #24 ]
  165. fmacd d11 , d2, d5
  166. KMAC_R d10 , d3, d5
  167. pld [ AO, #A_PRE ]
  168. KMAC_I d11 , d3, d4
  169. pld [ BO, #B_PRE ]
  170. fmacd d12 , d0, d6
  171. fmacd d13 , d0, d7
  172. KMAC_R d12 , d1, d7
  173. KMAC_I d13 , d1, d6
  174. fmacd d14 , d2, d6
  175. fmacd d15 , d2, d7
  176. add BO , BO, #32
  177. KMAC_R d14 , d3, d7
  178. add AO , AO, #32
  179. KMAC_I d15 , d3, d6
  180. .endm
  181. .macro KERNEL2x2_M2
  182. fldd d0 , [ AO ]
  183. fldd d4 , [ BO ]
  184. fldd d5 , [ BO, #8 ]
  185. fmacd d8 , d0, d4
  186. fldd d1 , [ AO, #8 ]
  187. fmacd d9 , d0, d5
  188. fldd d2 , [ AO, #16 ]
  189. KMAC_R d8 , d1, d5
  190. fldd d3 , [ AO, #24 ]
  191. KMAC_I d9 , d1, d4
  192. fldd d6 , [ BO, #16 ]
  193. fmacd d10 , d2, d4
  194. fldd d7 , [ BO, #24 ]
  195. fmacd d11 , d2, d5
  196. KMAC_R d10 , d3, d5
  197. pld [ AO, #A_PRE ]
  198. KMAC_I d11 , d3, d4
  199. pld [ BO, #B_PRE ]
  200. fmacd d12 , d0, d6
  201. fmacd d13 , d0, d7
  202. KMAC_R d12 , d1, d7
  203. KMAC_I d13 , d1, d6
  204. fmacd d14 , d2, d6
  205. fmacd d15 , d2, d7
  206. add BO , BO, #32
  207. KMAC_R d14 , d3, d7
  208. add AO , AO, #32
  209. KMAC_I d15 , d3, d6
  210. .endm
  211. .macro KERNEL2x2_E
  212. fldd d0 , [ AO ]
  213. fldd d1 , [ AO, #8 ]
  214. fldd d2 , [ AO, #16 ]
  215. fldd d3 , [ AO, #24 ]
  216. fldd d4 , [ BO ]
  217. fldd d5 , [ BO, #8 ]
  218. fldd d6 , [ BO, #16 ]
  219. fldd d7 , [ BO, #24 ]
  220. fmacd d8 , d0, d4
  221. KMAC_R d8 , d1, d5
  222. fmacd d9 , d0, d5
  223. KMAC_I d9 , d1, d4
  224. fmacd d10 , d2, d4
  225. KMAC_R d10 , d3, d5
  226. fmacd d11 , d2, d5
  227. KMAC_I d11 , d3, d4
  228. fmacd d12 , d0, d6
  229. KMAC_R d12 , d1, d7
  230. fmacd d13 , d0, d7
  231. KMAC_I d13 , d1, d6
  232. fmacd d14 , d2, d6
  233. KMAC_R d14 , d3, d7
  234. fmacd d15 , d2, d7
  235. KMAC_I d15 , d3, d6
  236. add BO , BO, #32
  237. add AO , AO, #32
  238. .endm
  239. .macro KERNEL2x2_SUB
  240. fldd d0 , [ AO ]
  241. fldd d4 , [ BO ]
  242. fldd d5 , [ BO, #8 ]
  243. fmacd d8 , d0, d4
  244. fldd d1 , [ AO, #8 ]
  245. fmacd d9 , d0, d5
  246. fldd d2 , [ AO, #16 ]
  247. KMAC_R d8 , d1, d5
  248. fldd d3 , [ AO, #24 ]
  249. KMAC_I d9 , d1, d4
  250. fldd d6 , [ BO, #16 ]
  251. fmacd d10 , d2, d4
  252. fldd d7 , [ BO, #24 ]
  253. fmacd d11 , d2, d5
  254. KMAC_R d10 , d3, d5
  255. pld [ AO, #A_PRE ]
  256. KMAC_I d11 , d3, d4
  257. pld [ BO, #B_PRE ]
  258. fmacd d12 , d0, d6
  259. fmacd d13 , d0, d7
  260. KMAC_R d12 , d1, d7
  261. KMAC_I d13 , d1, d6
  262. fmacd d14 , d2, d6
  263. fmacd d15 , d2, d7
  264. add BO , BO, #32
  265. KMAC_R d14 , d3, d7
  266. add AO , AO, #32
  267. KMAC_I d15 , d3, d6
  268. .endm
  269. .macro SAVE2x2
  270. ldr r3 , LDC
  271. add CO2 , CO1, r3
  272. fldd d0, ALPHA_R
  273. fldd d1, ALPHA_I
  274. fldd d4 , FP_ZERO
  275. vmov.f64 d5 , d4
  276. vmov.f64 d6 , d4
  277. vmov.f64 d7 , d4
  278. FMAC_R1 d4 , d0 , d8
  279. FMAC_I1 d5 , d0 , d9
  280. FMAC_R2 d4 , d1 , d9
  281. FMAC_I2 d5 , d1 , d8
  282. FMAC_R1 d6 , d0 , d10
  283. FMAC_I1 d7 , d0 , d11
  284. FMAC_R2 d6 , d1 , d11
  285. FMAC_I2 d7 , d1 , d10
  286. fstmiad CO1, { d4 - d7 }
  287. fldd d4 , FP_ZERO
  288. vmov.f64 d5 , d4
  289. vmov.f64 d6 , d4
  290. vmov.f64 d7 , d4
  291. FMAC_R1 d4 , d0 , d12
  292. FMAC_I1 d5 , d0 , d13
  293. FMAC_R2 d4 , d1 , d13
  294. FMAC_I2 d5 , d1 , d12
  295. FMAC_R1 d6 , d0 , d14
  296. FMAC_I1 d7 , d0 , d15
  297. FMAC_R2 d6 , d1 , d15
  298. FMAC_I2 d7 , d1 , d14
  299. fstmiad CO2, { d4 - d7 }
  300. add CO1, CO1, #32
  301. .endm
  302. /******************************************************************************/
  303. .macro INIT1x2
  304. fldd d8 , FP_ZERO
  305. vmov.f64 d9 , d8
  306. vmov.f64 d12, d8
  307. vmov.f64 d13, d8
  308. .endm
  309. .macro KERNEL1x2_I
  310. fldd d0 , [ AO ]
  311. fldd d1 , [ AO, #8 ]
  312. fldd d4 , [ BO ]
  313. fldd d5 , [ BO, #8 ]
  314. fldd d6 , [ BO, #16 ]
  315. fldd d7 , [ BO, #24 ]
  316. fmuld d8 , d0, d4
  317. KMAC_R d8 , d1, d5
  318. fmuld d9 , d0, d5
  319. KMAC_I d9 , d1, d4
  320. fmuld d12 , d0, d6
  321. KMAC_R d12 , d1, d7
  322. fmuld d13 , d0, d7
  323. KMAC_I d13 , d1, d6
  324. add BO , BO, #32
  325. add AO , AO, #16
  326. .endm
  327. .macro KERNEL1x2_M1
  328. fldd d0 , [ AO ]
  329. fldd d1 , [ AO, #8 ]
  330. fldd d4 , [ BO ]
  331. fldd d5 , [ BO, #8 ]
  332. fldd d6 , [ BO, #16 ]
  333. fldd d7 , [ BO, #24 ]
  334. fmacd d8 , d0, d4
  335. KMAC_R d8 , d1, d5
  336. fmacd d9 , d0, d5
  337. KMAC_I d9 , d1, d4
  338. fmacd d12 , d0, d6
  339. KMAC_R d12 , d1, d7
  340. fmacd d13 , d0, d7
  341. KMAC_I d13 , d1, d6
  342. add BO , BO, #32
  343. add AO , AO, #16
  344. .endm
  345. .macro KERNEL1x2_M2
  346. fldd d0 , [ AO ]
  347. fldd d1 , [ AO, #8 ]
  348. fldd d4 , [ BO ]
  349. fldd d5 , [ BO, #8 ]
  350. fldd d6 , [ BO, #16 ]
  351. fldd d7 , [ BO, #24 ]
  352. fmacd d8 , d0, d4
  353. KMAC_R d8 , d1, d5
  354. fmacd d9 , d0, d5
  355. KMAC_I d9 , d1, d4
  356. fmacd d12 , d0, d6
  357. KMAC_R d12 , d1, d7
  358. fmacd d13 , d0, d7
  359. KMAC_I d13 , d1, d6
  360. add BO , BO, #32
  361. add AO , AO, #16
  362. .endm
  363. .macro KERNEL1x2_E
  364. fldd d0 , [ AO ]
  365. fldd d1 , [ AO, #8 ]
  366. fldd d4 , [ BO ]
  367. fldd d5 , [ BO, #8 ]
  368. fldd d6 , [ BO, #16 ]
  369. fldd d7 , [ BO, #24 ]
  370. fmacd d8 , d0, d4
  371. KMAC_R d8 , d1, d5
  372. fmacd d9 , d0, d5
  373. KMAC_I d9 , d1, d4
  374. fmacd d12 , d0, d6
  375. KMAC_R d12 , d1, d7
  376. fmacd d13 , d0, d7
  377. KMAC_I d13 , d1, d6
  378. add BO , BO, #32
  379. add AO , AO, #16
  380. .endm
  381. .macro KERNEL1x2_SUB
  382. fldd d0 , [ AO ]
  383. fldd d1 , [ AO, #8 ]
  384. fldd d4 , [ BO ]
  385. fldd d5 , [ BO, #8 ]
  386. fldd d6 , [ BO, #16 ]
  387. fldd d7 , [ BO, #24 ]
  388. fmacd d8 , d0, d4
  389. KMAC_R d8 , d1, d5
  390. fmacd d9 , d0, d5
  391. KMAC_I d9 , d1, d4
  392. fmacd d12 , d0, d6
  393. KMAC_R d12 , d1, d7
  394. fmacd d13 , d0, d7
  395. KMAC_I d13 , d1, d6
  396. add BO , BO, #32
  397. add AO , AO, #16
  398. .endm
  399. .macro SAVE1x2
  400. ldr r3 , LDC
  401. add CO2 , CO1, r3
  402. fldd d0, ALPHA_R
  403. fldd d1, ALPHA_I
  404. fldd d4 , FP_ZERO
  405. vmov.f64 d5 , d4
  406. FMAC_R1 d4 , d0 , d8
  407. FMAC_I1 d5 , d0 , d9
  408. FMAC_R2 d4 , d1 , d9
  409. FMAC_I2 d5 , d1 , d8
  410. fstmiad CO1, { d4 - d5 }
  411. fldd d4 , FP_ZERO
  412. vmov.f64 d5 , d4
  413. FMAC_R1 d4 , d0 , d12
  414. FMAC_I1 d5 , d0 , d13
  415. FMAC_R2 d4 , d1 , d13
  416. FMAC_I2 d5 , d1 , d12
  417. fstmiad CO2, { d4 - d5 }
  418. add CO1, CO1, #16
  419. .endm
  420. /******************************************************************************/
  421. .macro INIT2x1
  422. fldd d8 , FP_ZERO
  423. vmov.f64 d9 , d8
  424. vmov.f64 d10, d8
  425. vmov.f64 d11, d8
  426. .endm
  427. .macro KERNEL2x1_I
  428. fldd d0 , [ AO ]
  429. fldd d1 , [ AO, #8 ]
  430. fldd d2 , [ AO, #16 ]
  431. fldd d3 , [ AO, #24 ]
  432. fldd d4 , [ BO ]
  433. fldd d5 , [ BO, #8 ]
  434. fmuld d8 , d0, d4
  435. KMAC_R d8 , d1, d5
  436. fmuld d9 , d0, d5
  437. KMAC_I d9 , d1, d4
  438. fmuld d10 , d2, d4
  439. KMAC_R d10 , d3, d5
  440. fmuld d11 , d2, d5
  441. KMAC_I d11 , d3, d4
  442. add BO , BO, #16
  443. add AO , AO, #32
  444. .endm
  445. .macro KERNEL2x1_M1
  446. fldd d0 , [ AO ]
  447. fldd d1 , [ AO, #8 ]
  448. fldd d2 , [ AO, #16 ]
  449. fldd d3 , [ AO, #24 ]
  450. fldd d4 , [ BO ]
  451. fldd d5 , [ BO, #8 ]
  452. fmacd d8 , d0, d4
  453. KMAC_R d8 , d1, d5
  454. fmacd d9 , d0, d5
  455. KMAC_I d9 , d1, d4
  456. fmacd d10 , d2, d4
  457. KMAC_R d10 , d3, d5
  458. fmacd d11 , d2, d5
  459. KMAC_I d11 , d3, d4
  460. add BO , BO, #16
  461. add AO , AO, #32
  462. .endm
  463. .macro KERNEL2x1_M2
  464. fldd d0 , [ AO ]
  465. fldd d1 , [ AO, #8 ]
  466. fldd d2 , [ AO, #16 ]
  467. fldd d3 , [ AO, #24 ]
  468. fldd d4 , [ BO ]
  469. fldd d5 , [ BO, #8 ]
  470. fmacd d8 , d0, d4
  471. KMAC_R d8 , d1, d5
  472. fmacd d9 , d0, d5
  473. KMAC_I d9 , d1, d4
  474. fmacd d10 , d2, d4
  475. KMAC_R d10 , d3, d5
  476. fmacd d11 , d2, d5
  477. KMAC_I d11 , d3, d4
  478. add BO , BO, #16
  479. add AO , AO, #32
  480. .endm
  481. .macro KERNEL2x1_E
  482. fldd d0 , [ AO ]
  483. fldd d1 , [ AO, #8 ]
  484. fldd d2 , [ AO, #16 ]
  485. fldd d3 , [ AO, #24 ]
  486. fldd d4 , [ BO ]
  487. fldd d5 , [ BO, #8 ]
  488. fmacd d8 , d0, d4
  489. KMAC_R d8 , d1, d5
  490. fmacd d9 , d0, d5
  491. KMAC_I d9 , d1, d4
  492. fmacd d10 , d2, d4
  493. KMAC_R d10 , d3, d5
  494. fmacd d11 , d2, d5
  495. KMAC_I d11 , d3, d4
  496. add BO , BO, #16
  497. add AO , AO, #32
  498. .endm
  499. .macro KERNEL2x1_SUB
  500. fldd d0 , [ AO ]
  501. fldd d1 , [ AO, #8 ]
  502. fldd d2 , [ AO, #16 ]
  503. fldd d3 , [ AO, #24 ]
  504. fldd d4 , [ BO ]
  505. fldd d5 , [ BO, #8 ]
  506. fmacd d8 , d0, d4
  507. KMAC_R d8 , d1, d5
  508. fmacd d9 , d0, d5
  509. KMAC_I d9 , d1, d4
  510. fmacd d10 , d2, d4
  511. KMAC_R d10 , d3, d5
  512. fmacd d11 , d2, d5
  513. KMAC_I d11 , d3, d4
  514. add BO , BO, #16
  515. add AO , AO, #32
  516. .endm
  517. .macro SAVE2x1
  518. fldd d0, ALPHA_R
  519. fldd d1, ALPHA_I
  520. fldd d4 , FP_ZERO
  521. vmov.f64 d5 , d4
  522. vmov.f64 d6 , d4
  523. vmov.f64 d7 , d4
  524. FMAC_R1 d4 , d0 , d8
  525. FMAC_I1 d5 , d0 , d9
  526. FMAC_R2 d4 , d1 , d9
  527. FMAC_I2 d5 , d1 , d8
  528. FMAC_R1 d6 , d0 , d10
  529. FMAC_I1 d7 , d0 , d11
  530. FMAC_R2 d6 , d1 , d11
  531. FMAC_I2 d7 , d1 , d10
  532. fstmiad CO1, { d4 - d7 }
  533. add CO1, CO1, #32
  534. .endm
  535. /******************************************************************************/
  536. .macro INIT1x1
  537. fldd d8 , FP_ZERO
  538. vmov.f64 d9 , d8
  539. .endm
  540. .macro KERNEL1x1_I
  541. fldd d0 , [ AO ]
  542. fldd d1 , [ AO, #8 ]
  543. fldd d4 , [ BO ]
  544. fldd d5 , [ BO, #8 ]
  545. fmuld d8 , d0, d4
  546. KMAC_R d8 , d1, d5
  547. fmuld d9 , d0, d5
  548. KMAC_I d9 , d1, d4
  549. add BO , BO, #16
  550. add AO , AO, #16
  551. .endm
  552. .macro KERNEL1x1_M1
  553. fldd d0 , [ AO ]
  554. fldd d1 , [ AO, #8 ]
  555. fldd d4 , [ BO ]
  556. fldd d5 , [ BO, #8 ]
  557. fmacd d8 , d0, d4
  558. KMAC_R d8 , d1, d5
  559. fmacd d9 , d0, d5
  560. KMAC_I d9 , d1, d4
  561. add BO , BO, #16
  562. add AO , AO, #16
  563. .endm
  564. .macro KERNEL1x1_M2
  565. fldd d0 , [ AO ]
  566. fldd d1 , [ AO, #8 ]
  567. fldd d4 , [ BO ]
  568. fldd d5 , [ BO, #8 ]
  569. fmacd d8 , d0, d4
  570. KMAC_R d8 , d1, d5
  571. fmacd d9 , d0, d5
  572. KMAC_I d9 , d1, d4
  573. add BO , BO, #16
  574. add AO , AO, #16
  575. .endm
  576. .macro KERNEL1x1_E
  577. fldd d0 , [ AO ]
  578. fldd d1 , [ AO, #8 ]
  579. fldd d4 , [ BO ]
  580. fldd d5 , [ BO, #8 ]
  581. fmacd d8 , d0, d4
  582. KMAC_R d8 , d1, d5
  583. fmacd d9 , d0, d5
  584. KMAC_I d9 , d1, d4
  585. add BO , BO, #16
  586. add AO , AO, #16
  587. .endm
  588. .macro KERNEL1x1_SUB
  589. fldd d0 , [ AO ]
  590. fldd d1 , [ AO, #8 ]
  591. fldd d4 , [ BO ]
  592. fldd d5 , [ BO, #8 ]
  593. fmacd d8 , d0, d4
  594. KMAC_R d8 , d1, d5
  595. fmacd d9 , d0, d5
  596. KMAC_I d9 , d1, d4
  597. add BO , BO, #16
  598. add AO , AO, #16
  599. .endm
  600. .macro SAVE1x1
  601. fldd d0, ALPHA_R
  602. fldd d1, ALPHA_I
  603. fldd d4 , FP_ZERO
  604. vmov.f64 d5 , d4
  605. FMAC_R1 d4 , d0 , d8
  606. FMAC_I1 d5 , d0 , d9
  607. FMAC_R2 d4 , d1 , d9
  608. FMAC_I2 d5 , d1 , d8
  609. fstmiad CO1, { d4 - d5 }
  610. add CO1, CO1, #16
  611. .endm
  612. /**************************************************************************************
  613. * End of macro definitions
  614. **************************************************************************************/
  615. PROLOGUE
  616. .align 5
  617. push {r4 - r9, fp}
  618. add fp, sp, #24
  619. sub sp, sp, #STACKSIZE // reserve stack
  620. str OLD_M, M
  621. str OLD_N, N
  622. str OLD_K, K
  623. str OLD_A, A
  624. vstr OLD_ALPHA_R, ALPHA_R
  625. vstr OLD_ALPHA_I, ALPHA_I
  626. sub r3, fp, #128
  627. vstm r3, { d8 - d15} // store floating point registers
  628. movs r4, #0
  629. str r4, FP_ZERO
  630. str r4, FP_ZERO_1
  631. ldr r3, OLD_LDC
  632. lsl r3, r3, #4 // ldc = ldc * 8 * 2
  633. str r3, LDC
  634. ldr r3, OFFSET
  635. #ifndef LEFT
  636. neg r3 , r3
  637. #endif
  638. str r3 , KK
  639. ldr BC, B
  640. ldr J, N
  641. asrs J, J, #1 // J = J / 2
  642. ble _L1_BEGIN
  643. _L2_BEGIN:
  644. ldr CO1, C // CO1 = C
  645. ldr r4 , LDC
  646. lsl r4 , r4 , #1 // LDC * 2
  647. add r3 , r4, CO1
  648. str r3 , C // store C
  649. #if defined(LEFT)
  650. ldr r3 , OFFSET
  651. str r3 , KK
  652. #endif
  653. ldr AO, A // AO = A
  654. pld [AO , #A_PRE-64]
  655. pld [AO , #A_PRE-32]
  656. _L2_M2_BEGIN:
  657. ldr I, M
  658. asrs I, I, #1 // I = I / 2
  659. ble _L2_M1_BEGIN
  660. _L2_M2_20:
  661. #if (defined(LEFT) && defined(TRANSA)) || \
  662. (!defined(LEFT) && !defined(TRANSA))
  663. mov BO, BC
  664. #else
  665. mov BO, BC
  666. ldr r3 , KK
  667. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  668. add BO , BO , r4
  669. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  670. add AO , AO , r4
  671. #endif
  672. #ifndef TRMMKERNEL
  673. ldr K1, K
  674. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  675. ldr K1, K
  676. ldr r3, KK
  677. sub K1, K1, r3
  678. str K1, KKK
  679. #else
  680. ldr K1, KK
  681. #ifdef LEFT
  682. add K1, K1, #2 // number of values in AO
  683. #else
  684. add K1, K1, #2 // number of values in BO
  685. #endif
  686. str K1, KKK
  687. #endif
  688. asrs L , K1, #3 // L = L / 8
  689. cmp L , #3
  690. blt _L2_M2_30
  691. .align 5
  692. KERNEL2x2_I
  693. KERNEL2x2_M2
  694. KERNEL2x2_M1
  695. KERNEL2x2_M2
  696. KERNEL2x2_M1
  697. KERNEL2x2_M2
  698. KERNEL2x2_M1
  699. KERNEL2x2_M2
  700. sub L, L, #2
  701. _L2_M2_22:
  702. KERNEL2x2_M1
  703. KERNEL2x2_M2
  704. KERNEL2x2_M1
  705. KERNEL2x2_M2
  706. KERNEL2x2_M1
  707. KERNEL2x2_M2
  708. KERNEL2x2_M1
  709. KERNEL2x2_M2
  710. subs L, L, #1
  711. bgt _L2_M2_22
  712. KERNEL2x2_M1
  713. KERNEL2x2_M2
  714. KERNEL2x2_M1
  715. KERNEL2x2_M2
  716. KERNEL2x2_M1
  717. KERNEL2x2_M2
  718. KERNEL2x2_M1
  719. KERNEL2x2_E
  720. b _L2_M2_44
  721. _L2_M2_30:
  722. tst L, #3
  723. ble _L2_M2_40
  724. tst L, #2
  725. ble _L2_M2_32
  726. KERNEL2x2_I
  727. KERNEL2x2_M2
  728. KERNEL2x2_M1
  729. KERNEL2x2_M2
  730. KERNEL2x2_M1
  731. KERNEL2x2_M2
  732. KERNEL2x2_M1
  733. KERNEL2x2_M2
  734. KERNEL2x2_M1
  735. KERNEL2x2_M2
  736. KERNEL2x2_M1
  737. KERNEL2x2_M2
  738. KERNEL2x2_M1
  739. KERNEL2x2_M2
  740. KERNEL2x2_M1
  741. KERNEL2x2_E
  742. b _L2_M2_44
  743. _L2_M2_32:
  744. tst L, #1
  745. ble _L2_M2_40
  746. KERNEL2x2_I
  747. KERNEL2x2_M2
  748. KERNEL2x2_M1
  749. KERNEL2x2_M2
  750. KERNEL2x2_M1
  751. KERNEL2x2_M2
  752. KERNEL2x2_M1
  753. KERNEL2x2_E
  754. b _L2_M2_44
  755. _L2_M2_40:
  756. INIT2x2
  757. _L2_M2_44:
  758. ands L , K1, #7 // L = L % 8
  759. ble _L2_M2_100
  760. _L2_M2_46:
  761. KERNEL2x2_SUB
  762. subs L, L, #1
  763. bne _L2_M2_46
  764. _L2_M2_100:
  765. SAVE2x2
  766. #if (defined(LEFT) && defined(TRANSA)) || \
  767. (!defined(LEFT) && !defined(TRANSA))
  768. ldr r3 , K
  769. ldr r4 , KKK
  770. sub r3 , r3 , r4
  771. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  772. add BO , BO , r4
  773. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  774. add AO , AO , r4
  775. #endif
  776. #if defined(LEFT)
  777. ldr r3 , KK
  778. add r3 , r3 , #2 // number of values in AO
  779. str r3 , KK
  780. #endif
  781. _L2_M2_END:
  782. subs I, I, #1
  783. bne _L2_M2_20
  784. _L2_M1_BEGIN:
  785. ldr I, M
  786. tst I, #1 // I = I % 2
  787. ble _L2_END
  788. _L2_M1_20:
  789. INIT1x2
  790. #if (defined(LEFT) && defined(TRANSA)) || \
  791. (!defined(LEFT) && !defined(TRANSA))
  792. mov BO, BC
  793. #else
  794. mov BO, BC
  795. ldr r3 , KK
  796. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  797. add BO , BO , r4
  798. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  799. add AO , AO , r4
  800. #endif
  801. #ifndef TRMMKERNEL
  802. ldr K1, K
  803. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  804. ldr K1, K
  805. ldr r3, KK
  806. sub K1, K1, r3
  807. str K1, KKK
  808. #else
  809. ldr K1, KK
  810. #ifdef LEFT
  811. add K1, K1, #1 // number of values in AO
  812. #else
  813. add K1, K1, #2 // number of values in BO
  814. #endif
  815. str K1, KKK
  816. #endif
  817. asrs L , K1, #3 // L = L / 8
  818. ble _L2_M1_40
  819. _L2_M1_22:
  820. KERNEL1x2_SUB
  821. KERNEL1x2_SUB
  822. KERNEL1x2_SUB
  823. KERNEL1x2_SUB
  824. KERNEL1x2_SUB
  825. KERNEL1x2_SUB
  826. KERNEL1x2_SUB
  827. KERNEL1x2_SUB
  828. subs L, L, #1
  829. bgt _L2_M1_22
  830. _L2_M1_40:
  831. ands L , K1, #7 // L = L % 8
  832. ble _L2_M1_100
  833. _L2_M1_42:
  834. KERNEL1x2_SUB
  835. subs L, L, #1
  836. bgt _L2_M1_42
  837. _L2_M1_100:
  838. SAVE1x2
  839. #if (defined(LEFT) && defined(TRANSA)) || \
  840. (!defined(LEFT) && !defined(TRANSA))
  841. ldr r3 , K
  842. ldr r4 , KKK
  843. sub r3 , r3 , r4
  844. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  845. add BO , BO , r4
  846. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  847. add AO , AO , r4
  848. #endif
  849. #if defined(LEFT)
  850. ldr r3 , KK
  851. add r3 , r3 , #1 // number of values in AO
  852. str r3 , KK
  853. #endif
  854. _L2_END:
  855. mov r3, BC
  856. ldr r4, K
  857. lsl r4, r4, #5 // k * 2 * 8 * 2
  858. add r3, r3, r4 // B = B + K * 4 * 8
  859. mov BC, r3
  860. #if !defined(LEFT)
  861. ldr r3 , KK
  862. add r3 , r3 , #2 // number of values in BO
  863. str r3 , KK
  864. #endif
  865. subs J , #1 // j--
  866. bgt _L2_BEGIN
  867. /*********************************************************************************************/
  868. _L1_BEGIN:
  869. ldr J , N
  870. tst J , #1
  871. ble _L999
  872. ldr CO1, C // CO1 = C
  873. ldr r4 , LDC
  874. add r3 , r4, CO1
  875. str r3 , C // store C
  876. #if defined(LEFT)
  877. ldr r3 , OFFSET
  878. str r3 , KK
  879. #endif
  880. ldr AO, A // AO = A
  881. _L1_M2_BEGIN:
  882. ldr I, M
  883. asrs I, I, #1 // I = I / 2
  884. ble _L1_M1_BEGIN
  885. _L1_M2_20:
  886. #if (defined(LEFT) && defined(TRANSA)) || \
  887. (!defined(LEFT) && !defined(TRANSA))
  888. mov BO, BC
  889. #else
  890. mov BO, BC
  891. ldr r3 , KK
  892. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  893. add BO , BO , r4
  894. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  895. add AO , AO , r4
  896. #endif
  897. #ifndef TRMMKERNEL
  898. ldr K1, K
  899. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  900. ldr K1, K
  901. ldr r3, KK
  902. sub K1, K1, r3
  903. str K1, KKK
  904. #else
  905. ldr K1, KK
  906. #ifdef LEFT
  907. add K1, K1, #2 // number of values in AO
  908. #else
  909. add K1, K1, #1 // number of values in BO
  910. #endif
  911. str K1, KKK
  912. #endif
  913. asrs L , K1, #3 // L = L / 8
  914. cmp L , #3
  915. blt _L1_M2_30
  916. .align 5
  917. KERNEL2x1_I
  918. KERNEL2x1_M2
  919. KERNEL2x1_M1
  920. KERNEL2x1_M2
  921. KERNEL2x1_M1
  922. KERNEL2x1_M2
  923. KERNEL2x1_M1
  924. KERNEL2x1_M2
  925. sub L, L, #2
  926. _L1_M2_22:
  927. KERNEL2x1_M1
  928. KERNEL2x1_M2
  929. KERNEL2x1_M1
  930. KERNEL2x1_M2
  931. KERNEL2x1_M1
  932. KERNEL2x1_M2
  933. KERNEL2x1_M1
  934. KERNEL2x1_M2
  935. subs L, L, #1
  936. bgt _L1_M2_22
  937. KERNEL2x1_M1
  938. KERNEL2x1_M2
  939. KERNEL2x1_M1
  940. KERNEL2x1_M2
  941. KERNEL2x1_M1
  942. KERNEL2x1_M2
  943. KERNEL2x1_M1
  944. KERNEL2x1_E
  945. b _L1_M2_44
  946. _L1_M2_30:
  947. tst L, #3
  948. ble _L1_M2_40
  949. tst L, #2
  950. ble _L1_M2_32
  951. KERNEL2x1_I
  952. KERNEL2x1_M2
  953. KERNEL2x1_M1
  954. KERNEL2x1_M2
  955. KERNEL2x1_M1
  956. KERNEL2x1_M2
  957. KERNEL2x1_M1
  958. KERNEL2x1_M2
  959. KERNEL2x1_M1
  960. KERNEL2x1_M2
  961. KERNEL2x1_M1
  962. KERNEL2x1_M2
  963. KERNEL2x1_M1
  964. KERNEL2x1_M2
  965. KERNEL2x1_M1
  966. KERNEL2x1_E
  967. b _L1_M2_44
  968. _L1_M2_32:
  969. tst L, #1
  970. ble _L1_M2_40
  971. KERNEL2x1_I
  972. KERNEL2x1_M2
  973. KERNEL2x1_M1
  974. KERNEL2x1_M2
  975. KERNEL2x1_M1
  976. KERNEL2x1_M2
  977. KERNEL2x1_M1
  978. KERNEL2x1_E
  979. b _L1_M2_44
  980. _L1_M2_40:
  981. INIT2x1
  982. _L1_M2_44:
  983. ands L , K1, #7 // L = L % 8
  984. ble _L1_M2_100
  985. _L1_M2_46:
  986. KERNEL2x1_SUB
  987. subs L, L, #1
  988. bne _L1_M2_46
  989. _L1_M2_100:
  990. SAVE2x1
  991. #if (defined(LEFT) && defined(TRANSA)) || \
  992. (!defined(LEFT) && !defined(TRANSA))
  993. ldr r3 , K
  994. ldr r4 , KKK
  995. sub r3 , r3 , r4
  996. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  997. add BO , BO , r4
  998. lsls r4 , r3 , #5 // 2 * 8 * 2 double values
  999. add AO , AO , r4
  1000. #endif
  1001. #if defined(LEFT)
  1002. ldr r3 , KK
  1003. add r3 , r3 , #2 // number of values in AO
  1004. str r3 , KK
  1005. #endif
  1006. _L1_M2_END:
  1007. subs I, I, #1
  1008. bne _L1_M2_20
  1009. _L1_M1_BEGIN:
  1010. ldr I, M
  1011. tst I, #1 // I = I % 2
  1012. ble _L1_END
  1013. _L1_M1_20:
  1014. INIT1x1
  1015. #if (defined(LEFT) && defined(TRANSA)) || \
  1016. (!defined(LEFT) && !defined(TRANSA))
  1017. mov BO, BC
  1018. #else
  1019. mov BO, BC
  1020. ldr r3 , KK
  1021. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1022. add BO , BO , r4
  1023. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1024. add AO , AO , r4
  1025. #endif
  1026. #ifndef TRMMKERNEL
  1027. ldr K1, K
  1028. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1029. ldr K1, K
  1030. ldr r3, KK
  1031. sub K1, K1, r3
  1032. str K1, KKK
  1033. #else
  1034. ldr K1, KK
  1035. #ifdef LEFT
  1036. add K1, K1, #1 // number of values in AO
  1037. #else
  1038. add K1, K1, #1 // number of values in BO
  1039. #endif
  1040. str K1, KKK
  1041. #endif
  1042. asrs L , K1, #3 // L = L / 8
  1043. ble _L1_M1_40
  1044. _L1_M1_22:
  1045. KERNEL1x1_SUB
  1046. KERNEL1x1_SUB
  1047. KERNEL1x1_SUB
  1048. KERNEL1x1_SUB
  1049. KERNEL1x1_SUB
  1050. KERNEL1x1_SUB
  1051. KERNEL1x1_SUB
  1052. KERNEL1x1_SUB
  1053. subs L, L, #1
  1054. bgt _L1_M1_22
  1055. _L1_M1_40:
  1056. ands L , K1, #7 // L = L % 8
  1057. ble _L1_M1_100
  1058. _L1_M1_42:
  1059. KERNEL1x1_SUB
  1060. subs L, L, #1
  1061. bgt _L1_M1_42
  1062. _L1_M1_100:
  1063. SAVE1x1
  1064. #if (defined(LEFT) && defined(TRANSA)) || \
  1065. (!defined(LEFT) && !defined(TRANSA))
  1066. ldr r3 , K
  1067. ldr r4 , KKK
  1068. sub r3 , r3 , r4
  1069. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1070. add BO , BO , r4
  1071. lsls r4 , r3 , #4 // 1 * 8 * 2 double values
  1072. add AO , AO , r4
  1073. #endif
  1074. #if defined(LEFT)
  1075. ldr r3 , KK
  1076. add r3 , r3 , #1 // number of values in AO
  1077. str r3 , KK
  1078. #endif
  1079. _L1_END:
  1080. _L999:
  1081. sub r3, fp, #128
  1082. vldm r3, { d8 - d15} // restore floating point registers
  1083. movs r0, #0 // set return value
  1084. sub sp, fp, #24
  1085. pop {r4 - r9, fp}
  1086. bx lr
  1087. EPILOGUE