You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_vfpv3.S 21 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/05 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/11/01 Saar
  34. * UNROLL_N 2
  35. * UNROLL_M 2
  36. * CGEMM_P 96
  37. * CGEMM_Q 120
  38. * CGEMM_R 4096
  39. * A_PRE 96
  40. * B_PRE 96
  41. * C_PRE 64
  42. *
  43. * Performance on Odroid U2:
  44. *
  45. * 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS
  46. * 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS
  47. * 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS
  48. * 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS
  49. **************************************************************************************/
  50. #define ASSEMBLER
  51. #include "common.h"
  52. #define STACKSIZE 256
  53. #define OLD_M r0
  54. #define OLD_N r1
  55. #define OLD_K r2
  56. #define OLD_A r3
  57. #define OLD_ALPHA_R s0
  58. #define OLD_ALPHA_I s1
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define A [fp, #-248 ]
  65. #define LDC [fp, #-252 ]
  66. #define M [fp, #-256 ]
  67. #define N [fp, #-260 ]
  68. #define K [fp, #-264 ]
  69. #define ALPHA_I [fp, #-272]
  70. #define ALPHA_R [fp, #-280]
  71. #define B [fp, #4 ]
  72. #define C [fp, #8 ]
  73. #define OLD_LDC [fp, #12 ]
  74. #define I r0
  75. #define J r1
  76. #define L r2
  77. #define AO r5
  78. #define BO r6
  79. #define CO1 r8
  80. #define CO2 r9
  81. #define K1 r7
  82. #define BC r12
  83. #define A_PRE 96
  84. #define B_PRE 96
  85. #define C_PRE 64
  86. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  87. #define FADD_R fsubs
  88. #define FADD_I fadds
  89. #define FMAC_R1 fnmacs
  90. #define FMAC_R2 fnmacs
  91. #define FMAC_I1 fmacs
  92. #define FMAC_I2 fnmacs
  93. #elif defined(CN) || defined(CT)
  94. #define FADD_R fadds
  95. #define FADD_I fsubs
  96. #define FMAC_R1 fmacs
  97. #define FMAC_R2 fmacs
  98. #define FMAC_I1 fnmacs
  99. #define FMAC_I2 fmacs
  100. #elif defined(NC) || defined(TC)
  101. #define FADD_R fadds
  102. #define FADD_I fsubs
  103. #define FMAC_R1 fmacs
  104. #define FMAC_R2 fnmacs
  105. #define FMAC_I1 fmacs
  106. #define FMAC_I2 fmacs
  107. #else
  108. #define FADD_R fsubs
  109. #define FADD_I fadds
  110. #define FMAC_R1 fnmacs
  111. #define FMAC_R2 fmacs
  112. #define FMAC_I1 fnmacs
  113. #define FMAC_I2 fnmacs
  114. #endif
  115. /**************************************************************************************
  116. * Macro definitions
  117. **************************************************************************************/
  118. .macro INIT2x2
  119. vsub.f32 s16 , s16 , s16
  120. vmov.f32 s17, s16
  121. vmov.f32 s18, s16
  122. vmov.f32 s19, s16
  123. vmov.f32 s20, s16
  124. vmov.f32 s21, s16
  125. vmov.f32 s22, s16
  126. vmov.f32 s23, s16
  127. vmov.f32 s24, s16
  128. vmov.f32 s25, s16
  129. vmov.f32 s26, s16
  130. vmov.f32 s27, s16
  131. vmov.f32 s28, s16
  132. vmov.f32 s29, s16
  133. vmov.f32 s30, s16
  134. vmov.f32 s31, s16
  135. .endm
  136. .macro KERNEL2x2_I
  137. pld [ AO , #A_PRE ]
  138. pld [ BO , #B_PRE ]
  139. fldmias AO!, { s0 - s1 }
  140. fldmias BO!, { s8 - s9 }
  141. fmuls s16 , s0, s8
  142. fmuls s24 , s1, s9
  143. fldmias AO!, { s2 - s3 }
  144. fmuls s17 , s0, s9
  145. fmuls s25 , s1, s8
  146. fldmias BO!, { s10 - s11 }
  147. fmuls s18 , s2, s8
  148. fmuls s26 , s3, s9
  149. fldmias AO!, { s4 - s5 }
  150. fmuls s19 , s2, s9
  151. fmuls s27 , s3, s8
  152. fldmias BO!, { s12 - s13 }
  153. fmuls s20 , s0, s10
  154. fmuls s28 , s1, s11
  155. fldmias AO!, { s6 - s7 }
  156. fmuls s21 , s0, s11
  157. fmuls s29 , s1, s10
  158. fldmias BO!, { s14 - s15 }
  159. fmuls s22 , s2, s10
  160. fmuls s30 , s3, s11
  161. fmuls s23 , s2, s11
  162. fmuls s31 , s3, s10
  163. .endm
  164. .macro KERNEL2x2_M1
  165. fmacs s16 , s0, s8
  166. fldmias AO!, { s4 - s5 }
  167. fmacs s24 , s1, s9
  168. fmacs s17 , s0, s9
  169. fldmias BO!, { s12 - s13 }
  170. fmacs s25 , s1, s8
  171. fmacs s18 , s2, s8
  172. fldmias AO!, { s6 - s7 }
  173. fmacs s26 , s3, s9
  174. fmacs s19 , s2, s9
  175. fldmias BO!, { s14 - s15 }
  176. fmacs s27 , s3, s8
  177. fmacs s20 , s0, s10
  178. fmacs s28 , s1, s11
  179. fmacs s21 , s0, s11
  180. fmacs s29 , s1, s10
  181. fmacs s22 , s2, s10
  182. fmacs s30 , s3, s11
  183. fmacs s23 , s2, s11
  184. fmacs s31 , s3, s10
  185. .endm
  186. .macro KERNEL2x2_M2
  187. pld [ AO , #A_PRE ]
  188. fmacs s16 , s4, s12
  189. pld [ BO , #B_PRE ]
  190. fmacs s24 , s5, s13
  191. fmacs s17 , s4, s13
  192. fldmias AO!, { s0 - s1 }
  193. fmacs s25 , s5, s12
  194. fmacs s18 , s6, s12
  195. fmacs s26 , s7, s13
  196. fldmias BO!, { s8 - s9 }
  197. fmacs s19 , s6, s13
  198. fmacs s27 , s7, s12
  199. fldmias AO!, { s2 - s3 }
  200. fmacs s20 , s4, s14
  201. fmacs s28 , s5, s15
  202. fldmias BO!, { s10 - s11 }
  203. fmacs s21 , s4, s15
  204. fmacs s29 , s5, s14
  205. fmacs s22 , s6, s14
  206. fmacs s30 , s7, s15
  207. fmacs s23 , s6, s15
  208. fmacs s31 , s7, s14
  209. .endm
  210. .macro KERNEL2x2_E
  211. fmacs s16 , s4, s12
  212. fmacs s24 , s5, s13
  213. fmacs s17 , s4, s13
  214. fmacs s25 , s5, s12
  215. fmacs s18 , s6, s12
  216. fmacs s26 , s7, s13
  217. fmacs s19 , s6, s13
  218. fmacs s27 , s7, s12
  219. fmacs s20 , s4, s14
  220. fmacs s28 , s5, s15
  221. fmacs s21 , s4, s15
  222. fmacs s29 , s5, s14
  223. fmacs s22 , s6, s14
  224. fmacs s30 , s7, s15
  225. fmacs s23 , s6, s15
  226. fmacs s31 , s7, s14
  227. .endm
  228. .macro KERNEL2x2_SUB
  229. fldmias AO!, { s0 - s1 }
  230. fldmias BO!, { s8 - s9 }
  231. fmacs s16 , s0, s8
  232. fmacs s24 , s1, s9
  233. fldmias AO!, { s2 - s3 }
  234. fmacs s17 , s0, s9
  235. fmacs s25 , s1, s8
  236. fldmias BO!, { s10 - s11 }
  237. fmacs s18 , s2, s8
  238. fmacs s26 , s3, s9
  239. fmacs s19 , s2, s9
  240. fmacs s27 , s3, s8
  241. fmacs s20 , s0, s10
  242. fmacs s28 , s1, s11
  243. fmacs s21 , s0, s11
  244. fmacs s29 , s1, s10
  245. fmacs s22 , s2, s10
  246. fmacs s30 , s3, s11
  247. fmacs s23 , s2, s11
  248. fmacs s31 , s3, s10
  249. .endm
  250. .macro SAVE2x2
  251. pld [ CO1 , #C_PRE ]
  252. ldr r3 , LDC
  253. add CO2 , CO1, r3
  254. flds s0, ALPHA_R
  255. flds s1, ALPHA_I
  256. fldmias CO1, { s4 - s7 }
  257. fldmias CO2, { s8 - s11 }
  258. FADD_R s16, s24 , s16
  259. FADD_I s17, s25 , s17
  260. FADD_R s18, s26 , s18
  261. FADD_I s19, s27 , s19
  262. FADD_R s20, s28 , s20
  263. FADD_I s21, s29 , s21
  264. FADD_R s22, s30 , s22
  265. FADD_I s23, s31 , s23
  266. FMAC_R1 s4 , s0 , s16
  267. FMAC_I1 s5 , s0 , s17
  268. FMAC_R2 s4 , s1 , s17
  269. FMAC_I2 s5 , s1 , s16
  270. FMAC_R1 s6 , s0 , s18
  271. FMAC_I1 s7 , s0 , s19
  272. FMAC_R2 s6 , s1 , s19
  273. FMAC_I2 s7 , s1 , s18
  274. FMAC_R1 s8 , s0 , s20
  275. FMAC_I1 s9 , s0 , s21
  276. FMAC_R2 s8 , s1 , s21
  277. FMAC_I2 s9 , s1 , s20
  278. FMAC_R1 s10, s0 , s22
  279. FMAC_I1 s11, s0 , s23
  280. FMAC_R2 s10, s1 , s23
  281. FMAC_I2 s11, s1 , s22
  282. fstmias CO1, { s4 - s7 }
  283. fstmias CO2, { s8 - s11 }
  284. add CO1, CO1, #16
  285. .endm
  286. /******************************************************************************/
  287. .macro INIT1x2
  288. vsub.f32 s16 , s16 , s16
  289. vmov.f32 s17, s16
  290. vmov.f32 s20, s16
  291. vmov.f32 s21, s16
  292. vmov.f32 s24, s16
  293. vmov.f32 s25, s16
  294. vmov.f32 s28, s16
  295. vmov.f32 s29, s16
  296. .endm
  297. .macro KERNEL1x2_I
  298. pld [ AO , #A_PRE ]
  299. pld [ BO , #B_PRE ]
  300. flds s0 , [ AO ]
  301. flds s1 , [ AO, #4 ]
  302. flds s8 , [ BO ]
  303. flds s9 , [ BO, #4 ]
  304. flds s10, [ BO, #8 ]
  305. flds s11, [ BO, #12 ]
  306. fmuls s16 , s0, s8
  307. fmuls s24 , s1, s9
  308. fmuls s17 , s0, s9
  309. fmuls s25 , s1, s8
  310. fmuls s20 , s0, s10
  311. fmuls s28 , s1, s11
  312. fmuls s21 , s0, s11
  313. fmuls s29 , s1, s10
  314. add BO , BO, #16
  315. add AO , AO, #8
  316. pld [ BO , #B_PRE ]
  317. flds s4 , [ AO, #0 ]
  318. flds s5 , [ AO, #4 ]
  319. flds s12, [ BO ]
  320. flds s13, [ BO, #4 ]
  321. flds s14, [ BO, #8 ]
  322. flds s15, [ BO, #12 ]
  323. add BO , BO, #16
  324. add AO , AO, #8
  325. .endm
  326. .macro KERNEL1x2_M1
  327. pld [ BO , #B_PRE ]
  328. fmacs s16 , s0, s8
  329. fmacs s24 , s1, s9
  330. fmacs s17 , s0, s9
  331. fmacs s25 , s1, s8
  332. fmacs s20 , s0, s10
  333. fmacs s28 , s1, s11
  334. fmacs s21 , s0, s11
  335. fmacs s29 , s1, s10
  336. flds s4 , [ AO, #0 ]
  337. flds s5 , [ AO, #4 ]
  338. flds s12, [ BO ]
  339. flds s13, [ BO, #4 ]
  340. flds s14, [ BO, #8 ]
  341. flds s15, [ BO, #12 ]
  342. add BO , BO, #16
  343. add AO , AO, #8
  344. .endm
  345. .macro KERNEL1x2_M2
  346. pld [ AO , #A_PRE ]
  347. pld [ BO , #B_PRE ]
  348. fmacs s16 , s4, s12
  349. fmacs s24 , s5, s13
  350. fmacs s17 , s4, s13
  351. fmacs s25 , s5, s12
  352. fmacs s20 , s4, s14
  353. fmacs s28 , s5, s15
  354. fmacs s21 , s4, s15
  355. fmacs s29 , s5, s14
  356. flds s0 , [ AO, #0 ]
  357. flds s1 , [ AO, #4 ]
  358. flds s8 , [ BO ]
  359. flds s9 , [ BO, #4 ]
  360. flds s10, [ BO, #8 ]
  361. flds s11, [ BO, #12 ]
  362. add BO , BO, #16
  363. add AO , AO, #8
  364. .endm
  365. .macro KERNEL1x2_E
  366. fmacs s16 , s4, s12
  367. fmacs s24 , s5, s13
  368. fmacs s17 , s4, s13
  369. fmacs s25 , s5, s12
  370. fmacs s20 , s4, s14
  371. fmacs s28 , s5, s15
  372. fmacs s21 , s4, s15
  373. fmacs s29 , s5, s14
  374. .endm
  375. .macro KERNEL1x2_SUB
  376. pld [ AO , #A_PRE ]
  377. pld [ BO , #B_PRE ]
  378. flds s0 , [ AO ]
  379. flds s1 , [ AO, #4 ]
  380. flds s8 , [ BO ]
  381. flds s9 , [ BO, #4 ]
  382. flds s10, [ BO, #8 ]
  383. flds s11, [ BO, #12 ]
  384. fmacs s16 , s0, s8
  385. fmacs s24 , s1, s9
  386. fmacs s17 , s0, s9
  387. fmacs s25 , s1, s8
  388. fmacs s20 , s0, s10
  389. fmacs s28 , s1, s11
  390. fmacs s21 , s0, s11
  391. fmacs s29 , s1, s10
  392. add BO , BO, #16
  393. add AO , AO, #8
  394. .endm
  395. .macro SAVE1x2
  396. pld [ CO1 , #C_PRE ]
  397. ldr r3 , LDC
  398. add CO2 , CO1, r3
  399. flds s0, ALPHA_R
  400. flds s1, ALPHA_I
  401. fldmias CO1, { s4 - s5 }
  402. fldmias CO2, { s8 - s9 }
  403. FADD_R s16, s24 , s16
  404. FADD_I s17, s25 , s17
  405. FADD_R s20, s28 , s20
  406. FADD_I s21, s29 , s21
  407. FMAC_R1 s4 , s0 , s16
  408. FMAC_I1 s5 , s0 , s17
  409. FMAC_R2 s4 , s1 , s17
  410. FMAC_I2 s5 , s1 , s16
  411. FMAC_R1 s8 , s0 , s20
  412. FMAC_I1 s9 , s0 , s21
  413. FMAC_R2 s8 , s1 , s21
  414. FMAC_I2 s9 , s1 , s20
  415. fstmias CO1, { s4 - s5 }
  416. fstmias CO2, { s8 - s9 }
  417. add CO1, CO1, #8
  418. .endm
  419. /******************************************************************************/
  420. .macro INIT2x1
  421. vsub.f32 s16 , s16 , s16
  422. vmov.f32 s17, s16
  423. vmov.f32 s18, s16
  424. vmov.f32 s19, s16
  425. vmov.f32 s24, s16
  426. vmov.f32 s25, s16
  427. vmov.f32 s26, s16
  428. vmov.f32 s27, s16
  429. .endm
  430. .macro KERNEL2x1_I
  431. pld [ AO , #A_PRE ]
  432. pld [ BO , #B_PRE ]
  433. flds s0 , [ AO ]
  434. flds s1 , [ AO, #4 ]
  435. flds s2 , [ AO, #8 ]
  436. flds s3 , [ AO, #12 ]
  437. flds s8 , [ BO ]
  438. flds s9 , [ BO, #4 ]
  439. fmuls s16 , s0, s8
  440. fmuls s24 , s1, s9
  441. fmuls s17 , s0, s9
  442. fmuls s25 , s1, s8
  443. fmuls s18 , s2, s8
  444. fmuls s26 , s3, s9
  445. fmuls s19 , s2, s9
  446. fmuls s27 , s3, s8
  447. add BO , BO, #8
  448. add AO , AO, #16
  449. pld [ BO , #B_PRE ]
  450. pld [ AO , #A_PRE ]
  451. flds s4 , [ AO, #0 ]
  452. flds s5 , [ AO, #4 ]
  453. flds s6 , [ AO, #8 ]
  454. flds s7 , [ AO, #12 ]
  455. flds s12, [ BO ]
  456. flds s13, [ BO, #4 ]
  457. add BO , BO, #8
  458. add AO , AO, #16
  459. .endm
  460. .macro KERNEL2x1_M1
  461. pld [ AO , #A_PRE ]
  462. pld [ BO , #B_PRE ]
  463. fmacs s16 , s0, s8
  464. fmacs s24 , s1, s9
  465. fmacs s17 , s0, s9
  466. fmacs s25 , s1, s8
  467. fmacs s18 , s2, s8
  468. fmacs s26 , s3, s9
  469. fmacs s19 , s2, s9
  470. fmacs s27 , s3, s8
  471. flds s4 , [ AO, #0 ]
  472. flds s5 , [ AO, #4 ]
  473. flds s6 , [ AO, #8 ]
  474. flds s7 , [ AO, #12 ]
  475. flds s12, [ BO ]
  476. flds s13, [ BO, #4 ]
  477. add BO , BO, #8
  478. add AO , AO, #16
  479. .endm
  480. .macro KERNEL2x1_M2
  481. pld [ AO , #A_PRE ]
  482. pld [ BO , #B_PRE ]
  483. fmacs s16 , s4, s12
  484. fmacs s24 , s5, s13
  485. fmacs s17 , s4, s13
  486. fmacs s25 , s5, s12
  487. fmacs s18 , s6, s12
  488. fmacs s26 , s7, s13
  489. fmacs s19 , s6, s13
  490. fmacs s27 , s7, s12
  491. flds s0 , [ AO, #0 ]
  492. flds s1 , [ AO, #4 ]
  493. flds s2 , [ AO, #8 ]
  494. flds s3 , [ AO, #12 ]
  495. flds s8 , [ BO ]
  496. flds s9 , [ BO, #4 ]
  497. add BO , BO, #8
  498. add AO , AO, #16
  499. .endm
  500. .macro KERNEL2x1_E
  501. fmacs s16 , s4, s12
  502. fmacs s24 , s5, s13
  503. fmacs s17 , s4, s13
  504. fmacs s25 , s5, s12
  505. fmacs s18 , s6, s12
  506. fmacs s26 , s7, s13
  507. fmacs s19 , s6, s13
  508. fmacs s27 , s7, s12
  509. .endm
  510. .macro KERNEL2x1_SUB
  511. pld [ AO , #A_PRE ]
  512. pld [ BO , #B_PRE ]
  513. flds s0 , [ AO ]
  514. flds s1 , [ AO, #4 ]
  515. flds s2 , [ AO, #8 ]
  516. flds s3 , [ AO, #12 ]
  517. flds s8 , [ BO ]
  518. flds s9 , [ BO, #4 ]
  519. fmacs s16 , s0, s8
  520. fmacs s24 , s1, s9
  521. fmacs s17 , s0, s9
  522. fmacs s25 , s1, s8
  523. fmacs s18 , s2, s8
  524. fmacs s26 , s3, s9
  525. fmacs s19 , s2, s9
  526. fmacs s27 , s3, s8
  527. add BO , BO, #8
  528. add AO , AO, #16
  529. .endm
  530. .macro SAVE2x1
  531. pld [ CO1 , #C_PRE ]
  532. flds s0, ALPHA_R
  533. flds s1, ALPHA_I
  534. fldmias CO1, { s4 - s7 }
  535. FADD_R s16, s24 , s16
  536. FADD_I s17, s25 , s17
  537. FADD_R s18, s26 , s18
  538. FADD_I s19, s27 , s19
  539. FMAC_R1 s4 , s0 , s16
  540. FMAC_I1 s5 , s0 , s17
  541. FMAC_R2 s4 , s1 , s17
  542. FMAC_I2 s5 , s1 , s16
  543. FMAC_R1 s6 , s0 , s18
  544. FMAC_I1 s7 , s0 , s19
  545. FMAC_R2 s6 , s1 , s19
  546. FMAC_I2 s7 , s1 , s18
  547. fstmias CO1, { s4 - s7 }
  548. add CO1, CO1, #16
  549. .endm
  550. /******************************************************************************/
  551. .macro INIT1x1
  552. vsub.f32 s16 , s16 , s16
  553. vmov.f32 s17, s16
  554. vmov.f32 s24, s16
  555. vmov.f32 s25, s16
  556. .endm
  557. .macro KERNEL1x1_I
  558. pld [ AO , #A_PRE ]
  559. pld [ BO , #B_PRE ]
  560. flds s0 , [ AO ]
  561. flds s1 , [ AO, #4 ]
  562. flds s8 , [ BO ]
  563. flds s9 , [ BO, #4 ]
  564. fmuls s16 , s0, s8
  565. fmuls s24 , s1, s9
  566. fmuls s17 , s0, s9
  567. fmuls s25 , s1, s8
  568. add BO , BO, #8
  569. add AO , AO, #8
  570. pld [ BO , #B_PRE ]
  571. pld [ AO , #A_PRE ]
  572. flds s4 , [ AO, #0 ]
  573. flds s5 , [ AO, #4 ]
  574. flds s12, [ BO ]
  575. flds s13, [ BO, #4 ]
  576. add BO , BO, #8
  577. add AO , AO, #8
  578. .endm
  579. .macro KERNEL1x1_M1
  580. fmacs s16 , s0, s8
  581. fmacs s24 , s1, s9
  582. fmacs s17 , s0, s9
  583. fmacs s25 , s1, s8
  584. flds s4 , [ AO, #0 ]
  585. flds s5 , [ AO, #4 ]
  586. flds s12, [ BO ]
  587. flds s13, [ BO, #4 ]
  588. add BO , BO, #8
  589. add AO , AO, #8
  590. .endm
  591. .macro KERNEL1x1_M2
  592. fmacs s16 , s4, s12
  593. fmacs s24 , s5, s13
  594. fmacs s17 , s4, s13
  595. fmacs s25 , s5, s12
  596. flds s0 , [ AO, #0 ]
  597. flds s1 , [ AO, #4 ]
  598. flds s8 , [ BO ]
  599. flds s9 , [ BO, #4 ]
  600. add BO , BO, #8
  601. add AO , AO, #8
  602. .endm
  603. .macro KERNEL1x1_E
  604. fmacs s16 , s4, s12
  605. fmacs s24 , s5, s13
  606. fmacs s17 , s4, s13
  607. fmacs s25 , s5, s12
  608. .endm
  609. .macro KERNEL1x1_SUB
  610. flds s0 , [ AO ]
  611. flds s1 , [ AO, #4 ]
  612. flds s8 , [ BO ]
  613. flds s9 , [ BO, #4 ]
  614. fmacs s16 , s0, s8
  615. fmacs s24 , s1, s9
  616. fmacs s17 , s0, s9
  617. fmacs s25 , s1, s8
  618. add BO , BO, #8
  619. add AO , AO, #8
  620. .endm
  621. .macro SAVE1x1
  622. pld [ CO1 , #C_PRE ]
  623. flds s0, ALPHA_R
  624. flds s1, ALPHA_I
  625. fldmias CO1, { s4 - s5 }
  626. FADD_R s16, s24 , s16
  627. FADD_I s17, s25 , s17
  628. FMAC_R1 s4 , s0 , s16
  629. FMAC_I1 s5 , s0 , s17
  630. FMAC_R2 s4 , s1 , s17
  631. FMAC_I2 s5 , s1 , s16
  632. fstmias CO1, { s4 - s5 }
  633. add CO1, CO1, #8
  634. .endm
  635. /******************************************************************************/
  636. /**************************************************************************************
  637. * End of macro definitions
  638. **************************************************************************************/
  639. PROLOGUE
  640. .align 5
  641. push {r4 - r9, fp}
  642. add fp, sp, #24
  643. sub sp, sp, #STACKSIZE // reserve stack
  644. str OLD_M, M
  645. str OLD_N, N
  646. str OLD_K, K
  647. str OLD_A, A
  648. vstr OLD_ALPHA_R, ALPHA_R
  649. vstr OLD_ALPHA_I, ALPHA_I
  650. sub r3, fp, #128
  651. vstm r3, { s8 - s31} // store floating point registers
  652. ldr r3, OLD_LDC
  653. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  654. str r3, LDC
  655. ldr K1, K
  656. ldr BC, B
  657. ldr J, N
  658. asrs J, J, #1 // J = J / 2
  659. ble cgemm_kernel_L1_BEGIN
  660. cgemm_kernel_L2_BEGIN:
  661. ldr CO1, C // CO1 = C
  662. ldr r4 , LDC
  663. lsl r4 , r4 , #1 // LDC * 2
  664. add r3 , r4, CO1
  665. str r3 , C // store C
  666. ldr AO, A // AO = A
  667. pld [AO , #A_PRE-64]
  668. pld [AO , #A_PRE-32]
  669. cgemm_kernel_L2_M2_BEGIN:
  670. ldr I, M
  671. asrs I, I, #1 // I = I / 2
  672. ble cgemm_kernel_L2_M1_BEGIN
  673. cgemm_kernel_L2_M2_20:
  674. mov BO, BC
  675. asrs L , K1, #3 // L = L / 8
  676. cmp L , #3
  677. blt cgemm_kernel_L2_M2_30
  678. .align 5
  679. KERNEL2x2_I
  680. KERNEL2x2_M2
  681. KERNEL2x2_M1
  682. KERNEL2x2_M2
  683. KERNEL2x2_M1
  684. KERNEL2x2_M2
  685. KERNEL2x2_M1
  686. KERNEL2x2_M2
  687. sub L, L, #2
  688. cgemm_kernel_L2_M2_22:
  689. KERNEL2x2_M1
  690. KERNEL2x2_M2
  691. KERNEL2x2_M1
  692. KERNEL2x2_M2
  693. KERNEL2x2_M1
  694. KERNEL2x2_M2
  695. KERNEL2x2_M1
  696. KERNEL2x2_M2
  697. subs L, L, #1
  698. bgt cgemm_kernel_L2_M2_22
  699. KERNEL2x2_M1
  700. KERNEL2x2_M2
  701. KERNEL2x2_M1
  702. KERNEL2x2_M2
  703. KERNEL2x2_M1
  704. KERNEL2x2_M2
  705. KERNEL2x2_M1
  706. KERNEL2x2_E
  707. b cgemm_kernel_L2_M2_44
  708. cgemm_kernel_L2_M2_30:
  709. tst L, #3
  710. ble cgemm_kernel_L2_M2_40
  711. tst L, #2
  712. ble cgemm_kernel_L2_M2_32
  713. KERNEL2x2_I
  714. KERNEL2x2_M2
  715. KERNEL2x2_M1
  716. KERNEL2x2_M2
  717. KERNEL2x2_M1
  718. KERNEL2x2_M2
  719. KERNEL2x2_M1
  720. KERNEL2x2_M2
  721. KERNEL2x2_M1
  722. KERNEL2x2_M2
  723. KERNEL2x2_M1
  724. KERNEL2x2_M2
  725. KERNEL2x2_M1
  726. KERNEL2x2_M2
  727. KERNEL2x2_M1
  728. KERNEL2x2_E
  729. b cgemm_kernel_L2_M2_44
  730. cgemm_kernel_L2_M2_32:
  731. tst L, #1
  732. ble cgemm_kernel_L2_M2_40
  733. KERNEL2x2_I
  734. KERNEL2x2_M2
  735. KERNEL2x2_M1
  736. KERNEL2x2_M2
  737. KERNEL2x2_M1
  738. KERNEL2x2_M2
  739. KERNEL2x2_M1
  740. KERNEL2x2_E
  741. b cgemm_kernel_L2_M2_44
  742. cgemm_kernel_L2_M2_40:
  743. INIT2x2
  744. cgemm_kernel_L2_M2_44:
  745. ands L , K1, #7 // L = L % 8
  746. ble cgemm_kernel_L2_M2_100
  747. cgemm_kernel_L2_M2_46:
  748. KERNEL2x2_SUB
  749. subs L, L, #1
  750. bne cgemm_kernel_L2_M2_46
  751. cgemm_kernel_L2_M2_100:
  752. SAVE2x2
  753. cgemm_kernel_L2_M2_END:
  754. subs I, I, #1
  755. bne cgemm_kernel_L2_M2_20
  756. cgemm_kernel_L2_M1_BEGIN:
  757. ldr I, M
  758. tst I, #1 // I = I % 2
  759. ble cgemm_kernel_L2_END
  760. cgemm_kernel_L2_M1_20:
  761. INIT1x2
  762. mov BO, BC
  763. asrs L , K1, #3 // L = L / 8
  764. ble cgemm_kernel_L2_M1_40
  765. cgemm_kernel_L2_M1_22:
  766. KERNEL1x2_SUB
  767. KERNEL1x2_SUB
  768. KERNEL1x2_SUB
  769. KERNEL1x2_SUB
  770. KERNEL1x2_SUB
  771. KERNEL1x2_SUB
  772. KERNEL1x2_SUB
  773. KERNEL1x2_SUB
  774. subs L, L, #1
  775. bgt cgemm_kernel_L2_M1_22
  776. cgemm_kernel_L2_M1_40:
  777. ands L , K1, #7 // L = L % 8
  778. ble cgemm_kernel_L2_M1_100
  779. cgemm_kernel_L2_M1_42:
  780. KERNEL1x2_SUB
  781. subs L, L, #1
  782. bgt cgemm_kernel_L2_M1_42
  783. cgemm_kernel_L2_M1_100:
  784. SAVE1x2
  785. cgemm_kernel_L2_END:
  786. mov r3, BC
  787. mov r4, K1
  788. lsl r4, r4, #4 // k * 2 * 4 * 2
  789. add r3, r3, r4 // B = B + K * 2 * 8
  790. mov BC, r3
  791. subs J , #1 // j--
  792. bgt cgemm_kernel_L2_BEGIN
  793. /*********************************************************************************************/
  794. cgemm_kernel_L1_BEGIN:
  795. ldr J , N
  796. tst J , #1
  797. ble cgemm_kernel_L999
  798. ldr CO1, C // CO1 = C
  799. ldr r4 , LDC
  800. add r3 , r4, CO1
  801. str r3 , C // store C
  802. ldr AO, A // AO = A
  803. cgemm_kernel_L1_M2_BEGIN:
  804. ldr I, M
  805. asrs I, I, #1 // I = I / 2
  806. ble cgemm_kernel_L1_M1_BEGIN
  807. cgemm_kernel_L1_M2_20:
  808. mov BO, BC
  809. asrs L , K1, #3 // L = L / 8
  810. cmp L , #3
  811. blt cgemm_kernel_L1_M2_30
  812. .align 5
  813. KERNEL2x1_I
  814. KERNEL2x1_M2
  815. KERNEL2x1_M1
  816. KERNEL2x1_M2
  817. KERNEL2x1_M1
  818. KERNEL2x1_M2
  819. KERNEL2x1_M1
  820. KERNEL2x1_M2
  821. sub L, L, #2
  822. cgemm_kernel_L1_M2_22:
  823. KERNEL2x1_M1
  824. KERNEL2x1_M2
  825. KERNEL2x1_M1
  826. KERNEL2x1_M2
  827. KERNEL2x1_M1
  828. KERNEL2x1_M2
  829. KERNEL2x1_M1
  830. KERNEL2x1_M2
  831. subs L, L, #1
  832. bgt cgemm_kernel_L1_M2_22
  833. KERNEL2x1_M1
  834. KERNEL2x1_M2
  835. KERNEL2x1_M1
  836. KERNEL2x1_M2
  837. KERNEL2x1_M1
  838. KERNEL2x1_M2
  839. KERNEL2x1_M1
  840. KERNEL2x1_E
  841. b cgemm_kernel_L1_M2_44
  842. cgemm_kernel_L1_M2_30:
  843. tst L, #3
  844. ble cgemm_kernel_L1_M2_40
  845. tst L, #2
  846. ble cgemm_kernel_L1_M2_32
  847. KERNEL2x1_I
  848. KERNEL2x1_M2
  849. KERNEL2x1_M1
  850. KERNEL2x1_M2
  851. KERNEL2x1_M1
  852. KERNEL2x1_M2
  853. KERNEL2x1_M1
  854. KERNEL2x1_M2
  855. KERNEL2x1_M1
  856. KERNEL2x1_M2
  857. KERNEL2x1_M1
  858. KERNEL2x1_M2
  859. KERNEL2x1_M1
  860. KERNEL2x1_M2
  861. KERNEL2x1_M1
  862. KERNEL2x1_E
  863. b cgemm_kernel_L1_M2_44
  864. cgemm_kernel_L1_M2_32:
  865. tst L, #1
  866. ble cgemm_kernel_L1_M2_40
  867. KERNEL2x1_I
  868. KERNEL2x1_M2
  869. KERNEL2x1_M1
  870. KERNEL2x1_M2
  871. KERNEL2x1_M1
  872. KERNEL2x1_M2
  873. KERNEL2x1_M1
  874. KERNEL2x1_E
  875. b cgemm_kernel_L1_M2_44
  876. cgemm_kernel_L1_M2_40:
  877. INIT2x1
  878. cgemm_kernel_L1_M2_44:
  879. ands L , K1, #7 // L = L % 8
  880. ble cgemm_kernel_L1_M2_100
  881. cgemm_kernel_L1_M2_46:
  882. KERNEL2x1_SUB
  883. subs L, L, #1
  884. bne cgemm_kernel_L1_M2_46
  885. cgemm_kernel_L1_M2_100:
  886. SAVE2x1
  887. cgemm_kernel_L1_M2_END:
  888. subs I, I, #1
  889. bne cgemm_kernel_L1_M2_20
  890. cgemm_kernel_L1_M1_BEGIN:
  891. ldr I, M
  892. tst I, #1 // I = I % 2
  893. ble cgemm_kernel_L1_END
  894. cgemm_kernel_L1_M1_20:
  895. INIT1x1
  896. mov BO, BC
  897. asrs L , K1, #3 // L = L / 8
  898. ble cgemm_kernel_L1_M1_40
  899. cgemm_kernel_L1_M1_22:
  900. KERNEL1x1_SUB
  901. KERNEL1x1_SUB
  902. KERNEL1x1_SUB
  903. KERNEL1x1_SUB
  904. KERNEL1x1_SUB
  905. KERNEL1x1_SUB
  906. KERNEL1x1_SUB
  907. KERNEL1x1_SUB
  908. subs L, L, #1
  909. bgt cgemm_kernel_L1_M1_22
  910. cgemm_kernel_L1_M1_40:
  911. ands L , K1, #7 // L = L % 8
  912. ble cgemm_kernel_L1_M1_100
  913. cgemm_kernel_L1_M1_42:
  914. KERNEL1x1_SUB
  915. subs L, L, #1
  916. bgt cgemm_kernel_L1_M1_42
  917. cgemm_kernel_L1_M1_100:
  918. SAVE1x1
  919. cgemm_kernel_L1_END:
  920. cgemm_kernel_L999:
  921. sub r3, fp, #128
  922. vldm r3, { s8 - s31} // restore floating point registers
  923. movs r0, #0 // set return value
  924. sub sp, fp, #24
  925. pop {r4 - r9, fp}
  926. bx lr
  927. EPILOGUE