You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_vfpv3.S 22 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/05 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/11/01 Saar
  34. * UNROLL_N 2
  35. * UNROLL_M 2
  36. * CGEMM_P 96
  37. * CGEMM_Q 120
  38. * CGEMM_R 4096
  39. * A_PRE 96
  40. * B_PRE 96
  41. * C_PRE 64
  42. *
  43. * Performance on Odroid U2:
  44. *
  45. * 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS
  46. * 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS
  47. * 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS
  48. * 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS
  49. **************************************************************************************/
  50. #define ASSEMBLER
  51. #include "common.h"
  52. #define STACKSIZE 256
  53. #define OLD_M r0
  54. #define OLD_N r1
  55. #define OLD_K r2
  56. #define OLD_A r3
  57. #define OLD_ALPHA_R s0
  58. #define OLD_ALPHA_I s1
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define A [fp, #-248 ]
  65. #define LDC [fp, #-252 ]
  66. #define M [fp, #-256 ]
  67. #define N [fp, #-260 ]
  68. #define K [fp, #-264 ]
  69. #define FP_ZERO [fp, #-240]
  70. #define FP_ZERO_0 [fp, # -240]
  71. #define FP_ZERO_1 [fp, # -236]
  72. #define ALPHA_I [fp, #-272]
  73. #define ALPHA_R [fp, #-280]
  74. #if !defined(__ARM_PCS_VFP)
  75. #define OLD_ALPHAR_SOFTFP r3
  76. #define OLD_ALPHAI_SOFTFP [fp, #4]
  77. #define OLD_A_SOFTFP [fp, #8 ]
  78. #define B [fp, #12 ]
  79. #define C [fp, #16 ]
  80. #define OLD_LDC [fp, #20 ]
  81. #else
  82. #define B [fp, #4 ]
  83. #define C [fp, #8 ]
  84. #define OLD_LDC [fp, #12 ]
  85. #endif
  86. #define I r0
  87. #define J r1
  88. #define L r2
  89. #define AO r5
  90. #define BO r6
  91. #define CO1 r8
  92. #define CO2 r9
  93. #define K1 r7
  94. #define BC r12
  95. #define A_PRE 96
  96. #define B_PRE 96
  97. #define C_PRE 64
  98. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  99. #define FADD_R fsubs
  100. #define FADD_I fadds
  101. #define FMAC_R1 vmls.f32
  102. #define FMAC_R2 vmls.f32
  103. #define FMAC_I1 fmacs
  104. #define FMAC_I2 vmls.f32
  105. #elif defined(CN) || defined(CT)
  106. #define FADD_R fadds
  107. #define FADD_I fsubs
  108. #define FMAC_R1 fmacs
  109. #define FMAC_R2 fmacs
  110. #define FMAC_I1 vmls.f32
  111. #define FMAC_I2 fmacs
  112. #elif defined(NC) || defined(TC)
  113. #define FADD_R fadds
  114. #define FADD_I fsubs
  115. #define FMAC_R1 fmacs
  116. #define FMAC_R2 vmls.f32
  117. #define FMAC_I1 fmacs
  118. #define FMAC_I2 fmacs
  119. #else
  120. #define FADD_R fsubs
  121. #define FADD_I fadds
  122. #define FMAC_R1 vmls.f32
  123. #define FMAC_R2 fmacs
  124. #define FMAC_I1 vmls.f32
  125. #define FMAC_I2 vmls.f32
  126. #endif
  127. /**************************************************************************************
  128. * Macro definitions
  129. **************************************************************************************/
  130. .macro INIT2x2
  131. flds s16, FP_ZERO
  132. vmov.f32 s17, s16
  133. vmov.f32 s18, s16
  134. vmov.f32 s19, s16
  135. vmov.f32 s20, s16
  136. vmov.f32 s21, s16
  137. vmov.f32 s22, s16
  138. vmov.f32 s23, s16
  139. vmov.f32 s24, s16
  140. vmov.f32 s25, s16
  141. vmov.f32 s26, s16
  142. vmov.f32 s27, s16
  143. vmov.f32 s28, s16
  144. vmov.f32 s29, s16
  145. vmov.f32 s30, s16
  146. vmov.f32 s31, s16
  147. .endm
  148. .macro KERNEL2x2_I
  149. pld [ AO , #A_PRE ]
  150. pld [ BO , #B_PRE ]
  151. vldmia.f32 AO!, { s0 - s1 }
  152. vldmia.f32 BO!, { s8 - s9 }
  153. fmuls s16 , s0, s8
  154. fmuls s24 , s1, s9
  155. vldmia.f32 AO!, { s2 - s3 }
  156. fmuls s17 , s0, s9
  157. fmuls s25 , s1, s8
  158. vldmia.f32 BO!, { s10 - s11 }
  159. fmuls s18 , s2, s8
  160. fmuls s26 , s3, s9
  161. vldmia.f32 AO!, { s4 - s5 }
  162. fmuls s19 , s2, s9
  163. fmuls s27 , s3, s8
  164. vldmia.f32 BO!, { s12 - s13 }
  165. fmuls s20 , s0, s10
  166. fmuls s28 , s1, s11
  167. vldmia.f32 AO!, { s6 - s7 }
  168. fmuls s21 , s0, s11
  169. fmuls s29 , s1, s10
  170. vldmia.f32 BO!, { s14 - s15 }
  171. fmuls s22 , s2, s10
  172. fmuls s30 , s3, s11
  173. fmuls s23 , s2, s11
  174. fmuls s31 , s3, s10
  175. .endm
  176. .macro KERNEL2x2_M1
  177. fmacs s16 , s0, s8
  178. vldmia.f32 AO!, { s4 - s5 }
  179. fmacs s24 , s1, s9
  180. fmacs s17 , s0, s9
  181. vldmia.f32 BO!, { s12 - s13 }
  182. fmacs s25 , s1, s8
  183. fmacs s18 , s2, s8
  184. vldmia.f32 AO!, { s6 - s7 }
  185. fmacs s26 , s3, s9
  186. fmacs s19 , s2, s9
  187. vldmia.f32 BO!, { s14 - s15 }
  188. fmacs s27 , s3, s8
  189. fmacs s20 , s0, s10
  190. fmacs s28 , s1, s11
  191. fmacs s21 , s0, s11
  192. fmacs s29 , s1, s10
  193. fmacs s22 , s2, s10
  194. fmacs s30 , s3, s11
  195. fmacs s23 , s2, s11
  196. fmacs s31 , s3, s10
  197. .endm
  198. .macro KERNEL2x2_M2
  199. pld [ AO , #A_PRE ]
  200. fmacs s16 , s4, s12
  201. pld [ BO , #B_PRE ]
  202. fmacs s24 , s5, s13
  203. fmacs s17 , s4, s13
  204. vldmia.f32 AO!, { s0 - s1 }
  205. fmacs s25 , s5, s12
  206. fmacs s18 , s6, s12
  207. fmacs s26 , s7, s13
  208. vldmia.f32 BO!, { s8 - s9 }
  209. fmacs s19 , s6, s13
  210. fmacs s27 , s7, s12
  211. vldmia.f32 AO!, { s2 - s3 }
  212. fmacs s20 , s4, s14
  213. fmacs s28 , s5, s15
  214. vldmia.f32 BO!, { s10 - s11 }
  215. fmacs s21 , s4, s15
  216. fmacs s29 , s5, s14
  217. fmacs s22 , s6, s14
  218. fmacs s30 , s7, s15
  219. fmacs s23 , s6, s15
  220. fmacs s31 , s7, s14
  221. .endm
  222. .macro KERNEL2x2_E
  223. fmacs s16 , s4, s12
  224. fmacs s24 , s5, s13
  225. fmacs s17 , s4, s13
  226. fmacs s25 , s5, s12
  227. fmacs s18 , s6, s12
  228. fmacs s26 , s7, s13
  229. fmacs s19 , s6, s13
  230. fmacs s27 , s7, s12
  231. fmacs s20 , s4, s14
  232. fmacs s28 , s5, s15
  233. fmacs s21 , s4, s15
  234. fmacs s29 , s5, s14
  235. fmacs s22 , s6, s14
  236. fmacs s30 , s7, s15
  237. fmacs s23 , s6, s15
  238. fmacs s31 , s7, s14
  239. .endm
  240. .macro KERNEL2x2_SUB
  241. vldmia.f32 AO!, { s0 - s1 }
  242. vldmia.f32 BO!, { s8 - s9 }
  243. fmacs s16 , s0, s8
  244. fmacs s24 , s1, s9
  245. vldmia.f32 AO!, { s2 - s3 }
  246. fmacs s17 , s0, s9
  247. fmacs s25 , s1, s8
  248. vldmia.f32 BO!, { s10 - s11 }
  249. fmacs s18 , s2, s8
  250. fmacs s26 , s3, s9
  251. fmacs s19 , s2, s9
  252. fmacs s27 , s3, s8
  253. fmacs s20 , s0, s10
  254. fmacs s28 , s1, s11
  255. fmacs s21 , s0, s11
  256. fmacs s29 , s1, s10
  257. fmacs s22 , s2, s10
  258. fmacs s30 , s3, s11
  259. fmacs s23 , s2, s11
  260. fmacs s31 , s3, s10
  261. .endm
  262. .macro SAVE2x2
  263. pld [ CO1 , #C_PRE ]
  264. ldr r3 , LDC
  265. add CO2 , CO1, r3
  266. flds s0, ALPHA_R
  267. flds s1, ALPHA_I
  268. vldmia.f32 CO1, { s4 - s7 }
  269. vldmia.f32 CO2, { s8 - s11 }
  270. FADD_R s16, s24 , s16
  271. FADD_I s17, s25 , s17
  272. FADD_R s18, s26 , s18
  273. FADD_I s19, s27 , s19
  274. FADD_R s20, s28 , s20
  275. FADD_I s21, s29 , s21
  276. FADD_R s22, s30 , s22
  277. FADD_I s23, s31 , s23
  278. FMAC_R1 s4 , s0 , s16
  279. FMAC_I1 s5 , s0 , s17
  280. FMAC_R2 s4 , s1 , s17
  281. FMAC_I2 s5 , s1 , s16
  282. FMAC_R1 s6 , s0 , s18
  283. FMAC_I1 s7 , s0 , s19
  284. FMAC_R2 s6 , s1 , s19
  285. FMAC_I2 s7 , s1 , s18
  286. FMAC_R1 s8 , s0 , s20
  287. FMAC_I1 s9 , s0 , s21
  288. FMAC_R2 s8 , s1 , s21
  289. FMAC_I2 s9 , s1 , s20
  290. FMAC_R1 s10, s0 , s22
  291. FMAC_I1 s11, s0 , s23
  292. FMAC_R2 s10, s1 , s23
  293. FMAC_I2 s11, s1 , s22
  294. vstmia.f32 CO1, { s4 - s7 }
  295. vstmia.f32 CO2, { s8 - s11 }
  296. add CO1, CO1, #16
  297. .endm
  298. /******************************************************************************/
  299. .macro INIT1x2
  300. flds s16, FP_ZERO
  301. vmov.f32 s17, s16
  302. vmov.f32 s20, s16
  303. vmov.f32 s21, s16
  304. vmov.f32 s24, s16
  305. vmov.f32 s25, s16
  306. vmov.f32 s28, s16
  307. vmov.f32 s29, s16
  308. .endm
  309. .macro KERNEL1x2_I
  310. pld [ AO , #A_PRE ]
  311. pld [ BO , #B_PRE ]
  312. flds s0 , [ AO ]
  313. flds s1 , [ AO, #4 ]
  314. flds s8 , [ BO ]
  315. flds s9 , [ BO, #4 ]
  316. flds s10, [ BO, #8 ]
  317. flds s11, [ BO, #12 ]
  318. fmuls s16 , s0, s8
  319. fmuls s24 , s1, s9
  320. fmuls s17 , s0, s9
  321. fmuls s25 , s1, s8
  322. fmuls s20 , s0, s10
  323. fmuls s28 , s1, s11
  324. fmuls s21 , s0, s11
  325. fmuls s29 , s1, s10
  326. add BO , BO, #16
  327. add AO , AO, #8
  328. pld [ BO , #B_PRE ]
  329. flds s4 , [ AO, #0 ]
  330. flds s5 , [ AO, #4 ]
  331. flds s12, [ BO ]
  332. flds s13, [ BO, #4 ]
  333. flds s14, [ BO, #8 ]
  334. flds s15, [ BO, #12 ]
  335. add BO , BO, #16
  336. add AO , AO, #8
  337. .endm
  338. .macro KERNEL1x2_M1
  339. pld [ BO , #B_PRE ]
  340. fmacs s16 , s0, s8
  341. fmacs s24 , s1, s9
  342. fmacs s17 , s0, s9
  343. fmacs s25 , s1, s8
  344. fmacs s20 , s0, s10
  345. fmacs s28 , s1, s11
  346. fmacs s21 , s0, s11
  347. fmacs s29 , s1, s10
  348. flds s4 , [ AO, #0 ]
  349. flds s5 , [ AO, #4 ]
  350. flds s12, [ BO ]
  351. flds s13, [ BO, #4 ]
  352. flds s14, [ BO, #8 ]
  353. flds s15, [ BO, #12 ]
  354. add BO , BO, #16
  355. add AO , AO, #8
  356. .endm
  357. .macro KERNEL1x2_M2
  358. pld [ AO , #A_PRE ]
  359. pld [ BO , #B_PRE ]
  360. fmacs s16 , s4, s12
  361. fmacs s24 , s5, s13
  362. fmacs s17 , s4, s13
  363. fmacs s25 , s5, s12
  364. fmacs s20 , s4, s14
  365. fmacs s28 , s5, s15
  366. fmacs s21 , s4, s15
  367. fmacs s29 , s5, s14
  368. flds s0 , [ AO, #0 ]
  369. flds s1 , [ AO, #4 ]
  370. flds s8 , [ BO ]
  371. flds s9 , [ BO, #4 ]
  372. flds s10, [ BO, #8 ]
  373. flds s11, [ BO, #12 ]
  374. add BO , BO, #16
  375. add AO , AO, #8
  376. .endm
  377. .macro KERNEL1x2_E
  378. fmacs s16 , s4, s12
  379. fmacs s24 , s5, s13
  380. fmacs s17 , s4, s13
  381. fmacs s25 , s5, s12
  382. fmacs s20 , s4, s14
  383. fmacs s28 , s5, s15
  384. fmacs s21 , s4, s15
  385. fmacs s29 , s5, s14
  386. .endm
  387. .macro KERNEL1x2_SUB
  388. pld [ AO , #A_PRE ]
  389. pld [ BO , #B_PRE ]
  390. flds s0 , [ AO ]
  391. flds s1 , [ AO, #4 ]
  392. flds s8 , [ BO ]
  393. flds s9 , [ BO, #4 ]
  394. flds s10, [ BO, #8 ]
  395. flds s11, [ BO, #12 ]
  396. fmacs s16 , s0, s8
  397. fmacs s24 , s1, s9
  398. fmacs s17 , s0, s9
  399. fmacs s25 , s1, s8
  400. fmacs s20 , s0, s10
  401. fmacs s28 , s1, s11
  402. fmacs s21 , s0, s11
  403. fmacs s29 , s1, s10
  404. add BO , BO, #16
  405. add AO , AO, #8
  406. .endm
  407. .macro SAVE1x2
  408. pld [ CO1 , #C_PRE ]
  409. ldr r3 , LDC
  410. add CO2 , CO1, r3
  411. flds s0, ALPHA_R
  412. flds s1, ALPHA_I
  413. vldmia.f32 CO1, { s4 - s5 }
  414. vldmia.f32 CO2, { s8 - s9 }
  415. FADD_R s16, s24 , s16
  416. FADD_I s17, s25 , s17
  417. FADD_R s20, s28 , s20
  418. FADD_I s21, s29 , s21
  419. FMAC_R1 s4 , s0 , s16
  420. FMAC_I1 s5 , s0 , s17
  421. FMAC_R2 s4 , s1 , s17
  422. FMAC_I2 s5 , s1 , s16
  423. FMAC_R1 s8 , s0 , s20
  424. FMAC_I1 s9 , s0 , s21
  425. FMAC_R2 s8 , s1 , s21
  426. FMAC_I2 s9 , s1 , s20
  427. vstmia.f32 CO1, { s4 - s5 }
  428. vstmia.f32 CO2, { s8 - s9 }
  429. add CO1, CO1, #8
  430. .endm
  431. /******************************************************************************/
  432. .macro INIT2x1
  433. flds s16, FP_ZERO
  434. vmov.f32 s17, s16
  435. vmov.f32 s18, s16
  436. vmov.f32 s19, s16
  437. vmov.f32 s24, s16
  438. vmov.f32 s25, s16
  439. vmov.f32 s26, s16
  440. vmov.f32 s27, s16
  441. .endm
  442. .macro KERNEL2x1_I
  443. pld [ AO , #A_PRE ]
  444. pld [ BO , #B_PRE ]
  445. flds s0 , [ AO ]
  446. flds s1 , [ AO, #4 ]
  447. flds s2 , [ AO, #8 ]
  448. flds s3 , [ AO, #12 ]
  449. flds s8 , [ BO ]
  450. flds s9 , [ BO, #4 ]
  451. fmuls s16 , s0, s8
  452. fmuls s24 , s1, s9
  453. fmuls s17 , s0, s9
  454. fmuls s25 , s1, s8
  455. fmuls s18 , s2, s8
  456. fmuls s26 , s3, s9
  457. fmuls s19 , s2, s9
  458. fmuls s27 , s3, s8
  459. add BO , BO, #8
  460. add AO , AO, #16
  461. pld [ BO , #B_PRE ]
  462. pld [ AO , #A_PRE ]
  463. flds s4 , [ AO, #0 ]
  464. flds s5 , [ AO, #4 ]
  465. flds s6 , [ AO, #8 ]
  466. flds s7 , [ AO, #12 ]
  467. flds s12, [ BO ]
  468. flds s13, [ BO, #4 ]
  469. add BO , BO, #8
  470. add AO , AO, #16
  471. .endm
  472. .macro KERNEL2x1_M1
  473. pld [ AO , #A_PRE ]
  474. pld [ BO , #B_PRE ]
  475. fmacs s16 , s0, s8
  476. fmacs s24 , s1, s9
  477. fmacs s17 , s0, s9
  478. fmacs s25 , s1, s8
  479. fmacs s18 , s2, s8
  480. fmacs s26 , s3, s9
  481. fmacs s19 , s2, s9
  482. fmacs s27 , s3, s8
  483. flds s4 , [ AO, #0 ]
  484. flds s5 , [ AO, #4 ]
  485. flds s6 , [ AO, #8 ]
  486. flds s7 , [ AO, #12 ]
  487. flds s12, [ BO ]
  488. flds s13, [ BO, #4 ]
  489. add BO , BO, #8
  490. add AO , AO, #16
  491. .endm
  492. .macro KERNEL2x1_M2
  493. pld [ AO , #A_PRE ]
  494. pld [ BO , #B_PRE ]
  495. fmacs s16 , s4, s12
  496. fmacs s24 , s5, s13
  497. fmacs s17 , s4, s13
  498. fmacs s25 , s5, s12
  499. fmacs s18 , s6, s12
  500. fmacs s26 , s7, s13
  501. fmacs s19 , s6, s13
  502. fmacs s27 , s7, s12
  503. flds s0 , [ AO, #0 ]
  504. flds s1 , [ AO, #4 ]
  505. flds s2 , [ AO, #8 ]
  506. flds s3 , [ AO, #12 ]
  507. flds s8 , [ BO ]
  508. flds s9 , [ BO, #4 ]
  509. add BO , BO, #8
  510. add AO , AO, #16
  511. .endm
  512. .macro KERNEL2x1_E
  513. fmacs s16 , s4, s12
  514. fmacs s24 , s5, s13
  515. fmacs s17 , s4, s13
  516. fmacs s25 , s5, s12
  517. fmacs s18 , s6, s12
  518. fmacs s26 , s7, s13
  519. fmacs s19 , s6, s13
  520. fmacs s27 , s7, s12
  521. .endm
  522. .macro KERNEL2x1_SUB
  523. pld [ AO , #A_PRE ]
  524. pld [ BO , #B_PRE ]
  525. flds s0 , [ AO ]
  526. flds s1 , [ AO, #4 ]
  527. flds s2 , [ AO, #8 ]
  528. flds s3 , [ AO, #12 ]
  529. flds s8 , [ BO ]
  530. flds s9 , [ BO, #4 ]
  531. fmacs s16 , s0, s8
  532. fmacs s24 , s1, s9
  533. fmacs s17 , s0, s9
  534. fmacs s25 , s1, s8
  535. fmacs s18 , s2, s8
  536. fmacs s26 , s3, s9
  537. fmacs s19 , s2, s9
  538. fmacs s27 , s3, s8
  539. add BO , BO, #8
  540. add AO , AO, #16
  541. .endm
  542. .macro SAVE2x1
  543. pld [ CO1 , #C_PRE ]
  544. flds s0, ALPHA_R
  545. flds s1, ALPHA_I
  546. vldmia.f32 CO1, { s4 - s7 }
  547. FADD_R s16, s24 , s16
  548. FADD_I s17, s25 , s17
  549. FADD_R s18, s26 , s18
  550. FADD_I s19, s27 , s19
  551. FMAC_R1 s4 , s0 , s16
  552. FMAC_I1 s5 , s0 , s17
  553. FMAC_R2 s4 , s1 , s17
  554. FMAC_I2 s5 , s1 , s16
  555. FMAC_R1 s6 , s0 , s18
  556. FMAC_I1 s7 , s0 , s19
  557. FMAC_R2 s6 , s1 , s19
  558. FMAC_I2 s7 , s1 , s18
  559. vstmia.f32 CO1, { s4 - s7 }
  560. add CO1, CO1, #16
  561. .endm
  562. /******************************************************************************/
  563. .macro INIT1x1
  564. flds s16, FP_ZERO
  565. vmov.f32 s17, s16
  566. vmov.f32 s24, s16
  567. vmov.f32 s25, s16
  568. .endm
  569. .macro KERNEL1x1_I
  570. pld [ AO , #A_PRE ]
  571. pld [ BO , #B_PRE ]
  572. flds s0 , [ AO ]
  573. flds s1 , [ AO, #4 ]
  574. flds s8 , [ BO ]
  575. flds s9 , [ BO, #4 ]
  576. fmuls s16 , s0, s8
  577. fmuls s24 , s1, s9
  578. fmuls s17 , s0, s9
  579. fmuls s25 , s1, s8
  580. add BO , BO, #8
  581. add AO , AO, #8
  582. pld [ BO , #B_PRE ]
  583. pld [ AO , #A_PRE ]
  584. flds s4 , [ AO, #0 ]
  585. flds s5 , [ AO, #4 ]
  586. flds s12, [ BO ]
  587. flds s13, [ BO, #4 ]
  588. add BO , BO, #8
  589. add AO , AO, #8
  590. .endm
  591. .macro KERNEL1x1_M1
  592. fmacs s16 , s0, s8
  593. fmacs s24 , s1, s9
  594. fmacs s17 , s0, s9
  595. fmacs s25 , s1, s8
  596. flds s4 , [ AO, #0 ]
  597. flds s5 , [ AO, #4 ]
  598. flds s12, [ BO ]
  599. flds s13, [ BO, #4 ]
  600. add BO , BO, #8
  601. add AO , AO, #8
  602. .endm
  603. .macro KERNEL1x1_M2
  604. fmacs s16 , s4, s12
  605. fmacs s24 , s5, s13
  606. fmacs s17 , s4, s13
  607. fmacs s25 , s5, s12
  608. flds s0 , [ AO, #0 ]
  609. flds s1 , [ AO, #4 ]
  610. flds s8 , [ BO ]
  611. flds s9 , [ BO, #4 ]
  612. add BO , BO, #8
  613. add AO , AO, #8
  614. .endm
  615. .macro KERNEL1x1_E
  616. fmacs s16 , s4, s12
  617. fmacs s24 , s5, s13
  618. fmacs s17 , s4, s13
  619. fmacs s25 , s5, s12
  620. .endm
  621. .macro KERNEL1x1_SUB
  622. flds s0 , [ AO ]
  623. flds s1 , [ AO, #4 ]
  624. flds s8 , [ BO ]
  625. flds s9 , [ BO, #4 ]
  626. fmacs s16 , s0, s8
  627. fmacs s24 , s1, s9
  628. fmacs s17 , s0, s9
  629. fmacs s25 , s1, s8
  630. add BO , BO, #8
  631. add AO , AO, #8
  632. .endm
  633. .macro SAVE1x1
  634. pld [ CO1 , #C_PRE ]
  635. flds s0, ALPHA_R
  636. flds s1, ALPHA_I
  637. vldmia.f32 CO1, { s4 - s5 }
  638. FADD_R s16, s24 , s16
  639. FADD_I s17, s25 , s17
  640. FMAC_R1 s4 , s0 , s16
  641. FMAC_I1 s5 , s0 , s17
  642. FMAC_R2 s4 , s1 , s17
  643. FMAC_I2 s5 , s1 , s16
  644. vstmia.f32 CO1, { s4 - s5 }
  645. add CO1, CO1, #8
  646. .endm
  647. /******************************************************************************/
  648. /**************************************************************************************
  649. * End of macro definitions
  650. **************************************************************************************/
  651. PROLOGUE
  652. .align 5
  653. push {r4 - r9, fp}
  654. add fp, sp, #24
  655. sub sp, sp, #STACKSIZE // reserve stack
  656. #if !defined(__ARM_PCS_VFP)
  657. vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
  658. vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
  659. ldr OLD_A, OLD_A_SOFTFP
  660. #endif
  661. str OLD_M, M
  662. str OLD_N, N
  663. str OLD_K, K
  664. str OLD_A, A
  665. vstr OLD_ALPHA_R, ALPHA_R
  666. vstr OLD_ALPHA_I, ALPHA_I
  667. sub r3, fp, #128
  668. vstm r3, { s8 - s31} // store floating point registers
  669. movs r4, #0
  670. str r4, FP_ZERO
  671. str r4, FP_ZERO_1
  672. ldr r3, OLD_LDC
  673. lsl r3, r3, #3 // ldc = ldc * 4 * 2
  674. str r3, LDC
  675. ldr K1, K
  676. ldr BC, B
  677. ldr J, N
  678. asrs J, J, #1 // J = J / 2
  679. ble cgemm_kernel_L1_BEGIN
  680. cgemm_kernel_L2_BEGIN:
  681. ldr CO1, C // CO1 = C
  682. ldr r4 , LDC
  683. lsl r4 , r4 , #1 // LDC * 2
  684. add r3 , r4, CO1
  685. str r3 , C // store C
  686. ldr AO, A // AO = A
  687. pld [AO , #A_PRE-64]
  688. pld [AO , #A_PRE-32]
  689. cgemm_kernel_L2_M2_BEGIN:
  690. ldr I, M
  691. asrs I, I, #1 // I = I / 2
  692. ble cgemm_kernel_L2_M1_BEGIN
  693. cgemm_kernel_L2_M2_20:
  694. mov BO, BC
  695. asrs L , K1, #3 // L = L / 8
  696. cmp L , #3
  697. blt cgemm_kernel_L2_M2_30
  698. .align 5
  699. KERNEL2x2_I
  700. KERNEL2x2_M2
  701. KERNEL2x2_M1
  702. KERNEL2x2_M2
  703. KERNEL2x2_M1
  704. KERNEL2x2_M2
  705. KERNEL2x2_M1
  706. KERNEL2x2_M2
  707. sub L, L, #2
  708. cgemm_kernel_L2_M2_22:
  709. KERNEL2x2_M1
  710. KERNEL2x2_M2
  711. KERNEL2x2_M1
  712. KERNEL2x2_M2
  713. KERNEL2x2_M1
  714. KERNEL2x2_M2
  715. KERNEL2x2_M1
  716. KERNEL2x2_M2
  717. subs L, L, #1
  718. bgt cgemm_kernel_L2_M2_22
  719. KERNEL2x2_M1
  720. KERNEL2x2_M2
  721. KERNEL2x2_M1
  722. KERNEL2x2_M2
  723. KERNEL2x2_M1
  724. KERNEL2x2_M2
  725. KERNEL2x2_M1
  726. KERNEL2x2_E
  727. b cgemm_kernel_L2_M2_44
  728. cgemm_kernel_L2_M2_30:
  729. tst L, #3
  730. ble cgemm_kernel_L2_M2_40
  731. tst L, #2
  732. ble cgemm_kernel_L2_M2_32
  733. KERNEL2x2_I
  734. KERNEL2x2_M2
  735. KERNEL2x2_M1
  736. KERNEL2x2_M2
  737. KERNEL2x2_M1
  738. KERNEL2x2_M2
  739. KERNEL2x2_M1
  740. KERNEL2x2_M2
  741. KERNEL2x2_M1
  742. KERNEL2x2_M2
  743. KERNEL2x2_M1
  744. KERNEL2x2_M2
  745. KERNEL2x2_M1
  746. KERNEL2x2_M2
  747. KERNEL2x2_M1
  748. KERNEL2x2_E
  749. b cgemm_kernel_L2_M2_44
  750. cgemm_kernel_L2_M2_32:
  751. tst L, #1
  752. ble cgemm_kernel_L2_M2_40
  753. KERNEL2x2_I
  754. KERNEL2x2_M2
  755. KERNEL2x2_M1
  756. KERNEL2x2_M2
  757. KERNEL2x2_M1
  758. KERNEL2x2_M2
  759. KERNEL2x2_M1
  760. KERNEL2x2_E
  761. b cgemm_kernel_L2_M2_44
  762. cgemm_kernel_L2_M2_40:
  763. INIT2x2
  764. cgemm_kernel_L2_M2_44:
  765. ands L , K1, #7 // L = L % 8
  766. ble cgemm_kernel_L2_M2_100
  767. cgemm_kernel_L2_M2_46:
  768. KERNEL2x2_SUB
  769. subs L, L, #1
  770. bne cgemm_kernel_L2_M2_46
  771. cgemm_kernel_L2_M2_100:
  772. SAVE2x2
  773. cgemm_kernel_L2_M2_END:
  774. subs I, I, #1
  775. bne cgemm_kernel_L2_M2_20
  776. cgemm_kernel_L2_M1_BEGIN:
  777. ldr I, M
  778. tst I, #1 // I = I % 2
  779. ble cgemm_kernel_L2_END
  780. cgemm_kernel_L2_M1_20:
  781. INIT1x2
  782. mov BO, BC
  783. asrs L , K1, #3 // L = L / 8
  784. ble cgemm_kernel_L2_M1_40
  785. cgemm_kernel_L2_M1_22:
  786. KERNEL1x2_SUB
  787. KERNEL1x2_SUB
  788. KERNEL1x2_SUB
  789. KERNEL1x2_SUB
  790. KERNEL1x2_SUB
  791. KERNEL1x2_SUB
  792. KERNEL1x2_SUB
  793. KERNEL1x2_SUB
  794. subs L, L, #1
  795. bgt cgemm_kernel_L2_M1_22
  796. cgemm_kernel_L2_M1_40:
  797. ands L , K1, #7 // L = L % 8
  798. ble cgemm_kernel_L2_M1_100
  799. cgemm_kernel_L2_M1_42:
  800. KERNEL1x2_SUB
  801. subs L, L, #1
  802. bgt cgemm_kernel_L2_M1_42
  803. cgemm_kernel_L2_M1_100:
  804. SAVE1x2
  805. cgemm_kernel_L2_END:
  806. mov r3, BC
  807. mov r4, K1
  808. lsl r4, r4, #4 // k * 2 * 4 * 2
  809. add r3, r3, r4 // B = B + K * 2 * 8
  810. mov BC, r3
  811. subs J , #1 // j--
  812. bgt cgemm_kernel_L2_BEGIN
  813. /*********************************************************************************************/
  814. cgemm_kernel_L1_BEGIN:
  815. ldr J , N
  816. tst J , #1
  817. ble cgemm_kernel_L999
  818. ldr CO1, C // CO1 = C
  819. ldr r4 , LDC
  820. add r3 , r4, CO1
  821. str r3 , C // store C
  822. ldr AO, A // AO = A
  823. cgemm_kernel_L1_M2_BEGIN:
  824. ldr I, M
  825. asrs I, I, #1 // I = I / 2
  826. ble cgemm_kernel_L1_M1_BEGIN
  827. cgemm_kernel_L1_M2_20:
  828. mov BO, BC
  829. asrs L , K1, #3 // L = L / 8
  830. cmp L , #3
  831. blt cgemm_kernel_L1_M2_30
  832. .align 5
  833. KERNEL2x1_I
  834. KERNEL2x1_M2
  835. KERNEL2x1_M1
  836. KERNEL2x1_M2
  837. KERNEL2x1_M1
  838. KERNEL2x1_M2
  839. KERNEL2x1_M1
  840. KERNEL2x1_M2
  841. sub L, L, #2
  842. cgemm_kernel_L1_M2_22:
  843. KERNEL2x1_M1
  844. KERNEL2x1_M2
  845. KERNEL2x1_M1
  846. KERNEL2x1_M2
  847. KERNEL2x1_M1
  848. KERNEL2x1_M2
  849. KERNEL2x1_M1
  850. KERNEL2x1_M2
  851. subs L, L, #1
  852. bgt cgemm_kernel_L1_M2_22
  853. KERNEL2x1_M1
  854. KERNEL2x1_M2
  855. KERNEL2x1_M1
  856. KERNEL2x1_M2
  857. KERNEL2x1_M1
  858. KERNEL2x1_M2
  859. KERNEL2x1_M1
  860. KERNEL2x1_E
  861. b cgemm_kernel_L1_M2_44
  862. cgemm_kernel_L1_M2_30:
  863. tst L, #3
  864. ble cgemm_kernel_L1_M2_40
  865. tst L, #2
  866. ble cgemm_kernel_L1_M2_32
  867. KERNEL2x1_I
  868. KERNEL2x1_M2
  869. KERNEL2x1_M1
  870. KERNEL2x1_M2
  871. KERNEL2x1_M1
  872. KERNEL2x1_M2
  873. KERNEL2x1_M1
  874. KERNEL2x1_M2
  875. KERNEL2x1_M1
  876. KERNEL2x1_M2
  877. KERNEL2x1_M1
  878. KERNEL2x1_M2
  879. KERNEL2x1_M1
  880. KERNEL2x1_M2
  881. KERNEL2x1_M1
  882. KERNEL2x1_E
  883. b cgemm_kernel_L1_M2_44
  884. cgemm_kernel_L1_M2_32:
  885. tst L, #1
  886. ble cgemm_kernel_L1_M2_40
  887. KERNEL2x1_I
  888. KERNEL2x1_M2
  889. KERNEL2x1_M1
  890. KERNEL2x1_M2
  891. KERNEL2x1_M1
  892. KERNEL2x1_M2
  893. KERNEL2x1_M1
  894. KERNEL2x1_E
  895. b cgemm_kernel_L1_M2_44
  896. cgemm_kernel_L1_M2_40:
  897. INIT2x1
  898. cgemm_kernel_L1_M2_44:
  899. ands L , K1, #7 // L = L % 8
  900. ble cgemm_kernel_L1_M2_100
  901. cgemm_kernel_L1_M2_46:
  902. KERNEL2x1_SUB
  903. subs L, L, #1
  904. bne cgemm_kernel_L1_M2_46
  905. cgemm_kernel_L1_M2_100:
  906. SAVE2x1
  907. cgemm_kernel_L1_M2_END:
  908. subs I, I, #1
  909. bne cgemm_kernel_L1_M2_20
  910. cgemm_kernel_L1_M1_BEGIN:
  911. ldr I, M
  912. tst I, #1 // I = I % 2
  913. ble cgemm_kernel_L1_END
  914. cgemm_kernel_L1_M1_20:
  915. INIT1x1
  916. mov BO, BC
  917. asrs L , K1, #3 // L = L / 8
  918. ble cgemm_kernel_L1_M1_40
  919. cgemm_kernel_L1_M1_22:
  920. KERNEL1x1_SUB
  921. KERNEL1x1_SUB
  922. KERNEL1x1_SUB
  923. KERNEL1x1_SUB
  924. KERNEL1x1_SUB
  925. KERNEL1x1_SUB
  926. KERNEL1x1_SUB
  927. KERNEL1x1_SUB
  928. subs L, L, #1
  929. bgt cgemm_kernel_L1_M1_22
  930. cgemm_kernel_L1_M1_40:
  931. ands L , K1, #7 // L = L % 8
  932. ble cgemm_kernel_L1_M1_100
  933. cgemm_kernel_L1_M1_42:
  934. KERNEL1x1_SUB
  935. subs L, L, #1
  936. bgt cgemm_kernel_L1_M1_42
  937. cgemm_kernel_L1_M1_100:
  938. SAVE1x1
  939. cgemm_kernel_L1_END:
  940. cgemm_kernel_L999:
  941. sub r3, fp, #128
  942. vldm r3, { s8 - s31} // restore floating point registers
  943. movs r0, #0 // set return value
  944. sub sp, fp, #24
  945. pop {r4 - r9, fp}
  946. bx lr
  947. EPILOGUE