You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_4x4_vfpv3.S 22 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. *
  34. * 2013/11/02 Saar
  35. * UNROLL_N 4
  36. * UNROLL_M 4
  37. * DGEMM_P 128
  38. * DGEMM_Q 240
  39. * DGEMM_R 12288
  40. * A_PRE 128
  41. * B_PRE 128
  42. * C_PRE 32
  43. *
  44. * Performance on Odroid U2:
  45. *
  46. * 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
  47. * 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
  48. * 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
  49. * 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
  50. **************************************************************************************/
  51. #define ASSEMBLER
  52. #include "common.h"
  53. #define STACKSIZE 256
  54. #define OLD_M r0
  55. #define OLD_N r1
  56. #define OLD_K r2
  57. #define OLD_A r3
  58. #define OLD_ALPHA s0
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define LDC [fp, #-252 ]
  65. #define M [fp, #-256 ]
  66. #define N [fp, #-260 ]
  67. #define K [fp, #-264 ]
  68. #define A [fp, #-268 ]
  69. #define FP_ZERO [fp, #-240]
  70. #define FP_ZERO_0 [fp, #-240]
  71. #define FP_ZERO_1 [fp, #-236]
  72. #define ALPHA [fp, #-280]
  73. #define B [fp, #4 ]
  74. #define C [fp, #8 ]
  75. #define OLD_LDC [fp, #12 ]
  76. #define I r0
  77. #define J r1
  78. #define L r2
  79. #define AO r5
  80. #define BO r6
  81. #define CO1 r8
  82. #define CO2 r9
  83. #define K1 r7
  84. #define BC r12
  85. #define A_PRE 128
  86. #define B_PRE 128
  87. #define C_PRE 32
  88. /**************************************************************************************
  89. * Macro definitions
  90. **************************************************************************************/
  91. .macro INIT4x4
  92. flds s16, FP_ZERO
  93. vmov.f32 s17, s16
  94. vmov.f32 s18, s16
  95. vmov.f32 s19, s16
  96. vmov.f32 s20, s16
  97. vmov.f32 s21, s16
  98. vmov.f32 s22, s16
  99. vmov.f32 s23, s16
  100. vmov.f32 s24, s16
  101. vmov.f32 s25, s16
  102. vmov.f32 s26, s16
  103. vmov.f32 s27, s16
  104. vmov.f32 s28, s16
  105. vmov.f32 s29, s16
  106. vmov.f32 s30, s16
  107. vmov.f32 s31, s16
  108. .endm
  109. .macro KERNEL4x4_I
  110. pld [ AO , #A_PRE ]
  111. fldmias AO!, { s0 - s1 }
  112. pld [ BO , #B_PRE ]
  113. fldmias BO!, { s8 - s9 }
  114. fmuls s16 , s0, s8
  115. fldmias AO!, { s2 - s3 }
  116. fmuls s17 , s1, s8
  117. fmuls s18 , s2, s8
  118. fldmias BO!, { s10 - s11 }
  119. fmuls s19 , s3, s8
  120. fmuls s20 , s0, s9
  121. fldmias AO!, { s4 - s5 }
  122. fmuls s21 , s1, s9
  123. fmuls s22 , s2, s9
  124. fldmias AO!, { s6 - s7 }
  125. fmuls s23 , s3, s9
  126. fmuls s24 , s0, s10
  127. fldmias BO!, { s12 - s13 }
  128. fmuls s25 , s1, s10
  129. fmuls s26 , s2, s10
  130. fldmias BO!, { s14 - s15 }
  131. fmuls s27 , s3, s10
  132. fmuls s28 , s0, s11
  133. fmuls s29 , s1, s11
  134. fmuls s30 , s2, s11
  135. fmuls s31 , s3, s11
  136. .endm
  137. .macro KERNEL4x4_M2
  138. pld [ AO , #A_PRE ]
  139. fmacs s16 , s4, s12
  140. fmacs s17 , s5, s12
  141. fldmias AO!, { s0 - s3 }
  142. fmacs s18 , s6, s12
  143. pld [ BO , #B_PRE ]
  144. fmacs s19 , s7, s12
  145. fmacs s20 , s4, s13
  146. fldmias BO!, { s8 - s11 }
  147. fmacs s21 , s5, s13
  148. fmacs s22 , s6, s13
  149. //fldmias AO!, { s2 - s3 }
  150. fmacs s23 , s7, s13
  151. fmacs s24 , s4, s14
  152. //fldmias BO!, { s10 - s11 }
  153. fmacs s25 , s5, s14
  154. fmacs s26 , s6, s14
  155. fmacs s27 , s7, s14
  156. fmacs s28 , s4, s15
  157. fmacs s29 , s5, s15
  158. fmacs s30 , s6, s15
  159. fmacs s31 , s7, s15
  160. .endm
  161. .macro KERNEL4x4_M1
  162. fmacs s16 , s0, s8
  163. fldmias AO!, { s4 - s7 }
  164. fmacs s17 , s1, s8
  165. fmacs s18 , s2, s8
  166. fldmias BO!, { s12 - s15 }
  167. //fldmias AO!, { s6 - s7 }
  168. fmacs s19 , s3, s8
  169. fmacs s20 , s0, s9
  170. fmacs s21 , s1, s9
  171. fmacs s22 , s2, s9
  172. //fldmias BO!, { s14 - s15 }
  173. fmacs s23 , s3, s9
  174. fmacs s24 , s0, s10
  175. fmacs s25 , s1, s10
  176. fmacs s26 , s2, s10
  177. fmacs s27 , s3, s10
  178. fmacs s28 , s0, s11
  179. fmacs s29 , s1, s11
  180. fmacs s30 , s2, s11
  181. fmacs s31 , s3, s11
  182. .endm
  183. .macro KERNEL4x4_E
  184. fmacs s16 , s4, s12
  185. fmacs s17 , s5, s12
  186. fmacs s18 , s6, s12
  187. fmacs s19 , s7, s12
  188. fmacs s20 , s4, s13
  189. fmacs s21 , s5, s13
  190. fmacs s22 , s6, s13
  191. fmacs s23 , s7, s13
  192. fmacs s24 , s4, s14
  193. fmacs s25 , s5, s14
  194. fmacs s26 , s6, s14
  195. fmacs s27 , s7, s14
  196. fmacs s28 , s4, s15
  197. fmacs s29 , s5, s15
  198. fmacs s30 , s6, s15
  199. fmacs s31 , s7, s15
  200. .endm
  201. .macro KERNEL4x4_SUB
  202. flds s8 , [ BO ]
  203. flds s0 , [ AO ]
  204. flds s1 , [ AO, #4 ]
  205. fmacs s16 , s0, s8
  206. flds s2 , [ AO, #8 ]
  207. fmacs s17 , s1, s8
  208. flds s3 , [ AO, #12 ]
  209. fmacs s18 , s2, s8
  210. flds s9 , [ BO, #4 ]
  211. fmacs s19 , s3, s8
  212. flds s10, [ BO, #8 ]
  213. fmacs s20 , s0, s9
  214. flds s11, [ BO, #12 ]
  215. fmacs s21 , s1, s9
  216. fmacs s22 , s2, s9
  217. fmacs s23 , s3, s9
  218. fmacs s24 , s0, s10
  219. fmacs s25 , s1, s10
  220. fmacs s26 , s2, s10
  221. fmacs s27 , s3, s10
  222. fmacs s28 , s0, s11
  223. fmacs s29 , s1, s11
  224. add AO , AO, #16
  225. fmacs s30 , s2, s11
  226. add BO , BO, #16
  227. fmacs s31 , s3, s11
  228. .endm
  229. .macro SAVE4x4
  230. ldr r3 , LDC
  231. add CO2 , CO1, r3
  232. flds s0, ALPHA
  233. add r4 , CO2, r3
  234. fldmias CO1, { s8 - s11 }
  235. fmacs s8 , s0 , s16
  236. flds s12, [CO2]
  237. fmacs s9 , s0 , s17
  238. flds s13, [CO2, #4 ]
  239. fmacs s10, s0 , s18
  240. flds s14, [CO2, #8 ]
  241. fmacs s11, s0 , s19
  242. flds s15, [CO2, #12 ]
  243. fmacs s12, s0 , s20
  244. fsts s8 , [CO1]
  245. fmacs s13, s0 , s21
  246. fsts s9 , [CO1, #4 ]
  247. fmacs s14, s0 , s22
  248. fsts s10, [CO1, #8 ]
  249. fmacs s15, s0 , s23
  250. fsts s11, [CO1, #12 ]
  251. pld [ CO1 , #C_PRE ]
  252. fldmias r4, { s8 - s11 }
  253. fmacs s8 , s0 , s24
  254. fsts s12, [CO2]
  255. fmacs s9 , s0 , s25
  256. fsts s13, [CO2, #4 ]
  257. fmacs s10, s0 , s26
  258. fsts s14, [CO2, #8 ]
  259. fmacs s11, s0 , s27
  260. fsts s15, [CO2, #12 ]
  261. pld [ CO2 , #C_PRE ]
  262. add CO2, r4 , r3
  263. fldmias CO2, { s12 - s15 }
  264. fsts s8 , [r4 ]
  265. fmacs s12, s0 , s28
  266. fsts s9 , [r4 , #4 ]
  267. fmacs s13, s0 , s29
  268. fsts s10, [r4 , #8 ]
  269. fmacs s14, s0 , s30
  270. fsts s11, [r4 , #12 ]
  271. fmacs s15, s0 , s31
  272. pld [ r4 , #C_PRE ]
  273. fstmias CO2, { s12 - s15 }
  274. pld [ CO2 , #C_PRE ]
  275. add CO1, CO1, #16
  276. .endm
  277. /******************************************************************************/
  278. .macro INIT2x4
  279. flds s16, FP_ZERO
  280. vmov.f32 s17, s16
  281. vmov.f32 s20, s16
  282. vmov.f32 s21, s16
  283. vmov.f32 s24, s16
  284. vmov.f32 s25, s16
  285. vmov.f32 s28, s16
  286. vmov.f32 s29, s16
  287. .endm
  288. .macro KERNEL2x4_SUB
  289. flds s8 , [ BO ]
  290. flds s9 , [ BO, #4 ]
  291. flds s10, [ BO, #8 ]
  292. flds s11, [ BO, #12 ]
  293. flds s0 , [ AO ]
  294. flds s1 , [ AO, #4 ]
  295. fmacs s16 , s0, s8
  296. fmacs s17 , s1, s8
  297. fmacs s20 , s0, s9
  298. fmacs s21 , s1, s9
  299. fmacs s24 , s0, s10
  300. fmacs s25 , s1, s10
  301. fmacs s28 , s0, s11
  302. fmacs s29 , s1, s11
  303. add AO , AO, #8
  304. add BO , BO, #16
  305. .endm
  306. .macro SAVE2x4
  307. ldr r3 , LDC
  308. add CO2 , CO1, r3
  309. add r4 , CO2, r3
  310. flds s0, ALPHA
  311. flds s8 , [CO1]
  312. flds s9 , [CO1, #4 ]
  313. fmacs s8 , s0 , s16
  314. fmacs s9 , s0 , s17
  315. fsts s8 , [CO1]
  316. fsts s9 , [CO1, #4 ]
  317. flds s12, [CO2]
  318. flds s13, [CO2, #4 ]
  319. fmacs s12, s0 , s20
  320. fmacs s13, s0 , s21
  321. fsts s12, [CO2]
  322. fsts s13, [CO2, #4 ]
  323. flds s8 , [r4 ]
  324. flds s9 , [r4 , #4 ]
  325. fmacs s8 , s0 , s24
  326. fmacs s9 , s0 , s25
  327. fsts s8 , [r4 ]
  328. fsts s9 , [r4 , #4 ]
  329. add CO2, r4 , r3
  330. flds s12, [CO2]
  331. flds s13, [CO2, #4 ]
  332. fmacs s12, s0 , s28
  333. fmacs s13, s0 , s29
  334. fsts s12, [CO2]
  335. fsts s13, [CO2, #4 ]
  336. add CO1, CO1, #8
  337. .endm
  338. /******************************************************************************/
  339. .macro INIT1x4
  340. flds s16, FP_ZERO
  341. vmov.f32 s20, s16
  342. vmov.f32 s24, s16
  343. vmov.f32 s28, s16
  344. .endm
  345. .macro KERNEL1x4_SUB
  346. flds s8 , [ BO ]
  347. flds s9 , [ BO, #4 ]
  348. flds s10, [ BO, #8 ]
  349. flds s11, [ BO, #12 ]
  350. flds s0 , [ AO ]
  351. fmacs s16 , s0, s8
  352. fmacs s20 , s0, s9
  353. fmacs s24 , s0, s10
  354. fmacs s28 , s0, s11
  355. add AO , AO, #4
  356. add BO , BO, #16
  357. .endm
  358. .macro SAVE1x4
  359. ldr r3 , LDC
  360. add CO2 , CO1, r3
  361. add r4 , CO2, r3
  362. flds s0, ALPHA
  363. flds s8 , [CO1]
  364. fmacs s8 , s0 , s16
  365. fsts s8 , [CO1]
  366. flds s12, [CO2]
  367. fmacs s12, s0 , s20
  368. fsts s12, [CO2]
  369. flds s8 , [r4 ]
  370. fmacs s8 , s0 , s24
  371. fsts s8 , [r4 ]
  372. add CO2, r4 , r3
  373. flds s12, [CO2]
  374. fmacs s12, s0 , s28
  375. fsts s12, [CO2]
  376. add CO1, CO1, #4
  377. .endm
  378. /******************************************************************************/
  379. /******************************************************************************/
  380. .macro INIT4x2
  381. flds s16, FP_ZERO
  382. vmov.f32 s17, s16
  383. vmov.f32 s18, s16
  384. vmov.f32 s19, s16
  385. vmov.f32 s20, s16
  386. vmov.f32 s21, s16
  387. vmov.f32 s22, s16
  388. vmov.f32 s23, s16
  389. .endm
  390. .macro KERNEL4x2_SUB
  391. flds s8 , [ BO ]
  392. flds s9 , [ BO, #4 ]
  393. flds s0 , [ AO ]
  394. flds s1 , [ AO, #4 ]
  395. flds s2 , [ AO, #8 ]
  396. flds s3 , [ AO, #12 ]
  397. fmacs s16 , s0, s8
  398. fmacs s17 , s1, s8
  399. fmacs s18 , s2, s8
  400. fmacs s19 , s3, s8
  401. fmacs s20 , s0, s9
  402. fmacs s21 , s1, s9
  403. fmacs s22 , s2, s9
  404. fmacs s23 , s3, s9
  405. add AO , AO, #16
  406. add BO , BO, #8
  407. .endm
  408. .macro SAVE4x2
  409. ldr r3 , LDC
  410. add CO2 , CO1, r3
  411. flds s0, ALPHA
  412. flds s8 , [CO1]
  413. flds s9 , [CO1, #4 ]
  414. flds s10, [CO1, #8 ]
  415. flds s11, [CO1, #12 ]
  416. fmacs s8 , s0 , s16
  417. fmacs s9 , s0 , s17
  418. fmacs s10, s0 , s18
  419. fmacs s11, s0 , s19
  420. fsts s8 , [CO1]
  421. fsts s9 , [CO1, #4 ]
  422. fsts s10, [CO1, #8 ]
  423. fsts s11, [CO1, #12 ]
  424. flds s12, [CO2]
  425. flds s13, [CO2, #4 ]
  426. flds s14, [CO2, #8 ]
  427. flds s15, [CO2, #12 ]
  428. fmacs s12, s0 , s20
  429. fmacs s13, s0 , s21
  430. fmacs s14, s0 , s22
  431. fmacs s15, s0 , s23
  432. fsts s12, [CO2]
  433. fsts s13, [CO2, #4 ]
  434. fsts s14, [CO2, #8 ]
  435. fsts s15, [CO2, #12 ]
  436. add CO1, CO1, #16
  437. .endm
  438. /******************************************************************************/
  439. .macro INIT2x2
  440. flds s16, FP_ZERO
  441. vmov.f32 s17, s16
  442. vmov.f32 s20, s16
  443. vmov.f32 s21, s16
  444. .endm
  445. .macro KERNEL2x2_SUB
  446. flds s8 , [ BO ]
  447. flds s9 , [ BO, #4 ]
  448. flds s0 , [ AO ]
  449. flds s1 , [ AO, #4 ]
  450. fmacs s16 , s0, s8
  451. fmacs s17 , s1, s8
  452. fmacs s20 , s0, s9
  453. fmacs s21 , s1, s9
  454. add AO , AO, #8
  455. add BO , BO, #8
  456. .endm
  457. .macro SAVE2x2
  458. ldr r3 , LDC
  459. add CO2 , CO1, r3
  460. flds s0, ALPHA
  461. flds s8 , [CO1]
  462. flds s9 , [CO1, #4 ]
  463. fmacs s8 , s0 , s16
  464. fmacs s9 , s0 , s17
  465. fsts s8 , [CO1]
  466. fsts s9 , [CO1, #4 ]
  467. flds s12, [CO2]
  468. flds s13, [CO2, #4 ]
  469. fmacs s12, s0 , s20
  470. fmacs s13, s0 , s21
  471. fsts s12, [CO2]
  472. fsts s13, [CO2, #4 ]
  473. add CO1, CO1, #8
  474. .endm
  475. /******************************************************************************/
  476. .macro INIT1x2
  477. flds s16, FP_ZERO
  478. vmov.f32 s20, s16
  479. .endm
  480. .macro KERNEL1x2_SUB
  481. flds s8 , [ BO ]
  482. flds s9 , [ BO, #4 ]
  483. flds s0 , [ AO ]
  484. fmacs s16 , s0, s8
  485. fmacs s20 , s0, s9
  486. add AO , AO, #4
  487. add BO , BO, #8
  488. .endm
  489. .macro SAVE1x2
  490. ldr r3 , LDC
  491. add CO2 , CO1, r3
  492. flds s0, ALPHA
  493. flds s8 , [CO1]
  494. fmacs s8 , s0 , s16
  495. fsts s8 , [CO1]
  496. flds s12, [CO2]
  497. fmacs s12, s0 , s20
  498. fsts s12, [CO2]
  499. add CO1, CO1, #4
  500. .endm
  501. /******************************************************************************/
  502. /******************************************************************************/
  503. .macro INIT4x1
  504. flds s16, FP_ZERO
  505. vmov.f32 s17, s16
  506. vmov.f32 s18, s16
  507. vmov.f32 s19, s16
  508. .endm
  509. .macro KERNEL4x1_SUB
  510. flds s8 , [ BO ]
  511. flds s0 , [ AO ]
  512. flds s1 , [ AO, #4 ]
  513. flds s2 , [ AO, #8 ]
  514. flds s3 , [ AO, #12 ]
  515. fmacs s16 , s0, s8
  516. fmacs s17 , s1, s8
  517. fmacs s18 , s2, s8
  518. fmacs s19 , s3, s8
  519. add AO , AO, #16
  520. add BO , BO, #4
  521. .endm
  522. .macro SAVE4x1
  523. flds s0, ALPHA
  524. flds s8 , [CO1]
  525. flds s9 , [CO1, #4 ]
  526. flds s10, [CO1, #8 ]
  527. flds s11, [CO1, #12 ]
  528. fmacs s8 , s0 , s16
  529. fmacs s9 , s0 , s17
  530. fmacs s10, s0 , s18
  531. fmacs s11, s0 , s19
  532. fsts s8 , [CO1]
  533. fsts s9 , [CO1, #4 ]
  534. fsts s10, [CO1, #8 ]
  535. fsts s11, [CO1, #12 ]
  536. add CO1, CO1, #16
  537. .endm
  538. /******************************************************************************/
  539. .macro INIT2x1
  540. flds s16, FP_ZERO
  541. vmov.f32 s17, s16
  542. .endm
  543. .macro KERNEL2x1_SUB
  544. flds s8 , [ BO ]
  545. flds s0 , [ AO ]
  546. flds s1 , [ AO, #4 ]
  547. fmacs s16 , s0, s8
  548. fmacs s17 , s1, s8
  549. add AO , AO, #8
  550. add BO , BO, #4
  551. .endm
  552. .macro SAVE2x1
  553. flds s0, ALPHA
  554. flds s8 , [CO1]
  555. flds s9 , [CO1, #4 ]
  556. fmacs s8 , s0 , s16
  557. fmacs s9 , s0 , s17
  558. fsts s8 , [CO1]
  559. fsts s9 , [CO1, #4 ]
  560. add CO1, CO1, #8
  561. .endm
  562. /******************************************************************************/
  563. .macro INIT1x1
  564. flds s16, FP_ZERO
  565. .endm
  566. .macro KERNEL1x1_SUB
  567. flds s8 , [ BO ]
  568. flds s0 , [ AO ]
  569. fmacs s16 , s0, s8
  570. add AO , AO, #4
  571. add BO , BO, #4
  572. .endm
  573. .macro SAVE1x1
  574. flds s0, ALPHA
  575. flds s8 , [CO1]
  576. fmacs s8 , s0 , s16
  577. fsts s8 , [CO1]
  578. add CO1, CO1, #4
  579. .endm
  580. /**************************************************************************************
  581. * End of macro definitions
  582. **************************************************************************************/
  583. PROLOGUE
  584. .align 5
  585. push {r4 - r9, fp}
  586. add fp, sp, #24
  587. sub sp, sp, #STACKSIZE // reserve stack
  588. str OLD_M, M
  589. str OLD_N, N
  590. str OLD_K, K
  591. str OLD_A, A
  592. vstr OLD_ALPHA, ALPHA
  593. sub r3, fp, #128
  594. vstm r3, { s8 - s31} // store floating point registers
  595. movs r4, #0
  596. str r4, FP_ZERO
  597. str r4, FP_ZERO_1
  598. ldr r3, OLD_LDC
  599. lsl r3, r3, #2 // ldc = ldc * 4
  600. str r3, LDC
  601. ldr K1, K
  602. ldr BC, B
  603. ldr J, N
  604. asrs J, J, #2 // J = J / 4
  605. ble sgemm_kernel_L2_BEGIN
  606. sgemm_kernel_L4_BEGIN:
  607. ldr CO1, C // CO1 = C
  608. ldr r4 , LDC
  609. lsl r4 , r4 , #2 // LDC * 4
  610. add r3 , r4, CO1
  611. str r3 , C // store C
  612. ldr AO, A // AO = A
  613. pld [AO , #A_PRE-64]
  614. pld [AO , #A_PRE-32]
  615. sgemm_kernel_L4_M4_BEGIN:
  616. ldr I, M
  617. asrs I, I, #2 // I = I / 4
  618. ble sgemm_kernel_L4_M2_BEGIN
  619. sgemm_kernel_L4_M4_20:
  620. mov BO, BC
  621. asrs L , K1, #1 // L = L / 8
  622. cmp L , #2
  623. blt sgemm_kernel_L4_M4_32
  624. KERNEL4x4_I
  625. KERNEL4x4_M2
  626. subs L, L, #2
  627. ble sgemm_kernel_L4_M4_22a
  628. .align 5
  629. sgemm_kernel_L4_M4_22:
  630. KERNEL4x4_M1
  631. KERNEL4x4_M2
  632. subs L, L, #1
  633. bgt sgemm_kernel_L4_M4_22
  634. sgemm_kernel_L4_M4_22a:
  635. KERNEL4x4_M1
  636. KERNEL4x4_E
  637. b sgemm_kernel_L4_M4_44
  638. sgemm_kernel_L4_M4_32:
  639. tst L, #1
  640. ble sgemm_kernel_L4_M4_40
  641. KERNEL4x4_I
  642. KERNEL4x4_E
  643. b sgemm_kernel_L4_M4_44
  644. sgemm_kernel_L4_M4_40:
  645. INIT4x4
  646. sgemm_kernel_L4_M4_44:
  647. ands L , K1, #1 // L = L % 8
  648. ble sgemm_kernel_L4_M4_100
  649. sgemm_kernel_L4_M4_46:
  650. KERNEL4x4_SUB
  651. subs L, L, #1
  652. bne sgemm_kernel_L4_M4_46
  653. sgemm_kernel_L4_M4_100:
  654. SAVE4x4
  655. sgemm_kernel_L4_M4_END:
  656. subs I, I, #1
  657. bne sgemm_kernel_L4_M4_20
  658. sgemm_kernel_L4_M2_BEGIN:
  659. ldr I, M
  660. tst I , #3
  661. ble sgemm_kernel_L4_END
  662. tst I, #2 // I = I / 2
  663. ble sgemm_kernel_L4_M1_BEGIN
  664. sgemm_kernel_L4_M2_20:
  665. INIT2x4
  666. mov BO, BC
  667. asrs L , K1, #3 // L = L / 8
  668. ble sgemm_kernel_L4_M2_40
  669. sgemm_kernel_L4_M2_22:
  670. KERNEL2x4_SUB
  671. KERNEL2x4_SUB
  672. KERNEL2x4_SUB
  673. KERNEL2x4_SUB
  674. KERNEL2x4_SUB
  675. KERNEL2x4_SUB
  676. KERNEL2x4_SUB
  677. KERNEL2x4_SUB
  678. subs L, L, #1
  679. bgt sgemm_kernel_L4_M2_22
  680. sgemm_kernel_L4_M2_40:
  681. ands L , K1, #7 // L = L % 8
  682. ble sgemm_kernel_L4_M2_100
  683. sgemm_kernel_L4_M2_42:
  684. KERNEL2x4_SUB
  685. subs L, L, #1
  686. bgt sgemm_kernel_L4_M2_42
  687. sgemm_kernel_L4_M2_100:
  688. SAVE2x4
  689. sgemm_kernel_L4_M2_END:
  690. sgemm_kernel_L4_M1_BEGIN:
  691. tst I, #1 // I = I % 2
  692. ble sgemm_kernel_L4_END
  693. sgemm_kernel_L4_M1_20:
  694. INIT1x4
  695. mov BO, BC
  696. asrs L , K1, #3 // L = L / 8
  697. ble sgemm_kernel_L4_M1_40
  698. sgemm_kernel_L4_M1_22:
  699. KERNEL1x4_SUB
  700. KERNEL1x4_SUB
  701. KERNEL1x4_SUB
  702. KERNEL1x4_SUB
  703. KERNEL1x4_SUB
  704. KERNEL1x4_SUB
  705. KERNEL1x4_SUB
  706. KERNEL1x4_SUB
  707. subs L, L, #1
  708. bgt sgemm_kernel_L4_M1_22
  709. sgemm_kernel_L4_M1_40:
  710. ands L , K1, #7 // L = L % 8
  711. ble sgemm_kernel_L4_M1_100
  712. sgemm_kernel_L4_M1_42:
  713. KERNEL1x4_SUB
  714. subs L, L, #1
  715. bgt sgemm_kernel_L4_M1_42
  716. sgemm_kernel_L4_M1_100:
  717. SAVE1x4
  718. sgemm_kernel_L4_END:
  719. mov r3, BC
  720. mov r4, K1
  721. lsl r4, r4, #4 // k * 4 * 4
  722. add r3, r3, r4 // B = B + K * 4 * 4
  723. mov BC, r3
  724. subs J , #1 // j--
  725. bgt sgemm_kernel_L4_BEGIN
  726. /*********************************************************************************************/
  727. sgemm_kernel_L2_BEGIN:
  728. ldr J , N
  729. tst J , #3
  730. ble sgemm_kernel_L999
  731. tst J , #2
  732. ble sgemm_kernel_L1_BEGIN
  733. ldr CO1, C // CO1 = C
  734. ldr r4 , LDC
  735. lsl r4 , r4 , #1 // LDC * 2
  736. add r3 , r4, CO1
  737. str r3 , C // store C
  738. ldr AO, A // AO = A
  739. //pld [AO , #A_PRE-96]
  740. //pld [AO , #A_PRE-64]
  741. //pld [AO , #A_PRE-32]
  742. sgemm_kernel_L2_M4_BEGIN:
  743. ldr I, M
  744. asrs I, I, #2 // I = I / 4
  745. ble sgemm_kernel_L2_M2_BEGIN
  746. sgemm_kernel_L2_M4_20:
  747. INIT4x2
  748. mov BO, BC
  749. asrs L , K1, #3 // L = L / 8
  750. ble sgemm_kernel_L2_M4_40
  751. .align 5
  752. sgemm_kernel_L2_M4_22:
  753. KERNEL4x2_SUB
  754. KERNEL4x2_SUB
  755. KERNEL4x2_SUB
  756. KERNEL4x2_SUB
  757. KERNEL4x2_SUB
  758. KERNEL4x2_SUB
  759. KERNEL4x2_SUB
  760. KERNEL4x2_SUB
  761. subs L, L, #1
  762. bgt sgemm_kernel_L2_M4_22
  763. sgemm_kernel_L2_M4_40:
  764. ands L , K1, #7 // L = L % 8
  765. ble sgemm_kernel_L2_M4_100
  766. sgemm_kernel_L2_M4_42:
  767. KERNEL4x2_SUB
  768. subs L, L, #1
  769. bgt sgemm_kernel_L2_M4_42
  770. sgemm_kernel_L2_M4_100:
  771. SAVE4x2
  772. sgemm_kernel_L2_M4_END:
  773. subs I, I, #1
  774. bgt sgemm_kernel_L2_M4_20
  775. sgemm_kernel_L2_M2_BEGIN:
  776. ldr I, M
  777. tst I , #3
  778. ble sgemm_kernel_L2_END
  779. tst I, #2 // I = I / 2
  780. ble sgemm_kernel_L2_M1_BEGIN
  781. sgemm_kernel_L2_M2_20:
  782. INIT2x2
  783. mov BO, BC
  784. asrs L , K1, #3 // L = L / 8
  785. ble sgemm_kernel_L2_M2_40
  786. sgemm_kernel_L2_M2_22:
  787. KERNEL2x2_SUB
  788. KERNEL2x2_SUB
  789. KERNEL2x2_SUB
  790. KERNEL2x2_SUB
  791. KERNEL2x2_SUB
  792. KERNEL2x2_SUB
  793. KERNEL2x2_SUB
  794. KERNEL2x2_SUB
  795. subs L, L, #1
  796. bgt sgemm_kernel_L2_M2_22
  797. sgemm_kernel_L2_M2_40:
  798. ands L , K1, #7 // L = L % 8
  799. ble sgemm_kernel_L2_M2_100
  800. sgemm_kernel_L2_M2_42:
  801. KERNEL2x2_SUB
  802. subs L, L, #1
  803. bgt sgemm_kernel_L2_M2_42
  804. sgemm_kernel_L2_M2_100:
  805. SAVE2x2
  806. sgemm_kernel_L2_M2_END:
  807. sgemm_kernel_L2_M1_BEGIN:
  808. tst I, #1 // I = I % 2
  809. ble sgemm_kernel_L2_END
  810. sgemm_kernel_L2_M1_20:
  811. INIT1x2
  812. mov BO, BC
  813. asrs L , K1, #3 // L = L / 8
  814. ble sgemm_kernel_L2_M1_40
  815. sgemm_kernel_L2_M1_22:
  816. KERNEL1x2_SUB
  817. KERNEL1x2_SUB
  818. KERNEL1x2_SUB
  819. KERNEL1x2_SUB
  820. KERNEL1x2_SUB
  821. KERNEL1x2_SUB
  822. KERNEL1x2_SUB
  823. KERNEL1x2_SUB
  824. subs L, L, #1
  825. bgt sgemm_kernel_L2_M1_22
  826. sgemm_kernel_L2_M1_40:
  827. ands L , K1, #7 // L = L % 8
  828. ble sgemm_kernel_L2_M1_100
  829. sgemm_kernel_L2_M1_42:
  830. KERNEL1x2_SUB
  831. subs L, L, #1
  832. bgt sgemm_kernel_L2_M1_42
  833. sgemm_kernel_L2_M1_100:
  834. SAVE1x2
  835. sgemm_kernel_L2_END:
  836. mov r3, BC
  837. mov r4, K1
  838. lsl r4, r4, #3 // k * 2 * 4
  839. add r3, r3, r4 // B = B + K * 2 * 4
  840. mov BC, r3
  841. /*********************************************************************************************/
  842. sgemm_kernel_L1_BEGIN:
  843. ldr J , N
  844. tst J , #1
  845. ble sgemm_kernel_L999
  846. ldr CO1, C // CO1 = C
  847. ldr r4 , LDC
  848. add r3 , r4, CO1
  849. str r3 , C // store C
  850. ldr AO, A // AO = A
  851. //pld [AO , #A_PRE-96]
  852. //pld [AO , #A_PRE-64]
  853. //pld [AO , #A_PRE-32]
  854. sgemm_kernel_L1_M4_BEGIN:
  855. ldr I, M
  856. asrs I, I, #2 // I = I / 4
  857. ble sgemm_kernel_L1_M2_BEGIN
  858. sgemm_kernel_L1_M4_20:
  859. INIT4x1
  860. mov BO, BC
  861. asrs L , K1, #3 // L = L / 8
  862. ble sgemm_kernel_L1_M4_40
  863. .align 5
  864. sgemm_kernel_L1_M4_22:
  865. KERNEL4x1_SUB
  866. KERNEL4x1_SUB
  867. KERNEL4x1_SUB
  868. KERNEL4x1_SUB
  869. KERNEL4x1_SUB
  870. KERNEL4x1_SUB
  871. KERNEL4x1_SUB
  872. KERNEL4x1_SUB
  873. subs L, L, #1
  874. bgt sgemm_kernel_L1_M4_22
  875. sgemm_kernel_L1_M4_40:
  876. ands L , K1, #7 // L = L % 8
  877. ble sgemm_kernel_L1_M4_100
  878. sgemm_kernel_L1_M4_42:
  879. KERNEL4x1_SUB
  880. subs L, L, #1
  881. bgt sgemm_kernel_L1_M4_42
  882. sgemm_kernel_L1_M4_100:
  883. SAVE4x1
  884. sgemm_kernel_L1_M4_END:
  885. subs I, I, #1
  886. bgt sgemm_kernel_L1_M4_20
  887. sgemm_kernel_L1_M2_BEGIN:
  888. ldr I, M
  889. tst I , #3
  890. ble sgemm_kernel_L1_END
  891. tst I, #2 // I = I / 2
  892. ble sgemm_kernel_L1_M1_BEGIN
  893. sgemm_kernel_L1_M2_20:
  894. INIT2x1
  895. mov BO, BC
  896. asrs L , K1, #3 // L = L / 8
  897. ble sgemm_kernel_L1_M2_40
  898. sgemm_kernel_L1_M2_22:
  899. KERNEL2x1_SUB
  900. KERNEL2x1_SUB
  901. KERNEL2x1_SUB
  902. KERNEL2x1_SUB
  903. KERNEL2x1_SUB
  904. KERNEL2x1_SUB
  905. KERNEL2x1_SUB
  906. KERNEL2x1_SUB
  907. subs L, L, #1
  908. bgt sgemm_kernel_L1_M2_22
  909. sgemm_kernel_L1_M2_40:
  910. ands L , K1, #7 // L = L % 8
  911. ble sgemm_kernel_L1_M2_100
  912. sgemm_kernel_L1_M2_42:
  913. KERNEL2x1_SUB
  914. subs L, L, #1
  915. bgt sgemm_kernel_L1_M2_42
  916. sgemm_kernel_L1_M2_100:
  917. SAVE2x1
  918. sgemm_kernel_L1_M2_END:
  919. sgemm_kernel_L1_M1_BEGIN:
  920. tst I, #1 // I = I % 2
  921. ble sgemm_kernel_L1_END
  922. sgemm_kernel_L1_M1_20:
  923. INIT1x1
  924. mov BO, BC
  925. asrs L , K1, #3 // L = L / 8
  926. ble sgemm_kernel_L1_M1_40
  927. sgemm_kernel_L1_M1_22:
  928. KERNEL1x1_SUB
  929. KERNEL1x1_SUB
  930. KERNEL1x1_SUB
  931. KERNEL1x1_SUB
  932. KERNEL1x1_SUB
  933. KERNEL1x1_SUB
  934. KERNEL1x1_SUB
  935. KERNEL1x1_SUB
  936. subs L, L, #1
  937. bgt sgemm_kernel_L1_M1_22
  938. sgemm_kernel_L1_M1_40:
  939. ands L , K1, #7 // L = L % 8
  940. ble sgemm_kernel_L1_M1_100
  941. sgemm_kernel_L1_M1_42:
  942. KERNEL1x1_SUB
  943. subs L, L, #1
  944. bgt sgemm_kernel_L1_M1_42
  945. sgemm_kernel_L1_M1_100:
  946. SAVE1x1
  947. sgemm_kernel_L1_END:
  948. sgemm_kernel_L999:
  949. sub r3, fp, #128
  950. vldm r3, { s8 - s31} // restore floating point registers
  951. movs r0, #0 // set return value
  952. sub sp, fp, #24
  953. pop {r4 - r9, fp}
  954. bx lr
  955. EPILOGUE