You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_vfp.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_LDA [fp, #0 ]
  38. #define X [fp, #4 ]
  39. #define OLD_INC_X [fp, #8 ]
  40. #define Y [fp, #12 ]
  41. #define OLD_INC_Y [fp, #16 ]
  42. #define OLD_A r3
  43. #define OLD_M r0
  44. #define AO1 r0
  45. #define N r1
  46. #define J r2
  47. #define AO2 r4
  48. #define XO r5
  49. #define YO r6
  50. #define LDA r7
  51. #define INC_X r8
  52. #define INC_Y r9
  53. #define I r12
  54. #define FP_ZERO [fp, #-228]
  55. #define FP_ZERO_0 [fp, #-228]
  56. #define FP_ZERO_1 [fp, #-224]
  57. #define M [fp, #-252 ]
  58. #define A [fp, #-256 ]
  59. #define X_PRE 64
  60. #define Y_PRE 0
  61. #define A_PRE 0
  62. /**************************************************************************************
  63. * Macro definitions
  64. **************************************************************************************/
  65. #if defined(DOUBLE)
  66. .macro INIT_F8
  67. pld [ YO , #Y_PRE ]
  68. pld [ YO , #Y_PRE+32 ]
  69. fldd d8 , FP_ZERO
  70. vmov.f64 d9 , d8
  71. vmov.f64 d10 , d8
  72. vmov.f64 d11 , d8
  73. vmov.f64 d12 , d8
  74. vmov.f64 d13 , d8
  75. vmov.f64 d14 , d8
  76. vmov.f64 d15 , d8
  77. .endm
  78. .macro KERNEL_F8X8
  79. pld [ XO , #X_PRE ]
  80. KERNEL_F8X1
  81. KERNEL_F8X1
  82. KERNEL_F8X1
  83. KERNEL_F8X1
  84. pld [ XO , #X_PRE ]
  85. KERNEL_F8X1
  86. KERNEL_F8X1
  87. KERNEL_F8X1
  88. KERNEL_F8X1
  89. .endm
  90. .macro KERNEL_F8X1
  91. pld [ AO2 , #A_PRE ]
  92. fldmiad XO! , { d2 }
  93. fldmiad AO1 , { d4 - d7 }
  94. vmla.f64 d8 , d2 , d4
  95. pld [ AO2 , #4*SIZE ]
  96. vmla.f64 d9 , d2 , d5
  97. add r3, AO1, #4*SIZE
  98. vmla.f64 d10 , d2 , d6
  99. vmla.f64 d11 , d2 , d7
  100. fldmiad r3 , { d4 - d7 }
  101. vmla.f64 d12 , d2 , d4
  102. vmla.f64 d13 , d2 , d5
  103. add AO1, AO1, LDA
  104. vmla.f64 d14 , d2 , d6
  105. add AO2, AO2, LDA
  106. vmla.f64 d15 , d2 , d7
  107. .endm
  108. .macro SAVE_F8
  109. fldmiad YO, { d4 - d7 }
  110. vmla.f64 d4 , d0, d8
  111. vmla.f64 d5 , d0, d9
  112. vmla.f64 d6 , d0, d10
  113. vmla.f64 d7 , d0, d11
  114. fstmiad YO!, { d4 - d7 }
  115. fldmiad YO, { d4 - d7 }
  116. vmla.f64 d4 , d0, d12
  117. vmla.f64 d5 , d0, d13
  118. vmla.f64 d6 , d0, d14
  119. vmla.f64 d7 , d0, d15
  120. fstmiad YO!, { d4 - d7 }
  121. .endm
  122. .macro INIT_F1
  123. fldd d12 , FP_ZERO
  124. .endm
  125. .macro KERNEL_F1X1
  126. fldmiad XO! , { d2 }
  127. fldmiad AO1 , { d8 }
  128. vmla.f64 d12 , d2 , d8
  129. add AO1, AO1, LDA
  130. .endm
  131. .macro SAVE_F1
  132. fldmiad YO, { d4 }
  133. vmla.f64 d4, d0, d12
  134. fstmiad YO!, { d4 }
  135. .endm
  136. /*********************************************************************************************/
  137. .macro INIT_S4
  138. fldd d12 , FP_ZERO
  139. vmov.f64 d13 , d12
  140. vmov.f64 d14 , d12
  141. vmov.f64 d15 , d12
  142. .endm
  143. .macro KERNEL_S4X4
  144. KERNEL_S4X1
  145. KERNEL_S4X1
  146. KERNEL_S4X1
  147. KERNEL_S4X1
  148. .endm
  149. .macro KERNEL_S4X1
  150. pld [ AO2 , #A_PRE ]
  151. fldmiad XO , { d2 }
  152. fldmiad AO1 , { d8 - d11 }
  153. vmla.f64 d12 , d2 , d8
  154. add AO1, AO1, LDA
  155. vmla.f64 d13 , d2 , d9
  156. add AO2, AO2, LDA
  157. vmla.f64 d14 , d2 , d10
  158. vmla.f64 d15 , d2 , d11
  159. add XO, XO , INC_X
  160. .endm
  161. .macro SAVE_S4
  162. fldmiad YO, { d4 }
  163. vmla.f64 d4 , d0, d12
  164. fstmiad YO, { d4 }
  165. add YO, YO, INC_Y
  166. fldmiad YO, { d5 }
  167. vmla.f64 d5 , d0, d13
  168. fstmiad YO, { d5 }
  169. add YO, YO, INC_Y
  170. fldmiad YO, { d4 }
  171. vmla.f64 d4 , d0, d14
  172. fstmiad YO, { d4 }
  173. add YO, YO, INC_Y
  174. fldmiad YO, { d5 }
  175. vmla.f64 d5 , d0, d15
  176. fstmiad YO, { d5 }
  177. add YO, YO, INC_Y
  178. .endm
  179. .macro INIT_S1
  180. fldd d12 , FP_ZERO
  181. .endm
  182. .macro KERNEL_S1X1
  183. fldmiad XO , { d2 }
  184. fldmiad AO1 , { d8 }
  185. vmla.f64 d12 , d2 , d8
  186. add AO1, AO1, LDA
  187. add XO, XO , INC_X
  188. .endm
  189. .macro SAVE_S1
  190. fldmiad YO, { d4 }
  191. vmla.f64 d4, d0, d12
  192. fstmiad YO , { d4 }
  193. add YO, YO, INC_Y
  194. .endm
  195. #else /************************* SINGLE PRECISION *****************************************/
  196. .macro INIT_F8
  197. pld [ YO , #Y_PRE ]
  198. flds s8 , FP_ZERO
  199. vmov.f32 s9 , s8
  200. vmov.f32 s10 , s8
  201. vmov.f32 s11 , s8
  202. vmov.f32 s12 , s8
  203. vmov.f32 s13 , s8
  204. vmov.f32 s14 , s8
  205. vmov.f32 s15 , s8
  206. .endm
  207. .macro KERNEL_F8X8
  208. pld [ XO , #X_PRE ]
  209. KERNEL_F8X1
  210. KERNEL_F8X1
  211. KERNEL_F8X1
  212. KERNEL_F8X1
  213. KERNEL_F8X1
  214. KERNEL_F8X1
  215. KERNEL_F8X1
  216. KERNEL_F8X1
  217. .endm
  218. .macro KERNEL_F8X1
  219. pld [ AO2, #A_PRE ]
  220. fldmias XO! , { s2 }
  221. fldmias AO1 , { s4 - s7 }
  222. vmla.f32 s8 , s2 , s4
  223. vmla.f32 s9 , s2 , s5
  224. vmla.f32 s10 , s2 , s6
  225. vmla.f32 s11 , s2 , s7
  226. add r3, AO1, #4*SIZE
  227. fldmias r3 , { s4 - s7 }
  228. vmla.f32 s12 , s2 , s4
  229. vmla.f32 s13 , s2 , s5
  230. vmla.f32 s14 , s2 , s6
  231. vmla.f32 s15 , s2 , s7
  232. add AO1, AO1, LDA
  233. add AO2, AO2, LDA
  234. .endm
  235. .macro SAVE_F8
  236. fldmias YO, { s4 - s7 }
  237. vmla.f32 s4 , s0, s8
  238. vmla.f32 s5 , s0, s9
  239. vmla.f32 s6 , s0, s10
  240. vmla.f32 s7 , s0, s11
  241. fstmias YO!, { s4 - s7 }
  242. fldmias YO, { s4 - s7 }
  243. vmla.f32 s4 , s0, s12
  244. vmla.f32 s5 , s0, s13
  245. vmla.f32 s6 , s0, s14
  246. vmla.f32 s7 , s0, s15
  247. fstmias YO!, { s4 - s7 }
  248. .endm
  249. .macro INIT_F1
  250. flds s12 , FP_ZERO
  251. .endm
  252. .macro KERNEL_F1X1
  253. fldmias XO! , { s2 }
  254. fldmias AO1 , { s8 }
  255. vmla.f32 s12 , s2 , s8
  256. add AO1, AO1, LDA
  257. .endm
  258. .macro SAVE_F1
  259. fldmias YO, { s4 }
  260. vmla.f32 s4, s0, s12
  261. fstmias YO!, { s4 }
  262. .endm
  263. /*********************************************************************************************/
  264. .macro INIT_S4
  265. flds s12 , FP_ZERO
  266. vmov.f32 s13 , s12
  267. vmov.f32 s14 , s12
  268. vmov.f32 s15 , s12
  269. .endm
  270. .macro KERNEL_S4X4
  271. pld [ AO2 , #A_PRE ]
  272. KERNEL_S4X1
  273. KERNEL_S4X1
  274. pld [ AO2 , #A_PRE ]
  275. KERNEL_S4X1
  276. KERNEL_S4X1
  277. .endm
  278. .macro KERNEL_S4X1
  279. fldmias XO , { s2 }
  280. fldmias AO1 , { s8 - s11 }
  281. vmla.f32 s12 , s2 , s8
  282. vmla.f32 s13 , s2 , s9
  283. vmla.f32 s14 , s2 , s10
  284. vmla.f32 s15 , s2 , s11
  285. add AO1, AO1, LDA
  286. add AO2, AO2, LDA
  287. add XO, XO , INC_X
  288. .endm
  289. .macro SAVE_S4
  290. fldmias YO, { s4 }
  291. vmla.f32 s4 , s0, s12
  292. fstmias YO, { s4 }
  293. add YO, YO, INC_Y
  294. fldmias YO, { s5 }
  295. vmla.f32 s5 , s0, s13
  296. fstmias YO, { s5 }
  297. add YO, YO, INC_Y
  298. fldmias YO, { s4 }
  299. vmla.f32 s4 , s0, s14
  300. fstmias YO, { s4 }
  301. add YO, YO, INC_Y
  302. fldmias YO, { s5 }
  303. vmla.f32 s5 , s0, s15
  304. fstmias YO, { s5 }
  305. add YO, YO, INC_Y
  306. .endm
  307. .macro INIT_S1
  308. flds s12 , FP_ZERO
  309. .endm
  310. .macro KERNEL_S1X1
  311. fldmias XO , { s2 }
  312. fldmias AO1 , { s8 }
  313. vmla.f32 s12 , s2 , s8
  314. add AO1, AO1, LDA
  315. add XO, XO , INC_X
  316. .endm
  317. .macro SAVE_S1
  318. fldmias YO, { s4 }
  319. vmla.f32 s4, s0, s12
  320. fstmias YO , { s4 }
  321. add YO, YO, INC_Y
  322. .endm
  323. #endif
  324. /**************************************************************************************
  325. * End of macro definitions
  326. **************************************************************************************/
  327. PROLOGUE
  328. .align 5
  329. push {r4 - r9 , fp}
  330. add fp, sp, #28
  331. sub sp, sp, #STACKSIZE // reserve stack
  332. sub r12, fp, #192
  333. #if defined(DOUBLE)
  334. vstm r12, { d8 - d15 } // store floating point registers
  335. #else
  336. vstm r12, { s8 - s15 } // store floating point registers
  337. #endif
  338. movs r12, #0
  339. str r12, FP_ZERO
  340. str r12, FP_ZERO_1
  341. cmp OLD_M, #0
  342. ble gemvn_kernel_L999
  343. cmp N, #0
  344. ble gemvn_kernel_L999
  345. str OLD_A, A
  346. str OLD_M, M
  347. ldr INC_X , OLD_INC_X
  348. ldr INC_Y , OLD_INC_Y
  349. cmp INC_X, #0
  350. beq gemvn_kernel_L999
  351. cmp INC_Y, #0
  352. beq gemvn_kernel_L999
  353. ldr LDA, OLD_LDA
  354. #if defined(DOUBLE)
  355. lsl LDA, LDA, #3 // LDA * SIZE
  356. #else
  357. lsl LDA, LDA, #2 // LDA * SIZE
  358. #endif
  359. cmp INC_X, #1
  360. bne gemvn_kernel_S4_BEGIN
  361. cmp INC_Y, #1
  362. bne gemvn_kernel_S4_BEGIN
  363. gemvn_kernel_F4_BEGIN:
  364. ldr YO , Y
  365. ldr I, M
  366. asrs I, I, #3 // I = M / 8
  367. ble gemvn_kernel_F1_BEGIN
  368. gemvn_kernel_F4X4:
  369. ldr AO1, A
  370. add AO2, AO1, LDA
  371. add r3 , AO1, #8*SIZE
  372. str r3 , A
  373. add AO2, AO2, LDA
  374. add AO2, AO2, LDA
  375. ldr XO , X
  376. INIT_F8
  377. asrs J, N, #3 // J = N / 8
  378. ble gemvn_kernel_F4X1
  379. gemvn_kernel_F4X4_10:
  380. KERNEL_F8X8
  381. subs J, J, #1
  382. bne gemvn_kernel_F4X4_10
  383. gemvn_kernel_F4X1:
  384. ands J, N , #7
  385. ble gemvn_kernel_F4_END
  386. gemvn_kernel_F4X1_10:
  387. KERNEL_F8X1
  388. subs J, J, #1
  389. bne gemvn_kernel_F4X1_10
  390. gemvn_kernel_F4_END:
  391. SAVE_F8
  392. subs I , I , #1
  393. bne gemvn_kernel_F4X4
  394. gemvn_kernel_F1_BEGIN:
  395. ldr I, M
  396. ands I, I , #7
  397. ble gemvn_kernel_L999
  398. gemvn_kernel_F1X1:
  399. ldr AO1, A
  400. add r3, AO1, #SIZE
  401. str r3, A
  402. ldr XO , X
  403. INIT_F1
  404. mov J, N
  405. gemvn_kernel_F1X1_10:
  406. KERNEL_F1X1
  407. subs J, J, #1
  408. bne gemvn_kernel_F1X1_10
  409. gemvn_kernel_F1_END:
  410. SAVE_F1
  411. subs I , I , #1
  412. bne gemvn_kernel_F1X1
  413. b gemvn_kernel_L999
  414. /*************************************************************************************************************/
  415. gemvn_kernel_S4_BEGIN:
  416. #if defined(DOUBLE)
  417. lsl INC_X, INC_X, #3 // INC_X * SIZE
  418. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  419. #else
  420. lsl INC_X, INC_X, #2 // INC_X * SIZE
  421. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  422. #endif
  423. ldr YO , Y
  424. ldr I, M
  425. asrs I, I, #2 // I = M / 4
  426. ble gemvn_kernel_S1_BEGIN
  427. gemvn_kernel_S4X4:
  428. ldr AO1, A
  429. add AO2, AO1, LDA
  430. add r3 , AO1, #4*SIZE
  431. str r3 , A
  432. ldr XO , X
  433. INIT_S4
  434. asrs J, N, #2 // J = N / 4
  435. ble gemvn_kernel_S4X1
  436. gemvn_kernel_S4X4_10:
  437. KERNEL_S4X4
  438. subs J, J, #1
  439. bne gemvn_kernel_S4X4_10
  440. gemvn_kernel_S4X1:
  441. ands J, N , #3
  442. ble gemvn_kernel_S4_END
  443. gemvn_kernel_S4X1_10:
  444. KERNEL_S4X1
  445. subs J, J, #1
  446. bne gemvn_kernel_S4X1_10
  447. gemvn_kernel_S4_END:
  448. SAVE_S4
  449. subs I , I , #1
  450. bne gemvn_kernel_S4X4
  451. gemvn_kernel_S1_BEGIN:
  452. ldr I, M
  453. ands I, I , #3
  454. ble gemvn_kernel_L999
  455. gemvn_kernel_S1X1:
  456. ldr AO1, A
  457. add r3, AO1, #SIZE
  458. str r3, A
  459. ldr XO , X
  460. INIT_S1
  461. mov J, N
  462. gemvn_kernel_S1X1_10:
  463. KERNEL_S1X1
  464. subs J, J, #1
  465. bne gemvn_kernel_S1X1_10
  466. gemvn_kernel_S1_END:
  467. SAVE_S1
  468. subs I , I , #1
  469. bne gemvn_kernel_S1X1
  470. /*************************************************************************************************************/
  471. gemvn_kernel_L999:
  472. sub r3, fp, #192
  473. #if defined(DOUBLE)
  474. vldm r3, { d8 - d15 } // restore floating point registers
  475. #else
  476. vldm r3, { s8 - s15 } // restore floating point registers
  477. #endif
  478. mov r0, #0 // set return value
  479. sub sp, fp, #28
  480. pop {r4 -r9 ,fp}
  481. bx lr
  482. EPILOGUE