You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_vfp.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/29 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_LDA [fp, #0 ]
  38. #define X [fp, #4 ]
  39. #define OLD_INC_X [fp, #8 ]
  40. #define Y [fp, #12 ]
  41. #define OLD_INC_Y [fp, #16 ]
  42. #define OLD_A r3
  43. #define OLD_N r1
  44. #define M r0
  45. #define AO1 r1
  46. #define J r2
  47. #define AO2 r4
  48. #define XO r5
  49. #define YO r6
  50. #define LDA r7
  51. #define INC_X r8
  52. #define INC_Y r9
  53. #define I r12
  54. #define FP_ZERO [fp, #-228]
  55. #define FP_ZERO_0 [fp, #-228]
  56. #define FP_ZERO_1 [fp, #-224]
  57. #define N [fp, #-252 ]
  58. #define A [fp, #-256 ]
  59. #define X_PRE 512
  60. #define A_PRE 512
  61. #define Y_PRE 32
  62. /**************************************************************************************
  63. * Macro definitions
  64. **************************************************************************************/
  65. #if !defined(CONJ) && !defined(XCONJ)
  66. #define KMAC_R fnmacd
  67. #define KMAC_I fmacd
  68. #define FMAC_R1 fmacd
  69. #define FMAC_R2 fnmacd
  70. #define FMAC_I1 fmacd
  71. #define FMAC_I2 fmacd
  72. #elif defined(CONJ) && !defined(XCONJ)
  73. #define KMAC_R fmacd
  74. #define KMAC_I fnmacd
  75. #define FMAC_R1 fmacd
  76. #define FMAC_R2 fnmacd
  77. #define FMAC_I1 fmacd
  78. #define FMAC_I2 fmacd
  79. #elif !defined(CONJ) && defined(XCONJ)
  80. #define KMAC_R fmacd
  81. #define KMAC_I fnmacd
  82. #define FMAC_R1 fmacd
  83. #define FMAC_R2 fmacd
  84. #define FMAC_I1 fnmacd
  85. #define FMAC_I2 fmacd
  86. #else
  87. #define KMAC_R fnmacd
  88. #define KMAC_I fmacd
  89. #define FMAC_R1 fmacd
  90. #define FMAC_R2 fmacd
  91. #define FMAC_I1 fnmacd
  92. #define FMAC_I2 fmacd
  93. #endif
  94. .macro INIT_F2
  95. fldd d12, FP_ZERO
  96. vmov.f64 d13, d12
  97. vmov.f64 d14, d12
  98. vmov.f64 d15, d12
  99. .endm
  100. .macro KERNEL_F2X4
  101. KERNEL_F2X1
  102. KERNEL_F2X1
  103. KERNEL_F2X1
  104. KERNEL_F2X1
  105. .endm
  106. .macro KERNEL_F2X1
  107. fldmiad XO! , { d2 - d3 }
  108. fldmiad AO1!, { d4 - d5 }
  109. fmacd d12 , d4 , d2
  110. fmacd d13 , d4 , d3
  111. fldmiad AO2!, { d8 - d9 }
  112. KMAC_R d12 , d5 , d3
  113. KMAC_I d13 , d5 , d2
  114. fmacd d14 , d8 , d2
  115. fmacd d15 , d8 , d3
  116. KMAC_R d14 , d9 , d3
  117. KMAC_I d15 , d9 , d2
  118. .endm
  119. .macro SAVE_F2
  120. fldmiad YO, { d4 - d7 }
  121. FMAC_R1 d4 , d0 , d12
  122. FMAC_I1 d5 , d0 , d13
  123. FMAC_R2 d4 , d1 , d13
  124. FMAC_I2 d5 , d1 , d12
  125. FMAC_R1 d6 , d0 , d14
  126. FMAC_I1 d7 , d0 , d15
  127. FMAC_R2 d6 , d1 , d15
  128. FMAC_I2 d7 , d1 , d14
  129. fstmiad YO!, { d4 - d7 }
  130. .endm
  131. /************************************************************************************************/
  132. .macro INIT_F1
  133. fldd d12, FP_ZERO
  134. vmov.f64 d13, d12
  135. .endm
  136. .macro KERNEL_F1X4
  137. KERNEL_F1X1
  138. KERNEL_F1X1
  139. KERNEL_F1X1
  140. KERNEL_F1X1
  141. .endm
  142. .macro KERNEL_F1X1
  143. fldmiad XO! , { d2 - d3 }
  144. fldmiad AO1!, { d4 - d5 }
  145. fmacd d12 , d4 , d2
  146. fmacd d13 , d4 , d3
  147. KMAC_R d12 , d5 , d3
  148. KMAC_I d13 , d5 , d2
  149. .endm
  150. .macro SAVE_F1
  151. fldmiad YO, { d4 - d5 }
  152. FMAC_R1 d4 , d0 , d12
  153. FMAC_I1 d5 , d0 , d13
  154. FMAC_R2 d4 , d1 , d13
  155. FMAC_I2 d5 , d1 , d12
  156. fstmiad YO!, { d4 - d5 }
  157. .endm
  158. /************************************************************************************************/
  159. .macro INIT_S2
  160. fldd d12, FP_ZERO
  161. vmov.f64 d13, d12
  162. vmov.f64 d14, d12
  163. vmov.f64 d15, d12
  164. .endm
  165. .macro KERNEL_S2X4
  166. KERNEL_S2X1
  167. KERNEL_S2X1
  168. KERNEL_S2X1
  169. KERNEL_S2X1
  170. .endm
  171. .macro KERNEL_S2X1
  172. fldmiad XO , { d2 - d3 }
  173. fldmiad AO1!, { d4 - d5 }
  174. fldmiad AO2!, { d8 - d9 }
  175. fmacd d12 , d4 , d2
  176. fmacd d13 , d4 , d3
  177. KMAC_R d12 , d5 , d3
  178. KMAC_I d13 , d5 , d2
  179. fmacd d14 , d8 , d2
  180. fmacd d15 , d8 , d3
  181. KMAC_R d14 , d9 , d3
  182. KMAC_I d15 , d9 , d2
  183. add XO, XO, INC_X
  184. .endm
  185. .macro SAVE_S2
  186. fldmiad YO, { d4 - d5 }
  187. FMAC_R1 d4 , d0 , d12
  188. FMAC_I1 d5 , d0 , d13
  189. FMAC_R2 d4 , d1 , d13
  190. FMAC_I2 d5 , d1 , d12
  191. fstmiad YO, { d4 - d5 }
  192. add YO, YO, INC_Y
  193. fldmiad YO, { d6 - d7 }
  194. FMAC_R1 d6 , d0 , d14
  195. FMAC_I1 d7 , d0 , d15
  196. FMAC_R2 d6 , d1 , d15
  197. FMAC_I2 d7 , d1 , d14
  198. fstmiad YO, { d6 - d7 }
  199. add YO, YO, INC_Y
  200. .endm
  201. /************************************************************************************************/
  202. .macro INIT_S1
  203. fldd d12, FP_ZERO
  204. vmov.f64 d13, d12
  205. .endm
  206. .macro KERNEL_S1X4
  207. KERNEL_S1X1
  208. KERNEL_S1X1
  209. KERNEL_S1X1
  210. KERNEL_S1X1
  211. .endm
  212. .macro KERNEL_S1X1
  213. fldmiad XO , { d2 - d3 }
  214. fldmiad AO1!, { d4 - d5 }
  215. fmacd d12 , d4 , d2
  216. fmacd d13 , d4 , d3
  217. KMAC_R d12 , d5 , d3
  218. KMAC_I d13 , d5 , d2
  219. add XO, XO, INC_X
  220. .endm
  221. .macro SAVE_S1
  222. fldmiad YO, { d4 - d5 }
  223. FMAC_R1 d4 , d0 , d12
  224. FMAC_I1 d5 , d0 , d13
  225. FMAC_R2 d4 , d1 , d13
  226. FMAC_I2 d5 , d1 , d12
  227. fstmiad YO, { d4 - d5 }
  228. add YO, YO, INC_Y
  229. .endm
  230. /**************************************************************************************
  231. * End of macro definitions
  232. **************************************************************************************/
  233. PROLOGUE
  234. .align 5
  235. push {r4 - r9 , fp}
  236. add fp, sp, #28
  237. sub sp, sp, #STACKSIZE // reserve stack
  238. sub r12, fp, #192
  239. #if defined(DOUBLE)
  240. vstm r12, { d8 - d15 } // store floating point registers
  241. #else
  242. vstm r12, { s8 - s15 } // store floating point registers
  243. #endif
  244. movs r12, #0
  245. str r12, FP_ZERO
  246. str r12, FP_ZERO_1
  247. cmp M, #0
  248. ble zgemvt_kernel_L999
  249. cmp OLD_N, #0
  250. ble zgemvt_kernel_L999
  251. str OLD_A, A
  252. str OLD_N, N
  253. ldr INC_X , OLD_INC_X
  254. ldr INC_Y , OLD_INC_Y
  255. cmp INC_X, #0
  256. beq zgemvt_kernel_L999
  257. cmp INC_Y, #0
  258. beq zgemvt_kernel_L999
  259. ldr LDA, OLD_LDA
  260. #if defined(DOUBLE)
  261. lsl LDA, LDA, #4 // LDA * SIZE
  262. #else
  263. lsl LDA, LDA, #3 // LDA * SIZE
  264. #endif
  265. cmp INC_X, #1
  266. bne zgemvt_kernel_S2_BEGIN
  267. cmp INC_Y, #1
  268. bne zgemvt_kernel_S2_BEGIN
  269. zgemvt_kernel_F2_BEGIN:
  270. ldr YO , Y
  271. ldr J, N
  272. asrs J, J, #1 // J = N / 2
  273. ble zgemvt_kernel_F1_BEGIN
  274. zgemvt_kernel_F2X4:
  275. ldr AO1, A
  276. add AO2, AO1, LDA
  277. add r3 , AO2, LDA
  278. str r3 , A
  279. ldr XO , X
  280. INIT_F2
  281. asrs I, M, #2 // I = M / 4
  282. ble zgemvt_kernel_F2X1
  283. zgemvt_kernel_F2X4_10:
  284. KERNEL_F2X4
  285. subs I, I, #1
  286. bne zgemvt_kernel_F2X4_10
  287. zgemvt_kernel_F2X1:
  288. ands I, M , #3
  289. ble zgemvt_kernel_F2_END
  290. zgemvt_kernel_F2X1_10:
  291. KERNEL_F2X1
  292. subs I, I, #1
  293. bne zgemvt_kernel_F2X1_10
  294. zgemvt_kernel_F2_END:
  295. SAVE_F2
  296. subs J , J , #1
  297. bne zgemvt_kernel_F2X4
  298. zgemvt_kernel_F1_BEGIN:
  299. ldr J, N
  300. ands J, J, #1
  301. ble zgemvt_kernel_L999
  302. zgemvt_kernel_F1X4:
  303. ldr AO1, A
  304. ldr XO , X
  305. INIT_F1
  306. asrs I, M, #2 // I = M / 4
  307. ble zgemvt_kernel_F1X1
  308. zgemvt_kernel_F1X4_10:
  309. KERNEL_F1X4
  310. subs I, I, #1
  311. bne zgemvt_kernel_F1X4_10
  312. zgemvt_kernel_F1X1:
  313. ands I, M , #3
  314. ble zgemvt_kernel_F1_END
  315. zgemvt_kernel_F1X1_10:
  316. KERNEL_F1X1
  317. subs I, I, #1
  318. bne zgemvt_kernel_F1X1_10
  319. zgemvt_kernel_F1_END:
  320. SAVE_F1
  321. b zgemvt_kernel_L999
  322. /*************************************************************************************************************/
  323. zgemvt_kernel_S2_BEGIN:
  324. #if defined(DOUBLE)
  325. lsl INC_X, INC_X, #4 // INC_X * SIZE
  326. lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
  327. #else
  328. lsl INC_X, INC_X, #3 // INC_X * SIZE
  329. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  330. #endif
  331. ldr YO , Y
  332. ldr J, N
  333. asrs J, J, #1 // J = N / 2
  334. ble zgemvt_kernel_S1_BEGIN
  335. zgemvt_kernel_S2X4:
  336. ldr AO1, A
  337. add AO2, AO1, LDA
  338. add r3 , AO2, LDA
  339. str r3 , A
  340. ldr XO , X
  341. INIT_S2
  342. asrs I, M, #2 // I = M / 4
  343. ble zgemvt_kernel_S2X1
  344. zgemvt_kernel_S2X4_10:
  345. KERNEL_S2X4
  346. subs I, I, #1
  347. bne zgemvt_kernel_S2X4_10
  348. zgemvt_kernel_S2X1:
  349. ands I, M , #3
  350. ble zgemvt_kernel_S2_END
  351. zgemvt_kernel_S2X1_10:
  352. KERNEL_S2X1
  353. subs I, I, #1
  354. bne zgemvt_kernel_S2X1_10
  355. zgemvt_kernel_S2_END:
  356. SAVE_S2
  357. subs J , J , #1
  358. bne zgemvt_kernel_S2X4
  359. zgemvt_kernel_S1_BEGIN:
  360. ldr J, N
  361. ands J, J, #1
  362. ble zgemvt_kernel_L999
  363. zgemvt_kernel_S1X4:
  364. ldr AO1, A
  365. ldr XO , X
  366. INIT_S1
  367. asrs I, M, #2 // I = M / 4
  368. ble zgemvt_kernel_S1X1
  369. zgemvt_kernel_S1X4_10:
  370. KERNEL_S1X4
  371. subs I, I, #1
  372. bne zgemvt_kernel_S1X4_10
  373. zgemvt_kernel_S1X1:
  374. ands I, M , #3
  375. ble zgemvt_kernel_S1_END
  376. zgemvt_kernel_S1X1_10:
  377. KERNEL_S1X1
  378. subs I, I, #1
  379. bne zgemvt_kernel_S1X1_10
  380. zgemvt_kernel_S1_END:
  381. SAVE_S1
  382. /*************************************************************************************************************/
  383. zgemvt_kernel_L999:
  384. sub r3, fp, #192
  385. #if defined(DOUBLE)
  386. vldm r3, { d8 - d15 } // restore floating point registers
  387. #else
  388. vldm r3, { s8 - s15 } // restore floating point registers
  389. #endif
  390. mov r0, #0 // set return value
  391. sub sp, fp, #28
  392. pop {r4 -r9 ,fp}
  393. bx lr
  394. EPILOGUE