You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x2_vfp.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA d0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define ALPHA [fp, #-276 ]
  56. #define B [fp, #4 ]
  57. #define OLD_C [fp, #8 ]
  58. #define OLD_LDC [fp, #12 ]
  59. #define OFFSET [fp, #16 ]
  60. #define I r0
  61. #define J r1
  62. #define L r2
  63. #define AO r5
  64. #define BO r6
  65. #define CO1 r8
  66. #define CO2 r9
  67. #define K1 r7
  68. #define BC r12
  69. #define A_PRE 64
  70. #define B_PRE 64
  71. #define C_PRE 64
  72. /**************************************************************************************
  73. * Macro definitions
  74. **************************************************************************************/
  75. .macro INIT4x2
  76. vsub.f64 d8 , d8 , d8
  77. vmov.f64 d9, d8
  78. vmov.f64 d10, d8
  79. vmov.f64 d11, d8
  80. vmov.f64 d12, d8
  81. vmov.f64 d13, d8
  82. vmov.f64 d14, d8
  83. vmov.f64 d15, d8
  84. .endm
  85. .macro KERNEL4x2_SUB
  86. fldd d4 , [ BO ]
  87. fldd d0 , [ AO ]
  88. fldd d1 , [ AO, #8 ]
  89. pld [ AO , #A_PRE ]
  90. fmacd d8 , d0, d4
  91. fldd d2 , [ AO, #16 ]
  92. fmacd d9 , d1, d4
  93. fldd d3 , [ AO, #24 ]
  94. fmacd d10 , d2, d4
  95. fldd d5 , [ BO, #8 ]
  96. fmacd d11 , d3, d4
  97. fmacd d12 , d0, d5
  98. fmacd d13 , d1, d5
  99. add AO , AO, #32
  100. fmacd d14 , d2, d5
  101. add BO , BO, #16
  102. fmacd d15 , d3, d5
  103. .endm
  104. .macro SAVE4x2
  105. ldr r3 , LDC
  106. add CO2 , CO1, r3
  107. fldd d0, ALPHA
  108. fmuld d4 , d0 , d8
  109. fmuld d5 , d0 , d9
  110. fmuld d6 , d0 , d10
  111. fmuld d7 , d0 , d11
  112. fstd d4 , [CO1]
  113. fstd d5 , [CO1, #8 ]
  114. fstd d6 , [CO1, #16 ]
  115. fstd d7 , [CO1, #24 ]
  116. fmuld d4 , d0 , d12
  117. fmuld d5 , d0 , d13
  118. fmuld d6 , d0 , d14
  119. fmuld d7 , d0 , d15
  120. fstd d4 , [CO2]
  121. fstd d5 , [CO2, #8 ]
  122. fstd d6 , [CO2, #16 ]
  123. fstd d7 , [CO2, #24 ]
  124. add CO1, CO1, #32
  125. .endm
  126. /******************************************************************************/
  127. .macro INIT2x2
  128. vsub.f64 d8 , d8 , d8
  129. vmov.f64 d9, d8
  130. vmov.f64 d12, d8
  131. vmov.f64 d13, d8
  132. .endm
  133. .macro KERNEL2x2_SUB
  134. fldd d4 , [ BO ]
  135. fldd d5 , [ BO, #8 ]
  136. fldd d0 , [ AO ]
  137. fldd d1 , [ AO, #8 ]
  138. fmacd d8 , d0, d4
  139. fmacd d9 , d1, d4
  140. fmacd d12 , d0, d5
  141. fmacd d13 , d1, d5
  142. add AO , AO, #16
  143. add BO , BO, #16
  144. .endm
  145. .macro SAVE2x2
  146. ldr r3 , LDC
  147. add CO2 , CO1, r3
  148. fldd d0, ALPHA
  149. fmuld d4 , d0 , d8
  150. fmuld d5 , d0 , d9
  151. fstd d4 , [CO1]
  152. fstd d5 , [CO1, #8 ]
  153. fmuld d4 , d0 , d12
  154. fmuld d5 , d0 , d13
  155. fstd d4 , [CO2]
  156. fstd d5 , [CO2, #8 ]
  157. add CO1, CO1, #16
  158. .endm
  159. /******************************************************************************/
  160. .macro INIT1x2
  161. vsub.f64 d8 , d8 , d8
  162. vmov.f64 d12, d8
  163. .endm
  164. .macro KERNEL1x2_SUB
  165. fldd d4 , [ BO ]
  166. fldd d5 , [ BO, #8 ]
  167. fldd d0 , [ AO ]
  168. fmacd d8 , d0, d4
  169. fmacd d12 , d0, d5
  170. add AO , AO, #8
  171. add BO , BO, #16
  172. .endm
  173. .macro SAVE1x2
  174. ldr r3 , LDC
  175. add CO2 , CO1, r3
  176. fldd d0, ALPHA
  177. fmuld d4 , d0 , d8
  178. fstd d4 , [CO1]
  179. fmuld d4 , d0 , d12
  180. fstd d4 , [CO2]
  181. add CO1, CO1, #8
  182. .endm
  183. /******************************************************************************/
  184. .macro INIT4x1
  185. vsub.f64 d8 , d8 , d8
  186. vmov.f64 d9, d8
  187. vmov.f64 d10, d8
  188. vmov.f64 d11, d8
  189. .endm
  190. .macro KERNEL4x1_SUB
  191. fldd d4 , [ BO ]
  192. fldd d0 , [ AO ]
  193. fldd d1 , [ AO, #8 ]
  194. fldd d2 , [ AO, #16 ]
  195. fldd d3 , [ AO, #24 ]
  196. fmacd d8 , d0, d4
  197. fmacd d9 , d1, d4
  198. fmacd d10 , d2, d4
  199. fmacd d11 , d3, d4
  200. add AO , AO, #32
  201. add BO , BO, #8
  202. .endm
  203. .macro SAVE4x1
  204. fldd d0, ALPHA
  205. fmuld d4 , d0 , d8
  206. fmuld d5 , d0 , d9
  207. fmuld d6 , d0 , d10
  208. fmuld d7 , d0 , d11
  209. fstd d4 , [CO1]
  210. fstd d5 , [CO1, #8 ]
  211. fstd d6 , [CO1, #16 ]
  212. fstd d7 , [CO1, #24 ]
  213. add CO1, CO1, #32
  214. .endm
  215. /******************************************************************************/
  216. .macro INIT2x1
  217. vsub.f64 d8 , d8 , d8
  218. vmov.f64 d9 , d8
  219. .endm
  220. .macro KERNEL2x1_SUB
  221. fldd d4 , [ BO ]
  222. fldd d0 , [ AO ]
  223. fldd d1 , [ AO, #8 ]
  224. fmacd d8 , d0, d4
  225. fmacd d9 , d1, d4
  226. add AO , AO, #16
  227. add BO , BO, #8
  228. .endm
  229. .macro SAVE2x1
  230. fldd d0, ALPHA
  231. fmuld d4 , d0 , d8
  232. fmuld d5 , d0 , d9
  233. fstd d4 , [CO1]
  234. fstd d5 , [CO1, #8 ]
  235. add CO1, CO1, #16
  236. .endm
  237. /******************************************************************************/
  238. .macro INIT1x1
  239. vsub.f64 d8 , d8 , d8
  240. .endm
  241. .macro KERNEL1x1_SUB
  242. fldd d4 , [ BO ]
  243. fldd d0 , [ AO ]
  244. fmacd d8 , d0, d4
  245. add AO , AO, #8
  246. add BO , BO, #8
  247. .endm
  248. .macro SAVE1x1
  249. fldd d0, ALPHA
  250. fmuld d4 , d0 , d8
  251. fstd d4 , [CO1]
  252. add CO1, CO1, #8
  253. .endm
  254. /**************************************************************************************
  255. * End of macro definitions
  256. **************************************************************************************/
  257. PROLOGUE
  258. .align 5
  259. push {r4 - r9, fp}
  260. add fp, sp, #24
  261. sub sp, sp, #STACKSIZE // reserve stack
  262. str OLD_M, M
  263. str OLD_N, N
  264. str OLD_K, K
  265. str OLD_A, A
  266. vstr OLD_ALPHA, ALPHA
  267. sub r3, fp, #128
  268. vstm r3, { d8 - d15} // store floating point registers
  269. ldr r3, OLD_LDC
  270. lsl r3, r3, #3 // ldc = ldc * 8
  271. str r3, LDC
  272. ldr r3, OLD_C
  273. str r3, C
  274. ldr BC, B
  275. ldr r3, OFFSET
  276. #ifndef LEFT
  277. neg r3 , r3
  278. #endif
  279. str r3 , KK
  280. ldr J, N
  281. asrs J, J, #1 // J = J / 2
  282. ble _L1_BEGIN
  283. _L2_BEGIN:
  284. ldr CO1, C // CO1 = C
  285. ldr r4 , LDC
  286. lsl r4 , r4 , #1 // LDC * 2
  287. add r3 , r4, CO1
  288. str r3 , C // store C
  289. #if defined(LEFT)
  290. ldr r3 , OFFSET
  291. str r3 , KK
  292. #endif
  293. ldr AO, A // AO = A
  294. _L2_M4_BEGIN:
  295. ldr I, M
  296. asrs I, I, #2 // I = I / 4
  297. ble _L2_M2_BEGIN
  298. _L2_M4_20:
  299. INIT4x2
  300. #if (defined(LEFT) && defined(TRANSA)) || \
  301. (!defined(LEFT) && !defined(TRANSA))
  302. mov BO, BC
  303. #else
  304. mov BO, BC
  305. ldr r3 , KK
  306. lsls r4 , r3 , #4 // 2 double values
  307. add BO , BO , r4
  308. lsls r4 , r3 , #5 // 4 double values
  309. add AO , AO , r4
  310. #endif
  311. #ifndef TRMMKERNEL
  312. ldr L , K
  313. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  314. ldr L , K
  315. ldr r3, KK
  316. sub L , L, r3
  317. str L , KKK
  318. #else
  319. ldr L , KK
  320. #ifdef LEFT
  321. add L , L , #4 // number of values in AO
  322. #else
  323. add L , L , #2 // number of values in BO
  324. #endif
  325. str L , KKK
  326. #endif
  327. mov K1, L
  328. asrs L , K1, #3 // L = L / 8
  329. ble _L2_M4_40
  330. .align 5
  331. _L2_M4_22:
  332. pld [ BO , #B_PRE ]
  333. KERNEL4x2_SUB
  334. KERNEL4x2_SUB
  335. pld [ BO , #B_PRE ]
  336. KERNEL4x2_SUB
  337. KERNEL4x2_SUB
  338. pld [ BO , #B_PRE ]
  339. KERNEL4x2_SUB
  340. KERNEL4x2_SUB
  341. pld [ BO , #B_PRE ]
  342. KERNEL4x2_SUB
  343. KERNEL4x2_SUB
  344. subs L, L, #1
  345. bgt _L2_M4_22
  346. _L2_M4_40:
  347. ands L , K1, #7 // L = L % 8
  348. ble _L2_M4_100
  349. _L2_M4_42:
  350. KERNEL4x2_SUB
  351. subs L, L, #1
  352. bgt _L2_M4_42
  353. _L2_M4_100:
  354. SAVE4x2
  355. #if (defined(LEFT) && defined(TRANSA)) || \
  356. (!defined(LEFT) && !defined(TRANSA))
  357. ldr r3 , K
  358. ldr r4 , KKK
  359. sub r3 , r3 , r4
  360. lsls r4 , r3 , #4 // 2 double values
  361. add BO , BO , r4
  362. lsls r4 , r3 , #5 // 4 double values
  363. add AO , AO , r4
  364. #endif
  365. #if defined(LEFT)
  366. ldr r3 , KK
  367. add r3 , r3 , #4 // number of values in AO
  368. str r3 , KK
  369. #endif
  370. _L2_M4_END:
  371. subs I, I, #1
  372. bgt _L2_M4_20
  373. _L2_M2_BEGIN:
  374. ldr I, M
  375. tst I , #3
  376. ble _L2_END
  377. tst I, #2 // I = I / 2
  378. ble _L2_M1_BEGIN
  379. _L2_M2_20:
  380. INIT2x2
  381. #if (defined(LEFT) && defined(TRANSA)) || \
  382. (!defined(LEFT) && !defined(TRANSA))
  383. mov BO, BC
  384. #else
  385. mov BO, BC
  386. ldr r3 , KK
  387. lsls r4 , r3 , #4 // 2 double values
  388. add BO , BO , r4
  389. lsls r4 , r3 , #4 // 2 double values
  390. add AO , AO , r4
  391. #endif
  392. #ifndef TRMMKERNEL
  393. ldr L , K
  394. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  395. ldr L , K
  396. ldr r3, KK
  397. sub L , L, r3
  398. str L , KKK
  399. #else
  400. ldr L , KK
  401. #ifdef LEFT
  402. add L , L , #2 // number of values in AO
  403. #else
  404. add L , L , #2 // number of values in BO
  405. #endif
  406. str L , KKK
  407. #endif
  408. mov K1, L
  409. asrs L , K1, #3 // L = L / 8
  410. ble _L2_M2_40
  411. _L2_M2_22:
  412. KERNEL2x2_SUB
  413. KERNEL2x2_SUB
  414. KERNEL2x2_SUB
  415. KERNEL2x2_SUB
  416. KERNEL2x2_SUB
  417. KERNEL2x2_SUB
  418. KERNEL2x2_SUB
  419. KERNEL2x2_SUB
  420. subs L, L, #1
  421. bgt _L2_M2_22
  422. _L2_M2_40:
  423. ands L , K1, #7 // L = L % 8
  424. ble _L2_M2_100
  425. _L2_M2_42:
  426. KERNEL2x2_SUB
  427. subs L, L, #1
  428. bgt _L2_M2_42
  429. _L2_M2_100:
  430. SAVE2x2
  431. #if (defined(LEFT) && defined(TRANSA)) || \
  432. (!defined(LEFT) && !defined(TRANSA))
  433. ldr r3 , K
  434. ldr r4 , KKK
  435. sub r3 , r3 , r4
  436. lsls r4 , r3 , #4 // 2 double values
  437. add BO , BO , r4
  438. lsls r4 , r3 , #4 // 2 double values
  439. add AO , AO , r4
  440. #endif
  441. #if defined(LEFT)
  442. ldr r3 , KK
  443. add r3 , r3 , #2 // number of values in AO
  444. str r3 , KK
  445. #endif
  446. _L2_M2_END:
  447. _L2_M1_BEGIN:
  448. tst I, #1 // I = I % 2
  449. ble _L2_END
  450. _L2_M1_20:
  451. INIT1x2
  452. #if (defined(LEFT) && defined(TRANSA)) || \
  453. (!defined(LEFT) && !defined(TRANSA))
  454. mov BO, BC
  455. #else
  456. mov BO, BC
  457. ldr r3 , KK
  458. lsls r4 , r3 , #4 // 2 double values
  459. add BO , BO , r4
  460. lsls r4 , r3 , #3 // 1 double value
  461. add AO , AO , r4
  462. #endif
  463. #ifndef TRMMKERNEL
  464. ldr L , K
  465. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  466. ldr L , K
  467. ldr r3, KK
  468. sub L , L, r3
  469. str L , KKK
  470. #else
  471. ldr L , KK
  472. #ifdef LEFT
  473. add L , L , #1 // number of values in AO
  474. #else
  475. add L , L , #2 // number of values in BO
  476. #endif
  477. str L , KKK
  478. #endif
  479. mov K1, L
  480. asrs L , K1, #3 // L = L / 8
  481. ble _L2_M1_40
  482. _L2_M1_22:
  483. KERNEL1x2_SUB
  484. KERNEL1x2_SUB
  485. KERNEL1x2_SUB
  486. KERNEL1x2_SUB
  487. KERNEL1x2_SUB
  488. KERNEL1x2_SUB
  489. KERNEL1x2_SUB
  490. KERNEL1x2_SUB
  491. subs L, L, #1
  492. bgt _L2_M1_22
  493. _L2_M1_40:
  494. ands L , K1, #7 // L = L % 8
  495. ble _L2_M1_100
  496. _L2_M1_42:
  497. KERNEL1x2_SUB
  498. subs L, L, #1
  499. bgt _L2_M1_42
  500. _L2_M1_100:
  501. SAVE1x2
  502. #if (defined(LEFT) && defined(TRANSA)) || \
  503. (!defined(LEFT) && !defined(TRANSA))
  504. ldr r3 , K
  505. ldr r4 , KKK
  506. sub r3 , r3 , r4
  507. lsls r4 , r3 , #4 // 2 double values
  508. add BO , BO , r4
  509. lsls r4 , r3 , #3 // 1 double value
  510. add AO , AO , r4
  511. #endif
  512. #if defined(LEFT)
  513. ldr r3 , KK
  514. add r3 , r3 , #1 // number of values in AO
  515. str r3 , KK
  516. #endif
  517. _L2_END:
  518. mov r3, BC
  519. ldr r4, K
  520. lsl r4, r4, #4 // k * 2 * 8
  521. add r3, r3, r4 // B = B + K * 2 * 8
  522. mov BC, r3
  523. #if !defined(LEFT)
  524. ldr r3 , KK
  525. add r3 , r3 , #2 // number of values in BO
  526. str r3 , KK
  527. #endif
  528. subs J , #1 // j--
  529. bgt _L2_BEGIN
  530. /*********************************************************************************************/
  531. _L1_BEGIN:
  532. ldr J , N
  533. tst J , #1
  534. ble _L999
  535. ldr CO1, C // CO1 = C
  536. ldr r4 , LDC
  537. add r3 , r4, CO1
  538. str r3 , C // store C
  539. #if defined(LEFT)
  540. ldr r3 , OFFSET
  541. str r3 , KK
  542. #endif
  543. ldr AO, A // AO = A
  544. //pld [AO , #A_PRE-96]
  545. //pld [AO , #A_PRE-64]
  546. //pld [AO , #A_PRE-32]
  547. _L1_M4_BEGIN:
  548. ldr I, M
  549. asrs I, I, #2 // I = I / 4
  550. ble _L1_M2_BEGIN
  551. _L1_M4_20:
  552. INIT4x1
  553. #if (defined(LEFT) && defined(TRANSA)) || \
  554. (!defined(LEFT) && !defined(TRANSA))
  555. mov BO, BC
  556. #else
  557. mov BO, BC
  558. ldr r3 , KK
  559. lsls r4 , r3 , #3 // 1 double value
  560. add BO , BO , r4
  561. lsls r4 , r3 , #5 // 4 double values
  562. add AO , AO , r4
  563. #endif
  564. #ifndef TRMMKERNEL
  565. ldr L , K
  566. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  567. ldr L , K
  568. ldr r3, KK
  569. sub L , L, r3
  570. str L , KKK
  571. #else
  572. ldr L , KK
  573. #ifdef LEFT
  574. add L , L , #4 // number of values in AO
  575. #else
  576. add L , L , #1 // number of values in BO
  577. #endif
  578. str L , KKK
  579. #endif
  580. mov K1, L
  581. asrs L , K1, #3 // L = L / 8
  582. ble _L1_M4_40
  583. .align 5
  584. _L1_M4_22:
  585. KERNEL4x1_SUB
  586. KERNEL4x1_SUB
  587. KERNEL4x1_SUB
  588. KERNEL4x1_SUB
  589. KERNEL4x1_SUB
  590. KERNEL4x1_SUB
  591. KERNEL4x1_SUB
  592. KERNEL4x1_SUB
  593. subs L, L, #1
  594. bgt _L1_M4_22
  595. _L1_M4_40:
  596. ands L , K1, #7 // L = L % 8
  597. ble _L1_M4_100
  598. _L1_M4_42:
  599. KERNEL4x1_SUB
  600. subs L, L, #1
  601. bgt _L1_M4_42
  602. _L1_M4_100:
  603. SAVE4x1
  604. #if (defined(LEFT) && defined(TRANSA)) || \
  605. (!defined(LEFT) && !defined(TRANSA))
  606. ldr r3 , K
  607. ldr r4 , KKK
  608. sub r3 , r3 , r4
  609. lsls r4 , r3 , #3 // 1 double value
  610. add BO , BO , r4
  611. lsls r4 , r3 , #5 // 4 double values
  612. add AO , AO , r4
  613. #endif
  614. #if defined(LEFT)
  615. ldr r3 , KK
  616. add r3 , r3 , #4 // number of values in AO
  617. str r3 , KK
  618. #endif
  619. _L1_M4_END:
  620. subs I, I, #1
  621. bgt _L1_M4_20
  622. _L1_M2_BEGIN:
  623. ldr I, M
  624. tst I , #3
  625. ble _L1_END
  626. tst I, #2 // I = I / 2
  627. ble _L1_M1_BEGIN
  628. _L1_M2_20:
  629. INIT2x1
  630. #if (defined(LEFT) && defined(TRANSA)) || \
  631. (!defined(LEFT) && !defined(TRANSA))
  632. mov BO, BC
  633. #else
  634. mov BO, BC
  635. ldr r3 , KK
  636. lsls r4 , r3 , #3 // 1 double value
  637. add BO , BO , r4
  638. lsls r4 , r3 , #4 // 2 double values
  639. add AO , AO , r4
  640. #endif
  641. #ifndef TRMMKERNEL
  642. ldr L , K
  643. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  644. ldr L , K
  645. ldr r3, KK
  646. sub L , L, r3
  647. str L , KKK
  648. #else
  649. ldr L , KK
  650. #ifdef LEFT
  651. add L , L , #2 // number of values in AO
  652. #else
  653. add L , L , #1 // number of values in BO
  654. #endif
  655. str L , KKK
  656. #endif
  657. mov K1, L
  658. asrs L , K1, #3 // L = L / 8
  659. ble _L1_M2_40
  660. _L1_M2_22:
  661. KERNEL2x1_SUB
  662. KERNEL2x1_SUB
  663. KERNEL2x1_SUB
  664. KERNEL2x1_SUB
  665. KERNEL2x1_SUB
  666. KERNEL2x1_SUB
  667. KERNEL2x1_SUB
  668. KERNEL2x1_SUB
  669. subs L, L, #1
  670. bgt _L1_M2_22
  671. _L1_M2_40:
  672. ands L , K1, #7 // L = L % 8
  673. ble _L1_M2_100
  674. _L1_M2_42:
  675. KERNEL2x1_SUB
  676. subs L, L, #1
  677. bgt _L1_M2_42
  678. _L1_M2_100:
  679. SAVE2x1
  680. #if (defined(LEFT) && defined(TRANSA)) || \
  681. (!defined(LEFT) && !defined(TRANSA))
  682. ldr r3 , K
  683. ldr r4 , KKK
  684. sub r3 , r3 , r4
  685. lsls r4 , r3 , #3 // 1 double value
  686. add BO , BO , r4
  687. lsls r4 , r3 , #4 // 2 double values
  688. add AO , AO , r4
  689. #endif
  690. #if defined(LEFT)
  691. ldr r3 , KK
  692. add r3 , r3 , #2 // number of values in AO
  693. str r3 , KK
  694. #endif
  695. _L1_M2_END:
  696. _L1_M1_BEGIN:
  697. tst I, #1 // I = I % 2
  698. ble _L1_END
  699. _L1_M1_20:
  700. INIT1x1
  701. #if (defined(LEFT) && defined(TRANSA)) || \
  702. (!defined(LEFT) && !defined(TRANSA))
  703. mov BO, BC
  704. #else
  705. mov BO, BC
  706. ldr r3 , KK
  707. lsls r4 , r3 , #3 // 1 double value
  708. add BO , BO , r4
  709. lsls r4 , r3 , #3 // 1 double value
  710. add AO , AO , r4
  711. #endif
  712. #ifndef TRMMKERNEL
  713. ldr L , K
  714. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  715. ldr L , K
  716. ldr r3, KK
  717. sub L , L, r3
  718. str L , KKK
  719. #else
  720. ldr L , KK
  721. #ifdef LEFT
  722. add L , L , #1 // number of values in AO
  723. #else
  724. add L , L , #1 // number of values in BO
  725. #endif
  726. str L , KKK
  727. #endif
  728. mov K1, L
  729. asrs L , K1, #3 // L = L / 8
  730. ble _L1_M1_40
  731. _L1_M1_22:
  732. KERNEL1x1_SUB
  733. KERNEL1x1_SUB
  734. KERNEL1x1_SUB
  735. KERNEL1x1_SUB
  736. KERNEL1x1_SUB
  737. KERNEL1x1_SUB
  738. KERNEL1x1_SUB
  739. KERNEL1x1_SUB
  740. subs L, L, #1
  741. bgt _L1_M1_22
  742. _L1_M1_40:
  743. ands L , K1, #7 // L = L % 8
  744. ble _L1_M1_100
  745. _L1_M1_42:
  746. KERNEL1x1_SUB
  747. subs L, L, #1
  748. bgt _L1_M1_42
  749. _L1_M1_100:
  750. SAVE1x1
  751. _L1_END:
  752. _L999:
  753. sub r3, fp, #128
  754. vldm r3, { d8 - d15} // restore floating point registers
  755. movs r0, #0 // set return value
  756. sub sp, fp, #24
  757. pop {r4 - r9, fp}
  758. bx lr
  759. EPILOGUE