You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x2_vfp.S 19 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 252
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA d0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define KK [fp, #-240 ]
  48. #define KKK [fp, #-244]
  49. #define C [fp, #-248 ]
  50. #define LDC [fp, #-252 ]
  51. #define M [fp, #-256 ]
  52. #define N [fp, #-260 ]
  53. #define K [fp, #-264 ]
  54. #define A [fp, #-268 ]
  55. #define FP_ZERO [fp, #-232]
  56. #define FP_ZERO_0 [fp, #-232]
  57. #define FP_ZERO_1 [fp, #-228]
  58. #define ALPHA [fp, #-276 ]
  59. #define B [fp, #4 ]
  60. #define OLD_C [fp, #8 ]
  61. #define OLD_LDC [fp, #12 ]
  62. #define OFFSET [fp, #16 ]
  63. #define I r0
  64. #define J r1
  65. #define L r2
  66. #define AO r5
  67. #define BO r6
  68. #define CO1 r8
  69. #define CO2 r9
  70. #define K1 r7
  71. #define BC r12
  72. #define A_PRE 64
  73. #define B_PRE 64
  74. #define C_PRE 64
  75. /**************************************************************************************
  76. * Macro definitions
  77. **************************************************************************************/
  78. .macro INIT4x2
  79. fldd d8 , FP_ZERO
  80. vmov.f64 d9, d8
  81. vmov.f64 d10, d8
  82. vmov.f64 d11, d8
  83. vmov.f64 d12, d8
  84. vmov.f64 d13, d8
  85. vmov.f64 d14, d8
  86. vmov.f64 d15, d8
  87. .endm
  88. .macro KERNEL4x2_SUB
  89. fldd d4 , [ BO ]
  90. fldd d0 , [ AO ]
  91. fldd d1 , [ AO, #8 ]
  92. pld [ AO , #A_PRE ]
  93. fmacd d8 , d0, d4
  94. fldd d2 , [ AO, #16 ]
  95. fmacd d9 , d1, d4
  96. fldd d3 , [ AO, #24 ]
  97. fmacd d10 , d2, d4
  98. fldd d5 , [ BO, #8 ]
  99. fmacd d11 , d3, d4
  100. fmacd d12 , d0, d5
  101. fmacd d13 , d1, d5
  102. add AO , AO, #32
  103. fmacd d14 , d2, d5
  104. add BO , BO, #16
  105. fmacd d15 , d3, d5
  106. .endm
  107. .macro SAVE4x2
  108. ldr r3 , LDC
  109. add CO2 , CO1, r3
  110. fldd d0, ALPHA
  111. fmuld d4 , d0 , d8
  112. fmuld d5 , d0 , d9
  113. fmuld d6 , d0 , d10
  114. fmuld d7 , d0 , d11
  115. fstd d4 , [CO1]
  116. fstd d5 , [CO1, #8 ]
  117. fstd d6 , [CO1, #16 ]
  118. fstd d7 , [CO1, #24 ]
  119. fmuld d4 , d0 , d12
  120. fmuld d5 , d0 , d13
  121. fmuld d6 , d0 , d14
  122. fmuld d7 , d0 , d15
  123. fstd d4 , [CO2]
  124. fstd d5 , [CO2, #8 ]
  125. fstd d6 , [CO2, #16 ]
  126. fstd d7 , [CO2, #24 ]
  127. add CO1, CO1, #32
  128. .endm
  129. /******************************************************************************/
  130. .macro INIT2x2
  131. fldd d8 , FP_ZERO
  132. vmov.f64 d9, d8
  133. vmov.f64 d12, d8
  134. vmov.f64 d13, d8
  135. .endm
  136. .macro KERNEL2x2_SUB
  137. fldd d4 , [ BO ]
  138. fldd d5 , [ BO, #8 ]
  139. fldd d0 , [ AO ]
  140. fldd d1 , [ AO, #8 ]
  141. fmacd d8 , d0, d4
  142. fmacd d9 , d1, d4
  143. fmacd d12 , d0, d5
  144. fmacd d13 , d1, d5
  145. add AO , AO, #16
  146. add BO , BO, #16
  147. .endm
  148. .macro SAVE2x2
  149. ldr r3 , LDC
  150. add CO2 , CO1, r3
  151. fldd d0, ALPHA
  152. fmuld d4 , d0 , d8
  153. fmuld d5 , d0 , d9
  154. fstd d4 , [CO1]
  155. fstd d5 , [CO1, #8 ]
  156. fmuld d4 , d0 , d12
  157. fmuld d5 , d0 , d13
  158. fstd d4 , [CO2]
  159. fstd d5 , [CO2, #8 ]
  160. add CO1, CO1, #16
  161. .endm
  162. /******************************************************************************/
  163. .macro INIT1x2
  164. fldd d8 , FP_ZERO
  165. vmov.f64 d12, d8
  166. .endm
  167. .macro KERNEL1x2_SUB
  168. fldd d4 , [ BO ]
  169. fldd d5 , [ BO, #8 ]
  170. fldd d0 , [ AO ]
  171. fmacd d8 , d0, d4
  172. fmacd d12 , d0, d5
  173. add AO , AO, #8
  174. add BO , BO, #16
  175. .endm
  176. .macro SAVE1x2
  177. ldr r3 , LDC
  178. add CO2 , CO1, r3
  179. fldd d0, ALPHA
  180. fmuld d4 , d0 , d8
  181. fstd d4 , [CO1]
  182. fmuld d4 , d0 , d12
  183. fstd d4 , [CO2]
  184. add CO1, CO1, #8
  185. .endm
  186. /******************************************************************************/
  187. .macro INIT4x1
  188. fldd d8 , FP_ZERO
  189. vmov.f64 d9, d8
  190. vmov.f64 d10, d8
  191. vmov.f64 d11, d8
  192. .endm
  193. .macro KERNEL4x1_SUB
  194. fldd d4 , [ BO ]
  195. fldd d0 , [ AO ]
  196. fldd d1 , [ AO, #8 ]
  197. fldd d2 , [ AO, #16 ]
  198. fldd d3 , [ AO, #24 ]
  199. fmacd d8 , d0, d4
  200. fmacd d9 , d1, d4
  201. fmacd d10 , d2, d4
  202. fmacd d11 , d3, d4
  203. add AO , AO, #32
  204. add BO , BO, #8
  205. .endm
  206. .macro SAVE4x1
  207. fldd d0, ALPHA
  208. fmuld d4 , d0 , d8
  209. fmuld d5 , d0 , d9
  210. fmuld d6 , d0 , d10
  211. fmuld d7 , d0 , d11
  212. fstd d4 , [CO1]
  213. fstd d5 , [CO1, #8 ]
  214. fstd d6 , [CO1, #16 ]
  215. fstd d7 , [CO1, #24 ]
  216. add CO1, CO1, #32
  217. .endm
  218. /******************************************************************************/
  219. .macro INIT2x1
  220. fldd d8 , FP_ZERO
  221. vmov.f64 d9 , d8
  222. .endm
  223. .macro KERNEL2x1_SUB
  224. fldd d4 , [ BO ]
  225. fldd d0 , [ AO ]
  226. fldd d1 , [ AO, #8 ]
  227. fmacd d8 , d0, d4
  228. fmacd d9 , d1, d4
  229. add AO , AO, #16
  230. add BO , BO, #8
  231. .endm
  232. .macro SAVE2x1
  233. fldd d0, ALPHA
  234. fmuld d4 , d0 , d8
  235. fmuld d5 , d0 , d9
  236. fstd d4 , [CO1]
  237. fstd d5 , [CO1, #8 ]
  238. add CO1, CO1, #16
  239. .endm
  240. /******************************************************************************/
  241. .macro INIT1x1
  242. fldd d8 , FP_ZERO
  243. .endm
  244. .macro KERNEL1x1_SUB
  245. fldd d4 , [ BO ]
  246. fldd d0 , [ AO ]
  247. fmacd d8 , d0, d4
  248. add AO , AO, #8
  249. add BO , BO, #8
  250. .endm
  251. .macro SAVE1x1
  252. fldd d0, ALPHA
  253. fmuld d4 , d0 , d8
  254. fstd d4 , [CO1]
  255. add CO1, CO1, #8
  256. .endm
  257. /**************************************************************************************
  258. * End of macro definitions
  259. **************************************************************************************/
  260. PROLOGUE
  261. .align 5
  262. push {r4 - r9, fp}
  263. add fp, sp, #24
  264. sub sp, sp, #STACKSIZE // reserve stack
  265. str OLD_M, M
  266. str OLD_N, N
  267. str OLD_K, K
  268. str OLD_A, A
  269. vstr OLD_ALPHA, ALPHA
  270. sub r3, fp, #128
  271. vstm r3, { d8 - d15} // store floating point registers
  272. movs r4, #0
  273. str r4, FP_ZERO
  274. str r4, FP_ZERO_1
  275. ldr r3, OLD_LDC
  276. lsl r3, r3, #3 // ldc = ldc * 8
  277. str r3, LDC
  278. ldr r3, OLD_C
  279. str r3, C
  280. ldr BC, B
  281. ldr r3, OFFSET
  282. #ifndef LEFT
  283. neg r3 , r3
  284. #endif
  285. str r3 , KK
  286. ldr J, N
  287. asrs J, J, #1 // J = J / 2
  288. ble _L1_BEGIN
  289. _L2_BEGIN:
  290. ldr CO1, C // CO1 = C
  291. ldr r4 , LDC
  292. lsl r4 , r4 , #1 // LDC * 2
  293. add r3 , r4, CO1
  294. str r3 , C // store C
  295. #if defined(LEFT)
  296. ldr r3 , OFFSET
  297. str r3 , KK
  298. #endif
  299. ldr AO, A // AO = A
  300. _L2_M4_BEGIN:
  301. ldr I, M
  302. asrs I, I, #2 // I = I / 4
  303. ble _L2_M2_BEGIN
  304. _L2_M4_20:
  305. INIT4x2
  306. #if (defined(LEFT) && defined(TRANSA)) || \
  307. (!defined(LEFT) && !defined(TRANSA))
  308. mov BO, BC
  309. #else
  310. mov BO, BC
  311. ldr r3 , KK
  312. lsls r4 , r3 , #4 // 2 double values
  313. add BO , BO , r4
  314. lsls r4 , r3 , #5 // 4 double values
  315. add AO , AO , r4
  316. #endif
  317. #ifndef TRMMKERNEL
  318. ldr L , K
  319. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  320. ldr L , K
  321. ldr r3, KK
  322. sub L , L, r3
  323. str L , KKK
  324. #else
  325. ldr L , KK
  326. #ifdef LEFT
  327. add L , L , #4 // number of values in AO
  328. #else
  329. add L , L , #2 // number of values in BO
  330. #endif
  331. str L , KKK
  332. #endif
  333. mov K1, L
  334. asrs L , K1, #3 // L = L / 8
  335. ble _L2_M4_40
  336. .align 5
  337. _L2_M4_22:
  338. pld [ BO , #B_PRE ]
  339. KERNEL4x2_SUB
  340. KERNEL4x2_SUB
  341. pld [ BO , #B_PRE ]
  342. KERNEL4x2_SUB
  343. KERNEL4x2_SUB
  344. pld [ BO , #B_PRE ]
  345. KERNEL4x2_SUB
  346. KERNEL4x2_SUB
  347. pld [ BO , #B_PRE ]
  348. KERNEL4x2_SUB
  349. KERNEL4x2_SUB
  350. subs L, L, #1
  351. bgt _L2_M4_22
  352. _L2_M4_40:
  353. ands L , K1, #7 // L = L % 8
  354. ble _L2_M4_100
  355. _L2_M4_42:
  356. KERNEL4x2_SUB
  357. subs L, L, #1
  358. bgt _L2_M4_42
  359. _L2_M4_100:
  360. SAVE4x2
  361. #if (defined(LEFT) && defined(TRANSA)) || \
  362. (!defined(LEFT) && !defined(TRANSA))
  363. ldr r3 , K
  364. ldr r4 , KKK
  365. sub r3 , r3 , r4
  366. lsls r4 , r3 , #4 // 2 double values
  367. add BO , BO , r4
  368. lsls r4 , r3 , #5 // 4 double values
  369. add AO , AO , r4
  370. #endif
  371. #if defined(LEFT)
  372. ldr r3 , KK
  373. add r3 , r3 , #4 // number of values in AO
  374. str r3 , KK
  375. #endif
  376. _L2_M4_END:
  377. subs I, I, #1
  378. bgt _L2_M4_20
  379. _L2_M2_BEGIN:
  380. ldr I, M
  381. tst I , #3
  382. ble _L2_END
  383. tst I, #2 // I = I / 2
  384. ble _L2_M1_BEGIN
  385. _L2_M2_20:
  386. INIT2x2
  387. #if (defined(LEFT) && defined(TRANSA)) || \
  388. (!defined(LEFT) && !defined(TRANSA))
  389. mov BO, BC
  390. #else
  391. mov BO, BC
  392. ldr r3 , KK
  393. lsls r4 , r3 , #4 // 2 double values
  394. add BO , BO , r4
  395. lsls r4 , r3 , #4 // 2 double values
  396. add AO , AO , r4
  397. #endif
  398. #ifndef TRMMKERNEL
  399. ldr L , K
  400. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  401. ldr L , K
  402. ldr r3, KK
  403. sub L , L, r3
  404. str L , KKK
  405. #else
  406. ldr L , KK
  407. #ifdef LEFT
  408. add L , L , #2 // number of values in AO
  409. #else
  410. add L , L , #2 // number of values in BO
  411. #endif
  412. str L , KKK
  413. #endif
  414. mov K1, L
  415. asrs L , K1, #3 // L = L / 8
  416. ble _L2_M2_40
  417. _L2_M2_22:
  418. KERNEL2x2_SUB
  419. KERNEL2x2_SUB
  420. KERNEL2x2_SUB
  421. KERNEL2x2_SUB
  422. KERNEL2x2_SUB
  423. KERNEL2x2_SUB
  424. KERNEL2x2_SUB
  425. KERNEL2x2_SUB
  426. subs L, L, #1
  427. bgt _L2_M2_22
  428. _L2_M2_40:
  429. ands L , K1, #7 // L = L % 8
  430. ble _L2_M2_100
  431. _L2_M2_42:
  432. KERNEL2x2_SUB
  433. subs L, L, #1
  434. bgt _L2_M2_42
  435. _L2_M2_100:
  436. SAVE2x2
  437. #if (defined(LEFT) && defined(TRANSA)) || \
  438. (!defined(LEFT) && !defined(TRANSA))
  439. ldr r3 , K
  440. ldr r4 , KKK
  441. sub r3 , r3 , r4
  442. lsls r4 , r3 , #4 // 2 double values
  443. add BO , BO , r4
  444. lsls r4 , r3 , #4 // 2 double values
  445. add AO , AO , r4
  446. #endif
  447. #if defined(LEFT)
  448. ldr r3 , KK
  449. add r3 , r3 , #2 // number of values in AO
  450. str r3 , KK
  451. #endif
  452. _L2_M2_END:
  453. _L2_M1_BEGIN:
  454. tst I, #1 // I = I % 2
  455. ble _L2_END
  456. _L2_M1_20:
  457. INIT1x2
  458. #if (defined(LEFT) && defined(TRANSA)) || \
  459. (!defined(LEFT) && !defined(TRANSA))
  460. mov BO, BC
  461. #else
  462. mov BO, BC
  463. ldr r3 , KK
  464. lsls r4 , r3 , #4 // 2 double values
  465. add BO , BO , r4
  466. lsls r4 , r3 , #3 // 1 double value
  467. add AO , AO , r4
  468. #endif
  469. #ifndef TRMMKERNEL
  470. ldr L , K
  471. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  472. ldr L , K
  473. ldr r3, KK
  474. sub L , L, r3
  475. str L , KKK
  476. #else
  477. ldr L , KK
  478. #ifdef LEFT
  479. add L , L , #1 // number of values in AO
  480. #else
  481. add L , L , #2 // number of values in BO
  482. #endif
  483. str L , KKK
  484. #endif
  485. mov K1, L
  486. asrs L , K1, #3 // L = L / 8
  487. ble _L2_M1_40
  488. _L2_M1_22:
  489. KERNEL1x2_SUB
  490. KERNEL1x2_SUB
  491. KERNEL1x2_SUB
  492. KERNEL1x2_SUB
  493. KERNEL1x2_SUB
  494. KERNEL1x2_SUB
  495. KERNEL1x2_SUB
  496. KERNEL1x2_SUB
  497. subs L, L, #1
  498. bgt _L2_M1_22
  499. _L2_M1_40:
  500. ands L , K1, #7 // L = L % 8
  501. ble _L2_M1_100
  502. _L2_M1_42:
  503. KERNEL1x2_SUB
  504. subs L, L, #1
  505. bgt _L2_M1_42
  506. _L2_M1_100:
  507. SAVE1x2
  508. #if (defined(LEFT) && defined(TRANSA)) || \
  509. (!defined(LEFT) && !defined(TRANSA))
  510. ldr r3 , K
  511. ldr r4 , KKK
  512. sub r3 , r3 , r4
  513. lsls r4 , r3 , #4 // 2 double values
  514. add BO , BO , r4
  515. lsls r4 , r3 , #3 // 1 double value
  516. add AO , AO , r4
  517. #endif
  518. #if defined(LEFT)
  519. ldr r3 , KK
  520. add r3 , r3 , #1 // number of values in AO
  521. str r3 , KK
  522. #endif
  523. _L2_END:
  524. mov r3, BC
  525. ldr r4, K
  526. lsl r4, r4, #4 // k * 2 * 8
  527. add r3, r3, r4 // B = B + K * 2 * 8
  528. mov BC, r3
  529. #if !defined(LEFT)
  530. ldr r3 , KK
  531. add r3 , r3 , #2 // number of values in BO
  532. str r3 , KK
  533. #endif
  534. subs J , #1 // j--
  535. bgt _L2_BEGIN
  536. /*********************************************************************************************/
  537. _L1_BEGIN:
  538. ldr J , N
  539. tst J , #1
  540. ble _L999
  541. ldr CO1, C // CO1 = C
  542. ldr r4 , LDC
  543. add r3 , r4, CO1
  544. str r3 , C // store C
  545. #if defined(LEFT)
  546. ldr r3 , OFFSET
  547. str r3 , KK
  548. #endif
  549. ldr AO, A // AO = A
  550. //pld [AO , #A_PRE-96]
  551. //pld [AO , #A_PRE-64]
  552. //pld [AO , #A_PRE-32]
  553. _L1_M4_BEGIN:
  554. ldr I, M
  555. asrs I, I, #2 // I = I / 4
  556. ble _L1_M2_BEGIN
  557. _L1_M4_20:
  558. INIT4x1
  559. #if (defined(LEFT) && defined(TRANSA)) || \
  560. (!defined(LEFT) && !defined(TRANSA))
  561. mov BO, BC
  562. #else
  563. mov BO, BC
  564. ldr r3 , KK
  565. lsls r4 , r3 , #3 // 1 double value
  566. add BO , BO , r4
  567. lsls r4 , r3 , #5 // 4 double values
  568. add AO , AO , r4
  569. #endif
  570. #ifndef TRMMKERNEL
  571. ldr L , K
  572. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  573. ldr L , K
  574. ldr r3, KK
  575. sub L , L, r3
  576. str L , KKK
  577. #else
  578. ldr L , KK
  579. #ifdef LEFT
  580. add L , L , #4 // number of values in AO
  581. #else
  582. add L , L , #1 // number of values in BO
  583. #endif
  584. str L , KKK
  585. #endif
  586. mov K1, L
  587. asrs L , K1, #3 // L = L / 8
  588. ble _L1_M4_40
  589. .align 5
  590. _L1_M4_22:
  591. KERNEL4x1_SUB
  592. KERNEL4x1_SUB
  593. KERNEL4x1_SUB
  594. KERNEL4x1_SUB
  595. KERNEL4x1_SUB
  596. KERNEL4x1_SUB
  597. KERNEL4x1_SUB
  598. KERNEL4x1_SUB
  599. subs L, L, #1
  600. bgt _L1_M4_22
  601. _L1_M4_40:
  602. ands L , K1, #7 // L = L % 8
  603. ble _L1_M4_100
  604. _L1_M4_42:
  605. KERNEL4x1_SUB
  606. subs L, L, #1
  607. bgt _L1_M4_42
  608. _L1_M4_100:
  609. SAVE4x1
  610. #if (defined(LEFT) && defined(TRANSA)) || \
  611. (!defined(LEFT) && !defined(TRANSA))
  612. ldr r3 , K
  613. ldr r4 , KKK
  614. sub r3 , r3 , r4
  615. lsls r4 , r3 , #3 // 1 double value
  616. add BO , BO , r4
  617. lsls r4 , r3 , #5 // 4 double values
  618. add AO , AO , r4
  619. #endif
  620. #if defined(LEFT)
  621. ldr r3 , KK
  622. add r3 , r3 , #4 // number of values in AO
  623. str r3 , KK
  624. #endif
  625. _L1_M4_END:
  626. subs I, I, #1
  627. bgt _L1_M4_20
  628. _L1_M2_BEGIN:
  629. ldr I, M
  630. tst I , #3
  631. ble _L1_END
  632. tst I, #2 // I = I / 2
  633. ble _L1_M1_BEGIN
  634. _L1_M2_20:
  635. INIT2x1
  636. #if (defined(LEFT) && defined(TRANSA)) || \
  637. (!defined(LEFT) && !defined(TRANSA))
  638. mov BO, BC
  639. #else
  640. mov BO, BC
  641. ldr r3 , KK
  642. lsls r4 , r3 , #3 // 1 double value
  643. add BO , BO , r4
  644. lsls r4 , r3 , #4 // 2 double values
  645. add AO , AO , r4
  646. #endif
  647. #ifndef TRMMKERNEL
  648. ldr L , K
  649. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  650. ldr L , K
  651. ldr r3, KK
  652. sub L , L, r3
  653. str L , KKK
  654. #else
  655. ldr L , KK
  656. #ifdef LEFT
  657. add L , L , #2 // number of values in AO
  658. #else
  659. add L , L , #1 // number of values in BO
  660. #endif
  661. str L , KKK
  662. #endif
  663. mov K1, L
  664. asrs L , K1, #3 // L = L / 8
  665. ble _L1_M2_40
  666. _L1_M2_22:
  667. KERNEL2x1_SUB
  668. KERNEL2x1_SUB
  669. KERNEL2x1_SUB
  670. KERNEL2x1_SUB
  671. KERNEL2x1_SUB
  672. KERNEL2x1_SUB
  673. KERNEL2x1_SUB
  674. KERNEL2x1_SUB
  675. subs L, L, #1
  676. bgt _L1_M2_22
  677. _L1_M2_40:
  678. ands L , K1, #7 // L = L % 8
  679. ble _L1_M2_100
  680. _L1_M2_42:
  681. KERNEL2x1_SUB
  682. subs L, L, #1
  683. bgt _L1_M2_42
  684. _L1_M2_100:
  685. SAVE2x1
  686. #if (defined(LEFT) && defined(TRANSA)) || \
  687. (!defined(LEFT) && !defined(TRANSA))
  688. ldr r3 , K
  689. ldr r4 , KKK
  690. sub r3 , r3 , r4
  691. lsls r4 , r3 , #3 // 1 double value
  692. add BO , BO , r4
  693. lsls r4 , r3 , #4 // 2 double values
  694. add AO , AO , r4
  695. #endif
  696. #if defined(LEFT)
  697. ldr r3 , KK
  698. add r3 , r3 , #2 // number of values in AO
  699. str r3 , KK
  700. #endif
  701. _L1_M2_END:
  702. _L1_M1_BEGIN:
  703. tst I, #1 // I = I % 2
  704. ble _L1_END
  705. _L1_M1_20:
  706. INIT1x1
  707. #if (defined(LEFT) && defined(TRANSA)) || \
  708. (!defined(LEFT) && !defined(TRANSA))
  709. mov BO, BC
  710. #else
  711. mov BO, BC
  712. ldr r3 , KK
  713. lsls r4 , r3 , #3 // 1 double value
  714. add BO , BO , r4
  715. lsls r4 , r3 , #3 // 1 double value
  716. add AO , AO , r4
  717. #endif
  718. #ifndef TRMMKERNEL
  719. ldr L , K
  720. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  721. ldr L , K
  722. ldr r3, KK
  723. sub L , L, r3
  724. str L , KKK
  725. #else
  726. ldr L , KK
  727. #ifdef LEFT
  728. add L , L , #1 // number of values in AO
  729. #else
  730. add L , L , #1 // number of values in BO
  731. #endif
  732. str L , KKK
  733. #endif
  734. mov K1, L
  735. asrs L , K1, #3 // L = L / 8
  736. ble _L1_M1_40
  737. _L1_M1_22:
  738. KERNEL1x1_SUB
  739. KERNEL1x1_SUB
  740. KERNEL1x1_SUB
  741. KERNEL1x1_SUB
  742. KERNEL1x1_SUB
  743. KERNEL1x1_SUB
  744. KERNEL1x1_SUB
  745. KERNEL1x1_SUB
  746. subs L, L, #1
  747. bgt _L1_M1_22
  748. _L1_M1_40:
  749. ands L , K1, #7 // L = L % 8
  750. ble _L1_M1_100
  751. _L1_M1_42:
  752. KERNEL1x1_SUB
  753. subs L, L, #1
  754. bgt _L1_M1_42
  755. _L1_M1_100:
  756. SAVE1x1
  757. _L1_END:
  758. _L999:
  759. sub r3, fp, #128
  760. vldm r3, { d8 - d15} // restore floating point registers
  761. movs r0, #0 // set return value
  762. sub sp, fp, #24
  763. pop {r4 - r9, fp}
  764. bx lr
  765. EPILOGUE