You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmm8x4V.S 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874
  1. /***************************************************************************
  2. Copyright (c) 2013-2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2017/01/01 AbdelRauf (quickwritereader@gmail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. **************************************************************************************/
  33. /*********************************************************************/
  34. /* Copyright 2009, 2010 The University of Texas at Austin. */
  35. /* All rights reserved. */
  36. /* */
  37. /* Redistribution and use in source and binary forms, with or */
  38. /* without modification, are permitted provided that the following */
  39. /* conditions are met: */
  40. /* */
  41. /* 1. Redistributions of source code must retain the above */
  42. /* copyright notice, this list of conditions and the following */
  43. /* disclaimer. */
  44. /* */
  45. /* 2. Redistributions in binary form must reproduce the above */
  46. /* copyright notice, this list of conditions and the following */
  47. /* disclaimer in the documentation and/or other materials */
  48. /* provided with the distribution. */
  49. /* */
  50. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  51. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  52. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  53. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  54. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  55. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  56. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  57. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  58. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  59. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  60. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  61. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  62. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  63. /* POSSIBILITY OF SUCH DAMAGE. */
  64. /* */
  65. /* The views and conclusions contained in the software and */
  66. /* documentation are those of the authors and should not be */
  67. /* interpreted as representing official policies, either expressed */
  68. /* or implied, of The University of Texas at Austin. */
  69. /*********************************************************************/
  70. #define ASSEMBLER
  71. #include "common.h"
  72. /*
  73. #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  74. ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168]
  75. offset=stack[176]
  76. **********************************************************************************************/
  77. /*Note: r0 can not be used as address disp register */
  78. #define BM %r2
  79. #define BM_CUR %r0
  80. #define BN %r3
  81. #define BN_CUR %r10
  82. #define BK %r4
  83. #define LDC_BYTE %r8
  84. #define ALPHA %f0
  85. #define ALPHA_VECT %v0
  86. #define LOCAL_VAR1 %r9
  87. #define LOCAL_VAR2 %r1
  88. #define LOCAL_VAR3 %r11
  89. #define A %r5
  90. #define B %r6
  91. #define CIJ %r7
  92. #define CIJ_LOCAL %r12
  93. #define OFF %r13
  94. #define OFFSET %f8
  95. #define ALIGN_4 .align 16
  96. #define ALIGN_2 .align 8
  97. #define PREFETCH_INS 1
  98. /**************************Include kernel helper macrosses**********************************/
  99. #include "kernelMacros.S"
  100. #if defined (TRMMKERNEL)
  101. #define STORE_8x4 STORE_TRMM_8x4
  102. #define STORE_4x4 STORE_TRMM_4x4
  103. #define STORE_2x4 STORE_TRMM_2x4
  104. #define STORE_1x4 STORE_TRMM_1x4
  105. #define STORE_8x2 STORE_TRMM_8x2
  106. #define STORE_4x2 STORE_TRMM_4x2
  107. #define STORE_2x2 STORE_TRMM_2x2
  108. #define STORE_1x2 STORE_TRMM_1x2
  109. #define STORE_8x1 STORE_TRMM_8x1
  110. #define STORE_4x1 STORE_TRMM_4x1
  111. #define STORE_2x1 STORE_TRMM_2x1
  112. #define STORE_1x1 STORE_TRMM_1x1
  113. #endif
  114. /***********************************DGEMM***********************************************************/
  115. PROLOGUE
  116. #if defined(TRMMKERNEL)
  117. std OFFSET,40(%r15)
  118. stmg %r6,%r13,48(%r15)
  119. #else
  120. stmg %r6,%r12,48(%r15)
  121. #endif
  122. lg CIJ, 160(%r15)
  123. lg LOCAL_VAR1, 168(%r15)
  124. #if defined(TRMMKERNEL)
  125. lg OFF,176(%r15)
  126. ldgr OFFSET ,OFF
  127. #endif
  128. srlg BN_CUR,BN,2
  129. vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
  130. sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with bytes double=8 x<<3 */
  131. #if defined(TRMMKERNEL) && !defined(LEFT)
  132. /*off = -offset;*/
  133. lgdr LOCAL_VAR1,OFFSET
  134. lcgr OFF,LOCAL_VAR1
  135. #endif
  136. cijle BN_CUR,0,.LX2
  137. ALIGN_4
  138. .LX4_BN:
  139. #if defined(PREFETCH_INS)
  140. pfd 1, 0(A)
  141. pfd 1, 256(A)
  142. pfd 1, 0(B)
  143. pfd 1, 256(B)
  144. #endif
  145. #if defined(TRMMKERNEL) && defined(LEFT)
  146. /*off = offset;*/
  147. lgdr OFF,OFFSET
  148. #endif
  149. srlg BM_CUR,BM,3
  150. lgr LOCAL_VAR3,A
  151. lgr CIJ_LOCAL,CIJ
  152. cijle BM_CUR,0,.L4x4
  153. ALIGN_4
  154. .L8x4_BM: /*BM_CUR LOOP */
  155. #if defined(TRMMKERNEL)
  156. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  157. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4
  158. RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
  159. srl LOCAL_VAR1,2
  160. #else
  161. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  162. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  163. #endif
  164. ZERO_CVEC_8x4
  165. cijle LOCAL_VAR1,0,.L8x4_mod
  166. ALIGN_4
  167. .L8x4_4_BK: /*BK_CUR LOOP */
  168. #if defined(PREFETCH_INS)
  169. pfd 1, 512(LOCAL_VAR3)
  170. #endif
  171. CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2
  172. #if defined(PREFETCH_INS)
  173. pfd 1, 512(LOCAL_VAR2)
  174. #endif
  175. brctg LOCAL_VAR1,.L8x4_4_BK
  176. ALIGN_4
  177. .L8x4_mod:
  178. #if defined(TRMMKERNEL)
  179. RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
  180. nill LOCAL_VAR1,3
  181. #else
  182. la LOCAL_VAR1,3(0,0)
  183. NGR LOCAL_VAR1,BK /*refresh BK*/
  184. #endif
  185. jz .L8x4_BK_Store
  186. ALIGN_4
  187. .L8x4_BK: /*BK_CUR LOOP */
  188. CALC_8x4 LOCAL_VAR3,LOCAL_VAR2
  189. brctg LOCAL_VAR1,.L8x4_BK
  190. ALIGN_4
  191. .L8x4_BK_Store:
  192. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  193. STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  194. #if defined(TRMMKERNEL)
  195. /*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/
  196. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4
  197. #endif
  198. brctg BM_CUR,.L8x4_BM
  199. ALIGN_4
  200. .L4x4:
  201. tmll BM,4
  202. jz .L2x4
  203. ALIGN_4
  204. .L4x4_BM: /*BM start*/
  205. #if defined(TRMMKERNEL)
  206. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  207. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4
  208. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  209. srl LOCAL_VAR1,2
  210. #else
  211. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  212. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  213. #endif
  214. ZERO_CVEC_4x4
  215. cijle LOCAL_VAR1,0,.L4x4_mod
  216. ALIGN_4
  217. .L4x4_4_BK: /*BK_CUR LOOP */
  218. CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
  219. brctg LOCAL_VAR1,.L4x4_4_BK
  220. ALIGN_4
  221. .L4x4_mod:
  222. #if defined(TRMMKERNEL)
  223. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  224. nill LOCAL_VAR1,3
  225. #else
  226. la LOCAL_VAR1,3(0,0)
  227. NGR LOCAL_VAR1,BK /*refresh BK*/
  228. #endif
  229. jz .L4x4_BK_Store
  230. ALIGN_4
  231. .L4x4_BK: /*BK_CUR LOOP */
  232. CALC_4x4 LOCAL_VAR3,LOCAL_VAR2
  233. brctg LOCAL_VAR1,.L4x4_BK
  234. ALIGN_4
  235. .L4x4_BK_Store:
  236. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  237. STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  238. #if defined(TRMMKERNEL)
  239. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4
  240. #endif
  241. ALIGN_2
  242. .L2x4:
  243. tmll BM,2
  244. jz .L1x4
  245. ALIGN_4
  246. .L2x4_BM: /*BM start*/
  247. #if defined(TRMMKERNEL)
  248. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  249. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4
  250. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  251. srl LOCAL_VAR1,2
  252. #else
  253. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  254. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  255. #endif
  256. ZERO_CVEC_2x4
  257. cijle LOCAL_VAR1,0,.L2x4_mod
  258. ALIGN_4
  259. .L2x4_4_BK: /*BK_CUR LOOP */
  260. CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
  261. brctg LOCAL_VAR1,.L2x4_4_BK
  262. ALIGN_4
  263. .L2x4_mod:
  264. #if defined(TRMMKERNEL)
  265. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  266. nill LOCAL_VAR1,3
  267. #else
  268. la LOCAL_VAR1,3(0,0)
  269. NGR LOCAL_VAR1,BK /*refresh BK*/
  270. #endif
  271. jz .L2x4_BK_Store
  272. ALIGN_4
  273. .L2x4_BK: /*BK_CUR LOOP */
  274. CALC_2x4 LOCAL_VAR3,LOCAL_VAR2
  275. brctg LOCAL_VAR1,.L2x4_BK
  276. ALIGN_4
  277. .L2x4_BK_Store:
  278. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  279. STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  280. #if defined(TRMMKERNEL)
  281. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4
  282. #endif
  283. ALIGN_4
  284. .L1x4:
  285. tmll BM,1
  286. jz .Lx4_INNER_END
  287. ALIGN_4
  288. .L1x4_BM: /*BM start*/
  289. #if defined(TRMMKERNEL)
  290. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  291. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4
  292. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  293. srl LOCAL_VAR1,2
  294. #else
  295. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  296. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  297. #endif
  298. ZERO_CVEC_1x4
  299. cijle LOCAL_VAR1,0,.L1x4_mod
  300. ALIGN_4
  301. .L1x4_4_BK: /*BK_CUR LOOP */
  302. CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
  303. brctg LOCAL_VAR1,.L1x4_4_BK
  304. ALIGN_4
  305. .L1x4_mod:
  306. #if defined(TRMMKERNEL)
  307. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  308. nill LOCAL_VAR1,3
  309. #else
  310. la LOCAL_VAR1,3(0,0)
  311. NGR LOCAL_VAR1,BK /*refresh BK*/
  312. #endif
  313. jz .L1x4_BK_Store
  314. ALIGN_4
  315. .L1x4_BK: /*BK_CUR LOOP */
  316. CALC_1x4 LOCAL_VAR3,LOCAL_VAR2
  317. brctg LOCAL_VAR1,.L1x4_BK
  318. ALIGN_4
  319. .L1x4_BK_Store:
  320. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  321. STORE_1x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  322. #if defined(TRMMKERNEL)
  323. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4
  324. #endif
  325. ALIGN_2
  326. .Lx4_INNER_END:
  327. /*add LDC_BYTE_COPY to new*/
  328. sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */
  329. #if defined(TRMMKERNEL) && !defined(LEFT)
  330. aghi OFF,4
  331. #endif
  332. sllg LOCAL_VAR2,BK,5 /*muyliply*4*sizeof(double) =multiply*32* 2**5 */
  333. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  334. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
  335. brctg BN_CUR,.LX4_BN
  336. /*********************************X2 SECTION************************************************/
  337. ALIGN_4
  338. .LX2:
  339. tmll BN,2
  340. jz .Lx1
  341. ALIGN_4
  342. .Lx2_BN:
  343. #if defined(TRMMKERNEL) && defined(LEFT)
  344. /*off = offset;*/
  345. lgdr OFF,OFFSET
  346. #endif
  347. srlg BM_CUR,BM,3
  348. lgr LOCAL_VAR3,A
  349. lgr CIJ_LOCAL,CIJ
  350. cijle BM_CUR,0,.L4x2
  351. ALIGN_4
  352. .L8x2_BM: /*BM_CUR LOOP */
  353. #if defined(TRMMKERNEL)
  354. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  355. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2
  356. RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
  357. srl LOCAL_VAR1,2
  358. #else
  359. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  360. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  361. #endif
  362. ZERO_CVEC_8x2
  363. cijle LOCAL_VAR1,0,.L8x2_mod
  364. ALIGN_4
  365. .L8x2_4_BK: /*BK_CUR LOOP */
  366. #if defined(PREFETCH_INS)
  367. pfd 1, 256(LOCAL_VAR3)
  368. pfd 1,64(LOCAL_VAR2)
  369. #endif
  370. CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2
  371. brctg LOCAL_VAR1,.L8x2_4_BK
  372. ALIGN_4
  373. .L8x2_mod:
  374. #if defined(TRMMKERNEL)
  375. RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
  376. nill LOCAL_VAR1,3
  377. #else
  378. la LOCAL_VAR1,3(0,0)
  379. NGR LOCAL_VAR1,BK /*refresh BK*/
  380. #endif
  381. jz .L8x2_BK_Store
  382. ALIGN_4
  383. .L8x2_BK: /*BK_CUR LOOP */
  384. CALC_8x2 LOCAL_VAR3,LOCAL_VAR2
  385. brctg LOCAL_VAR1,.L8x2_BK
  386. ALIGN_4
  387. .L8x2_BK_Store:
  388. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  389. STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  390. #if defined(TRMMKERNEL)
  391. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2
  392. #endif
  393. ALIGN_4
  394. brctg BM_CUR,.L8x2_BM
  395. ALIGN_2
  396. .L4x2:
  397. tmll BM,4
  398. jz .L2x2
  399. ALIGN_4
  400. .L4x2_BM: /*BM start*/
  401. #if defined(TRMMKERNEL)
  402. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  403. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2
  404. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  405. srl LOCAL_VAR1,2
  406. #else
  407. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  408. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  409. #endif
  410. ZERO_CVEC_4x2
  411. cijle LOCAL_VAR1,0,.L4x2_mod
  412. ALIGN_4
  413. .L4x2_4_BK: /*BK_CUR LOOP */
  414. CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
  415. brctg LOCAL_VAR1,.L4x2_4_BK
  416. ALIGN_4
  417. .L4x2_mod:
  418. #if defined(TRMMKERNEL)
  419. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  420. nill LOCAL_VAR1,3
  421. #else
  422. la LOCAL_VAR1,3(0,0)
  423. NGR LOCAL_VAR1,BK /*refresh BK*/
  424. #endif
  425. jz .L4x2_BK_Store
  426. ALIGN_4
  427. .L4x2_BK: /*BK_CUR LOOP */
  428. CALC_4x2 LOCAL_VAR3,LOCAL_VAR2
  429. brctg LOCAL_VAR1,.L4x2_BK
  430. ALIGN_4
  431. .L4x2_BK_Store:
  432. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  433. STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  434. #if defined(TRMMKERNEL)
  435. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2
  436. #endif
  437. ALIGN_2
  438. .L2x2:
  439. tmll BM,2
  440. jz .L1x2
  441. ALIGN_4
  442. .L2x2_BM: /*BM start*/
  443. #if defined(TRMMKERNEL)
  444. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  445. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2
  446. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  447. srl LOCAL_VAR1,2
  448. #else
  449. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  450. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  451. #endif
  452. ZERO_CVEC_2x2
  453. cijle LOCAL_VAR1,0,.L2x2_mod
  454. ALIGN_4
  455. .L2x2_4_BK: /*BK_CUR LOOP */
  456. CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
  457. brctg LOCAL_VAR1,.L2x2_4_BK
  458. ALIGN_4
  459. .L2x2_mod:
  460. #if defined(TRMMKERNEL)
  461. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  462. nill LOCAL_VAR1,3
  463. #else
  464. la LOCAL_VAR1,3(0,0)
  465. NGR LOCAL_VAR1,BK /*refresh BK*/
  466. #endif
  467. jz .L2x2_BK_Store
  468. ALIGN_4
  469. .L2x2_BK: /*BK_CUR LOOP */
  470. CALC_2x2 LOCAL_VAR3,LOCAL_VAR2
  471. brctg LOCAL_VAR1,.L2x2_BK
  472. ALIGN_4
  473. .L2x2_BK_Store:
  474. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  475. STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  476. #if defined(TRMMKERNEL)
  477. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2
  478. #endif
  479. ALIGN_2
  480. .L1x2:
  481. tmll BM,1
  482. jz .Lx2_INNER_END
  483. ALIGN_4
  484. .L1x2_BM: /*BM start*/
  485. #if defined(TRMMKERNEL)
  486. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  487. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2
  488. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  489. srl LOCAL_VAR1,2
  490. #else
  491. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  492. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  493. #endif
  494. ZERO_CVEC_1x2
  495. cijle LOCAL_VAR1,0,.L1x2_mod
  496. ALIGN_4
  497. .L1x2_4_BK: /*BK_CUR LOOP */
  498. CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
  499. brctg LOCAL_VAR1,.L1x2_4_BK
  500. ALIGN_4
  501. .L1x2_mod:
  502. #if defined(TRMMKERNEL)
  503. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  504. nill LOCAL_VAR1,3
  505. #else
  506. la LOCAL_VAR1,3(0,0)
  507. NGR LOCAL_VAR1,BK /*refresh BK*/
  508. #endif
  509. jz .L1x2_BK_Store
  510. ALIGN_4
  511. .L1x2_BK: /*BK_CUR LOOP */
  512. CALC_1x2 LOCAL_VAR3,LOCAL_VAR2
  513. brctg LOCAL_VAR1,.L1x2_BK
  514. ALIGN_4
  515. .L1x2_BK_Store:
  516. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  517. STORE_1x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  518. #if defined(TRMMKERNEL)
  519. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2
  520. #endif
  521. ALIGN_2
  522. .Lx2_INNER_END:
  523. /*add LDC_BYTE_COPY to new*/
  524. la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */
  525. sllg LOCAL_VAR2,BK,4 /*muyliply*2*sizeof(double) =multiply*16* 2**4 */
  526. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  527. #if defined(TRMMKERNEL) && !defined(LEFT)
  528. aghi OFF,2
  529. #endif
  530. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
  531. /*********************************X1 SECTION************************************************/
  532. ALIGN_2
  533. .Lx1:
  534. tmll BN,1
  535. jz .L_FUNC_END
  536. ALIGN_4
  537. .Lx1_BN:
  538. #if defined(TRMMKERNEL) && defined(LEFT)
  539. /*off = offset;*/
  540. lgdr OFF,OFFSET
  541. #endif
  542. srlg BM_CUR,BM,3
  543. lgr LOCAL_VAR3,A
  544. lgr CIJ_LOCAL,CIJ
  545. cijle BM_CUR,0,.L4x1
  546. ALIGN_4
  547. .L8x1_BM: /*BM_CUR LOOP */
  548. #if defined(TRMMKERNEL)
  549. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  550. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1
  551. RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
  552. srl LOCAL_VAR1,2
  553. #else
  554. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  555. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  556. #endif
  557. ZERO_CVEC_8x1
  558. cijle LOCAL_VAR1,0,.L8x1_mod
  559. ALIGN_4
  560. .L8x1_4_BK: /*BK_CUR LOOP */
  561. #if defined(PREFETCH_INS)
  562. pfd 1, 256(LOCAL_VAR3)
  563. #endif
  564. CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2
  565. brctg LOCAL_VAR1,.L8x1_4_BK
  566. ALIGN_4
  567. .L8x1_mod:
  568. #if defined(TRMMKERNEL)
  569. RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
  570. nill LOCAL_VAR1,3
  571. #else
  572. la LOCAL_VAR1,3(0,0)
  573. NGR LOCAL_VAR1,BK /*refresh BK*/
  574. #endif
  575. jz .L8x1_BK_Store
  576. ALIGN_4
  577. .L8x1_BK: /*BK_CUR LOOP */
  578. CALC_8x1 LOCAL_VAR3,LOCAL_VAR2
  579. brctg LOCAL_VAR1,.L8x1_BK
  580. ALIGN_4
  581. .L8x1_BK_Store:
  582. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  583. STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  584. #if defined(TRMMKERNEL)
  585. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1
  586. #endif
  587. ALIGN_4
  588. brctg BM_CUR,.L8x1_BM
  589. ALIGN_2
  590. .L4x1:
  591. tmll BM,4
  592. jz .L2x1
  593. ALIGN_4
  594. .L4x1_BM: /*BM start*/
  595. #if defined(TRMMKERNEL)
  596. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  597. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1
  598. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  599. srl LOCAL_VAR1,2
  600. #else
  601. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  602. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  603. #endif
  604. ZERO_CVEC_4x1
  605. cijle LOCAL_VAR1,0,.L4x1_mod
  606. ALIGN_4
  607. .L4x1_4_BK: /*BK_CUR LOOP */
  608. CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
  609. brctg LOCAL_VAR1,.L4x1_4_BK
  610. ALIGN_4
  611. .L4x1_mod:
  612. #if defined(TRMMKERNEL)
  613. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  614. nill LOCAL_VAR1,3
  615. #else
  616. la LOCAL_VAR1,3(0,0)
  617. NGR LOCAL_VAR1,BK /*refresh BK*/
  618. #endif
  619. jz .L4x1_BK_Store
  620. ALIGN_4
  621. .L4x1_BK: /*BK_CUR LOOP */
  622. CALC_4x1 LOCAL_VAR3,LOCAL_VAR2
  623. brctg LOCAL_VAR1,.L4x1_BK
  624. ALIGN_4
  625. .L4x1_BK_Store:
  626. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  627. STORE_4x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  628. #if defined(TRMMKERNEL)
  629. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1
  630. #endif
  631. ALIGN_2
  632. .L2x1:
  633. tmll BM,2
  634. jz .L1x1
  635. ALIGN_4
  636. .L2x1_BM: /*BM start*/
  637. #if defined(TRMMKERNEL)
  638. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  639. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1
  640. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  641. srl LOCAL_VAR1,2
  642. #else
  643. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  644. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  645. #endif
  646. ZERO_CVEC_2x1
  647. cijle LOCAL_VAR1,0,.L2x1_mod
  648. ALIGN_4
  649. .L2x1_4_BK: /*BK_CUR LOOP */
  650. CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
  651. brctg LOCAL_VAR1,.L2x1_4_BK
  652. ALIGN_4
  653. .L2x1_mod:
  654. #if defined(TRMMKERNEL)
  655. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  656. nill LOCAL_VAR1,3
  657. #else
  658. la LOCAL_VAR1,3(0,0)
  659. NGR LOCAL_VAR1,BK /*refresh BK*/
  660. #endif
  661. jz .L2x1_BK_Store
  662. ALIGN_4
  663. .L2x1_BK: /*BK_CUR LOOP */
  664. CALC_2x1 LOCAL_VAR3,LOCAL_VAR2
  665. brctg LOCAL_VAR1,.L2x1_BK
  666. ALIGN_4
  667. .L2x1_BK_Store:
  668. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  669. STORE_2x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  670. #if defined(TRMMKERNEL)
  671. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1
  672. #endif
  673. ALIGN_2
  674. .L1x1:
  675. tmll BM, 1
  676. jz .Lx1_INNER_END
  677. ALIGN_4
  678. .L1x1_BM: /*BM start*/
  679. #if defined(TRMMKERNEL)
  680. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  681. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1
  682. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  683. srl LOCAL_VAR1,2
  684. #else
  685. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  686. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  687. #endif
  688. ZERO_CVEC_1x1
  689. cijle LOCAL_VAR1,0,.L1x1_mod
  690. ALIGN_4
  691. .L1x1_4_BK: /*BK_CUR LOOP */
  692. CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
  693. brctg LOCAL_VAR1,.L1x1_4_BK
  694. ALIGN_4
  695. .L1x1_mod:
  696. #if defined(TRMMKERNEL)
  697. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  698. nill LOCAL_VAR1,3
  699. #else
  700. la LOCAL_VAR1,3(0,0)
  701. NGR LOCAL_VAR1,BK /*refresh BK*/
  702. #endif
  703. jz .L1x1_BK_Store
  704. ALIGN_4
  705. .L1x1_BK: /*BK_CUR LOOP */
  706. CALC_1x1 LOCAL_VAR3,LOCAL_VAR2
  707. brctg LOCAL_VAR1,.L1x1_BK
  708. ALIGN_4
  709. .L1x1_BK_Store:
  710. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  711. STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
  712. #if defined(TRMMKERNEL)
  713. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1
  714. #endif
  715. ALIGN_2
  716. .Lx1_INNER_END:
  717. /*add LDC_BYTE_COPY to new*/
  718. sllg LOCAL_VAR2,BK,3 /*muyliply*2*sizeof(double) =multiply*8* 2**3 */
  719. la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
  720. #if defined(TRMMKERNEL) && !defined(LEFT)
  721. aghi OFF,1
  722. #endif
  723. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(double) */
  724. ALIGN_2
  725. .L_FUNC_END:
  726. /*end*/
  727. #if defined(TRMMKERNEL)
  728. ld OFFSET,40(%r15)
  729. lmg %r6,%r13,48(%r15)
  730. #else
  731. lmg %r6,%r12,48(%r15)
  732. #endif
  733. br %r14
  734. .end