You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm8x4V.S 22 kB

9 years ago
9 years ago
9 years ago
9 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. /***************************************************************************
  2. Copyright (c) 2013-2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2017/03/01 AbdelRauf (quickwritereader@gmail.com)
  29. * BLASTEST : passed
  30. * CTEST : passed
  31. * TEST : passed
  32. **************************************************************************************/
  33. /*********************************************************************/
  34. /* Copyright 2009, 2010 The University of Texas at Austin. */
  35. /* All rights reserved. */
  36. /* */
  37. /* Redistribution and use in source and binary forms, with or */
  38. /* without modification, are permitted provided that the following */
  39. /* conditions are met: */
  40. /* */
  41. /* 1. Redistributions of source code must retain the above */
  42. /* copyright notice, this list of conditions and the following */
  43. /* disclaimer. */
  44. /* */
  45. /* 2. Redistributions in binary form must reproduce the above */
  46. /* copyright notice, this list of conditions and the following */
  47. /* disclaimer in the documentation and/or other materials */
  48. /* provided with the distribution. */
  49. /* */
  50. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  51. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  52. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  53. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  54. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  55. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  56. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  57. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  58. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  59. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  60. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  61. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  62. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  63. /* POSSIBILITY OF SUCH DAMAGE. */
  64. /* */
  65. /* The views and conclusions contained in the software and */
  66. /* documentation are those of the authors and should not be */
  67. /* interpreted as representing official policies, either expressed */
  68. /* or implied, of The University of Texas at Austin. */
  69. /*********************************************************************/
  70. #define ASSEMBLER
  71. #include "common.h"
  72. /*
  73. #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  74. ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168]
  75. offset=stack[176]
  76. **********************************************************************************************/
  77. /*Note: r0 can not be used as address disp register */
  78. #define BM %r2
  79. #define BM_CUR %r0
  80. #define BN %r3
  81. #define BN_CUR %r10
  82. #define BK %r4
  83. #define LDC_BYTE %r8
  84. #define ALPHA %f0
  85. #define ALPHA_VECT %v0
  86. #define LOCAL_VAR1 %r9
  87. #define LOCAL_VAR2 %r1
  88. #define LOCAL_VAR3 %r11
  89. #define A %r5
  90. #define B %r6
  91. #define CIJ %r7
  92. #define CIJ_LOCAL %r12
  93. #define OFF %r13
  94. #define OFFSET %f8
  95. #define ALIGN_4 .align 16
  96. #define ALIGN_2 .align 8
  97. #define PREFETCH_INS 1
  98. /**************************Include kernel helper macrosses**********************************/
  99. #include "skernelMacros.S"
  100. /***********************************DGEMM***********************************************************/
  101. PROLOGUE
  102. #if defined(TRMMKERNEL)
  103. std OFFSET,40(%r15)
  104. stmg %r6,%r13,48(%r15)
  105. #else
  106. stmg %r6,%r12,48(%r15)
  107. #endif
  108. lg CIJ, 160(%r15)
  109. lg LOCAL_VAR1, 168(%r15)
  110. #if defined(TRMMKERNEL)
  111. lg OFF,176(%r15)
  112. ldgr OFFSET ,OFF
  113. #endif
  114. srlg BN_CUR,BN,2
  115. vrepf ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
  116. vldeb ALPHA_VECT,ALPHA_VECT
  117. sllg LDC_BYTE, LOCAL_VAR1,2 /*calculate lcd stride with bytes float=4 x<<2 */
  118. #if defined(TRMMKERNEL) && !defined(LEFT)
  119. /*off = -offset;*/
  120. lgdr LOCAL_VAR1,OFFSET
  121. lcgr OFF,LOCAL_VAR1
  122. #endif
  123. cijle BN_CUR,0,.LX2
  124. ALIGN_4
  125. .LX4_BN:
  126. #if defined(PREFETCH_INS)
  127. pfd 1, 0(A)
  128. pfd 1, 0(B)
  129. #endif
  130. #if defined(TRMMKERNEL) && defined(LEFT)
  131. /*off = offset;*/
  132. lgdr OFF,OFFSET
  133. #endif
  134. srlg BM_CUR,BM,3
  135. lgr LOCAL_VAR3,A
  136. lgr CIJ_LOCAL,CIJ
  137. cijle BM_CUR,0,.L4x4
  138. ALIGN_4
  139. .L8x4_BM: /*BM_CUR LOOP */
  140. #if defined(TRMMKERNEL)
  141. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  142. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4
  143. RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
  144. srl LOCAL_VAR1,2
  145. #else
  146. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  147. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  148. #endif
  149. ZERO_CVEC_8x4
  150. cijle LOCAL_VAR1,0,.L8x4_mod
  151. ALIGN_4
  152. .L8x4_4_BK: /*BK_CUR LOOP */
  153. #if defined(PREFETCH_INS)
  154. pfd 1, 256(LOCAL_VAR3)
  155. #endif
  156. CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2
  157. #if defined(PREFETCH_INS)
  158. pfd 1, 128(LOCAL_VAR2)
  159. #endif
  160. brctg LOCAL_VAR1,.L8x4_4_BK
  161. ALIGN_4
  162. .L8x4_mod:
  163. #if defined(TRMMKERNEL)
  164. RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
  165. nill LOCAL_VAR1,3
  166. #else
  167. lghi LOCAL_VAR1,3
  168. NGR LOCAL_VAR1,BK /*refresh BK*/
  169. #endif
  170. jz .L8x4_BK_Store
  171. ALIGN_4
  172. .L8x4_BK: /*BK_CUR LOOP */
  173. CALC_8x4 LOCAL_VAR3,LOCAL_VAR2
  174. brctg LOCAL_VAR1,.L8x4_BK
  175. ALIGN_4
  176. .L8x4_BK_Store:
  177. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  178. STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
  179. #if defined(TRMMKERNEL)
  180. /*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/
  181. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4
  182. #endif
  183. brctg BM_CUR,.L8x4_BM
  184. ALIGN_4
  185. .L4x4:
  186. tmll BM,4
  187. jz .L2x4
  188. ALIGN_4
  189. .L4x4_BM: /*BM start*/
  190. #if defined(TRMMKERNEL)
  191. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  192. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4
  193. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  194. srl LOCAL_VAR1,2
  195. #else
  196. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  197. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  198. #endif
  199. ZERO_CVEC_4x4
  200. cijle LOCAL_VAR1,0,.L4x4_mod
  201. ALIGN_4
  202. .L4x4_4_BK: /*BK_CUR LOOP */
  203. CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
  204. brctg LOCAL_VAR1,.L4x4_4_BK
  205. ALIGN_4
  206. .L4x4_mod:
  207. #if defined(TRMMKERNEL)
  208. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  209. nill LOCAL_VAR1,3
  210. #else
  211. lghi LOCAL_VAR1,3
  212. NGR LOCAL_VAR1,BK /*refresh BK*/
  213. #endif
  214. jz .L4x4_BK_Store
  215. ALIGN_4
  216. .L4x4_BK: /*BK_CUR LOOP */
  217. CALC_4x4 LOCAL_VAR3,LOCAL_VAR2
  218. brctg LOCAL_VAR1,.L4x4_BK
  219. ALIGN_4
  220. .L4x4_BK_Store:
  221. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  222. STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
  223. #if defined(TRMMKERNEL)
  224. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4
  225. #endif
  226. ALIGN_2
  227. .L2x4:
  228. tmll BM,2
  229. jz .L1x4
  230. ALIGN_4
  231. .L2x4_BM: /*BM start*/
  232. #if defined(TRMMKERNEL)
  233. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  234. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4
  235. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  236. srl LOCAL_VAR1,2
  237. #else
  238. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  239. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  240. #endif
  241. ZERO_CVEC_2x4
  242. cijle LOCAL_VAR1,0,.L2x4_mod
  243. ALIGN_4
  244. .L2x4_4_BK: /*BK_CUR LOOP */
  245. CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
  246. brctg LOCAL_VAR1,.L2x4_4_BK
  247. ALIGN_4
  248. .L2x4_mod:
  249. #if defined(TRMMKERNEL)
  250. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  251. nill LOCAL_VAR1,3
  252. #else
  253. lghi LOCAL_VAR1,3
  254. NGR LOCAL_VAR1,BK /*refresh BK*/
  255. #endif
  256. jz .L2x4_BK_Store
  257. ALIGN_4
  258. .L2x4_BK: /*BK_CUR LOOP */
  259. CALC_2x4 LOCAL_VAR3,LOCAL_VAR2
  260. brctg LOCAL_VAR1,.L2x4_BK
  261. ALIGN_4
  262. .L2x4_BK_Store:
  263. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  264. STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
  265. #if defined(TRMMKERNEL)
  266. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4
  267. #endif
  268. ALIGN_4
  269. .L1x4:
  270. tmll BM,1
  271. jz .Lx4_INNER_END
  272. ALIGN_4
  273. .L1x4_BM: /*BM start*/
  274. #if defined(TRMMKERNEL)
  275. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  276. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4
  277. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  278. srl LOCAL_VAR1,2
  279. #else
  280. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  281. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  282. #endif
  283. ZERO_CVEC_1x4
  284. cijle LOCAL_VAR1,0,.L1x4_mod
  285. ALIGN_4
  286. .L1x4_4_BK: /*BK_CUR LOOP */
  287. CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
  288. brctg LOCAL_VAR1,.L1x4_4_BK
  289. ALIGN_4
  290. .L1x4_mod:
  291. #if defined(TRMMKERNEL)
  292. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  293. nill LOCAL_VAR1,3
  294. #else
  295. lghi LOCAL_VAR1,3
  296. NGR LOCAL_VAR1,BK /*refresh BK*/
  297. #endif
  298. jz .L1x4_BK_Store
  299. ALIGN_4
  300. .L1x4_BK: /*BK_CUR LOOP */
  301. CALC_1x4 LOCAL_VAR3,LOCAL_VAR2
  302. brctg LOCAL_VAR1,.L1x4_BK
  303. ALIGN_4
  304. .L1x4_BK_Store:
  305. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  306. STORE_1x4 ALPHA ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
  307. #if defined(TRMMKERNEL)
  308. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4
  309. #endif
  310. ALIGN_2
  311. .Lx4_INNER_END:
  312. /*add LDC_BYTE_COPY to new*/
  313. sllg LOCAL_VAR1,LDC_BYTE,2 /*op*4 */
  314. #if defined(TRMMKERNEL) && !defined(LEFT)
  315. aghi OFF,4
  316. #endif
  317. sllg LOCAL_VAR2,BK,4 /*op*4*sizeof(float) =op*16* 2**4 */
  318. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  319. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */
  320. brctg BN_CUR,.LX4_BN
  321. /*********************************X2 SECTION************************************************/
  322. ALIGN_4
  323. .LX2:
  324. tmll BN,2
  325. jz .Lx1
  326. ALIGN_4
  327. .Lx2_BN:
  328. #if defined(TRMMKERNEL) && defined(LEFT)
  329. /*off = offset;*/
  330. lgdr OFF,OFFSET
  331. #endif
  332. srlg BM_CUR,BM,3
  333. lgr LOCAL_VAR3,A
  334. lgr CIJ_LOCAL,CIJ
  335. cijle BM_CUR,0,.L4x2
  336. ALIGN_4
  337. .L8x2_BM: /*BM_CUR LOOP */
  338. #if defined(TRMMKERNEL)
  339. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  340. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2
  341. RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
  342. srl LOCAL_VAR1,2
  343. #else
  344. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  345. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  346. #endif
  347. ZERO_CVEC_8x2
  348. cijle LOCAL_VAR1,0,.L8x2_mod
  349. ALIGN_4
  350. .L8x2_4_BK: /*BK_CUR LOOP */
  351. #if defined(PREFETCH_INS)
  352. pfd 1, 256(LOCAL_VAR3)
  353. #endif
  354. CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2
  355. brctg LOCAL_VAR1,.L8x2_4_BK
  356. ALIGN_4
  357. .L8x2_mod:
  358. #if defined(TRMMKERNEL)
  359. RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
  360. nill LOCAL_VAR1,3
  361. #else
  362. lghi LOCAL_VAR1,3
  363. NGR LOCAL_VAR1,BK /*refresh BK*/
  364. #endif
  365. jz .L8x2_BK_Store
  366. ALIGN_4
  367. .L8x2_BK: /*BK_CUR LOOP */
  368. CALC_8x2 LOCAL_VAR3,LOCAL_VAR2
  369. brctg LOCAL_VAR1,.L8x2_BK
  370. ALIGN_4
  371. .L8x2_BK_Store:
  372. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  373. STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  374. #if defined(TRMMKERNEL)
  375. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2
  376. #endif
  377. ALIGN_4
  378. brctg BM_CUR,.L8x2_BM
  379. ALIGN_2
  380. .L4x2:
  381. tmll BM,4
  382. jz .L2x2
  383. ALIGN_4
  384. .L4x2_BM: /*BM start*/
  385. #if defined(TRMMKERNEL)
  386. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  387. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2
  388. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  389. srl LOCAL_VAR1,2
  390. #else
  391. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  392. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  393. #endif
  394. ZERO_CVEC_4x2
  395. cijle LOCAL_VAR1,0,.L4x2_mod
  396. ALIGN_4
  397. .L4x2_4_BK: /*BK_CUR LOOP */
  398. CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
  399. brctg LOCAL_VAR1,.L4x2_4_BK
  400. ALIGN_4
  401. .L4x2_mod:
  402. #if defined(TRMMKERNEL)
  403. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  404. nill LOCAL_VAR1,3
  405. #else
  406. lghi LOCAL_VAR1,3
  407. NGR LOCAL_VAR1,BK /*refresh BK*/
  408. #endif
  409. jz .L4x2_BK_Store
  410. ALIGN_4
  411. .L4x2_BK: /*BK_CUR LOOP */
  412. CALC_4x2 LOCAL_VAR3,LOCAL_VAR2
  413. brctg LOCAL_VAR1,.L4x2_BK
  414. ALIGN_4
  415. .L4x2_BK_Store:
  416. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  417. STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  418. #if defined(TRMMKERNEL)
  419. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2
  420. #endif
  421. ALIGN_2
  422. .L2x2:
  423. tmll BM,2
  424. jz .L1x2
  425. ALIGN_4
  426. .L2x2_BM: /*BM start*/
  427. #if defined(TRMMKERNEL)
  428. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  429. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2
  430. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  431. srl LOCAL_VAR1,2
  432. #else
  433. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  434. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  435. #endif
  436. ZERO_CVEC_2x2
  437. cijle LOCAL_VAR1,0,.L2x2_mod
  438. ALIGN_4
  439. .L2x2_4_BK: /*BK_CUR LOOP */
  440. CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
  441. brctg LOCAL_VAR1,.L2x2_4_BK
  442. ALIGN_4
  443. .L2x2_mod:
  444. #if defined(TRMMKERNEL)
  445. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  446. nill LOCAL_VAR1,3
  447. #else
  448. lghi LOCAL_VAR1,3
  449. NGR LOCAL_VAR1,BK /*refresh BK*/
  450. #endif
  451. jz .L2x2_BK_Store
  452. ALIGN_4
  453. .L2x2_BK: /*BK_CUR LOOP */
  454. CALC_2x2 LOCAL_VAR3,LOCAL_VAR2
  455. brctg LOCAL_VAR1,.L2x2_BK
  456. ALIGN_4
  457. .L2x2_BK_Store:
  458. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  459. STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  460. #if defined(TRMMKERNEL)
  461. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2
  462. #endif
  463. ALIGN_2
  464. .L1x2:
  465. tmll BM,1
  466. jz .Lx2_INNER_END
  467. ALIGN_4
  468. .L1x2_BM: /*BM start*/
  469. #if defined(TRMMKERNEL)
  470. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  471. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2
  472. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  473. srl LOCAL_VAR1,2
  474. #else
  475. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  476. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  477. #endif
  478. ZERO_CVEC_1x2
  479. cijle LOCAL_VAR1,0,.L1x2_mod
  480. ALIGN_4
  481. .L1x2_4_BK: /*BK_CUR LOOP */
  482. CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
  483. brctg LOCAL_VAR1,.L1x2_4_BK
  484. ALIGN_4
  485. .L1x2_mod:
  486. #if defined(TRMMKERNEL)
  487. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  488. nill LOCAL_VAR1,3
  489. #else
  490. lghi LOCAL_VAR1,3
  491. NGR LOCAL_VAR1,BK /*refresh BK*/
  492. #endif
  493. jz .L1x2_BK_Store
  494. ALIGN_4
  495. .L1x2_BK: /*BK_CUR LOOP */
  496. CALC_1x2 LOCAL_VAR3,LOCAL_VAR2
  497. brctg LOCAL_VAR1,.L1x2_BK
  498. ALIGN_4
  499. .L1x2_BK_Store:
  500. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  501. STORE_1x2 ALPHA ,CIJ_LOCAL, LDC_BYTE
  502. #if defined(TRMMKERNEL)
  503. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2
  504. #endif
  505. ALIGN_2
  506. .Lx2_INNER_END:
  507. /*add LDC_BYTE_COPY to new*/
  508. la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*op*2 */
  509. sllg LOCAL_VAR2,BK,3 /*op*2*sizeof(float) =op*8 2**3 */
  510. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  511. #if defined(TRMMKERNEL) && !defined(LEFT)
  512. aghi OFF,2
  513. #endif
  514. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */
  515. /*********************************X1 SECTION************************************************/
  516. ALIGN_2
  517. .Lx1:
  518. tmll BN,1
  519. jz .L_FUNC_END
  520. ALIGN_4
  521. .Lx1_BN:
  522. #if defined(TRMMKERNEL) && defined(LEFT)
  523. /*off = offset;*/
  524. lgdr OFF,OFFSET
  525. #endif
  526. srlg BM_CUR,BM,3
  527. lgr LOCAL_VAR3,A
  528. lgr CIJ_LOCAL,CIJ
  529. cijle BM_CUR,0,.L4x1
  530. ALIGN_4
  531. .L8x1_BM: /*BM_CUR LOOP */
  532. #if defined(TRMMKERNEL)
  533. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  534. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1
  535. RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
  536. srl LOCAL_VAR1,2
  537. #else
  538. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  539. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  540. #endif
  541. ZERO_CVEC_8x1
  542. cijle LOCAL_VAR1,0,.L8x1_mod
  543. ALIGN_4
  544. .L8x1_4_BK: /*BK_CUR LOOP */
  545. #if defined(PREFETCH_INS)
  546. pfd 1, 256(LOCAL_VAR3)
  547. #endif
  548. CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2
  549. brctg LOCAL_VAR1,.L8x1_4_BK
  550. ALIGN_4
  551. .L8x1_mod:
  552. #if defined(TRMMKERNEL)
  553. RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
  554. nill LOCAL_VAR1,3
  555. #else
  556. lghi LOCAL_VAR1,3
  557. NGR LOCAL_VAR1,BK /*refresh BK*/
  558. #endif
  559. jz .L8x1_BK_Store
  560. ALIGN_4
  561. .L8x1_BK: /*BK_CUR LOOP */
  562. CALC_8x1 LOCAL_VAR3,LOCAL_VAR2
  563. brctg LOCAL_VAR1,.L8x1_BK
  564. ALIGN_4
  565. .L8x1_BK_Store:
  566. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  567. STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  568. #if defined(TRMMKERNEL)
  569. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1
  570. #endif
  571. ALIGN_4
  572. brctg BM_CUR,.L8x1_BM
  573. ALIGN_2
  574. .L4x1:
  575. tmll BM,4
  576. jz .L2x1
  577. ALIGN_4
  578. .L4x1_BM: /*BM start*/
  579. #if defined(TRMMKERNEL)
  580. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  581. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1
  582. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  583. srl LOCAL_VAR1,2
  584. #else
  585. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  586. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  587. #endif
  588. ZERO_CVEC_4x1
  589. cijle LOCAL_VAR1,0,.L4x1_mod
  590. ALIGN_4
  591. .L4x1_4_BK: /*BK_CUR LOOP */
  592. CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
  593. brctg LOCAL_VAR1,.L4x1_4_BK
  594. ALIGN_4
  595. .L4x1_mod:
  596. #if defined(TRMMKERNEL)
  597. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  598. nill LOCAL_VAR1,3
  599. #else
  600. lghi LOCAL_VAR1,3
  601. NGR LOCAL_VAR1,BK /*refresh BK*/
  602. #endif
  603. jz .L4x1_BK_Store
  604. ALIGN_4
  605. .L4x1_BK: /*BK_CUR LOOP */
  606. CALC_4x1 LOCAL_VAR3,LOCAL_VAR2
  607. brctg LOCAL_VAR1,.L4x1_BK
  608. ALIGN_4
  609. .L4x1_BK_Store:
  610. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  611. STORE_4x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
  612. #if defined(TRMMKERNEL)
  613. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1
  614. #endif
  615. ALIGN_2
  616. .L2x1:
  617. tmll BM,2
  618. jz .L1x1
  619. ALIGN_4
  620. .L2x1_BM: /*BM start*/
  621. #if defined(TRMMKERNEL)
  622. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  623. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1
  624. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  625. srl LOCAL_VAR1,2
  626. #else
  627. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  628. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  629. #endif
  630. ZERO_CVEC_2x1
  631. cijle LOCAL_VAR1,0,.L2x1_mod
  632. ALIGN_4
  633. .L2x1_4_BK: /*BK_CUR LOOP */
  634. CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
  635. brctg LOCAL_VAR1,.L2x1_4_BK
  636. ALIGN_4
  637. .L2x1_mod:
  638. #if defined(TRMMKERNEL)
  639. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  640. nill LOCAL_VAR1,3
  641. #else
  642. lghi LOCAL_VAR1,3
  643. NGR LOCAL_VAR1,BK /*refresh BK*/
  644. #endif
  645. jz .L2x1_BK_Store
  646. ALIGN_4
  647. .L2x1_BK: /*BK_CUR LOOP */
  648. CALC_2x1 LOCAL_VAR3,LOCAL_VAR2
  649. brctg LOCAL_VAR1,.L2x1_BK
  650. ALIGN_4
  651. .L2x1_BK_Store:
  652. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  653. STORE_2x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
  654. #if defined(TRMMKERNEL)
  655. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1
  656. #endif
  657. ALIGN_2
  658. .L1x1:
  659. tmll BM, 1
  660. jz .Lx1_INNER_END
  661. ALIGN_4
  662. .L1x1_BM: /*BM start*/
  663. #if defined(TRMMKERNEL)
  664. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  665. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1
  666. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  667. srl LOCAL_VAR1,2
  668. #else
  669. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  670. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  671. #endif
  672. ZERO_CVEC_1x1
  673. cijle LOCAL_VAR1,0,.L1x1_mod
  674. ALIGN_4
  675. .L1x1_4_BK: /*BK_CUR LOOP */
  676. CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
  677. brctg LOCAL_VAR1,.L1x1_4_BK
  678. ALIGN_4
  679. .L1x1_mod:
  680. #if defined(TRMMKERNEL)
  681. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  682. nill LOCAL_VAR1,3
  683. #else
  684. lghi LOCAL_VAR1,3
  685. NGR LOCAL_VAR1,BK /*refresh BK*/
  686. #endif
  687. jz .L1x1_BK_Store
  688. ALIGN_4
  689. .L1x1_BK: /*BK_CUR LOOP */
  690. CALC_1x1 LOCAL_VAR3,LOCAL_VAR2
  691. brctg LOCAL_VAR1,.L1x1_BK
  692. ALIGN_4
  693. .L1x1_BK_Store:
  694. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  695. STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
  696. #if defined(TRMMKERNEL)
  697. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1
  698. #endif
  699. ALIGN_2
  700. .Lx1_INNER_END:
  701. /*add LDC_BYTE_COPY to new*/
  702. sllg LOCAL_VAR2,BK,2 /*op*1*sizeof(float) =op*4 2**2 */
  703. la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
  704. #if defined(TRMMKERNEL) && !defined(LEFT)
  705. aghi OFF,1
  706. #endif
  707. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(float) */
  708. ALIGN_2
  709. .L_FUNC_END:
  710. /*end*/
  711. #if defined(TRMMKERNEL)
  712. ld OFFSET,40(%r15)
  713. lmg %r6,%r13,48(%r15)
  714. #else
  715. lmg %r6,%r12,48(%r15)
  716. #endif
  717. br %r14
  718. .end