You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm8x4V.S 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. /***************************************************************************
  2. Copyright (c) 2013-2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2017/01/01 AbdelRauf (quickwritereader@gmail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. **************************************************************************************/
  33. /*********************************************************************/
  34. /* Copyright 2009, 2010 The University of Texas at Austin. */
  35. /* All rights reserved. */
  36. /* */
  37. /* Redistribution and use in source and binary forms, with or */
  38. /* without modification, are permitted provided that the following */
  39. /* conditions are met: */
  40. /* */
  41. /* 1. Redistributions of source code must retain the above */
  42. /* copyright notice, this list of conditions and the following */
  43. /* disclaimer. */
  44. /* */
  45. /* 2. Redistributions in binary form must reproduce the above */
  46. /* copyright notice, this list of conditions and the following */
  47. /* disclaimer in the documentation and/or other materials */
  48. /* provided with the distribution. */
  49. /* */
  50. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  51. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  52. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  53. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  54. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  55. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  56. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  57. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  58. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  59. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  60. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  61. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  62. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  63. /* POSSIBILITY OF SUCH DAMAGE. */
  64. /* */
  65. /* The views and conclusions contained in the software and */
  66. /* documentation are those of the authors and should not be */
  67. /* interpreted as representing official policies, either expressed */
  68. /* or implied, of The University of Texas at Austin. */
  69. /*********************************************************************/
  70. #define ASSEMBLER
  71. #include "common.h"
  72. /*
  73. #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  74. ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168]
  75. **********************************************************************************************/
  76. /*Note: r0 can not be used as address disp register */
  77. #define BM %r2
  78. #define BM_CUR %r0
  79. #define BN %r3
  80. #define BN_CUR %r10
  81. #define BK %r4
  82. #define LDC_BYTE %r8
  83. #define ALPHA %f0
  84. #define ALPHA_VECT %v0
  85. #define LOCAL_VAR1 %r9
  86. #define LOCAL_VAR2 %r1
  87. #define LOCAL_VAR3 %r11
  88. #define A %r5
  89. #define B %r6
  90. #define CIJ %r7
  91. #define CIJ_LOCAL %r12
  92. #define ALIGN_4 .align 16
  93. #define ALIGN_2 .align 8
  94. #define PREFETCH_INS 1
  95. #include "kernelMacros.S"
  96. /***********************************DGEMM***********************************************************/
  97. PROLOGUE
  98. stmg %r6,%r12,48(%r15)
  99. lg CIJ, 160(%r15)
  100. lg LOCAL_VAR1, 168(%r15)
  101. srlg BN_CUR,BN,2
  102. vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
  103. sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with bytes double=8 x<<3 */
  104. cijle BN_CUR,0,.LX2
  105. ALIGN_4
  106. .LX4_BN:
  107. #if defined(PREFETCH_INS)
  108. pfd 1, 0(A)
  109. pfd 1, 256(A)
  110. pfd 1, 0(B)
  111. pfd 1, 256(B)
  112. #endif
  113. srlg BM_CUR,BM,3
  114. lgr LOCAL_VAR3,A
  115. lgr CIJ_LOCAL,CIJ
  116. cijle BM_CUR,0,.L4x4
  117. ALIGN_4
  118. .L8x4_BM: /*BM_CUR LOOP */
  119. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  120. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  121. ZERO_CVEC_8x4
  122. cijle LOCAL_VAR1,0,.L8x4_mod
  123. ALIGN_4
  124. .L8x4_4_BK: /*BK_CUR LOOP */
  125. #if defined(PREFETCH_INS)
  126. pfd 1, 512(LOCAL_VAR3)
  127. #endif
  128. CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2
  129. #if defined(PREFETCH_INS)
  130. pfd 1, 512(LOCAL_VAR2)
  131. #endif
  132. brctg LOCAL_VAR1,.L8x4_4_BK
  133. ALIGN_4
  134. .L8x4_mod:
  135. lghi LOCAL_VAR1,3
  136. NGR LOCAL_VAR1,BK /*refresh BK*/
  137. jz .L8x4_BK_Store
  138. ALIGN_4
  139. .L8x4_BK: /*BK_CUR LOOP */
  140. CALC_8x4 LOCAL_VAR3,LOCAL_VAR2
  141. brctg LOCAL_VAR1,.L8x4_BK
  142. ALIGN_4
  143. .L8x4_BK_Store:
  144. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  145. STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  146. brctg BM_CUR,.L8x4_BM
  147. ALIGN_4
  148. .L4x4:
  149. tmll BM,4
  150. jz .L2x4
  151. ALIGN_4
  152. .L4x4_BM: /*BM start*/
  153. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  154. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  155. ZERO_CVEC_4x4
  156. cijle LOCAL_VAR1,0,.L4x4_mod
  157. ALIGN_4
  158. .L4x4_4_BK: /*BK_CUR LOOP */
  159. CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
  160. brctg LOCAL_VAR1,.L4x4_4_BK
  161. ALIGN_4
  162. .L4x4_mod:
  163. lghi LOCAL_VAR1,3
  164. NGR LOCAL_VAR1,BK /*refresh BK*/
  165. jz .L4x4_BK_Store
  166. ALIGN_4
  167. .L4x4_BK: /*BK_CUR LOOP */
  168. CALC_4x4 LOCAL_VAR3,LOCAL_VAR2
  169. brctg LOCAL_VAR1,.L4x4_BK
  170. ALIGN_4
  171. .L4x4_BK_Store:
  172. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  173. STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  174. ALIGN_2
  175. .L2x4:
  176. tmll BM,2
  177. jz .L1x4
  178. ALIGN_4
  179. .L2x4_BM: /*BM start*/
  180. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  181. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  182. ZERO_CVEC_2x4
  183. cijle LOCAL_VAR1,0,.L2x4_mod
  184. ALIGN_4
  185. .L2x4_4_BK: /*BK_CUR LOOP */
  186. CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
  187. brctg LOCAL_VAR1,.L2x4_4_BK
  188. ALIGN_4
  189. .L2x4_mod:
  190. lghi LOCAL_VAR1,3
  191. NGR LOCAL_VAR1,BK /*refresh BK*/
  192. jz .L2x4_BK_Store
  193. ALIGN_4
  194. .L2x4_BK: /*BK_CUR LOOP */
  195. CALC_2x4 LOCAL_VAR3,LOCAL_VAR2
  196. brctg LOCAL_VAR1,.L2x4_BK
  197. ALIGN_4
  198. .L2x4_BK_Store:
  199. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  200. STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  201. ALIGN_4
  202. .L1x4:
  203. tmll BM,1
  204. jz .Lx4_INNER_END
  205. ALIGN_4
  206. .L1x4_BM: /*BM start*/
  207. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  208. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  209. ZERO_CVEC_1x4
  210. cijle LOCAL_VAR1,0,.L1x4_mod
  211. ALIGN_4
  212. .L1x4_4_BK: /*BK_CUR LOOP */
  213. CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
  214. brctg LOCAL_VAR1,.L1x4_4_BK
  215. ALIGN_4
  216. .L1x4_mod:
  217. lghi LOCAL_VAR1,3
  218. NGR LOCAL_VAR1,BK /*refresh BK*/
  219. jz .L1x4_BK_Store
  220. ALIGN_4
  221. .L1x4_BK: /*BK_CUR LOOP */
  222. CALC_1x4 LOCAL_VAR3,LOCAL_VAR2
  223. brctg LOCAL_VAR1,.L1x4_BK
  224. ALIGN_4
  225. .L1x4_BK_Store:
  226. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  227. STORE_1x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  228. ALIGN_2
  229. .Lx4_INNER_END:
  230. /*add LDC_BYTE_COPY to new*/
  231. sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */
  232. sllg LOCAL_VAR2,BK,5 /*muyliply*4*sizeof(double) =multiply*32* 2**5 */
  233. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  234. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
  235. brctg BN_CUR,.LX4_BN
  236. /*********************************X2 SECTION************************************************/
  237. ALIGN_4
  238. .LX2:
  239. tmll BN,2
  240. jz .Lx1
  241. ALIGN_4
  242. .Lx2_BN:
  243. srlg BM_CUR,BM,3
  244. lgr LOCAL_VAR3,A
  245. lgr CIJ_LOCAL,CIJ
  246. cijle BM_CUR,0,.L4x2
  247. ALIGN_4
  248. .L8x2_BM: /*BM_CUR LOOP */
  249. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  250. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  251. ZERO_CVEC_8x2
  252. cijle LOCAL_VAR1,0,.L8x2_mod
  253. ALIGN_4
  254. .L8x2_4_BK: /*BK_CUR LOOP */
  255. #if defined(PREFETCH_INS)
  256. pfd 1, 256(LOCAL_VAR3)
  257. pfd 1,64(LOCAL_VAR2)
  258. #endif
  259. CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2
  260. brctg LOCAL_VAR1,.L8x2_4_BK
  261. ALIGN_4
  262. .L8x2_mod:
  263. lghi LOCAL_VAR1,3
  264. NGR LOCAL_VAR1,BK /*refresh BK*/
  265. jz .L8x2_BK_Store
  266. ALIGN_4
  267. .L8x2_BK: /*BK_CUR LOOP */
  268. CALC_8x2 LOCAL_VAR3,LOCAL_VAR2
  269. brctg LOCAL_VAR1,.L8x2_BK
  270. ALIGN_4
  271. .L8x2_BK_Store:
  272. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  273. STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  274. ALIGN_4
  275. brctg BM_CUR,.L8x2_BM
  276. ALIGN_2
  277. .L4x2:
  278. tmll BM,4
  279. jz .L2x2
  280. ALIGN_4
  281. .L4x2_BM: /*BM start*/
  282. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  283. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  284. ZERO_CVEC_4x2
  285. cijle LOCAL_VAR1,0,.L4x2_mod
  286. ALIGN_4
  287. .L4x2_4_BK: /*BK_CUR LOOP */
  288. CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
  289. brctg LOCAL_VAR1,.L4x2_4_BK
  290. ALIGN_4
  291. .L4x2_mod:
  292. lghi LOCAL_VAR1,3
  293. NGR LOCAL_VAR1,BK /*refresh BK*/
  294. jz .L4x2_BK_Store
  295. ALIGN_4
  296. .L4x2_BK: /*BK_CUR LOOP */
  297. CALC_4x2 LOCAL_VAR3,LOCAL_VAR2
  298. brctg LOCAL_VAR1,.L4x2_BK
  299. ALIGN_4
  300. .L4x2_BK_Store:
  301. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  302. STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  303. ALIGN_2
  304. .L2x2:
  305. tmll BM,2
  306. jz .L1x2
  307. ALIGN_4
  308. .L2x2_BM: /*BM start*/
  309. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  310. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  311. ZERO_CVEC_2x2
  312. cijle LOCAL_VAR1,0,.L2x2_mod
  313. ALIGN_4
  314. .L2x2_4_BK: /*BK_CUR LOOP */
  315. CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
  316. brctg LOCAL_VAR1,.L2x2_4_BK
  317. ALIGN_4
  318. .L2x2_mod:
  319. lghi LOCAL_VAR1,3
  320. NGR LOCAL_VAR1,BK /*refresh BK*/
  321. jz .L2x2_BK_Store
  322. ALIGN_4
  323. .L2x2_BK: /*BK_CUR LOOP */
  324. CALC_2x2 LOCAL_VAR3,LOCAL_VAR2
  325. brctg LOCAL_VAR1,.L2x2_BK
  326. ALIGN_4
  327. .L2x2_BK_Store:
  328. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  329. STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  330. ALIGN_2
  331. .L1x2:
  332. tmll BM,1
  333. jz .Lx2_INNER_END
  334. ALIGN_4
  335. .L1x2_BM: /*BM start*/
  336. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  337. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  338. ZERO_CVEC_1x2
  339. cijle LOCAL_VAR1,0,.L1x2_mod
  340. ALIGN_4
  341. .L1x2_4_BK: /*BK_CUR LOOP */
  342. CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
  343. brctg LOCAL_VAR1,.L1x2_4_BK
  344. ALIGN_4
  345. .L1x2_mod:
  346. lghi LOCAL_VAR1,3
  347. NGR LOCAL_VAR1,BK /*refresh BK*/
  348. jz .L1x2_BK_Store
  349. ALIGN_4
  350. .L1x2_BK: /*BK_CUR LOOP */
  351. CALC_1x2 LOCAL_VAR3,LOCAL_VAR2
  352. brctg LOCAL_VAR1,.L1x2_BK
  353. ALIGN_4
  354. .L1x2_BK_Store:
  355. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  356. STORE_1x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  357. ALIGN_2
  358. .Lx2_INNER_END:
  359. /*add LDC_BYTE_COPY to new*/
  360. la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */
  361. sllg LOCAL_VAR2,BK,4 /*muyliply*2*sizeof(double) =multiply*16* 2**4 */
  362. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  363. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
  364. /*********************************X1 SECTION************************************************/
  365. ALIGN_2
  366. .Lx1:
  367. tmll BN,1
  368. jz .L_FUNC_END
  369. ALIGN_4
  370. .Lx1_BN:
  371. srlg BM_CUR,BM,3
  372. lgr LOCAL_VAR3,A
  373. lgr CIJ_LOCAL,CIJ
  374. cijle BM_CUR,0,.L4x1
  375. ALIGN_4
  376. .L8x1_BM: /*BM_CUR LOOP */
  377. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  378. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  379. ZERO_CVEC_8x1
  380. cijle LOCAL_VAR1,0,.L8x1_mod
  381. ALIGN_4
  382. .L8x1_4_BK: /*BK_CUR LOOP */
  383. #if defined(PREFETCH_INS)
  384. pfd 1, 256(LOCAL_VAR3)
  385. #endif
  386. CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2
  387. brctg LOCAL_VAR1,.L8x1_4_BK
  388. ALIGN_4
  389. .L8x1_mod:
  390. lghi LOCAL_VAR1,3
  391. NGR LOCAL_VAR1,BK /*refresh BK*/
  392. jz .L8x1_BK_Store
  393. ALIGN_4
  394. .L8x1_BK: /*BK_CUR LOOP */
  395. CALC_8x1 LOCAL_VAR3,LOCAL_VAR2
  396. brctg LOCAL_VAR1,.L8x1_BK
  397. ALIGN_4
  398. .L8x1_BK_Store:
  399. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  400. STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
  401. ALIGN_4
  402. brctg BM_CUR,.L8x1_BM
  403. ALIGN_2
  404. .L4x1:
  405. tmll BM,4
  406. jz .L2x1
  407. ALIGN_4
  408. .L4x1_BM: /*BM start*/
  409. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  410. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  411. ZERO_CVEC_4x1
  412. cijle LOCAL_VAR1,0,.L4x1_mod
  413. ALIGN_4
  414. .L4x1_4_BK: /*BK_CUR LOOP */
  415. CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
  416. brctg LOCAL_VAR1,.L4x1_4_BK
  417. ALIGN_4
  418. .L4x1_mod:
  419. lghi LOCAL_VAR1,3
  420. NGR LOCAL_VAR1,BK /*refresh BK*/
  421. jz .L4x1_BK_Store
  422. ALIGN_4
  423. .L4x1_BK: /*BK_CUR LOOP */
  424. CALC_4x1 LOCAL_VAR3,LOCAL_VAR2
  425. brctg LOCAL_VAR1,.L4x1_BK
  426. ALIGN_4
  427. .L4x1_BK_Store:
  428. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  429. STORE_4x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  430. ALIGN_2
  431. .L2x1:
  432. tmll BM,2
  433. jz .L1x1
  434. ALIGN_4
  435. .L2x1_BM: /*BM start*/
  436. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  437. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  438. ZERO_CVEC_2x1
  439. cijle LOCAL_VAR1,0,.L2x1_mod
  440. ALIGN_4
  441. .L2x1_4_BK: /*BK_CUR LOOP */
  442. CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
  443. brctg LOCAL_VAR1,.L2x1_4_BK
  444. ALIGN_4
  445. .L2x1_mod:
  446. lghi LOCAL_VAR1,3
  447. NGR LOCAL_VAR1,BK /*refresh BK*/
  448. jz .L2x1_BK_Store
  449. ALIGN_4
  450. .L2x1_BK: /*BK_CUR LOOP */
  451. CALC_2x1 LOCAL_VAR3,LOCAL_VAR2
  452. brctg LOCAL_VAR1,.L2x1_BK
  453. ALIGN_4
  454. .L2x1_BK_Store:
  455. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  456. STORE_2x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
  457. ALIGN_2
  458. .L1x1:
  459. tmll BM, 1
  460. jz .Lx1_INNER_END
  461. ALIGN_4
  462. .L1x1_BM: /*BM start*/
  463. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  464. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  465. ZERO_CVEC_1x1
  466. cijle LOCAL_VAR1,0,.L1x1_mod
  467. ALIGN_4
  468. .L1x1_4_BK: /*BK_CUR LOOP */
  469. CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
  470. brctg LOCAL_VAR1,.L1x1_4_BK
  471. ALIGN_4
  472. .L1x1_mod:
  473. lghi LOCAL_VAR1,3
  474. NGR LOCAL_VAR1,BK /*refresh BK*/
  475. jz .L1x1_BK_Store
  476. ALIGN_4
  477. .L1x1_BK: /*BK_CUR LOOP */
  478. CALC_1x1 LOCAL_VAR3,LOCAL_VAR2
  479. brctg LOCAL_VAR1,.L1x1_BK
  480. ALIGN_4
  481. .L1x1_BK_Store:
  482. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  483. STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
  484. ALIGN_2
  485. .Lx1_INNER_END:
  486. /*add LDC_BYTE_COPY to new*/
  487. sllg LOCAL_VAR2,BK,3 /*muyliply*2*sizeof(double) =multiply*8* 2**3 */
  488. la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
  489. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(double) */
  490. ALIGN_2
  491. .L_FUNC_END:
  492. /*end*/
  493. lmg %r6,%r12,48(%r15)
  494. br %r14
  495. EPILOGUE
  496. .end