You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrsm_kernel_LN_bulldozer.c 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include "common.h"
  39. static FLOAT dm1 = -1.;
  40. #ifdef CONJ
  41. #define GEMM_KERNEL GEMM_KERNEL_L
  42. #else
  43. #define GEMM_KERNEL GEMM_KERNEL_N
  44. #endif
  45. #if GEMM_DEFAULT_UNROLL_M == 1
  46. #define GEMM_UNROLL_M_SHIFT 0
  47. #endif
  48. #if GEMM_DEFAULT_UNROLL_M == 2
  49. #define GEMM_UNROLL_M_SHIFT 1
  50. #endif
  51. #if GEMM_DEFAULT_UNROLL_M == 4
  52. #define GEMM_UNROLL_M_SHIFT 2
  53. #endif
  54. #if GEMM_DEFAULT_UNROLL_M == 6
  55. #define GEMM_UNROLL_M_SHIFT 2
  56. #endif
  57. #if GEMM_DEFAULT_UNROLL_M == 8
  58. #define GEMM_UNROLL_M_SHIFT 3
  59. #endif
  60. #if GEMM_DEFAULT_UNROLL_M == 16
  61. #define GEMM_UNROLL_M_SHIFT 4
  62. #endif
  63. #if GEMM_DEFAULT_UNROLL_N == 1
  64. #define GEMM_UNROLL_N_SHIFT 0
  65. #endif
  66. #if GEMM_DEFAULT_UNROLL_N == 2
  67. #define GEMM_UNROLL_N_SHIFT 1
  68. #endif
  69. #if GEMM_DEFAULT_UNROLL_N == 4
  70. #define GEMM_UNROLL_N_SHIFT 2
  71. #endif
  72. #if GEMM_DEFAULT_UNROLL_N == 8
  73. #define GEMM_UNROLL_N_SHIFT 3
  74. #endif
  75. #if GEMM_DEFAULT_UNROLL_N == 16
  76. #define GEMM_UNROLL_N_SHIFT 4
  77. #endif
  78. static void dtrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline));
  79. static void dtrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs)
  80. {
  81. FLOAT *c1 = c + ldc ;
  82. BLASLONG n1 = n * 8;
  83. BLASLONG i=0;
  84. bs += 14;
  85. __asm__ __volatile__
  86. (
  87. " vzeroupper \n\t"
  88. " prefetcht0 (%4) \n\t"
  89. " prefetcht0 (%5) \n\t"
  90. " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t"
  91. " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t"
  92. " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t"
  93. " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t"
  94. " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t"
  95. " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t"
  96. " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t"
  97. " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"
  98. " cmpq $0, %0 \n\t"
  99. " je 2f \n\t"
  100. " .align 16 \n\t"
  101. "1: \n\t"
  102. " prefetcht0 384(%2,%1,8) \n\t"
  103. " prefetcht0 384(%3,%1,8) \n\t"
  104. " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
  105. " vmovups (%2,%1,8), %%xmm4 \n\t"
  106. " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
  107. " vmovups 16(%2,%1,8), %%xmm5 \n\t"
  108. " vmovups 32(%2,%1,8), %%xmm6 \n\t"
  109. " vmovups 48(%2,%1,8), %%xmm7 \n\t"
  110. " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
  111. " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
  112. " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t"
  113. " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t"
  114. " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t"
  115. " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t"
  116. " addq $8, %1 \n\t"
  117. " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t"
  118. " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t"
  119. " cmpq %1, %0 \n\t"
  120. " jz 2f \n\t"
  121. " prefetcht0 384(%2,%1,8) \n\t"
  122. " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
  123. " vmovups (%2,%1,8), %%xmm4 \n\t"
  124. " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
  125. " vmovups 16(%2,%1,8), %%xmm5 \n\t"
  126. " vmovups 32(%2,%1,8), %%xmm6 \n\t"
  127. " vmovups 48(%2,%1,8), %%xmm7 \n\t"
  128. " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
  129. " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
  130. " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t"
  131. " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t"
  132. " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t"
  133. " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t"
  134. " addq $8, %1 \n\t"
  135. " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t"
  136. " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t"
  137. " cmpq %1, %0 \n\t"
  138. " jz 2f \n\t"
  139. " prefetcht0 384(%2,%1,8) \n\t"
  140. " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
  141. " vmovups (%2,%1,8), %%xmm4 \n\t"
  142. " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
  143. " vmovups 16(%2,%1,8), %%xmm5 \n\t"
  144. " vmovups 32(%2,%1,8), %%xmm6 \n\t"
  145. " vmovups 48(%2,%1,8), %%xmm7 \n\t"
  146. " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
  147. " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
  148. " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t"
  149. " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t"
  150. " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t"
  151. " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t"
  152. " addq $8, %1 \n\t"
  153. " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t"
  154. " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t"
  155. " cmpq %1, %0 \n\t"
  156. " jz 2f \n\t"
  157. " prefetcht0 384(%2,%1,8) \n\t"
  158. " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
  159. " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
  160. " vmovups (%2,%1,8), %%xmm4 \n\t"
  161. " vmovups 16(%2,%1,8), %%xmm5 \n\t"
  162. " vmovups 32(%2,%1,8), %%xmm6 \n\t"
  163. " vmovups 48(%2,%1,8), %%xmm7 \n\t"
  164. " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
  165. " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
  166. " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t"
  167. " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t"
  168. " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t"
  169. " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t"
  170. " addq $8, %1 \n\t"
  171. " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t"
  172. " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t"
  173. " cmpq %1, %0 \n\t"
  174. " jnz 1b \n\t"
  175. "2: \n\t"
  176. " vmovups (%4) , %%xmm0 \n\t"
  177. " vmovups 16(%4) , %%xmm1 \n\t"
  178. " vmovups 32(%4) , %%xmm2 \n\t"
  179. " vmovups 48(%4) , %%xmm3 \n\t"
  180. " vmovups (%5) , %%xmm4 \n\t"
  181. " vmovups 16(%5) , %%xmm5 \n\t"
  182. " vmovups 32(%5) , %%xmm6 \n\t"
  183. " vmovups 48(%5) , %%xmm7 \n\t"
  184. " vsubpd %%xmm8 , %%xmm0 , %%xmm8 \n\t"
  185. " vsubpd %%xmm9 , %%xmm1 , %%xmm9 \n\t"
  186. " vsubpd %%xmm10, %%xmm2 , %%xmm10 \n\t"
  187. " vsubpd %%xmm11, %%xmm3 , %%xmm11 \n\t"
  188. " vsubpd %%xmm12, %%xmm4 , %%xmm12 \n\t"
  189. " vsubpd %%xmm13, %%xmm5 , %%xmm13 \n\t"
  190. " vsubpd %%xmm14, %%xmm6 , %%xmm14 \n\t"
  191. " vsubpd %%xmm15, %%xmm7 , %%xmm15 \n\t"
  192. "3: \n\t"
  193. " movq $56, %1 \n\t" // i = 7
  194. " xorq %0, %0 \n\t" // pointer for bs
  195. " vmovddup 56(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  196. " vunpckhpd %%xmm11 , %%xmm11, %%xmm1 \n\t" // extract bb0
  197. " vunpckhpd %%xmm15 , %%xmm15, %%xmm2 \n\t" // extract bb1
  198. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  199. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  200. " vmovsd %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
  201. " vmovsd %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
  202. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  203. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  204. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  205. " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k]
  206. " vmovups 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k]
  207. " vmovsd 48(%6, %1, 8) , %%xmm7 \n\t" // read a[k]
  208. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  209. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  210. " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
  211. " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
  212. " vfnmaddpd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
  213. " vfnmaddpd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
  214. " vfnmaddsd %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
  215. " vfnmaddsd %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
  216. " subq $8, %1 \n\t" // i = 6
  217. " subq $2, %0 \n\t" // b-= n
  218. " vmovddup 48(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  219. " vunpcklpd %%xmm11 , %%xmm11, %%xmm1 \n\t" // extract bb0
  220. " vunpcklpd %%xmm15 , %%xmm15, %%xmm2 \n\t" // extract bb1
  221. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  222. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
  223. " vmovsd %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
  224. " vmovsd %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
  225. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  226. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  227. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  228. " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k]
  229. " vmovups 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k]
  230. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  231. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  232. " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
  233. " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
  234. " vfnmaddpd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
  235. " vfnmaddpd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
  236. " subq $8, %1 \n\t" // i = 5
  237. " subq $2, %0 \n\t" // b-= n
  238. " vmovddup 40(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  239. " vunpckhpd %%xmm10 , %%xmm10, %%xmm1 \n\t" // extract bb0
  240. " vunpckhpd %%xmm14 , %%xmm14, %%xmm2 \n\t" // extract bb1
  241. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  242. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  243. " vmovsd %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
  244. " vmovsd %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
  245. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  246. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  247. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  248. " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k]
  249. " vmovsd 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k]
  250. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  251. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  252. " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
  253. " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
  254. " vfnmaddsd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
  255. " vfnmaddsd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
  256. " subq $8, %1 \n\t" // i = 4
  257. " subq $2, %0 \n\t" // b-= n
  258. " vmovddup 32(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  259. " vunpcklpd %%xmm10 , %%xmm10, %%xmm1 \n\t" // extract bb0
  260. " vunpcklpd %%xmm14 , %%xmm14, %%xmm2 \n\t" // extract bb1
  261. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  262. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  263. " vmovsd %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
  264. " vmovsd %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
  265. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  266. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  267. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  268. " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k]
  269. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  270. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  271. " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
  272. " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
  273. " subq $8, %1 \n\t" // i = 3
  274. " subq $2, %0 \n\t" // b-= n
  275. " vmovddup 24(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  276. " vunpckhpd %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
  277. " vunpckhpd %%xmm13 , %%xmm13, %%xmm2 \n\t" // extract bb1
  278. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  279. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  280. " vmovsd %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
  281. " vmovsd %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
  282. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  283. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  284. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  285. " vmovsd 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k]
  286. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  287. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  288. " vfnmaddsd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
  289. " vfnmaddsd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
  290. " subq $8, %1 \n\t" // i = 2
  291. " subq $2, %0 \n\t" // b-= n
  292. " vmovddup 16(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  293. " vunpcklpd %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
  294. " vunpcklpd %%xmm13 , %%xmm13, %%xmm2 \n\t" // extract bb1
  295. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  296. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  297. " vmovsd %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
  298. " vmovsd %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
  299. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  300. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  301. " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  302. " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  303. " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  304. " subq $8, %1 \n\t" // i = 1
  305. " subq $2, %0 \n\t" // b-= n
  306. " vmovddup 8(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  307. " vunpckhpd %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
  308. " vunpckhpd %%xmm12 , %%xmm12, %%xmm2 \n\t" // extract bb1
  309. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  310. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  311. " vmovsd %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
  312. " vmovsd %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
  313. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  314. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  315. " vmovsd 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k]
  316. " vfnmaddsd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
  317. " vfnmaddsd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
  318. " subq $8, %1 \n\t" // i = 0
  319. " subq $2, %0 \n\t" // b-= n
  320. " vmovddup 0(%6, %1, 8) , %%xmm0 \n\t" // read aa[i]
  321. " vunpcklpd %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
  322. " vunpcklpd %%xmm12 , %%xmm12, %%xmm2 \n\t" // extract bb1
  323. " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
  324. " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa
  325. " vmovsd %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
  326. " vmovsd %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
  327. " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa
  328. " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa
  329. " vzeroupper \n\t"
  330. :
  331. :
  332. "r" (n1), // 0
  333. "a" (i), // 1
  334. "r" (a), // 2
  335. "r" (b), // 3
  336. "r" (c), // 4
  337. "r" (c1), // 5
  338. "r" (as), // 6
  339. "r" (bs) // 7
  340. : "cc",
  341. "%xmm0", "%xmm1", "%xmm2", "%xmm3",
  342. "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  343. "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  344. "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  345. "memory"
  346. );
  347. }
  348. #ifndef COMPLEX
  349. static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  350. FLOAT aa, bb;
  351. FLOAT *cj;
  352. BLASLONG i, j, k;
  353. a += (m - 1) * m;
  354. b += (m - 1) * n;
  355. for (i = m - 1; i >= 0; i--) {
  356. aa = *(a + i);
  357. for (j = 0; j < n; j ++) {
  358. cj = c + j * ldc;
  359. bb = *(cj + i);
  360. bb *= aa;
  361. *b = bb;
  362. *(cj + i) = bb;
  363. b ++;
  364. /*
  365. BLASLONG i1 = i & -4 ;
  366. */
  367. FLOAT t0,t1,t2,t3;
  368. k=0;
  369. if ( i & 4 )
  370. {
  371. t0 = cj[k];
  372. t1 = cj[k+1];
  373. t2 = cj[k+2];
  374. t3 = cj[k+3];
  375. t0 -= bb * a[k+0];
  376. t1 -= bb * a[k+1];
  377. t2 -= bb * a[k+2];
  378. t3 -= bb * a[k+3];
  379. cj[k+0] = t0;
  380. cj[k+1] = t1;
  381. cj[k+2] = t2;
  382. cj[k+3] = t3;
  383. k+=4;
  384. }
  385. if ( i & 2 )
  386. {
  387. t0 = a[k];
  388. t1 = a[k+1];
  389. t0 *= bb;
  390. t1 *= bb;
  391. cj[k+0] -= t0;
  392. cj[k+1] -= t1;
  393. k+=2;
  394. }
  395. if ( i & 1 )
  396. {
  397. t0 = bb * a[k];
  398. cj[k+0] -= t0;
  399. }
  400. }
  401. a -= m;
  402. b -= 2 * n;
  403. }
  404. }
  405. #else
  406. static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  407. FLOAT aa1, aa2;
  408. FLOAT bb1, bb2;
  409. FLOAT cc1, cc2;
  410. int i, j, k;
  411. ldc *= 2;
  412. a += (m - 1) * m * 2;
  413. b += (m - 1) * n * 2;
  414. for (i = m - 1; i >= 0; i--) {
  415. aa1 = *(a + i * 2 + 0);
  416. aa2 = *(a + i * 2 + 1);
  417. for (j = 0; j < n; j ++) {
  418. bb1 = *(c + i * 2 + 0 + j * ldc);
  419. bb2 = *(c + i * 2 + 1 + j * ldc);
  420. #ifndef CONJ
  421. cc1 = aa1 * bb1 - aa2 * bb2;
  422. cc2 = aa1 * bb2 + aa2 * bb1;
  423. #else
  424. cc1 = aa1 * bb1 + aa2 * bb2;
  425. cc2 = aa1 * bb2 - aa2 * bb1;
  426. #endif
  427. *(b + 0) = cc1;
  428. *(b + 1) = cc2;
  429. *(c + i * 2 + 0 + j * ldc) = cc1;
  430. *(c + i * 2 + 1 + j * ldc) = cc2;
  431. b += 2;
  432. for (k = 0; k < i; k ++){
  433. #ifndef CONJ
  434. *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
  435. *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
  436. #else
  437. *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
  438. *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
  439. #endif
  440. }
  441. }
  442. a -= m * 2;
  443. b -= 4 * n;
  444. }
  445. }
  446. #endif
  447. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
  448. #ifdef COMPLEX
  449. FLOAT dummy2,
  450. #endif
  451. FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
  452. BLASLONG i, j;
  453. FLOAT *aa, *cc;
  454. BLASLONG kk;
  455. #if 0
  456. fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n",
  457. m, n, k, offset);
  458. #endif
  459. j = (n >> GEMM_UNROLL_N_SHIFT);
  460. while (j > 0) {
  461. kk = m + offset;
  462. if (m & (GEMM_UNROLL_M - 1)) {
  463. for (i = 1; i < GEMM_UNROLL_M; i *= 2){
  464. if (m & i) {
  465. aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
  466. cc = c + ((m & ~(i - 1)) - i) * COMPSIZE;
  467. if (k - kk > 0) {
  468. GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
  469. #ifdef COMPLEX
  470. ZERO,
  471. #endif
  472. aa + i * kk * COMPSIZE,
  473. b + GEMM_UNROLL_N * kk * COMPSIZE,
  474. cc,
  475. ldc);
  476. }
  477. solve(i, GEMM_UNROLL_N,
  478. aa + (kk - i) * i * COMPSIZE,
  479. b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
  480. cc, ldc);
  481. kk -= i;
  482. }
  483. }
  484. }
  485. i = (m >> GEMM_UNROLL_M_SHIFT);
  486. if (i > 0) {
  487. aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
  488. cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE;
  489. do {
  490. dtrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc
  491. , aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE);
  492. aa -= GEMM_UNROLL_M * k * COMPSIZE;
  493. cc -= GEMM_UNROLL_M * COMPSIZE;
  494. kk -= GEMM_UNROLL_M;
  495. i --;
  496. } while (i > 0);
  497. }
  498. b += GEMM_UNROLL_N * k * COMPSIZE;
  499. c += GEMM_UNROLL_N * ldc * COMPSIZE;
  500. j --;
  501. }
  502. if (n & (GEMM_UNROLL_N - 1)) {
  503. j = (GEMM_UNROLL_N >> 1);
  504. while (j > 0) {
  505. if (n & j) {
  506. kk = m + offset;
  507. if (m & (GEMM_UNROLL_M - 1)) {
  508. for (i = 1; i < GEMM_UNROLL_M; i *= 2){
  509. if (m & i) {
  510. aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
  511. cc = c + ((m & ~(i - 1)) - i) * COMPSIZE;
  512. if (k - kk > 0) {
  513. GEMM_KERNEL(i, j, k - kk, dm1,
  514. #ifdef COMPLEX
  515. ZERO,
  516. #endif
  517. aa + i * kk * COMPSIZE,
  518. b + j * kk * COMPSIZE,
  519. cc, ldc);
  520. }
  521. solve(i, j,
  522. aa + (kk - i) * i * COMPSIZE,
  523. b + (kk - i) * j * COMPSIZE,
  524. cc, ldc);
  525. kk -= i;
  526. }
  527. }
  528. }
  529. i = (m >> GEMM_UNROLL_M_SHIFT);
  530. if (i > 0) {
  531. aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
  532. cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE;
  533. do {
  534. if (k - kk > 0) {
  535. GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
  536. #ifdef COMPLEX
  537. ZERO,
  538. #endif
  539. aa + GEMM_UNROLL_M * kk * COMPSIZE,
  540. b + j * kk * COMPSIZE,
  541. cc,
  542. ldc);
  543. }
  544. solve(GEMM_UNROLL_M, j,
  545. aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
  546. b + (kk - GEMM_UNROLL_M) * j * COMPSIZE,
  547. cc, ldc);
  548. aa -= GEMM_UNROLL_M * k * COMPSIZE;
  549. cc -= GEMM_UNROLL_M * COMPSIZE;
  550. kk -= GEMM_UNROLL_M;
  551. i --;
  552. } while (i > 0);
  553. }
  554. b += j * k * COMPSIZE;
  555. c += j * ldc * COMPSIZE;
  556. }
  557. j >>= 1;
  558. }
  559. }
  560. return 0;
  561. }