You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_RT_8x8_msa.c 42 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. BLASLONG k;
  32. FLOAT *aa = a, *bb = b;
  33. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
  34. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  35. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  36. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  37. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  38. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  39. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  40. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  41. FLOAT *c_nxt1line = c + ldc;
  42. FLOAT *c_nxt2line = c + 2 * ldc;
  43. FLOAT *c_nxt3line = c + 3 * ldc;
  44. FLOAT *c_nxt4line = c + 4 * ldc;
  45. FLOAT *c_nxt5line = c + 5 * ldc;
  46. FLOAT *c_nxt6line = c + 6 * ldc;
  47. FLOAT *c_nxt7line = c + 7 * ldc;
  48. LD_SP2(c, 4, src_c0, src_c1);
  49. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  50. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  51. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  52. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  53. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  54. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  55. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  56. for (k = 0; k < bk; k++)
  57. {
  58. LD_SP2(aa, 4, src_a0, src_a1);
  59. src_b = LD_SP(bb + 0);
  60. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  61. src_c0 -= src_a0 * src_b0;
  62. src_c1 -= src_a1 * src_b0;
  63. src_c2 -= src_a0 * src_b1;
  64. src_c3 -= src_a1 * src_b1;
  65. src_c4 -= src_a0 * src_b2;
  66. src_c5 -= src_a1 * src_b2;
  67. src_c6 -= src_a0 * src_b3;
  68. src_c7 -= src_a1 * src_b3;
  69. src_b = LD_SP(bb + 4);
  70. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  71. src_c8 -= src_a0 * src_b0;
  72. src_c9 -= src_a1 * src_b0;
  73. src_c10 -= src_a0 * src_b1;
  74. src_c11 -= src_a1 * src_b1;
  75. src_c12 -= src_a0 * src_b2;
  76. src_c13 -= src_a1 * src_b2;
  77. src_c14 -= src_a0 * src_b3;
  78. src_c15 -= src_a1 * src_b3;
  79. aa += 8;
  80. bb += 8;
  81. }
  82. b -= 64;
  83. src_b = LD_SP(b + 60);
  84. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  85. src_b = LD_SP(b + 56);
  86. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  87. src_c15 *= src_b63;
  88. src_c14 *= src_b63;
  89. src_c13 -= src_c15 * src_b62;
  90. src_c12 -= src_c14 * src_b62;
  91. src_c11 -= src_c15 * src_b61;
  92. src_c10 -= src_c14 * src_b61;
  93. src_c9 -= src_c15 * src_b60;
  94. src_c8 -= src_c14 * src_b60;
  95. src_c7 -= src_c15 * src_b59;
  96. src_c6 -= src_c14 * src_b59;
  97. src_c5 -= src_c15 * src_b58;
  98. src_c4 -= src_c14 * src_b58;
  99. src_c3 -= src_c15 * src_b57;
  100. src_c2 -= src_c14 * src_b57;
  101. src_c1 -= src_c15 * src_b56;
  102. src_c0 -= src_c14 * src_b56;
  103. src_b = LD_SP(b + 48);
  104. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  105. src_b52 = LD_SP(b + 52);
  106. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  107. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  108. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  109. src_c12 *= src_b54;
  110. src_c13 *= src_b54;
  111. src_c10 -= src_c12 * src_b53;
  112. src_c11 -= src_c13 * src_b53;
  113. src_c8 -= src_c12 * src_b52;
  114. src_c9 -= src_c13 * src_b52;
  115. src_c6 -= src_c12 * src_b51;
  116. src_c7 -= src_c13 * src_b51;
  117. src_c4 -= src_c12 * src_b50;
  118. src_c5 -= src_c13 * src_b50;
  119. src_c2 -= src_c12 * src_b49;
  120. src_c3 -= src_c13 * src_b49;
  121. src_c0 -= src_c12 * src_b48;
  122. src_c1 -= src_c13 * src_b48;
  123. ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4);
  124. ST_SP2(src_c12, src_c13, c_nxt6line, 4);
  125. ST_SP2(src_c14, src_c15, c_nxt7line, 4);
  126. src_b = LD_SP(b + 40);
  127. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  128. src_b44 = LD_SP(b + 44);
  129. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  130. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  131. src_c10 *= src_b45;
  132. src_c11 *= src_b45;
  133. src_c8 -= src_c10 * src_b44;
  134. src_c9 -= src_c11 * src_b44;
  135. src_c6 -= src_c10 * src_b43;
  136. src_c7 -= src_c11 * src_b43;
  137. src_c4 -= src_c10 * src_b42;
  138. src_c5 -= src_c11 * src_b42;
  139. src_c2 -= src_c10 * src_b41;
  140. src_c3 -= src_c11 * src_b41;
  141. src_c0 -= src_c10 * src_b40;
  142. src_c1 -= src_c11 * src_b40;
  143. src_b = LD_SP(b + 32);
  144. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  145. COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
  146. src_c8 *= src_b36;
  147. src_c9 *= src_b36;
  148. src_c6 -= src_c8 * src_b35;
  149. src_c7 -= src_c9 * src_b35;
  150. src_c4 -= src_c8 * src_b34;
  151. src_c5 -= src_c9 * src_b34;
  152. src_c2 -= src_c8 * src_b33;
  153. src_c3 -= src_c9 * src_b33;
  154. src_c0 -= src_c8 * src_b32;
  155. src_c1 -= src_c9 * src_b32;
  156. ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4);
  157. ST_SP2(src_c8, src_c9, c_nxt4line, 4);
  158. ST_SP2(src_c10, src_c11, c_nxt5line, 4);
  159. src_b = LD_SP(b + 24);
  160. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  161. src_c6 *= src_b27;
  162. src_c7 *= src_b27;
  163. src_c4 -= src_c6 * src_b26;
  164. src_c5 -= src_c7 * src_b26;
  165. src_c2 -= src_c6 * src_b25;
  166. src_c3 -= src_c7 * src_b25;
  167. src_c0 -= src_c6 * src_b24;
  168. src_c1 -= src_c7 * src_b24;
  169. src_b16 = LD_SP(b + 16);
  170. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  171. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  172. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  173. src_c4 *= src_b18;
  174. src_c5 *= src_b18;
  175. src_c2 -= src_c4 * src_b17;
  176. src_c3 -= src_c5 * src_b17;
  177. src_c0 -= src_c4 * src_b16;
  178. src_c1 -= src_c5 * src_b16;
  179. ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4);
  180. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  181. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  182. COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
  183. COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
  184. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  185. src_c2 *= src_b9;
  186. src_c3 *= src_b9;
  187. src_c0 -= src_c2 * src_b8;
  188. src_c1 -= src_c3 * src_b8;
  189. src_c0 *= src_b0;
  190. src_c1 *= src_b0;
  191. ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4);
  192. ST_SP2(src_c0, src_c1, c, 4);
  193. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  194. }
  195. static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  196. {
  197. BLASLONG k;
  198. FLOAT *aa = a, *bb = b;
  199. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
  200. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  201. v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12;
  202. v4f32 src_b13, src_b14, src_b15;
  203. FLOAT *c_nxt1line = c + ldc;
  204. FLOAT *c_nxt2line = c + 2 * ldc;
  205. FLOAT *c_nxt3line = c + 3 * ldc;
  206. LD_SP2(c, 4, src_c0, src_c1);
  207. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  208. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  209. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  210. for (k = 0; k < (bk >> 1); k++)
  211. {
  212. LD_SP2(aa, 4, src_a0, src_a1);
  213. src_b = LD_SP(bb + 0);
  214. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  215. src_c0 -= src_a0 * src_b0;
  216. src_c1 -= src_a1 * src_b0;
  217. src_c2 -= src_a0 * src_b1;
  218. src_c3 -= src_a1 * src_b1;
  219. src_c4 -= src_a0 * src_b2;
  220. src_c5 -= src_a1 * src_b2;
  221. src_c6 -= src_a0 * src_b3;
  222. src_c7 -= src_a1 * src_b3;
  223. aa += 8;
  224. bb += 4;
  225. LD_SP2(aa, 4, src_a0, src_a1);
  226. src_b = LD_SP(bb + 0);
  227. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  228. src_c0 -= src_a0 * src_b0;
  229. src_c1 -= src_a1 * src_b0;
  230. src_c2 -= src_a0 * src_b1;
  231. src_c3 -= src_a1 * src_b1;
  232. src_c4 -= src_a0 * src_b2;
  233. src_c5 -= src_a1 * src_b2;
  234. src_c6 -= src_a0 * src_b3;
  235. src_c7 -= src_a1 * src_b3;
  236. aa += 8;
  237. bb += 4;
  238. }
  239. if (bk & 1)
  240. {
  241. LD_SP2(aa, 4, src_a0, src_a1);
  242. src_b = LD_SP(bb + 0);
  243. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  244. src_c0 -= src_a0 * src_b0;
  245. src_c1 -= src_a1 * src_b0;
  246. src_c2 -= src_a0 * src_b1;
  247. src_c3 -= src_a1 * src_b1;
  248. src_c4 -= src_a0 * src_b2;
  249. src_c5 -= src_a1 * src_b2;
  250. src_c6 -= src_a0 * src_b3;
  251. src_c7 -= src_a1 * src_b3;
  252. }
  253. a -= 32;
  254. b -= 16;
  255. src_b = LD_SP(b + 12);
  256. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  257. src_b8 = LD_SP(b + 8);
  258. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  259. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  260. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  261. COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
  262. COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
  263. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  264. src_c7 *= src_b15;
  265. src_c6 *= src_b15;
  266. src_c5 -= src_c7 * src_b14;
  267. src_c4 -= src_c6 * src_b14;
  268. src_c3 -= src_c7 * src_b13;
  269. src_c2 -= src_c6 * src_b13;
  270. src_c1 -= src_c7 * src_b12;
  271. src_c0 -= src_c6 * src_b12;
  272. src_c5 *= src_b10;
  273. src_c4 *= src_b10;
  274. src_c3 -= src_c5 * src_b9;
  275. src_c2 -= src_c4 * src_b9;
  276. src_c1 -= src_c5 * src_b8;
  277. src_c0 -= src_c4 * src_b8;
  278. src_c3 *= src_b5;
  279. src_c2 *= src_b5;
  280. src_c1 -= src_c3 * src_b4;
  281. src_c0 -= src_c2 * src_b4;
  282. src_c1 *= src_b0;
  283. src_c0 *= src_b0;
  284. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  285. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  286. ST_SP2(src_c0, src_c1, c, 4);
  287. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  288. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  289. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  290. }
  291. static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  292. {
  293. BLASLONG k;
  294. FLOAT *aa = a, *bb = b;
  295. v4f32 src_a0, src_a1, src_b1;
  296. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
  297. FLOAT *c_nxt1line = c + ldc;
  298. LD_SP2(c, 4, src_c0, src_c1);
  299. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  300. for (k = 0; k < (bk >> 1); k++)
  301. {
  302. LD_SP2(aa, 4, src_a0, src_a1);
  303. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  304. COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
  305. src_c0 -= src_a0 * src_b0;
  306. src_c1 -= src_a1 * src_b0;
  307. src_c2 -= src_a0 * src_b1;
  308. src_c3 -= src_a1 * src_b1;
  309. aa += 8;
  310. bb += 2;
  311. LD_SP2(aa, 4, src_a0, src_a1);
  312. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  313. COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
  314. src_c0 -= src_a0 * src_b0;
  315. src_c1 -= src_a1 * src_b0;
  316. src_c2 -= src_a0 * src_b1;
  317. src_c3 -= src_a1 * src_b1;
  318. aa += 8;
  319. bb += 2;
  320. }
  321. if (bk & 1)
  322. {
  323. LD_SP2(aa, 4, src_a0, src_a1);
  324. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  325. COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
  326. src_c0 -= src_a0 * src_b0;
  327. src_c1 -= src_a1 * src_b0;
  328. src_c2 -= src_a0 * src_b1;
  329. src_c3 -= src_a1 * src_b1;
  330. }
  331. a -= 16;
  332. b -= 4;
  333. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  334. COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
  335. COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
  336. src_c2 *= src_b3;
  337. src_c3 *= src_b3;
  338. src_c0 -= src_c2 * src_b2;
  339. src_c1 -= src_c3 * src_b2;
  340. src_c0 *= src_b0;
  341. src_c1 *= src_b0;
  342. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  343. ST_SP2(src_c0, src_c1, c, 4);
  344. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  345. }
  346. static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  347. {
  348. BLASLONG k;
  349. FLOAT *aa = a, *bb = b;
  350. v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
  351. LD_SP2(c, 4, src_c0, src_c1);
  352. for (k = 0; k < (bk >> 2); k++)
  353. {
  354. LD_SP2(aa, 4, src_a0, src_a1);
  355. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  356. src_c0 -= src_a0 * src_b0;
  357. src_c1 -= src_a1 * src_b0;
  358. aa += 8;
  359. bb += 1;
  360. LD_SP2(aa, 4, src_a0, src_a1);
  361. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  362. src_c0 -= src_a0 * src_b0;
  363. src_c1 -= src_a1 * src_b0;
  364. aa += 8;
  365. bb += 1;
  366. LD_SP2(aa, 4, src_a0, src_a1);
  367. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  368. src_c0 -= src_a0 * src_b0;
  369. src_c1 -= src_a1 * src_b0;
  370. aa += 8;
  371. bb += 1;
  372. LD_SP2(aa, 4, src_a0, src_a1);
  373. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  374. src_c0 -= src_a0 * src_b0;
  375. src_c1 -= src_a1 * src_b0;
  376. aa += 8;
  377. bb += 1;
  378. }
  379. if (bk & 3)
  380. {
  381. if (bk & 2)
  382. {
  383. LD_SP2(aa, 4, src_a0, src_a1);
  384. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  385. src_c0 -= src_a0 * src_b0;
  386. src_c1 -= src_a1 * src_b0;
  387. aa += 8;
  388. bb += 1;
  389. LD_SP2(aa, 4, src_a0, src_a1);
  390. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  391. src_c0 -= src_a0 * src_b0;
  392. src_c1 -= src_a1 * src_b0;
  393. aa += 8;
  394. bb += 1;
  395. }
  396. if (bk & 1)
  397. {
  398. LD_SP2(aa, 4, src_a0, src_a1);
  399. COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
  400. src_c0 -= src_a0 * src_b0;
  401. src_c1 -= src_a1 * src_b0;
  402. }
  403. }
  404. a -= 8;
  405. b -= 1;
  406. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  407. src_c0 *= src_b0;
  408. src_c1 *= src_b0;
  409. ST_SP2(src_c0, src_c1, a, 4);
  410. ST_SP2(src_c0, src_c1, c, 4);
  411. }
  412. static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  413. {
  414. BLASLONG k;
  415. FLOAT *aa = a, *bb = b;
  416. v4f32 src_a0, src_b1, src_b2, src_b3;
  417. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  418. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  419. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  420. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  421. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  422. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  423. FLOAT *c_nxt1line = c + ldc;
  424. FLOAT *c_nxt2line = c + 2 * ldc;
  425. FLOAT *c_nxt3line = c + 3 * ldc;
  426. FLOAT *c_nxt4line = c + 4 * ldc;
  427. FLOAT *c_nxt5line = c + 5 * ldc;
  428. FLOAT *c_nxt6line = c + 6 * ldc;
  429. FLOAT *c_nxt7line = c + 7 * ldc;
  430. src_c0 = LD_SP(c);
  431. src_c1 = LD_SP(c_nxt1line);
  432. src_c2 = LD_SP(c_nxt2line);
  433. src_c3 = LD_SP(c_nxt3line);
  434. src_c4 = LD_SP(c_nxt4line);
  435. src_c5 = LD_SP(c_nxt5line);
  436. src_c6 = LD_SP(c_nxt6line);
  437. src_c7 = LD_SP(c_nxt7line);
  438. for (k = 0; k < bk; k++)
  439. {
  440. src_a0 = LD_SP(aa);
  441. src_b = LD_SP(bb + 0);
  442. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  443. src_c0 -= src_a0 * src_b0;
  444. src_c1 -= src_a0 * src_b1;
  445. src_c2 -= src_a0 * src_b2;
  446. src_c3 -= src_a0 * src_b3;
  447. src_b = LD_SP(bb + 4);
  448. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  449. src_c4 -= src_a0 * src_b0;
  450. src_c5 -= src_a0 * src_b1;
  451. src_c6 -= src_a0 * src_b2;
  452. src_c7 -= src_a0 * src_b3;
  453. aa += 4;
  454. bb += 8;
  455. }
  456. a -= 32;
  457. b -= 64;
  458. src_b = LD_SP(b + 60);
  459. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  460. src_b = LD_SP(b + 56);
  461. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  462. src_b = LD_SP(b + 48);
  463. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  464. src_b52 = LD_SP(b + 52);
  465. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  466. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  467. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  468. src_b = LD_SP(b + 40);
  469. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  470. src_b44 = LD_SP(b + 44);
  471. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  472. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  473. src_b = LD_SP(b + 32);
  474. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  475. COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
  476. src_b = LD_SP(b + 24);
  477. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  478. src_b16 = LD_SP(b + 16);
  479. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  480. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  481. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  482. COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
  483. COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
  484. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  485. src_c7 *= src_b63;
  486. src_c6 -= src_c7 * src_b62;
  487. src_c5 -= src_c7 * src_b61;
  488. src_c4 -= src_c7 * src_b60;
  489. src_c3 -= src_c7 * src_b59;
  490. src_c2 -= src_c7 * src_b58;
  491. src_c1 -= src_c7 * src_b57;
  492. src_c0 -= src_c7 * src_b56;
  493. src_c6 *= src_b54;
  494. src_c5 -= src_c6 * src_b53;
  495. src_c4 -= src_c6 * src_b52;
  496. src_c3 -= src_c6 * src_b51;
  497. src_c2 -= src_c6 * src_b50;
  498. src_c1 -= src_c6 * src_b49;
  499. src_c0 -= src_c6 * src_b48;
  500. src_c5 *= src_b45;
  501. src_c4 -= src_c5 * src_b44;
  502. src_c3 -= src_c5 * src_b43;
  503. src_c2 -= src_c5 * src_b42;
  504. src_c1 -= src_c5 * src_b41;
  505. src_c0 -= src_c5 * src_b40;
  506. src_c4 *= src_b36;
  507. src_c3 -= src_c4 * src_b35;
  508. src_c2 -= src_c4 * src_b34;
  509. src_c1 -= src_c4 * src_b33;
  510. src_c0 -= src_c4 * src_b32;
  511. src_c3 *= src_b27;
  512. src_c2 -= src_c3 * src_b26;
  513. src_c1 -= src_c3 * src_b25;
  514. src_c0 -= src_c3 * src_b24;
  515. src_c2 *= src_b18;
  516. src_c1 -= src_c2 * src_b17;
  517. src_c0 -= src_c2 * src_b16;
  518. src_c1 *= src_b9;
  519. src_c0 -= src_c1 * src_b8;
  520. src_c0 *= src_b0;
  521. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  522. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  523. ST_SP(src_c0, c);
  524. ST_SP(src_c1, c_nxt1line);
  525. ST_SP(src_c2, c_nxt2line);
  526. ST_SP(src_c3, c_nxt3line);
  527. ST_SP(src_c4, c_nxt4line);
  528. ST_SP(src_c5, c_nxt5line);
  529. ST_SP(src_c6, c_nxt6line);
  530. ST_SP(src_c7, c_nxt7line);
  531. }
  532. static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  533. {
  534. BLASLONG k;
  535. FLOAT *aa = a, *bb = b;
  536. v4f32 src_c0, src_c1, src_c2, src_c3, src_b;
  537. v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
  538. v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3;
  539. FLOAT *c_nxt1line = c + ldc;
  540. FLOAT *c_nxt2line = c + 2 * ldc;
  541. FLOAT *c_nxt3line = c + 3 * ldc;
  542. src_c0 = LD_SP(c);
  543. src_c1 = LD_SP(c_nxt1line);
  544. src_c2 = LD_SP(c_nxt2line);
  545. src_c3 = LD_SP(c_nxt3line);
  546. for (k = 0; k < (bk >> 1); k++)
  547. {
  548. src_a = LD_SP(aa);
  549. src_b = LD_SP(bb);
  550. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  551. src_c0 -= src_a * src_b0;
  552. src_c1 -= src_a * src_b1;
  553. src_c2 -= src_a * src_b2;
  554. src_c3 -= src_a * src_b3;
  555. aa += 4;
  556. bb += 4;
  557. src_a = LD_SP(aa);
  558. src_b = LD_SP(bb);
  559. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  560. src_c0 -= src_a * src_b0;
  561. src_c1 -= src_a * src_b1;
  562. src_c2 -= src_a * src_b2;
  563. src_c3 -= src_a * src_b3;
  564. aa += 4;
  565. bb += 4;
  566. }
  567. if (bk & 1)
  568. {
  569. src_a = LD_SP(aa);
  570. src_b = LD_SP(bb);
  571. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  572. src_c0 -= src_a * src_b0;
  573. src_c1 -= src_a * src_b1;
  574. src_c2 -= src_a * src_b2;
  575. src_c3 -= src_a * src_b3;
  576. }
  577. a -= 16;
  578. b -= 16;
  579. src_b = LD_SP(b + 12);
  580. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  581. src_b8 = LD_SP(b + 8);
  582. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  583. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  584. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  585. COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
  586. COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
  587. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  588. src_c3 *= src_b15;
  589. src_c2 -= src_c3 * src_b14;
  590. src_c1 -= src_c3 * src_b13;
  591. src_c0 -= src_c3 * src_b12;
  592. src_c2 *= src_b10;
  593. src_c1 -= src_c2 * src_b9;
  594. src_c0 -= src_c2 * src_b8;
  595. src_c1 *= src_b5;
  596. src_c0 -= src_c1 * src_b4;
  597. src_c0 *= src_b0;
  598. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  599. ST_SP(src_c0, c);
  600. ST_SP(src_c1, c_nxt1line);
  601. ST_SP(src_c2, c_nxt2line);
  602. ST_SP(src_c3, c_nxt3line);
  603. }
  604. static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  605. {
  606. BLASLONG k;
  607. FLOAT *aa = a, *bb = b;
  608. v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3;
  609. FLOAT *c_nxt1line = c + ldc;
  610. src_c0 = LD_SP(c);
  611. src_c1 = LD_SP(c_nxt1line);
  612. for (k = 0; k < (bk >> 2); k++)
  613. {
  614. src_a = LD_SP(aa);
  615. src_b0 = LD_SP(bb);
  616. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  617. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  618. src_c0 -= src_a * src_b0;
  619. src_c1 -= src_a * src_b1;
  620. aa += 4;
  621. bb += 2;
  622. src_a = LD_SP(aa);
  623. src_b0 = LD_SP(bb);
  624. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  625. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  626. src_c0 -= src_a * src_b0;
  627. src_c1 -= src_a * src_b1;
  628. aa += 4;
  629. bb += 2;
  630. src_a = LD_SP(aa);
  631. src_b0 = LD_SP(bb);
  632. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  633. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  634. src_c0 -= src_a * src_b0;
  635. src_c1 -= src_a * src_b1;
  636. aa += 4;
  637. bb += 2;
  638. src_a = LD_SP(aa);
  639. src_b0 = LD_SP(bb);
  640. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  641. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  642. src_c0 -= src_a * src_b0;
  643. src_c1 -= src_a * src_b1;
  644. aa += 4;
  645. bb += 2;
  646. }
  647. if (bk & 3)
  648. {
  649. if (bk & 2)
  650. {
  651. src_a = LD_SP(aa);
  652. src_b0 = LD_SP(bb);
  653. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  654. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  655. src_c0 -= src_a * src_b0;
  656. src_c1 -= src_a * src_b1;
  657. aa += 4;
  658. bb += 2;
  659. src_a = LD_SP(aa);
  660. src_b0 = LD_SP(bb);
  661. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  662. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  663. src_c0 -= src_a * src_b0;
  664. src_c1 -= src_a * src_b1;
  665. aa += 4;
  666. bb += 2;
  667. }
  668. if (bk & 1)
  669. {
  670. src_a = LD_SP(aa);
  671. src_b0 = LD_SP(bb);
  672. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  673. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  674. src_c0 -= src_a * src_b0;
  675. src_c1 -= src_a * src_b1;
  676. }
  677. }
  678. a -= 8;
  679. b -= 4;
  680. COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
  681. COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
  682. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  683. src_c1 *= src_b3;
  684. src_c0 -= src_c1 * src_b2;
  685. src_c0 *= src_b0;
  686. ST_SP2(src_c0, src_c1, a, 4);
  687. ST_SP(src_c0, c);
  688. ST_SP(src_c1, c_nxt1line);
  689. }
  690. static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  691. {
  692. BLASLONG k;
  693. FLOAT *aa = a, *bb = b;
  694. FLOAT b0, c0, c1, c2, c3;
  695. c0 = *(c + 0);
  696. c1 = *(c + 1);
  697. c2 = *(c + 2);
  698. c3 = *(c + 3);
  699. for (k = 0; k < bk; k++)
  700. {
  701. c0 -= aa[0] * bb[0];
  702. c1 -= aa[1] * bb[0];
  703. c2 -= aa[2] * bb[0];
  704. c3 -= aa[3] * bb[0];
  705. aa += 4;
  706. bb += 1;
  707. }
  708. a -= 4;
  709. b -= 1;
  710. b0 = *b;
  711. c0 *= b0;
  712. c1 *= b0;
  713. c2 *= b0;
  714. c3 *= b0;
  715. *(a + 0) = c0;
  716. *(a + 1) = c1;
  717. *(a + 2) = c2;
  718. *(a + 3) = c3;
  719. *(c + 0) = c0;
  720. *(c + 1) = c1;
  721. *(c + 2) = c2;
  722. *(c + 3) = c3;
  723. }
  724. static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  725. {
  726. BLASLONG k;
  727. FLOAT *aa = a, *bb = b;
  728. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  729. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  730. FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7;
  731. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  732. FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  733. c0 = *(c + 0);
  734. c1 = *(c + 1);
  735. c0_nxt1 = *(c + 0 + 1 * ldc);
  736. c1_nxt1 = *(c + 1 + 1 * ldc);
  737. c0_nxt2 = *(c + 0 + 2 * ldc);
  738. c1_nxt2 = *(c + 1 + 2 * ldc);
  739. c0_nxt3 = *(c + 0 + 3 * ldc);
  740. c1_nxt3 = *(c + 1 + 3 * ldc);
  741. c0_nxt4 = *(c + 0 + 4 * ldc);
  742. c1_nxt4 = *(c + 1 + 4 * ldc);
  743. c0_nxt5 = *(c + 0 + 5 * ldc);
  744. c1_nxt5 = *(c + 1 + 5 * ldc);
  745. c0_nxt6 = *(c + 0 + 6 * ldc);
  746. c1_nxt6 = *(c + 1 + 6 * ldc);
  747. c0_nxt7 = *(c + 0 + 7 * ldc);
  748. c1_nxt7 = *(c + 1 + 7 * ldc);
  749. for (k = 0; k < bk; k++)
  750. {
  751. c0 -= aa[0] * bb[0];
  752. c1 -= aa[1] * bb[0];
  753. c0_nxt1 -= aa[0] * bb[1];
  754. c1_nxt1 -= aa[1] * bb[1];
  755. c0_nxt2 -= aa[0] * bb[2];
  756. c1_nxt2 -= aa[1] * bb[2];
  757. c0_nxt3 -= aa[0] * bb[3];
  758. c1_nxt3 -= aa[1] * bb[3];
  759. c0_nxt4 -= aa[0] * bb[4];
  760. c1_nxt4 -= aa[1] * bb[4];
  761. c0_nxt5 -= aa[0] * bb[5];
  762. c1_nxt5 -= aa[1] * bb[5];
  763. c0_nxt6 -= aa[0] * bb[6];
  764. c1_nxt6 -= aa[1] * bb[6];
  765. c0_nxt7 -= aa[0] * bb[7];
  766. c1_nxt7 -= aa[1] * bb[7];
  767. aa += 2;
  768. bb += 8;
  769. }
  770. a -= 16;
  771. b -= 64;
  772. b0 = *(b + 0);
  773. b8 = *(b + 8);
  774. b9 = *(b + 9);
  775. b16 = *(b + 16);
  776. b17 = *(b + 17);
  777. b18 = *(b + 18);
  778. b24 = *(b + 24);
  779. b25 = *(b + 25);
  780. b26 = *(b + 26);
  781. b27 = *(b + 27);
  782. b32 = *(b + 32);
  783. b33 = *(b + 33);
  784. b34 = *(b + 34);
  785. b35 = *(b + 35);
  786. b36 = *(b + 36);
  787. b40 = *(b + 40);
  788. b41 = *(b + 41);
  789. b42 = *(b + 42);
  790. b43 = *(b + 43);
  791. b44 = *(b + 44);
  792. b45 = *(b + 45);
  793. b48 = *(b + 48);
  794. b49 = *(b + 49);
  795. b50 = *(b + 50);
  796. b51 = *(b + 51);
  797. b52 = *(b + 52);
  798. b53 = *(b + 53);
  799. b54 = *(b + 54);
  800. b56 = *(b + 56);
  801. b57 = *(b + 57);
  802. b58 = *(b + 58);
  803. b59 = *(b + 59);
  804. b60 = *(b + 60);
  805. b61 = *(b + 61);
  806. b62 = *(b + 62);
  807. b63 = *(b + 63);
  808. c0_nxt7 *= b63;
  809. c1_nxt7 *= b63;
  810. c0_nxt6 -= c0_nxt7 * b62;
  811. c1_nxt6 -= c1_nxt7 * b62;
  812. c0_nxt6 *= b54;
  813. c1_nxt6 *= b54;
  814. c0_nxt5 -= c0_nxt7 * b61;
  815. c1_nxt5 -= c1_nxt7 * b61;
  816. c0_nxt5 -= c0_nxt6 * b53;
  817. c1_nxt5 -= c1_nxt6 * b53;
  818. c0_nxt5 *= b45;
  819. c1_nxt5 *= b45;
  820. c0_nxt4 -= c0_nxt7 * b60;
  821. c1_nxt4 -= c1_nxt7 * b60;
  822. c0_nxt4 -= c0_nxt6 * b52;
  823. c1_nxt4 -= c1_nxt6 * b52;
  824. c0_nxt4 -= c0_nxt5 * b44;
  825. c1_nxt4 -= c1_nxt5 * b44;
  826. c0_nxt4 *= b36;
  827. c1_nxt4 *= b36;
  828. c0_nxt3 -= c0_nxt7 * b59;
  829. c1_nxt3 -= c1_nxt7 * b59;
  830. c0_nxt3 -= c0_nxt6 * b51;
  831. c1_nxt3 -= c1_nxt6 * b51;
  832. c0_nxt3 -= c0_nxt5 * b43;
  833. c1_nxt3 -= c1_nxt5 * b43;
  834. c0_nxt3 -= c0_nxt4 * b35;
  835. c1_nxt3 -= c1_nxt4 * b35;
  836. c0_nxt3 *= b27;
  837. c1_nxt3 *= b27;
  838. c0_nxt2 -= c0_nxt7 * b58;
  839. c1_nxt2 -= c1_nxt7 * b58;
  840. c0_nxt2 -= c0_nxt6 * b50;
  841. c1_nxt2 -= c1_nxt6 * b50;
  842. c0_nxt2 -= c0_nxt5 * b42;
  843. c1_nxt2 -= c1_nxt5 * b42;
  844. c0_nxt2 -= c0_nxt4 * b34;
  845. c1_nxt2 -= c1_nxt4 * b34;
  846. c0_nxt2 -= c0_nxt3 * b26;
  847. c1_nxt2 -= c1_nxt3 * b26;
  848. c0_nxt2 *= b18;
  849. c1_nxt2 *= b18;
  850. c0_nxt1 -= c0_nxt7 * b57;
  851. c1_nxt1 -= c1_nxt7 * b57;
  852. c0_nxt1 -= c0_nxt6 * b49;
  853. c1_nxt1 -= c1_nxt6 * b49;
  854. c0_nxt1 -= c0_nxt5 * b41;
  855. c1_nxt1 -= c1_nxt5 * b41;
  856. c0_nxt1 -= c0_nxt4 * b33;
  857. c1_nxt1 -= c1_nxt4 * b33;
  858. c0_nxt1 -= c0_nxt3 * b25;
  859. c1_nxt1 -= c1_nxt3 * b25;
  860. c0_nxt1 -= c0_nxt2 * b17;
  861. c1_nxt1 -= c1_nxt2 * b17;
  862. c0_nxt1 *= b9;
  863. c1_nxt1 *= b9;
  864. c0 -= c0_nxt7 * b56;
  865. c1 -= c1_nxt7 * b56;
  866. c0 -= c0_nxt6 * b48;
  867. c1 -= c1_nxt6 * b48;
  868. c0 -= c0_nxt5 * b40;
  869. c1 -= c1_nxt5 * b40;
  870. c0 -= c0_nxt4 * b32;
  871. c1 -= c1_nxt4 * b32;
  872. c0 -= c0_nxt3 * b24;
  873. c1 -= c1_nxt3 * b24;
  874. c0 -= c0_nxt2 * b16;
  875. c1 -= c1_nxt2 * b16;
  876. c0 -= c0_nxt1 * b8;
  877. c1 -= c1_nxt1 * b8;
  878. c0 *= b0;
  879. c1 *= b0;
  880. *(a + 0) = c0;
  881. *(a + 1) = c1;
  882. *(a + 2) = c0_nxt1;
  883. *(a + 3) = c1_nxt1;
  884. *(a + 4) = c0_nxt2;
  885. *(a + 5) = c1_nxt2;
  886. *(a + 6) = c0_nxt3;
  887. *(a + 7) = c1_nxt3;
  888. *(a + 8) = c0_nxt4;
  889. *(a + 9) = c1_nxt4;
  890. *(a + 10) = c0_nxt5;
  891. *(a + 11) = c1_nxt5;
  892. *(a + 12) = c0_nxt6;
  893. *(a + 13) = c1_nxt6;
  894. *(a + 14) = c0_nxt7;
  895. *(a + 15) = c1_nxt7;
  896. *(c + 0) = c0;
  897. *(c + 1) = c1;
  898. *(c + 0 + 1 * ldc) = c0_nxt1;
  899. *(c + 1 + 1 * ldc) = c1_nxt1;
  900. *(c + 0 + 2 * ldc) = c0_nxt2;
  901. *(c + 1 + 2 * ldc) = c1_nxt2;
  902. *(c + 0 + 3 * ldc) = c0_nxt3;
  903. *(c + 1 + 3 * ldc) = c1_nxt3;
  904. *(c + 0 + 4 * ldc) = c0_nxt4;
  905. *(c + 1 + 4 * ldc) = c1_nxt4;
  906. *(c + 0 + 5 * ldc) = c0_nxt5;
  907. *(c + 1 + 5 * ldc) = c1_nxt5;
  908. *(c + 0 + 6 * ldc) = c0_nxt6;
  909. *(c + 1 + 6 * ldc) = c1_nxt6;
  910. *(c + 0 + 7 * ldc) = c0_nxt7;
  911. *(c + 1 + 7 * ldc) = c1_nxt7;
  912. }
  913. static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  914. {
  915. BLASLONG k;
  916. FLOAT *aa = a, *bb = b;
  917. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  918. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  919. c0 = *(c + 0);
  920. c1 = *(c + 1);
  921. c0_nxt1 = *(c + 0 + 1 * ldc);
  922. c1_nxt1 = *(c + 1 + 1 * ldc);
  923. c0_nxt2 = *(c + 0 + 2 * ldc);
  924. c1_nxt2 = *(c + 1 + 2 * ldc);
  925. c0_nxt3 = *(c + 0 + 3 * ldc);
  926. c1_nxt3 = *(c + 1 + 3 * ldc);
  927. for (k = 0; k < bk; k++)
  928. {
  929. c0 -= aa[0] * bb[0];
  930. c1 -= aa[1] * bb[0];
  931. c0_nxt1 -= aa[0] * bb[1];
  932. c1_nxt1 -= aa[1] * bb[1];
  933. c0_nxt2 -= aa[0] * bb[2];
  934. c1_nxt2 -= aa[1] * bb[2];
  935. c0_nxt3 -= aa[0] * bb[3];
  936. c1_nxt3 -= aa[1] * bb[3];
  937. aa += 2;
  938. bb += 4;
  939. }
  940. a -= 8;
  941. b -= 16;
  942. b0 = *b;
  943. b4 = *(b + 4);
  944. b5 = *(b + 5);
  945. b8 = *(b + 8);
  946. b9 = *(b + 9);
  947. b10 = *(b + 10);
  948. b12 = *(b + 12);
  949. b13 = *(b + 13);
  950. b14 = *(b + 14);
  951. b15 = *(b + 15);
  952. c0_nxt3 *= b15;
  953. c1_nxt3 *= b15;
  954. c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10;
  955. c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10;
  956. c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5;
  957. c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5;
  958. c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0;
  959. c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0;
  960. *(a + 0) = c0;
  961. *(a + 1) = c1;
  962. *(a + 2) = c0_nxt1;
  963. *(a + 3) = c1_nxt1;
  964. *(a + 4) = c0_nxt2;
  965. *(a + 5) = c1_nxt2;
  966. *(a + 6) = c0_nxt3;
  967. *(a + 7) = c1_nxt3;
  968. *(c + 0) = c0;
  969. *(c + 1) = c1;
  970. *(c + 0 + 1 * ldc) = c0_nxt1;
  971. *(c + 1 + 1 * ldc) = c1_nxt1;
  972. *(c + 0 + 2 * ldc) = c0_nxt2;
  973. *(c + 1 + 2 * ldc) = c1_nxt2;
  974. *(c + 0 + 3 * ldc) = c0_nxt3;
  975. *(c + 1 + 3 * ldc) = c1_nxt3;
  976. }
  977. static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  978. {
  979. BLASLONG k;
  980. FLOAT *aa = a, *bb = b;
  981. FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
  982. c0 = *(c + 0);
  983. c1 = *(c + 1);
  984. c0_nxt = *(c + 0 + ldc);
  985. c1_nxt = *(c + 1 + ldc);
  986. for (k = 0; k < bk; k++)
  987. {
  988. c0 -= aa[0] * bb[0];
  989. c1 -= aa[1] * bb[0];
  990. c0_nxt -= aa[0] * bb[1];
  991. c1_nxt -= aa[1] * bb[1];
  992. aa += 2;
  993. bb += 2;
  994. }
  995. a -= 4;
  996. b -= 4;
  997. b3 = *(b + 3);
  998. b2 = *(b + 2);
  999. b0 = *b;
  1000. c0_nxt *= b3;
  1001. c1_nxt *= b3;
  1002. c0 -= c0_nxt * b2;
  1003. c1 -= c1_nxt * b2;
  1004. c0 *= b0;
  1005. c1 *= b0;
  1006. *(a + 0) = c0;
  1007. *(a + 1) = c1;
  1008. *(a + 2) = c0_nxt;
  1009. *(a + 3) = c1_nxt;
  1010. *(c + 0) = c0;
  1011. *(c + 1) = c1;
  1012. *(c + 0 + ldc) = c0_nxt;
  1013. *(c + 1 + ldc) = c1_nxt;
  1014. }
  1015. static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1016. {
  1017. BLASLONG k;
  1018. FLOAT *aa = a, *bb = b;
  1019. FLOAT b0, c0, c1;
  1020. c0 = *(c + 0);
  1021. c1 = *(c + 1);
  1022. for (k = 0; k < bk; k++)
  1023. {
  1024. c0 -= aa[0] * bb[0];
  1025. c1 -= aa[1] * bb[0];
  1026. aa += 2;
  1027. bb += 1;
  1028. }
  1029. a -= 2;
  1030. b -= 1;
  1031. b0 = *b;
  1032. c0 *= b0;
  1033. c1 *= b0;
  1034. *(a + 0) = c0;
  1035. *(a + 1) = c1;
  1036. *(c + 0) = c0;
  1037. *(c + 1) = c1;
  1038. }
  1039. static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1040. {
  1041. BLASLONG k;
  1042. FLOAT *aa = a, *bb = b;
  1043. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  1044. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  1045. FLOAT b56, b57, b58, b59, b60, b61, b62, b63;
  1046. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  1047. c0 = *(c + 0);
  1048. c1 = *(c + 1 * ldc);
  1049. c2 = *(c + 2 * ldc);
  1050. c3 = *(c + 3 * ldc);
  1051. c4 = *(c + 4 * ldc);
  1052. c5 = *(c + 5 * ldc);
  1053. c6 = *(c + 6 * ldc);
  1054. c7 = *(c + 7 * ldc);
  1055. for (k = 0; k < bk; k++)
  1056. {
  1057. c0 -= aa[0] * bb[0];
  1058. c1 -= aa[0] * bb[1];
  1059. c2 -= aa[0] * bb[2];
  1060. c3 -= aa[0] * bb[3];
  1061. c4 -= aa[0] * bb[4];
  1062. c5 -= aa[0] * bb[5];
  1063. c6 -= aa[0] * bb[6];
  1064. c7 -= aa[0] * bb[7];
  1065. aa += 1;
  1066. bb += 8;
  1067. }
  1068. a -= 8;
  1069. b -= 64;
  1070. b0 = *(b + 0);
  1071. b8 = *(b + 8);
  1072. b9 = *(b + 9);
  1073. b16 = *(b + 16);
  1074. b17 = *(b + 17);
  1075. b18 = *(b + 18);
  1076. b24 = *(b + 24);
  1077. b25 = *(b + 25);
  1078. b26 = *(b + 26);
  1079. b27 = *(b + 27);
  1080. b32 = *(b + 32);
  1081. b33 = *(b + 33);
  1082. b34 = *(b + 34);
  1083. b35 = *(b + 35);
  1084. b36 = *(b + 36);
  1085. b40 = *(b + 40);
  1086. b41 = *(b + 41);
  1087. b42 = *(b + 42);
  1088. b43 = *(b + 43);
  1089. b44 = *(b + 44);
  1090. b45 = *(b + 45);
  1091. b48 = *(b + 48);
  1092. b49 = *(b + 49);
  1093. b50 = *(b + 50);
  1094. b51 = *(b + 51);
  1095. b52 = *(b + 52);
  1096. b53 = *(b + 53);
  1097. b54 = *(b + 54);
  1098. b56 = *(b + 56);
  1099. b57 = *(b + 57);
  1100. b58 = *(b + 58);
  1101. b59 = *(b + 59);
  1102. b60 = *(b + 60);
  1103. b61 = *(b + 61);
  1104. b62 = *(b + 62);
  1105. b63 = *(b + 63);
  1106. c7 *= b63;
  1107. c6 -= c7 * b62;
  1108. c6 *= b54;
  1109. c5 -= c7 * b61;
  1110. c5 -= c6 * b53;
  1111. c5 *= b45;
  1112. c4 -= c7 * b60;
  1113. c4 -= c6 * b52;
  1114. c4 -= c5 * b44;
  1115. c4 *= b36;
  1116. c3 -= c7 * b59;
  1117. c3 -= c6 * b51;
  1118. c3 -= c5 * b43;
  1119. c3 -= c4 * b35;
  1120. c3 *= b27;
  1121. c2 -= c7 * b58;
  1122. c2 -= c6 * b50;
  1123. c2 -= c5 * b42;
  1124. c2 -= c4 * b34;
  1125. c2 -= c3 * b26;
  1126. c2 *= b18;
  1127. c1 -= c7 * b57;
  1128. c1 -= c6 * b49;
  1129. c1 -= c5 * b41;
  1130. c1 -= c4 * b33;
  1131. c1 -= c3 * b25;
  1132. c1 -= c2 * b17;
  1133. c1 *= b9;
  1134. c0 -= c7 * b56;
  1135. c0 -= c6 * b48;
  1136. c0 -= c5 * b40;
  1137. c0 -= c4 * b32;
  1138. c0 -= c3 * b24;
  1139. c0 -= c2 * b16;
  1140. c0 -= c1 * b8;
  1141. c0 *= b0;
  1142. *(a + 0) = c0;
  1143. *(a + 1) = c1;
  1144. *(a + 2) = c2;
  1145. *(a + 3) = c3;
  1146. *(a + 4) = c4;
  1147. *(a + 5) = c5;
  1148. *(a + 6) = c6;
  1149. *(a + 7) = c7;
  1150. *(c + 0) = c0;
  1151. *(c + 1 * ldc) = c1;
  1152. *(c + 2 * ldc) = c2;
  1153. *(c + 3 * ldc) = c3;
  1154. *(c + 4 * ldc) = c4;
  1155. *(c + 5 * ldc) = c5;
  1156. *(c + 6 * ldc) = c6;
  1157. *(c + 7 * ldc) = c7;
  1158. }
  1159. static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1160. {
  1161. BLASLONG k;
  1162. FLOAT *aa = a, *bb = b;
  1163. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  1164. FLOAT c0, c1, c2, c3;
  1165. c0 = *(c + 0);
  1166. c1 = *(c + 1 * ldc);
  1167. c2 = *(c + 2 * ldc);
  1168. c3 = *(c + 3 * ldc);
  1169. for (k = 0; k < bk; k++)
  1170. {
  1171. c0 -= aa[0] * bb[0];
  1172. c1 -= aa[0] * bb[1];
  1173. c2 -= aa[0] * bb[2];
  1174. c3 -= aa[0] * bb[3];
  1175. aa += 1;
  1176. bb += 4;
  1177. }
  1178. a -= 4;
  1179. b -= 16;
  1180. b0 = *b;
  1181. b4 = *(b + 4);
  1182. b5 = *(b + 5);
  1183. b8 = *(b + 8);
  1184. b9 = *(b + 9);
  1185. b10 = *(b + 10);
  1186. b12 = *(b + 12);
  1187. b13 = *(b + 13);
  1188. b14 = *(b + 14);
  1189. b15 = *(b + 15);
  1190. c3 *= b15;
  1191. c2 = (c2 - c3 * b14) * b10;
  1192. c1 = ((c1 - c3 * b13) - c2 * b9) * b5;
  1193. c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0;
  1194. *(a + 0) = c0;
  1195. *(a + 1) = c1;
  1196. *(a + 2) = c2;
  1197. *(a + 3) = c3;
  1198. *(c) = c0;
  1199. *(c + 1 * ldc) = c1;
  1200. *(c + 2 * ldc) = c2;
  1201. *(c + 3 * ldc) = c3;
  1202. }
  1203. static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1204. {
  1205. BLASLONG k;
  1206. FLOAT *aa = a, *bb = b;
  1207. FLOAT b0, b2, b3, c0, c1;
  1208. c0 = *(c + 0);
  1209. c1 = *(c + ldc);
  1210. for (k = 0; k < bk; k++)
  1211. {
  1212. c0 -= aa[0] * bb[0];
  1213. c1 -= aa[0] * bb[1];
  1214. aa += 1;
  1215. bb += 2;
  1216. }
  1217. a -= 2;
  1218. b -= 4;
  1219. b3 = *(b + 3);
  1220. b2 = *(b + 2);
  1221. b0 = *b;
  1222. c1 *= b3;
  1223. c0 -= c1 * b2;
  1224. c0 *= b0;
  1225. *(a + 0) = c0;
  1226. *(a + 1) = c1;
  1227. *(c + 0) = c0;
  1228. *(c + ldc) = c1;
  1229. }
  1230. static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1231. {
  1232. BLASLONG k;
  1233. for (k = 0; k < bk; k++)
  1234. {
  1235. *c -= a[k] * b[k];
  1236. }
  1237. *c *= *(a - 1);
  1238. *(b - 1) = *c;
  1239. }
  1240. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1241. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1242. {
  1243. FLOAT *aa, *cc;
  1244. BLASLONG i, j, kk;
  1245. kk = n - offset;
  1246. c += n * ldc;
  1247. b += n * k;
  1248. if (n & 7)
  1249. {
  1250. if (n & 1)
  1251. {
  1252. aa = a;
  1253. b -= k;
  1254. c -= ldc;
  1255. cc = c;
  1256. for (i = (m >> 3); i--;)
  1257. {
  1258. ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk));
  1259. aa += 8 * k;
  1260. cc += 8;
  1261. }
  1262. if (m & 7)
  1263. {
  1264. if (m & 4)
  1265. {
  1266. ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk));
  1267. aa += 4 * k;
  1268. cc += 4;
  1269. }
  1270. if (m & 2)
  1271. {
  1272. ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk));
  1273. aa += 2 * k;
  1274. cc += 2;
  1275. }
  1276. if (m & 1)
  1277. {
  1278. ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk));
  1279. aa += k;
  1280. cc += 1;
  1281. }
  1282. }
  1283. kk -= 1;
  1284. }
  1285. if (n & 2)
  1286. {
  1287. aa = a;
  1288. b -= 2 * k;
  1289. c -= 2 * ldc;
  1290. cc = c;
  1291. for (i = (m >> 3); i--;)
  1292. {
  1293. ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1294. aa += 8 * k;
  1295. cc += 8;
  1296. }
  1297. if (m & 7)
  1298. {
  1299. if (m & 4)
  1300. {
  1301. ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1302. aa += 4 * k;
  1303. cc += 4;
  1304. }
  1305. if (m & 2)
  1306. {
  1307. ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1308. aa += 2 * k;
  1309. cc += 2;
  1310. }
  1311. if (m & 1)
  1312. {
  1313. ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk));
  1314. aa += k;
  1315. cc += 1;
  1316. }
  1317. }
  1318. kk -= 2;
  1319. }
  1320. if (n & 4)
  1321. {
  1322. aa = a;
  1323. b -= 4 * k;
  1324. c -= 4 * ldc;
  1325. cc = c;
  1326. for (i = (m >> 3); i--;)
  1327. {
  1328. ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1329. aa += 8 * k;
  1330. cc += 8;
  1331. }
  1332. if (m & 7)
  1333. {
  1334. if (m & 4)
  1335. {
  1336. ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1337. aa += 4 * k;
  1338. cc += 4;
  1339. }
  1340. if (m & 2)
  1341. {
  1342. ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1343. aa += 2 * k;
  1344. cc += 2;
  1345. }
  1346. if (m & 1)
  1347. {
  1348. ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk));
  1349. aa += k;
  1350. cc += 1;
  1351. }
  1352. }
  1353. kk -= 4;
  1354. }
  1355. }
  1356. for (j = (n >> 3); j--;)
  1357. {
  1358. aa = a;
  1359. b -= 8 * k;
  1360. c -= 8 * ldc;
  1361. cc = c;
  1362. for (i = (m >> 3); i--;)
  1363. {
  1364. ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1365. aa += 8 * k;
  1366. cc += 8;
  1367. }
  1368. if (m & 7)
  1369. {
  1370. if (m & 4)
  1371. {
  1372. ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1373. aa += 4 * k;
  1374. cc += 4;
  1375. }
  1376. if (m & 2)
  1377. {
  1378. ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1379. aa += 2 * k;
  1380. cc += 2;
  1381. }
  1382. if (m & 1)
  1383. {
  1384. ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk));
  1385. aa += k;
  1386. cc += 1;
  1387. }
  1388. }
  1389. kk -= 8;
  1390. }
  1391. return 0;
  1392. }