You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmm_utcopy_6.c 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
  41. BLASLONG i, js, ii;
  42. BLASLONG X;
  43. FLOAT data01, data02, data05, data06;
  44. FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6;
  45. js = (n / 6);
  46. if (js > 0){
  47. do {
  48. X = posX;
  49. if (posX <= posY) {
  50. ao1 = a + posX + (posY + 0) * lda;
  51. ao2 = a + posX + (posY + 1) * lda;
  52. ao3 = a + posX + (posY + 2) * lda;
  53. ao4 = a + posX + (posY + 3) * lda;
  54. ao5 = a + posX + (posY + 4) * lda;
  55. ao6 = a + posX + (posY + 5) * lda;
  56. } else {
  57. ao1 = a + posY + (posX + 0) * lda;
  58. ao2 = a + posY + (posX + 1) * lda;
  59. ao3 = a + posY + (posX + 2) * lda;
  60. ao4 = a + posY + (posX + 3) * lda;
  61. ao5 = a + posY + (posX + 4) * lda;
  62. ao6 = a + posY + (posX + 5) * lda;
  63. }
  64. i = (m / 6);
  65. if (i > 0) {
  66. do {
  67. if (X < posY) {
  68. ao1 += 6;
  69. ao2 += 6;
  70. ao3 += 6;
  71. ao4 += 6;
  72. ao5 += 6;
  73. ao6 += 6;
  74. b += 36;
  75. } else if (X > posY) {
  76. for (ii = 0; ii < 6; ii++){
  77. b[ 0] = *(ao1 + 0);
  78. b[ 1] = *(ao1 + 1);
  79. b[ 2] = *(ao1 + 2);
  80. b[ 3] = *(ao1 + 3);
  81. b[ 4] = *(ao1 + 4);
  82. b[ 5] = *(ao1 + 5);
  83. ao1 += lda;
  84. b += 6;
  85. }
  86. ao2 += 6 * lda;
  87. ao3 += 6 * lda;
  88. ao4 += 6 * lda;
  89. ao5 += 6 * lda;
  90. ao6 += 6 * lda;
  91. } else {
  92. #ifdef UNIT
  93. b[ 0] = ONE;
  94. #else
  95. b[ 0] = *(ao1 + 0);
  96. #endif
  97. b[ 1] = ZERO;
  98. b[ 2] = ZERO;
  99. b[ 3] = ZERO;
  100. b[ 4] = ZERO;
  101. b[ 5] = ZERO;
  102. b[ 6] = *(ao2 + 0);
  103. #ifdef UNIT
  104. b[ 7] = ONE;
  105. #else
  106. b[ 7] = *(ao2 + 1);
  107. #endif
  108. b[ 8] = ZERO;
  109. b[ 9] = ZERO;
  110. b[10] = ZERO;
  111. b[11] = ZERO;
  112. b[12] = *(ao3 + 0);
  113. b[13] = *(ao3 + 1);
  114. #ifdef UNIT
  115. b[14] = ONE;
  116. #else
  117. b[14] = *(ao3 + 2);
  118. #endif
  119. b[15] = ZERO;
  120. b[16] = ZERO;
  121. b[17] = ZERO;
  122. b[18] = *(ao4 + 0);
  123. b[19] = *(ao4 + 1);
  124. b[20] = *(ao4 + 2);
  125. #ifdef UNIT
  126. b[21] = ONE;
  127. #else
  128. b[21] = *(ao4 + 3);
  129. #endif
  130. b[22] = ZERO;
  131. b[23] = ZERO;
  132. b[24] = *(ao5 + 0);
  133. b[25] = *(ao5 + 1);
  134. b[26] = *(ao5 + 2);
  135. b[27] = *(ao5 + 3);
  136. #ifdef UNIT
  137. b[28] = ONE;
  138. #else
  139. b[28] = *(ao5 + 4);
  140. #endif
  141. b[29] = ZERO;
  142. b[30] = *(ao6 + 0);
  143. b[31] = *(ao6 + 1);
  144. b[32] = *(ao6 + 2);
  145. b[33] = *(ao6 + 3);
  146. b[34] = *(ao6 + 4);
  147. #ifdef UNIT
  148. b[35] = ONE;
  149. #else
  150. b[35] = *(ao6 + 5);
  151. #endif
  152. ao1 += 6 * lda;
  153. ao2 += 6 * lda;
  154. ao3 += 6 * lda;
  155. ao4 += 6 * lda;
  156. ao5 += 6 * lda;
  157. ao6 += 6 * lda;
  158. b += 36;
  159. }
  160. X += 6;
  161. i --;
  162. } while (i > 0);
  163. }
  164. i = m % 6;
  165. if (i > 0) {
  166. if (X < posY) {
  167. ao1 += i;
  168. ao2 += i;
  169. ao3 += i;
  170. ao4 += i;
  171. ao5 += i;
  172. ao6 += i;
  173. b += 6 * i;
  174. } else if (X > posY) {
  175. for (ii = 0; ii < i; ii++){
  176. b[ 0] = *(ao1 + 0);
  177. b[ 1] = *(ao1 + 1);
  178. b[ 2] = *(ao1 + 2);
  179. b[ 3] = *(ao1 + 3);
  180. b[ 4] = *(ao1 + 4);
  181. b[ 5] = *(ao1 + 5);
  182. ao1 += lda;
  183. ao2 += lda;
  184. ao3 += lda;
  185. ao4 += lda;
  186. ao5 += lda;
  187. ao6 += lda;
  188. b += 6;
  189. }
  190. } else {
  191. #ifdef UNIT
  192. b[ 0] = ONE;
  193. #else
  194. b[ 0] = *(ao1 + 0);
  195. #endif
  196. b[ 1] = ZERO;
  197. b[ 2] = ZERO;
  198. b[ 3] = ZERO;
  199. b[ 4] = ZERO;
  200. b[ 5] = ZERO;
  201. if (i >= 2) {
  202. b[ 0] = *(ao2 + 0);
  203. #ifdef UNIT
  204. b[ 1] = ONE;
  205. #else
  206. b[ 1] = *(ao2 + 1);
  207. #endif
  208. b[ 2] = ZERO;
  209. b[ 3] = ZERO;
  210. b[ 4] = ZERO;
  211. b[ 5] = ZERO;
  212. b += 6;
  213. }
  214. if (i >= 3) {
  215. b[ 0] = *(ao3 + 0);
  216. b[ 1] = *(ao3 + 1);
  217. #ifdef UNIT
  218. b[ 2] = ONE;
  219. #else
  220. b[ 2] = *(ao3 + 2);
  221. #endif
  222. b[ 3] = ZERO;
  223. b[ 4] = ZERO;
  224. b[ 5] = ZERO;
  225. b += 6;
  226. }
  227. if (i >= 4) {
  228. b[ 0] = *(ao4 + 0);
  229. b[ 1] = *(ao4 + 1);
  230. b[ 2] = *(ao4 + 2);
  231. #ifdef UNIT
  232. b[ 3] = ONE;
  233. #else
  234. b[ 3] = *(ao4 + 3);
  235. #endif
  236. b[ 4] = ZERO;
  237. b[ 5] = ZERO;
  238. b += 6;
  239. }
  240. if (i >= 5) {
  241. b[ 0] = *(ao5 + 0);
  242. b[ 1] = *(ao5 + 1);
  243. b[ 2] = *(ao5 + 2);
  244. b[ 3] = *(ao5 + 3);
  245. #ifdef UNIT
  246. b[ 4] = ONE;
  247. #else
  248. b[ 4] = *(ao5 + 4);
  249. #endif
  250. b[ 5] = ZERO;
  251. b += 6;
  252. }
  253. }
  254. }
  255. posY += 6;
  256. js --;
  257. } while (js > 0);
  258. } /* End of main loop */
  259. if ((n % 6) & 4){
  260. X = posX;
  261. if (posX <= posY) {
  262. ao1 = a + posX + (posY + 0) * lda;
  263. ao2 = a + posX + (posY + 1) * lda;
  264. ao3 = a + posX + (posY + 2) * lda;
  265. ao4 = a + posX + (posY + 3) * lda;
  266. } else {
  267. ao1 = a + posY + (posX + 0) * lda;
  268. ao2 = a + posY + (posX + 1) * lda;
  269. ao3 = a + posY + (posX + 2) * lda;
  270. ao4 = a + posY + (posX + 3) * lda;
  271. }
  272. i = (m >> 1);
  273. if (i > 0) {
  274. do {
  275. if (X < posY) {
  276. ao1 += 2;
  277. ao2 += 2;
  278. ao3 += 2;
  279. ao4 += 2;
  280. b += 8;
  281. } else if (X > posY) {
  282. for (ii = 0; ii < 2; ii++){
  283. b[ 0] = *(ao1 + 0);
  284. b[ 1] = *(ao1 + 1);
  285. b[ 2] = *(ao1 + 2);
  286. b[ 3] = *(ao1 + 3);
  287. ao1 += lda;
  288. b += 4;
  289. }
  290. ao2 += 2 * lda;
  291. ao3 += 2 * lda;
  292. ao4 += 2 * lda;
  293. } else {
  294. #ifdef UNIT
  295. b[ 0] = ONE;
  296. #else
  297. b[ 0] = *(ao1 + 0);
  298. #endif
  299. b[ 1] = ZERO;
  300. b[ 2] = ZERO;
  301. b[ 3] = ZERO;
  302. b[ 4] = *(ao2 + 0);
  303. #ifdef UNIT
  304. b[ 5] = ONE;
  305. #else
  306. b[ 5] = *(ao2 + 1);
  307. #endif
  308. b[ 6] = ZERO;
  309. b[ 7] = ZERO;
  310. b[ 8] = *(ao3 + 0);
  311. b[ 9] = *(ao3 + 1);
  312. #ifdef UNIT
  313. b[ 10] = ONE;
  314. #else
  315. b[ 10] = *(ao3 + 2);
  316. #endif
  317. b[ 11] = ZERO;
  318. b[ 12] = *(ao4 + 0);
  319. b[ 13] = *(ao4 + 1);
  320. b[ 14] = *(ao4 + 2);
  321. #ifdef UNIT
  322. b[ 15] = ONE;
  323. #else
  324. b[ 15] = *(ao4 + 3);
  325. #endif
  326. ao1 += 4 * lda;
  327. ao2 += 4 * lda;
  328. ao3 += 4 * lda;
  329. ao4 += 4 * lda;
  330. b += 16;
  331. X += 4;
  332. i -= 2;
  333. continue;
  334. }
  335. X += 2;
  336. i --;
  337. } while (i > 0);
  338. }
  339. i = (m & 1);
  340. if (i > 0) {
  341. if (X < posY) {
  342. ao1 += i;
  343. ao2 += i;
  344. ao3 += i;
  345. ao4 += i;
  346. b += 4 * i;
  347. } else if (X > posY) {
  348. for (ii = 0; ii < i; ii++){
  349. b[ 0] = *(ao1 + 0);
  350. b[ 1] = *(ao1 + 1);
  351. b[ 2] = *(ao1 + 2);
  352. b[ 3] = *(ao1 + 3);
  353. ao1 += lda;
  354. b += 4;
  355. }
  356. ao2 += lda;
  357. ao3 += lda;
  358. ao4 += lda;
  359. } else {
  360. #ifdef UNIT
  361. b[ 0] = ONE;
  362. #else
  363. b[ 0] = *(ao1 + 0);
  364. #endif
  365. b[ 1] = ZERO;
  366. b[ 2] = ZERO;
  367. b[ 3] = ZERO;
  368. b += 4;
  369. }
  370. }
  371. posY += 4;
  372. }
  373. if ((n % 6) & 2){
  374. X = posX;
  375. if (posX <= posY) {
  376. ao1 = a + posX + (posY + 0) * lda;
  377. ao2 = a + posX + (posY + 1) * lda;
  378. } else {
  379. ao1 = a + posY + (posX + 0) * lda;
  380. ao2 = a + posY + (posX + 1) * lda;
  381. }
  382. i = (m >> 1);
  383. if (i > 0) {
  384. do {
  385. if (X < posY) {
  386. ao1 += 2;
  387. ao2 += 2;
  388. b += 4;
  389. } else if (X > posY) {
  390. data01 = *(ao1 + 0);
  391. data02 = *(ao1 + 1);
  392. data05 = *(ao2 + 0);
  393. data06 = *(ao2 + 1);
  394. b[ 0] = data01;
  395. b[ 1] = data02;
  396. b[ 2] = data05;
  397. b[ 3] = data06;
  398. ao1 += 2 * lda;
  399. ao2 += 2 * lda;
  400. b += 4;
  401. } else {
  402. #ifdef UNIT
  403. data05 = *(ao2 + 0);
  404. b[ 0] = ONE;
  405. b[ 1] = ZERO;
  406. b[ 2] = data05;
  407. b[ 3] = ONE;
  408. #else
  409. data01 = *(ao1 + 0);
  410. data05 = *(ao2 + 0);
  411. data06 = *(ao2 + 1);
  412. b[ 0] = data01;
  413. b[ 1] = ZERO;
  414. b[ 2] = data05;
  415. b[ 3] = data06;
  416. #endif
  417. ao1 += 2 * lda;
  418. ao2 += 2 * lda;
  419. b += 4;
  420. }
  421. X += 2;
  422. i --;
  423. } while (i > 0);
  424. }
  425. i = (m & 1);
  426. if (i) {
  427. if (X < posY) {
  428. ao1 += 2;
  429. b += 2;
  430. } else if (X > posY) {
  431. data01 = *(ao1 + 0);
  432. data02 = *(ao1 + 1);
  433. b[ 0] = data01;
  434. b[ 1] = data02;
  435. ao1 += lda;
  436. b += 2;
  437. } else {
  438. #ifdef UNIT
  439. b[ 0] = ONE;
  440. b[ 1] = ZERO;
  441. #else
  442. data01 = *(ao1 + 0);
  443. b[ 0] = data01;
  444. b[ 1] = ZERO;
  445. #endif
  446. b += 2;
  447. }
  448. }
  449. posY += 2;
  450. }
  451. if ((n % 6) & 1){
  452. X = posX;
  453. if (posX <= posY) {
  454. ao1 = a + posX + (posY + 0) * lda;
  455. } else {
  456. ao1 = a + posY + (posX + 0) * lda;
  457. }
  458. i = m;
  459. if (m > 0) {
  460. do {
  461. if (X < posY) {
  462. b += 1;
  463. ao1 += 1;
  464. } else if (X > posY) {
  465. data01 = *(ao1 + 0);
  466. b[ 0] = data01;
  467. ao1 += lda;
  468. b += 1;
  469. } else {
  470. #ifdef UNIT
  471. b[ 0] = ONE;
  472. #else
  473. data01 = *(ao1 + 0);
  474. b[ 0] = data01;
  475. #endif
  476. ao1 += lda;
  477. b += 1;
  478. }
  479. X += 1;
  480. i --;
  481. } while (i > 0);
  482. }
  483. }
  484. return 0;
  485. }