You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zherk_kernel.c 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef CONJ
  41. #define GEMM_KERNEL GEMM_KERNEL_R
  42. #define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0
  43. #else
  44. #define GEMM_KERNEL GEMM_KERNEL_L
  45. #define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0
  46. #endif
  47. int
  48. #ifndef C_MSVC
  49. __attribute__((visibility("hidden")))
  50. #endif
  51. CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
  52. FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
  53. BLASLONG i, j;
  54. BLASLONG loop;
  55. FLOAT *cc, *ss;
  56. FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE];
  57. if (m + offset < 0) {
  58. #ifndef LOWER
  59. GEMM_KERNEL(m, n, k,
  60. alpha_r, ZERO,
  61. a, b, c, ldc);
  62. #endif
  63. return 0;
  64. }
  65. if (n < offset) {
  66. #ifdef LOWER
  67. GEMM_KERNEL(m, n, k,
  68. alpha_r, ZERO,
  69. a, b, c, ldc);
  70. #endif
  71. return 0;
  72. }
  73. if (offset > 0) {
  74. #ifdef LOWER
  75. GEMM_KERNEL(m, offset, k,
  76. alpha_r, ZERO,
  77. a, b, c, ldc);
  78. #endif
  79. b += offset * k * COMPSIZE;
  80. c += offset * ldc * COMPSIZE;
  81. n -= offset;
  82. offset = 0;
  83. if (n <= 0) return 0;
  84. }
  85. if (n > m + offset) {
  86. #ifndef LOWER
  87. GEMM_KERNEL(m, n - m - offset, k,
  88. alpha_r, ZERO,
  89. a,
  90. b + (m + offset) * k * COMPSIZE,
  91. c + (m + offset) * ldc * COMPSIZE, ldc);
  92. #endif
  93. n = m + offset;
  94. if (n <= 0) return 0;
  95. }
  96. if (offset < 0) {
  97. #ifndef LOWER
  98. GEMM_KERNEL(-offset, n, k,
  99. alpha_r, ZERO,
  100. a, b, c, ldc);
  101. #endif
  102. a -= offset * k * COMPSIZE;
  103. c -= offset * COMPSIZE;
  104. m += offset;
  105. offset = 0;
  106. if (m <= 0) return 0;
  107. }
  108. if (m > n - offset) {
  109. #ifdef LOWER
  110. GEMM_KERNEL(m - n + offset, n, k,
  111. alpha_r, ZERO,
  112. a + (n - offset) * k * COMPSIZE,
  113. b,
  114. c + (n - offset) * COMPSIZE, ldc);
  115. #endif
  116. m = n + offset;
  117. if (m <= 0) return 0;
  118. }
  119. for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
  120. int mm, nn;
  121. mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
  122. nn = MIN(GEMM_UNROLL_MN, n - loop);
  123. #ifndef LOWER
  124. GEMM_KERNEL(mm, nn, k,
  125. alpha_r, ZERO,
  126. a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
  127. #endif
  128. GEMM_BETA(nn, nn, 0, ZERO, ZERO,
  129. NULL, 0, NULL, 0, subbuffer, nn);
  130. GEMM_KERNEL(nn, nn, k,
  131. alpha_r, ZERO,
  132. a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
  133. cc = c + (loop + loop * ldc) * COMPSIZE;
  134. ss = subbuffer;
  135. #ifndef LOWER
  136. for (j = 0; j < nn; j ++) {
  137. for (i = 0; i <j; i ++) {
  138. cc[i * 2 + 0] += ss[i * 2 + 0];
  139. cc[i * 2 + 1] += ss[i * 2 + 1];
  140. }
  141. cc[j * 2 + 0] += ss[i * 2 + 0];
  142. cc[j * 2 + 1] = ZERO;
  143. ss += nn * COMPSIZE;
  144. cc += ldc * COMPSIZE;
  145. }
  146. #else
  147. for (j = 0; j < nn; j ++) {
  148. cc[j * 2 + 0] += ss[j * 2 + 0];
  149. cc[j * 2 + 1] = ZERO;
  150. for (i = j + 1; i < nn; i ++) {
  151. cc[i * 2 + 0] += ss[i * 2 + 0];
  152. cc[i * 2 + 1] += ss[i * 2 + 1];
  153. }
  154. ss += nn * COMPSIZE;
  155. cc += ldc * COMPSIZE;
  156. }
  157. #endif
  158. #ifdef LOWER
  159. GEMM_KERNEL(m - mm - nn, nn, k,
  160. alpha_r, ZERO,
  161. a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
  162. c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
  163. #endif
  164. }
  165. return 0;
  166. }