You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

casum.c 3.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #include "common.h"
  2. #ifndef ABS_K
  3. #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
  4. #endif
  5. #if defined(SKYLAKEX)
  6. #include "casum_microk_skylakex-2.c"
  7. #endif
  8. #ifndef HAVE_CASUM_KERNEL
  9. static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
  10. {
  11. BLASLONG i=0;
  12. BLASLONG n_8 = n & -8;
  13. FLOAT *x = x1;
  14. FLOAT temp0, temp1, temp2, temp3;
  15. FLOAT temp4, temp5, temp6, temp7;
  16. FLOAT sum0 = 0.0;
  17. FLOAT sum1 = 0.0;
  18. FLOAT sum2 = 0.0;
  19. FLOAT sum3 = 0.0;
  20. FLOAT sum4 = 0.0;
  21. while (i < n_8) {
  22. temp0 = ABS_K(x[0]);
  23. temp1 = ABS_K(x[1]);
  24. temp2 = ABS_K(x[2]);
  25. temp3 = ABS_K(x[3]);
  26. temp4 = ABS_K(x[4]);
  27. temp5 = ABS_K(x[5]);
  28. temp6 = ABS_K(x[6]);
  29. temp7 = ABS_K(x[7]);
  30. sum0 += temp0;
  31. sum1 += temp1;
  32. sum2 += temp2;
  33. sum3 += temp3;
  34. sum0 += temp4;
  35. sum1 += temp5;
  36. sum2 += temp6;
  37. sum3 += temp7;
  38. x+=8;
  39. i+=4;
  40. }
  41. while (i < n) {
  42. sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
  43. x1 += 2;
  44. i++;
  45. }
  46. return sum0+sum1+sum2+sum3+sum4;
  47. }
  48. #endif
  49. static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
  50. {
  51. BLASLONG i = 0;
  52. BLASLONG ip = 0;
  53. BLASLONG inc_x2;
  54. FLOAT sumf = 0.0;
  55. if (n <= 0 || inc_x <= 0) return(sumf);
  56. if (inc_x == 1) {
  57. sumf = casum_kernel(n, x);
  58. }
  59. else {
  60. inc_x2 = 2 * inc_x;
  61. while (i < n) {
  62. sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
  63. ip += inc_x2;
  64. i++;
  65. }
  66. }
  67. return(sumf);
  68. }
  69. #if defined(SMP)
  70. static int asum_thread_function(BLASLONG n,
  71. BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
  72. FLOAT *x, BLASLONG inc_x,
  73. FLOAT * dummy3, BLASLONG dummy4,
  74. FLOAT * result, BLASLONG dummy5)
  75. {
  76. *(FLOAT *) result = asum_compute(n, x, inc_x);
  77. return 0;
  78. }
  79. extern int blas_level1_thread_with_return_value(int mode,
  80. BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
  81. void *a, BLASLONG lda,
  82. void *b, BLASLONG ldb,
  83. void *c, BLASLONG ldc,
  84. int (*function)(),
  85. int nthread);
  86. #endif
  87. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
  88. {
  89. #if defined(SMP)
  90. int nthreads;
  91. FLOAT dummy_alpha[2];
  92. #endif
  93. FLOAT sumf = 0.0;
  94. #if defined(SMP)
  95. int num_cpu = num_cpu_avail(1);
  96. if (n <= 10000 || inc_x <= 0)
  97. nthreads = 1;
  98. else
  99. nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
  100. if (nthreads == 1) {
  101. sumf = asum_compute(n, x, inc_x);
  102. }
  103. else {
  104. int mode, i;
  105. char result[MAX_CPU_NUMBER * sizeof(double) *2];
  106. FLOAT *ptr;
  107. #if !defined(DOUBLE)
  108. mode = BLAS_SINGLE | BLAS_COMPLEX;
  109. #else
  110. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  111. #endif
  112. blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
  113. NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
  114. ptr = (FLOAT *)result;
  115. for (i = 0; i < nthreads; i++) {
  116. sumf += (*ptr);
  117. ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
  118. }
  119. }
  120. #else
  121. sumf = asum_compute(n, x, inc_x);
  122. #endif
  123. return(sumf);
  124. }