You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

drot_microk_haswell-2.c 2.4 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #if defined(HAVE_FMA3) && defined(HAVE_AVX2)
  2. #define HAVE_DROT_KERNEL 1
  3. #include <immintrin.h>
  4. #include <stdint.h>
  5. static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
  6. {
  7. BLASLONG i = 0;
  8. BLASLONG tail_index_4 = n&(~3);
  9. BLASLONG tail_index_16 = n&(~15);
  10. __m256d c_256, s_256;
  11. if (n >= 4) {
  12. c_256 = _mm256_set1_pd(c);
  13. s_256 = _mm256_set1_pd(s);
  14. }
  15. __m256d x0, x1, x2, x3;
  16. __m256d y0, y1, y2, y3;
  17. __m256d t0, t1, t2, t3;
  18. for (i = 0; i < tail_index_16; i += 16) {
  19. x0 = _mm256_loadu_pd(&x[i + 0]);
  20. x1 = _mm256_loadu_pd(&x[i + 4]);
  21. x2 = _mm256_loadu_pd(&x[i + 8]);
  22. x3 = _mm256_loadu_pd(&x[i +12]);
  23. y0 = _mm256_loadu_pd(&y[i + 0]);
  24. y1 = _mm256_loadu_pd(&y[i + 4]);
  25. y2 = _mm256_loadu_pd(&y[i + 8]);
  26. y3 = _mm256_loadu_pd(&y[i +12]);
  27. t0 = _mm256_mul_pd(s_256, y0);
  28. t1 = _mm256_mul_pd(s_256, y1);
  29. t2 = _mm256_mul_pd(s_256, y2);
  30. t3 = _mm256_mul_pd(s_256, y3);
  31. t0 = _mm256_fmadd_pd(c_256, x0, t0);
  32. t1 = _mm256_fmadd_pd(c_256, x1, t1);
  33. t2 = _mm256_fmadd_pd(c_256, x2, t2);
  34. t3 = _mm256_fmadd_pd(c_256, x3, t3);
  35. _mm256_storeu_pd(&x[i + 0], t0);
  36. _mm256_storeu_pd(&x[i + 4], t1);
  37. _mm256_storeu_pd(&x[i + 8], t2);
  38. _mm256_storeu_pd(&x[i +12], t3);
  39. t0 = _mm256_mul_pd(s_256, x0);
  40. t1 = _mm256_mul_pd(s_256, x1);
  41. t2 = _mm256_mul_pd(s_256, x2);
  42. t3 = _mm256_mul_pd(s_256, x3);
  43. t0 = _mm256_fmsub_pd(c_256, y0, t0);
  44. t1 = _mm256_fmsub_pd(c_256, y1, t1);
  45. t2 = _mm256_fmsub_pd(c_256, y2, t2);
  46. t3 = _mm256_fmsub_pd(c_256, y3, t3);
  47. _mm256_storeu_pd(&y[i + 0], t0);
  48. _mm256_storeu_pd(&y[i + 4], t1);
  49. _mm256_storeu_pd(&y[i + 8], t2);
  50. _mm256_storeu_pd(&y[i +12], t3);
  51. }
  52. for (i = tail_index_16; i < tail_index_4; i += 4) {
  53. x0 = _mm256_loadu_pd(&x[i]);
  54. y0 = _mm256_loadu_pd(&y[i]);
  55. t0 = _mm256_mul_pd(s_256, y0);
  56. t0 = _mm256_fmadd_pd(c_256, x0, t0);
  57. _mm256_storeu_pd(&x[i], t0);
  58. t0 = _mm256_mul_pd(s_256, x0);
  59. t0 = _mm256_fmsub_pd(c_256, y0, t0);
  60. _mm256_storeu_pd(&y[i], t0);
  61. }
  62. for (i = tail_index_4; i < n; ++i) {
  63. FLOAT temp = c * x[i] + s * y[i];
  64. y[i] = c * y[i] - s * x[i];
  65. x[i] = temp;
  66. }
  67. }
  68. #endif