You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

isamax.c 9.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. #if ( defined(__GNUC__) && (__GNUC__ < 5 || (__GNUC__ == 7 && __GNUC_MINOR__ < 3)) )
  2. #pragma GCC optimize "O0"
  3. #endif
  4. /***************************************************************************
  5. Copyright (c) 2013-2019, The OpenBLAS Project
  6. All rights reserved.
  7. Redistribution and use in source and binary forms, with or without
  8. modification, are permitted provided that the following conditions are
  9. met:
  10. 1. Redistributions of source code must retain the above copyright
  11. notice, this list of conditions and the following disclaimer.
  12. 2. Redistributions in binary form must reproduce the above copyright
  13. notice, this list of conditions and the following disclaimer in
  14. the documentation and/or other materials provided with the
  15. distribution.
  16. 3. Neither the name of the OpenBLAS project nor the names of
  17. its contributors may be used to endorse or promote products
  18. derived from this software without specific prior written permission.
  19. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  23. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  28. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *****************************************************************************/
  30. #include "common.h"
  31. #include <math.h>
  32. #include <altivec.h>
  33. #if defined(DOUBLE)
  34. #define ABS fabs
  35. #else
  36. #define ABS fabsf
  37. #endif
  38. /**
  39. * Find maximum index
  40. * Warning: requirements n>0 and n % 64 == 0
  41. * @param n
  42. * @param x pointer to the vector
  43. * @param maxf (out) maximum absolute value .( only for output )
  44. * @return index
  45. */
  46. static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
  47. BLASLONG index;
  48. BLASLONG i=0;
  49. register __vector unsigned int static_index0 = {0,1,2,3};
  50. register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
  51. register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
  52. register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
  53. register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
  54. register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
  55. temp0=vec_xor(temp0,temp0);
  56. temp1=temp1 <<1 ; //{16,16,16,16}
  57. register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
  58. register __vector float quadruple_values={0,0,0,0};
  59. register __vector float * v_ptrx=(__vector float *)x;
  60. for(; i<n; i+=64){
  61. //absolute temporary vectors
  62. register __vector float v0=vec_abs(v_ptrx[0]);
  63. register __vector float v1=vec_abs(v_ptrx[1]);
  64. register __vector float v2=vec_abs(v_ptrx[2]);
  65. register __vector float v3=vec_abs(v_ptrx[3]);
  66. register __vector float v4=vec_abs(v_ptrx[4]);
  67. register __vector float v5=vec_abs(v_ptrx[5]);
  68. register __vector float v6=vec_abs(v_ptrx[6]);
  69. register __vector float v7=vec_abs(v_ptrx[7]);
  70. //cmp quadruple pairs
  71. register __vector bool int r1=vec_cmpgt(v1,v0);
  72. register __vector bool int r2=vec_cmpgt(v3,v2);
  73. register __vector bool int r3=vec_cmpgt(v5,v4);
  74. register __vector bool int r4=vec_cmpgt(v7,v6);
  75. //select
  76. register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
  77. register __vector float vf0= vec_sel(v0,v1,r1);
  78. register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
  79. register __vector float vf1= vec_sel(v2,v3,r2);
  80. register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
  81. v0=vec_sel(v4,v5,r3);
  82. register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
  83. v1=vec_sel(v6,v7,r4);
  84. // cmp selected
  85. r1=vec_cmpgt(vf1,vf0);
  86. r2=vec_cmpgt(v1,v0);
  87. v_ptrx+=8;
  88. //select from above
  89. ind0_first= vec_sel(ind0_first,ind1,r1);
  90. vf0= vec_sel(vf0,vf1,r1) ;
  91. ind2= vec_sel(ind2,ind3,r2);
  92. vf1= vec_sel(v0,v1,r2);
  93. //second indices actually should be within [16,31] so ind2+16
  94. ind2 +=temp1;
  95. //final cmp and select index and value for the first 32 values
  96. r1=vec_cmpgt(vf1,vf0);
  97. ind0_first = vec_sel(ind0_first,ind2,r1);
  98. vf0= vec_sel(vf0,vf1,r1);
  99. ind0_first+=temp0; //get absolute index
  100. temp0+=temp1;
  101. temp0+=temp1; //temp0+32
  102. //second part of 32
  103. // absolute temporary vectors
  104. v0=vec_abs(v_ptrx[0]);
  105. v1=vec_abs(v_ptrx[1]);
  106. v2=vec_abs(v_ptrx[2]);
  107. v3=vec_abs(v_ptrx[3]);
  108. v4=vec_abs(v_ptrx[4]);
  109. v5=vec_abs(v_ptrx[5]);
  110. v6=vec_abs(v_ptrx[6]);
  111. v7=vec_abs(v_ptrx[7]);
  112. //cmp quadruple pairs
  113. r1=vec_cmpgt(v1,v0);
  114. r2=vec_cmpgt(v3,v2);
  115. r3=vec_cmpgt(v5,v4);
  116. r4=vec_cmpgt(v7,v6);
  117. //select
  118. register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
  119. register __vector float vv0= vec_sel(v0,v1,r1);
  120. ind1= vec_sel(static_index2,static_index3,r2);
  121. register __vector float vv1= vec_sel(v2,v3,r2);
  122. ind2= vec_sel(static_index0,static_index1,r3);
  123. v0=vec_sel(v4,v5,r3);
  124. ind3= vec_sel(static_index2,static_index3,r4);
  125. v1=vec_sel(v6,v7,r4);
  126. // cmp selected
  127. r1=vec_cmpgt(vv1,vv0);
  128. r2=vec_cmpgt(v1,v0);
  129. v_ptrx+=8;
  130. //select from above
  131. ind0_second= vec_sel(ind0_second,ind1,r1);
  132. vv0= vec_sel(vv0,vv1,r1) ;
  133. ind2= vec_sel(ind2,ind3,r2);
  134. vv1= vec_sel(v0,v1,r2) ;
  135. //second indices actually should be within [16,31] so ind2+16
  136. ind2 +=temp1;
  137. //final cmp and select index and value for the second 32 values
  138. r1=vec_cmpgt(vv1,vv0);
  139. ind0_second = vec_sel(ind0_second,ind2,r1);
  140. vv0= vec_sel(vv0,vv1,r1);
  141. ind0_second+=temp0; //get absolute index
  142. //find final quadruple from 64 elements
  143. r2=vec_cmpgt(vv0,vf0);
  144. ind2 = vec_sel( ind0_first,ind0_second,r2);
  145. vv0= vec_sel(vf0,vv0,r2);
  146. //compare with old quadruple and update
  147. r3=vec_cmpgt(vv0,quadruple_values);
  148. quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
  149. quadruple_values= vec_sel(quadruple_values,vv0,r3);
  150. temp0+=temp1;
  151. temp0+=temp1; //temp0+32
  152. }
  153. //now we have to chose from 4 values and 4 different indices
  154. // we will compare pairwise if pairs are exactly the same we will choose minimum between index
  155. // otherwise we will assign index of the maximum value
  156. float a1,a2,a3,a4;
  157. unsigned int i1,i2,i3,i4;
  158. a1=vec_extract(quadruple_values,0);
  159. a2=vec_extract(quadruple_values,1);
  160. a3=vec_extract(quadruple_values,2);
  161. a4=vec_extract(quadruple_values,3);
  162. i1=vec_extract(quadruple_indices,0);
  163. i2=vec_extract(quadruple_indices,1);
  164. i3=vec_extract(quadruple_indices,2);
  165. i4=vec_extract(quadruple_indices,3);
  166. if(a1==a2){
  167. index=i1>i2?i2:i1;
  168. }else if(a2>a1){
  169. index=i2;
  170. a1=a2;
  171. }else{
  172. index= i1;
  173. }
  174. if(a4==a3){
  175. i1=i3>i4?i4:i3;
  176. }else if(a4>a3){
  177. i1=i4;
  178. a3=a4;
  179. }else{
  180. i1= i3;
  181. }
  182. if(a1==a3){
  183. index=i1>index?index:i1;
  184. *maxf=a1;
  185. }else if(a3>a1){
  186. index=i1;
  187. *maxf=a3;
  188. }else{
  189. *maxf=a1;
  190. }
  191. return index;
  192. }
  193. BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
  194. BLASLONG i = 0;
  195. BLASLONG j = 0;
  196. FLOAT maxf = 0.0;
  197. BLASLONG max = 0;
  198. if (n <= 0 || inc_x <= 0) return (max);
  199. if (inc_x == 1) {
  200. BLASLONG n1 = n & -64;
  201. if (n1 > 0) {
  202. max = siamax_kernel_64(n1, x, &maxf);
  203. i = n1;
  204. }
  205. while (i < n) {
  206. if (ABS(x[i]) > maxf) {
  207. max = i;
  208. maxf = ABS(x[i]);
  209. }
  210. i++;
  211. }
  212. return (max + 1);
  213. } else {
  214. BLASLONG n1 = n & -4;
  215. while (j < n1) {
  216. if (ABS(x[i]) > maxf) {
  217. max = j;
  218. maxf = ABS(x[i]);
  219. }
  220. if (ABS(x[i + inc_x]) > maxf) {
  221. max = j + 1;
  222. maxf = ABS(x[i + inc_x]);
  223. }
  224. if (ABS(x[i + 2 * inc_x]) > maxf) {
  225. max = j + 2;
  226. maxf = ABS(x[i + 2 * inc_x]);
  227. }
  228. if (ABS(x[i + 3 * inc_x]) > maxf) {
  229. max = j + 3;
  230. maxf = ABS(x[i + 3 * inc_x]);
  231. }
  232. i += inc_x * 4;
  233. j += 4;
  234. }
  235. while (j < n) {
  236. if (ABS(x[i]) > maxf) {
  237. max = j;
  238. maxf = ABS(x[i]);
  239. }
  240. i += inc_x;
  241. j++;
  242. }
  243. return (max + 1);
  244. }
  245. }