| @@ -0,0 +1,230 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset; | |||
| FLOAT ctemp1, ctemp2, ctemp3, ctemp4; | |||
| FLOAT ctemp5, ctemp6, ctemp7, ctemp8; | |||
| FLOAT ctemp9, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| j = (n >> 2); | |||
| if (j > 0){ | |||
| do{ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp10 = *(a_offset3 + 1); | |||
| ctemp11 = *(a_offset3 + 2); | |||
| ctemp12 = *(a_offset3 + 3); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| ctemp14 = *(a_offset4 + 1); | |||
| ctemp15 = *(a_offset4 + 2); | |||
| ctemp16 = *(a_offset4 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp9; | |||
| *(b_offset + 3) = ctemp13; | |||
| *(b_offset + 4) = ctemp2; | |||
| *(b_offset + 5) = ctemp6; | |||
| *(b_offset + 6) = ctemp10; | |||
| *(b_offset + 7) = ctemp14; | |||
| *(b_offset + 8) = ctemp3; | |||
| *(b_offset + 9) = ctemp7; | |||
| *(b_offset + 10) = ctemp11; | |||
| *(b_offset + 11) = ctemp15; | |||
| *(b_offset + 12) = ctemp4; | |||
| *(b_offset + 13) = ctemp8; | |||
| *(b_offset + 14) = ctemp12; | |||
| *(b_offset + 15) = ctemp16; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| b_offset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp9; | |||
| *(b_offset + 3) = ctemp13; | |||
| a_offset1 ++; | |||
| a_offset2 ++; | |||
| a_offset3 ++; | |||
| a_offset4 ++; | |||
| b_offset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 2){ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp2; | |||
| *(b_offset + 3) = ctemp6; | |||
| *(b_offset + 4) = ctemp3; | |||
| *(b_offset + 5) = ctemp7; | |||
| *(b_offset + 6) = ctemp4; | |||
| *(b_offset + 7) = ctemp8; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset += 8; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| a_offset1 ++; | |||
| a_offset2 ++; | |||
| b_offset += 2; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 1){ | |||
| a_offset1 = a_offset; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp2; | |||
| *(b_offset + 2) = ctemp3; | |||
| *(b_offset + 3) = ctemp4; | |||
| a_offset1 += 4; | |||
| b_offset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| a_offset1 ++; | |||
| b_offset += 1; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,281 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; | |||
| FLOAT ctemp1, ctemp2, ctemp3, ctemp4; | |||
| FLOAT ctemp5, ctemp6, ctemp7, ctemp8; | |||
| FLOAT ctemp9, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| b_offset2 = b + m * (n & ~3); | |||
| b_offset3 = b + m * (n & ~1); | |||
| j = (m >> 2); | |||
| if (j > 0){ | |||
| do{ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 16; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp10 = *(a_offset3 + 1); | |||
| ctemp11 = *(a_offset3 + 2); | |||
| ctemp12 = *(a_offset3 + 3); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| ctemp14 = *(a_offset4 + 1); | |||
| ctemp15 = *(a_offset4 + 2); | |||
| ctemp16 = *(a_offset4 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| *(b_offset1 + 4) = ctemp5; | |||
| *(b_offset1 + 5) = ctemp6; | |||
| *(b_offset1 + 6) = ctemp7; | |||
| *(b_offset1 + 7) = ctemp8; | |||
| *(b_offset1 + 8) = ctemp9; | |||
| *(b_offset1 + 9) = ctemp10; | |||
| *(b_offset1 + 10) = ctemp11; | |||
| *(b_offset1 + 11) = ctemp12; | |||
| *(b_offset1 + 12) = ctemp13; | |||
| *(b_offset1 + 13) = ctemp14; | |||
| *(b_offset1 + 14) = ctemp15; | |||
| *(b_offset1 + 15) = ctemp16; | |||
| b_offset1 += m * 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset2 + 0); | |||
| ctemp4 = *(a_offset2 + 1); | |||
| ctemp5 = *(a_offset3 + 0); | |||
| ctemp6 = *(a_offset3 + 1); | |||
| ctemp7 = *(a_offset4 + 0); | |||
| ctemp8 = *(a_offset4 + 1); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| a_offset3 += 2; | |||
| a_offset4 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| *(b_offset2 + 2) = ctemp3; | |||
| *(b_offset2 + 3) = ctemp4; | |||
| *(b_offset2 + 4) = ctemp5; | |||
| *(b_offset2 + 5) = ctemp6; | |||
| *(b_offset2 + 6) = ctemp7; | |||
| *(b_offset2 + 7) = ctemp8; | |||
| b_offset2 += 8; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset2 + 0); | |||
| ctemp3 = *(a_offset3 + 0); | |||
| ctemp4 = *(a_offset4 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| *(b_offset3 + 1) = ctemp2; | |||
| *(b_offset3 + 2) = ctemp3; | |||
| *(b_offset3 + 3) = ctemp4; | |||
| b_offset3 += 4; | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } | |||
| if (m & 2){ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 8; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| *(b_offset1 + 4) = ctemp5; | |||
| *(b_offset1 + 5) = ctemp6; | |||
| *(b_offset1 + 6) = ctemp7; | |||
| *(b_offset1 + 7) = ctemp8; | |||
| b_offset1 += m * 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset2 + 0); | |||
| ctemp4 = *(a_offset2 + 1); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| *(b_offset2 + 2) = ctemp3; | |||
| *(b_offset2 + 3) = ctemp4; | |||
| b_offset2 += 4; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset2 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| *(b_offset3 + 1) = ctemp2; | |||
| b_offset3 += 2; | |||
| } | |||
| } | |||
| if (m & 1){ | |||
| a_offset1 = a_offset; | |||
| b_offset1 = b_offset; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| a_offset1 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| b_offset1 += 4 * m; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| a_offset1 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,138 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, offset; | |||
| FLOAT data01, data02, data03, data04; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| while (js > 0){ | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; | |||
| if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; | |||
| if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| if (offset > -1) ao2 += lda; else ao2 ++; | |||
| if (offset > -2) ao3 += lda; else ao3 ++; | |||
| if (offset > -3) ao4 += lda; else ao4 ++; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 4; | |||
| js --; | |||
| } | |||
| if (n & 2) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| if (offset > -1) ao2 += lda; else ao2 ++; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b += 2; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 2; | |||
| } | |||
| if (n & 1) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| b[ 0] = data01; | |||
| b ++; | |||
| offset --; | |||
| i --; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, offset; | |||
| FLOAT data01, data02, data03, data04; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| while (js > 0){ | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; | |||
| if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; | |||
| if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| if (offset > -1) ao2 ++; else ao2 += lda; | |||
| if (offset > -2) ao3 ++; else ao3 += lda; | |||
| if (offset > -3) ao4 ++; else ao4 += lda; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 4; | |||
| js --; | |||
| } | |||
| if (n & 2) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| if (offset > -1) ao2 ++; else ao2 += lda; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b += 2; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 2; | |||
| } | |||
| if (n & 1) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| b[ 0] = data01; | |||
| b ++; | |||
| offset --; | |||
| i --; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,484 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data12 = *(ao3 + 3); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data02; | |||
| b[ 5] = ONE; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = ONE; | |||
| b[11] = ZERO; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = ZERO; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| b[ 4] = data02; | |||
| b[ 5] = data04; | |||
| b[ 6] = data06; | |||
| b[ 7] = data08; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data02; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += lda; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| b += 1; | |||
| ao1 += 1; | |||
| } | |||
| X ++; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,488 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data12 = *(ao3 + 3); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ONE; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ONE; | |||
| b[11] = data12; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| if (i >= 2) { | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| } | |||
| if (i >= 3) { | |||
| data12 = *(ao3 + 3); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data07; | |||
| b[ 3] = data08; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data12; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data07; | |||
| b[ 3] = data08; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data12; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X > posY) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += 1; | |||
| b += 1; | |||
| } | |||
| X ++; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,785 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X, mm; | |||
| FLOAT data01, data02, data03, data04, data05, data06; | |||
| FLOAT data07, data08, data09, data10, data11, data12; | |||
| FLOAT data13, data14, data15, data16, data17, data18; | |||
| FLOAT data19, data20, data21, data22, data23, data24; | |||
| FLOAT data25, data26, data27, data28, data29, data30; | |||
| FLOAT data31, data32, data33, data34, data35, data36; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; | |||
| //js = (n >> 2); | |||
| js = n/6; | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| ao5 = a + posX + (posY + 4) * lda; | |||
| ao6 = a + posX + (posY + 5) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| ao5 = a + posY + (posX + 4) * lda; | |||
| ao6 = a + posY + (posX + 5) * lda; | |||
| } | |||
| i = m/6; | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao1 + 4); | |||
| data06 = *(ao1 + 5); | |||
| data07 = *(ao2 + 0); | |||
| data08 = *(ao2 + 1); | |||
| data09 = *(ao2 + 2); | |||
| data10 = *(ao2 + 3); | |||
| data11 = *(ao2 + 4); | |||
| data12 = *(ao2 + 5); | |||
| data13 = *(ao3 + 0); | |||
| data14 = *(ao3 + 1); | |||
| data15 = *(ao3 + 2); | |||
| data16 = *(ao3 + 3); | |||
| data17 = *(ao3 + 4); | |||
| data18 = *(ao3 + 5); | |||
| data19 = *(ao4 + 0); | |||
| data20 = *(ao4 + 1); | |||
| data21 = *(ao4 + 2); | |||
| data22 = *(ao4 + 3); | |||
| data23 = *(ao4 + 4); | |||
| data24 = *(ao4 + 5); | |||
| data25 = *(ao5 + 0); | |||
| data26 = *(ao5 + 1); | |||
| data27 = *(ao5 + 2); | |||
| data28 = *(ao5 + 3); | |||
| data29 = *(ao5 + 4); | |||
| data30 = *(ao5 + 5); | |||
| data31 = *(ao6 + 0); | |||
| data32 = *(ao6 + 1); | |||
| data33 = *(ao6 + 2); | |||
| data34 = *(ao6 + 3); | |||
| data35 = *(ao6 + 4); | |||
| data36 = *(ao6 + 5); | |||
| b[ 0] = data01; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = data02; | |||
| b[ 7] = data08; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = data03; | |||
| b[13] = data09; | |||
| b[14] = data15; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = data04; | |||
| b[19] = data10; | |||
| b[20] = data16; | |||
| b[21] = data22; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = data05; | |||
| b[25] = data11; | |||
| b[26] = data17; | |||
| b[27] = data23; | |||
| b[28] = data29; | |||
| b[29] = data35; | |||
| b[30] = data06; | |||
| b[31] = data12; | |||
| b[32] = data18; | |||
| b[33] = data24; | |||
| b[34] = data30; | |||
| b[35] = data36; | |||
| ao1 += 6; | |||
| ao2 += 6; | |||
| ao3 += 6; | |||
| ao4 += 6; | |||
| ao5 += 6; | |||
| ao6 += 6; | |||
| b += 36; | |||
| } else | |||
| if (X > posY) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ZERO; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ZERO; | |||
| b[11] = ZERO; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ZERO; | |||
| b[16] = ZERO; | |||
| b[17] = ZERO; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ZERO; | |||
| b[22] = ZERO; | |||
| b[23] = ZERO; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = ZERO; | |||
| b[29] = ZERO; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = ZERO; | |||
| ao1 += 6 * lda; | |||
| ao2 += 6 * lda; | |||
| ao3 += 6 * lda; | |||
| ao4 += 6 * lda; | |||
| ao5 += 6 * lda; | |||
| ao6 += 6 * lda; | |||
| b += 36; | |||
| } else { | |||
| data01 = *(ao1 + 0); | |||
| data07 = *(ao2 + 0); | |||
| data13 = *(ao3 + 0); | |||
| data19 = *(ao4 + 0); | |||
| data25 = *(ao5 + 0); | |||
| data31 = *(ao6 + 0); | |||
| data08 = *(ao2 + 1); | |||
| data14 = *(ao3 + 1); | |||
| data20 = *(ao4 + 1); | |||
| data26 = *(ao5 + 1); | |||
| data32 = *(ao6 + 1); | |||
| data15 = *(ao3 + 2); | |||
| data21 = *(ao4 + 2); | |||
| data27 = *(ao5 + 2); | |||
| data33 = *(ao6 + 2); | |||
| data22 = *(ao4 + 3); | |||
| data28 = *(ao5 + 3); | |||
| data34 = *(ao6 + 3); | |||
| data29 = *(ao5 + 4); | |||
| data35 = *(ao6 + 4); | |||
| data36 = *(ao6 + 5); | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ONE; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ONE; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ONE; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = ONE; | |||
| b[29] = data35; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = ONE; | |||
| #else | |||
| b[ 0] = data01; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = data08; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = data15; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = data22; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = data29; | |||
| b[29] = data35; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = data36; | |||
| #endif | |||
| ao1 += 6; | |||
| ao2 += 6; | |||
| ao3 += 6; | |||
| ao4 += 6; | |||
| ao5 += 6; | |||
| ao6 += 7; | |||
| b += 36; | |||
| } | |||
| X += 6; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| mm = m - m/6; | |||
| if (mm & 4) { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X > posY) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ZERO; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ZERO; | |||
| b[11] = ZERO; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ZERO; | |||
| b[16] = ZERO; | |||
| b[17] = ZERO; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ZERO; | |||
| b[22] = ZERO; | |||
| b[23] = ZERO; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ONE; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ONE; | |||
| b[11] = data15; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| } | |||
| if (mm & 3) { | |||
| if (X < posY) { | |||
| if (mm & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| b[ 4] = data02; | |||
| b[ 5] = data04; | |||
| b[ 6] = data06; | |||
| b[ 7] = data08; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (mm & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data03 = *(ao2 + 0); | |||
| data05 = *(ao3 + 0); | |||
| data07 = *(ao4 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += lda; | |||
| ao2 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| #endif | |||
| ao1 += lda; | |||
| ao2 += lda; | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (m > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += 1; | |||
| b += 1; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += lda; | |||
| b += 1; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,472 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data05; | |||
| b[ 5] = ONE; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = ONE; | |||
| b[11] = ZERO; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = ZERO; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| if (i >= 2) { | |||
| data05 = *(ao2 + 0); | |||
| } | |||
| if (i >= 3) { | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = data05; | |||
| b[ 1] = ONE; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = data09; | |||
| b[ 1] = data10; | |||
| b[ 2] = ONE; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| if (i >= 2) { | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = data05; | |||
| b[ 1] = data06; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = data09; | |||
| b[ 1] = data10; | |||
| b[ 2] = data11; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data05; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X < posY) { | |||
| ao1 += 2; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (m > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += lda; | |||
| b += 1; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -0,0 +1,326 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data12 = *(a3 + 3); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data02; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data15; | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4; | |||
| a2 += 4; | |||
| a3 += 4; | |||
| a4 += 4; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data02; | |||
| *(b + 5) = INV(data06); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| data05 = *(a3 + 0); | |||
| data06 = *(a3 + 1); | |||
| data07 = *(a4 + 0); | |||
| data08 = *(a4 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data05; | |||
| *(b + 3) = data07; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data04; | |||
| *(b + 6) = data06; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| a3 += 2; | |||
| a4 += 2; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| data03 = *(a3 + 0); | |||
| data04 = *(a4 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4 * lda; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 2) = data02; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data02; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2 * lda; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1+= 1; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,346 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data12 = *(a3 + 3); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 11) = data12; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data12; | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4 * lda; | |||
| a2 += 4 * lda; | |||
| a3 += 4 * lda; | |||
| a4 += 4 * lda; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1 += 1 * lda; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,350 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 11) = data15; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data15; | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4; | |||
| a2 += 4; | |||
| a3 += 4; | |||
| a4 += 4; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| data05 = *(a3 + 0); | |||
| data06 = *(a3 + 1); | |||
| data07 = *(a4 + 0); | |||
| data08 = *(a4 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| data09 = *(a3 + 0); | |||
| data13 = *(a4 + 0); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| data03 = *(a3 + 0); | |||
| data04 = *(a4 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4 * lda; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data03; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data02; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data03; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2 * lda; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1+= 1; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,322 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data05; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data12; | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4 * lda; | |||
| a2 += 4 * lda; | |||
| a3 += 4 * lda; | |||
| a4 += 4 * lda; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data05; | |||
| *(b + 5) = INV(data06); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 2) = data03; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1 += 1 * lda; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S | |||
| DDOTKERNEL = ddot_bulldozer.S | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| @@ -16,7 +16,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||
| DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S | |||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| @@ -25,7 +26,8 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||
| CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -34,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -52,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S | |||
| DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||