| @@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt | |||||
| Additional support CPU: | Additional support CPU: | ||||
| x86_64: | x86_64: | ||||
| Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | ||||
| Intel Sandy Bridge | |||||
| MIPS64: | MIPS64: | ||||
| ICT Loongson 3A //Level 3 BLAS subroutines are optimized. | |||||
| ICT Loongson 3A | |||||
| ICT Loongson 3B (Experimental) | |||||
| 4.Usages | 4.Usages | ||||
| Link with libopenblas.a or -lopenblas for shared library. | Link with libopenblas.a or -lopenblas for shared library. | ||||
| @@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| 8.ChangeLog | 8.ChangeLog | ||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | ||||
| 9.Known Issues | |||||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||||
| is 64. On 32 bits, it is 32. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||||
| 9.Troubleshooting | |||||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||||
| * The number of CPUs/Cores should less than or equal to 256. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||||
| 10. Specification of Git Branches | 10. Specification of Git Branches | ||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | ||||
| @@ -425,6 +425,7 @@ REALNAME: | |||||
| #define ALIGN_2 .align 2 | #define ALIGN_2 .align 2 | ||||
| #define ALIGN_3 .align 3 | #define ALIGN_3 .align 3 | ||||
| #define ALIGN_4 .align 4 | #define ALIGN_4 .align 4 | ||||
| #define ALIGN_5 .align 5 | |||||
| #define ffreep fstp | #define ffreep fstp | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,235 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| for (j=0; j<col/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,401 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||||
| for (j=0; j<col/8; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src4 = src3+2*srcdim; | |||||
| src5 = src4+2*srcdim; | |||||
| src6 = src5+2*srcdim; | |||||
| src7 = src6+2*srcdim; | |||||
| src = src7+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| dest0[32] = src0[4]; | |||||
| dest0[33] = src0[5]; | |||||
| dest0[34] = src1[4]; | |||||
| dest0[35] = src1[5]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src3[4]; | |||||
| dest0[39] = src3[5]; | |||||
| dest0[40] = src4[4]; | |||||
| dest0[41] = src4[5]; | |||||
| dest0[42] = src5[4]; | |||||
| dest0[43] = src5[5]; | |||||
| dest0[44] = src6[4]; | |||||
| dest0[45] = src6[5]; | |||||
| dest0[46] = src7[4]; | |||||
| dest0[47] = src7[5]; | |||||
| dest0[48] = src0[6]; | |||||
| dest0[49] = src0[7]; | |||||
| dest0[50] = src1[6]; | |||||
| dest0[51] = src1[7]; | |||||
| dest0[52] = src2[6]; | |||||
| dest0[53] = src2[7]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src4[6]; | |||||
| dest0[57] = src4[7]; | |||||
| dest0[58] = src5[6]; | |||||
| dest0[59] = src5[7]; | |||||
| dest0[60] = src6[6]; | |||||
| dest0[61] = src6[7]; | |||||
| dest0[62] = src7[6]; | |||||
| dest0[63] = src7[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| src4 = src4+8; | |||||
| src5 = src5+8; | |||||
| src6 = src6+8; | |||||
| src7 = src7+8; | |||||
| ii = (4<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| src4 = src4+4; | |||||
| src5 = src5+4; | |||||
| src6 = src6+4; | |||||
| src7 = src7+4; | |||||
| ii = (2<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| src4 = src4+2; | |||||
| src5 = src5+2; | |||||
| src6 = src6+2; | |||||
| src7 = src7+2; | |||||
| ii = (1<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,237 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| dest0[16] = src2[0]; | |||||
| dest0[17] = src2[1]; | |||||
| dest0[18] = src2[2]; | |||||
| dest0[19] = src2[3]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src2[6]; | |||||
| dest0[23] = src2[7]; | |||||
| dest0[24] = src3[0]; | |||||
| dest0[25] = src3[1]; | |||||
| dest0[26] = src3[2]; | |||||
| dest0[27] = src3[3]; | |||||
| dest0[28] = src3[4]; | |||||
| dest0[29] = src3[5]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,370 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2,*dest4; | |||||
| ii = col&-8; | |||||
| ii = ii*(2*row); | |||||
| dest4 = dest+ii; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| dest0[32] = src2[0]; | |||||
| dest0[33] = src2[1]; | |||||
| dest0[34] = src2[2]; | |||||
| dest0[35] = src2[3]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src2[6]; | |||||
| dest0[39] = src2[7]; | |||||
| dest0[40] = src2[8]; | |||||
| dest0[41] = src2[9]; | |||||
| dest0[42] = src2[10]; | |||||
| dest0[43] = src2[11]; | |||||
| dest0[44] = src2[12]; | |||||
| dest0[45] = src2[13]; | |||||
| dest0[46] = src2[14]; | |||||
| dest0[47] = src2[15]; | |||||
| dest0[48] = src3[0]; | |||||
| dest0[49] = src3[1]; | |||||
| dest0[50] = src3[2]; | |||||
| dest0[51] = src3[3]; | |||||
| dest0[52] = src3[4]; | |||||
| dest0[53] = src3[5]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src3[8]; | |||||
| dest0[57] = src3[9]; | |||||
| dest0[58] = src3[10]; | |||||
| dest0[59] = src3[11]; | |||||
| dest0[60] = src3[12]; | |||||
| dest0[61] = src3[13]; | |||||
| dest0[62] = src3[14]; | |||||
| dest0[63] = src3[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| src2 = src2+16; | |||||
| src3 = src3+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| dest4[16] = src2[0]; | |||||
| dest4[17] = src2[1]; | |||||
| dest4[18] = src2[2]; | |||||
| dest4[19] = src2[3]; | |||||
| dest4[20] = src2[4]; | |||||
| dest4[21] = src2[5]; | |||||
| dest4[22] = src2[6]; | |||||
| dest4[23] = src2[7]; | |||||
| dest4[24] = src3[0]; | |||||
| dest4[25] = src3[1]; | |||||
| dest4[26] = src3[2]; | |||||
| dest4[27] = src3[3]; | |||||
| dest4[28] = src3[4]; | |||||
| dest4[29] = src3[5]; | |||||
| dest4[30] = src3[6]; | |||||
| dest4[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| dest4 = dest4+32; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| dest4 = dest4+16; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| src0 = src0+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| dest4 = dest4+8; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1,59 +1,84 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||||
| SGEMMINCOPY = gemm_ncopy_4.S | |||||
| SGEMMITCOPY = gemm_tcopy_4.S | |||||
| SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | ||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||||
| DGEMMINCOPY = dgemm_ncopy_2.S | |||||
| DGEMMITCOPY = dgemm_tcopy_2.S | |||||
| DGEMMONCOPY = dgemm_ncopy_8.S | |||||
| DGEMMOTCOPY = dgemm_tcopy_8.S | |||||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| #DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| #DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||||
| CGEMMINCOPY = zgemm_ncopy_2.S | |||||
| CGEMMITCOPY = zgemm_tcopy_2.S | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| #CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||||
| CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMINCOPY = zgemm_ncopy_1.S | |||||
| ZGEMMITCOPY = zgemm_tcopy_1.S | |||||
| #ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||||
| #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||||
| #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||||
| #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | ||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | ||||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define SNUMOPT 8 | #define SNUMOPT 8 | ||||
| #define DNUMOPT 4 | #define DNUMOPT 4 | ||||
| @@ -208,68 +208,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define SGEMM_DEFAULT_R sgemm_r | |||||
| #define QGEMM_DEFAULT_R qgemm_r | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | |||||
| #define XGEMM_DEFAULT_R xgemm_r | |||||
| #define SYMV_P 16 | |||||
| #define HAVE_EXCLUSIVE_CACHE | |||||
| #define GEMM_THREAD gemm_thread_mn | |||||
| #endif | |||||
| #if defined(BOBCAT) | |||||
| #define SNUMOPT 8 | |||||
| #define DNUMOPT 4 | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #ifdef ARCH_X86 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #endif | |||||
| #define SGEMM_DEFAULT_P 448 | |||||
| #define DGEMM_DEFAULT_P 224 | |||||
| #define QGEMM_DEFAULT_P 112 | |||||
| #define CGEMM_DEFAULT_P 224 | |||||
| #define ZGEMM_DEFAULT_P 112 | |||||
| #define XGEMM_DEFAULT_P 56 | |||||
| #define SGEMM_DEFAULT_Q 224 | |||||
| #define DGEMM_DEFAULT_Q 224 | |||||
| #define QGEMM_DEFAULT_Q 224 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 224 | |||||
| #define XGEMM_DEFAULT_Q 224 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | #define SGEMM_DEFAULT_R sgemm_r | ||||
| #define QGEMM_DEFAULT_R qgemm_r | #define QGEMM_DEFAULT_R qgemm_r | ||||
| #define DGEMM_DEFAULT_R dgemm_r | #define DGEMM_DEFAULT_R dgemm_r | ||||
| @@ -980,7 +918,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SNUMOPT 8 | #define SNUMOPT 8 | ||||
| #define DNUMOPT 4 | #define DNUMOPT 4 | ||||
| #define GEMM_DEFAULT_OFFSET_A 32 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| @@ -990,57 +928,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | #define CGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | #define ZGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | #define XGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | #define QGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | #define CGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | #define XGEMM_DEFAULT_UNROLL_N 1 | ||||
| #else | #else | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | #define XGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | #define QGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | #define ZGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | #define XGEMM_DEFAULT_UNROLL_N 1 | ||||
| #endif | #endif | ||||
| #define SGEMM_DEFAULT_P 504 | |||||
| #define SGEMM_DEFAULT_P 512 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | #define SGEMM_DEFAULT_R sgemm_r | ||||
| //#define SGEMM_DEFAULT_R 1024 | |||||
| #define DGEMM_DEFAULT_P 504 | |||||
| #define DGEMM_DEFAULT_P 512 | |||||
| #define DGEMM_DEFAULT_R dgemm_r | #define DGEMM_DEFAULT_R dgemm_r | ||||
| //#define DGEMM_DEFAULT_R 1024 | |||||
| #define QGEMM_DEFAULT_P 504 | #define QGEMM_DEFAULT_P 504 | ||||
| #define QGEMM_DEFAULT_R qgemm_r | #define QGEMM_DEFAULT_R qgemm_r | ||||
| #define CGEMM_DEFAULT_P 252 | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define CGEMM_DEFAULT_P 128 | |||||
| //#define CGEMM_DEFAULT_R cgemm_r | |||||
| #define CGEMM_DEFAULT_R 1024 | |||||
| #define ZGEMM_DEFAULT_P 252 | |||||
| #define ZGEMM_DEFAULT_P 512 | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | #define ZGEMM_DEFAULT_R zgemm_r | ||||
| //#define ZGEMM_DEFAULT_R 1024 | |||||
| #define XGEMM_DEFAULT_P 252 | #define XGEMM_DEFAULT_P 252 | ||||
| #define XGEMM_DEFAULT_R xgemm_r | #define XGEMM_DEFAULT_R xgemm_r | ||||
| #define SGEMM_DEFAULT_Q 512 | |||||
| #define SGEMM_DEFAULT_Q 256 | |||||
| #define DGEMM_DEFAULT_Q 256 | #define DGEMM_DEFAULT_Q 256 | ||||
| #define QGEMM_DEFAULT_Q 128 | #define QGEMM_DEFAULT_Q 128 | ||||
| #define CGEMM_DEFAULT_Q 512 | |||||
| #define ZGEMM_DEFAULT_Q 256 | |||||
| #define CGEMM_DEFAULT_Q 256 | |||||
| #define ZGEMM_DEFAULT_Q 192 | |||||
| #define XGEMM_DEFAULT_Q 128 | #define XGEMM_DEFAULT_Q 128 | ||||
| #define GETRF_FACTOR 0.72 | #define GETRF_FACTOR 0.72 | ||||