| @@ -0,0 +1,235 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| for (j=0; j<col/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| dest0[16] = src0[4]; | |||
| dest0[17] = src0[5]; | |||
| dest0[18] = src1[4]; | |||
| dest0[19] = src1[5]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src3[4]; | |||
| dest0[23] = src3[5]; | |||
| dest0[24] = src0[6]; | |||
| dest0[25] = src0[7]; | |||
| dest0[26] = src1[6]; | |||
| dest0[27] = src1[7]; | |||
| dest0[28] = src2[6]; | |||
| dest0[29] = src2[7]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (4<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| ii = (2<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| ii = (1<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<2); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| dest0[8] = src0[4]; | |||
| dest0[9] = src0[5]; | |||
| dest0[10] = src1[4]; | |||
| dest0[11] = src1[5]; | |||
| dest0[12] = src0[6]; | |||
| dest0[13] = src0[7]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (4<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| ii = (2<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| ii = (1<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<1); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (4<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| src0 = src0+4; | |||
| ii = (2<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| src0 = src0+2; | |||
| ii = (1<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,401 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||
| for (j=0; j<col/8; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src4 = src3+2*srcdim; | |||
| src5 = src4+2*srcdim; | |||
| src6 = src5+2*srcdim; | |||
| src7 = src6+2*srcdim; | |||
| src = src7+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| dest0[16] = src0[2]; | |||
| dest0[17] = src0[3]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src2[2]; | |||
| dest0[21] = src2[3]; | |||
| dest0[22] = src3[2]; | |||
| dest0[23] = src3[3]; | |||
| dest0[24] = src4[2]; | |||
| dest0[25] = src4[3]; | |||
| dest0[26] = src5[2]; | |||
| dest0[27] = src5[3]; | |||
| dest0[28] = src6[2]; | |||
| dest0[29] = src6[3]; | |||
| dest0[30] = src7[2]; | |||
| dest0[31] = src7[3]; | |||
| dest0[32] = src0[4]; | |||
| dest0[33] = src0[5]; | |||
| dest0[34] = src1[4]; | |||
| dest0[35] = src1[5]; | |||
| dest0[36] = src2[4]; | |||
| dest0[37] = src2[5]; | |||
| dest0[38] = src3[4]; | |||
| dest0[39] = src3[5]; | |||
| dest0[40] = src4[4]; | |||
| dest0[41] = src4[5]; | |||
| dest0[42] = src5[4]; | |||
| dest0[43] = src5[5]; | |||
| dest0[44] = src6[4]; | |||
| dest0[45] = src6[5]; | |||
| dest0[46] = src7[4]; | |||
| dest0[47] = src7[5]; | |||
| dest0[48] = src0[6]; | |||
| dest0[49] = src0[7]; | |||
| dest0[50] = src1[6]; | |||
| dest0[51] = src1[7]; | |||
| dest0[52] = src2[6]; | |||
| dest0[53] = src2[7]; | |||
| dest0[54] = src3[6]; | |||
| dest0[55] = src3[7]; | |||
| dest0[56] = src4[6]; | |||
| dest0[57] = src4[7]; | |||
| dest0[58] = src5[6]; | |||
| dest0[59] = src5[7]; | |||
| dest0[60] = src6[6]; | |||
| dest0[61] = src6[7]; | |||
| dest0[62] = src7[6]; | |||
| dest0[63] = src7[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| src4 = src4+8; | |||
| src5 = src5+8; | |||
| src6 = src6+8; | |||
| src7 = src7+8; | |||
| ii = (4<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| dest0[16] = src0[2]; | |||
| dest0[17] = src0[3]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src2[2]; | |||
| dest0[21] = src2[3]; | |||
| dest0[22] = src3[2]; | |||
| dest0[23] = src3[3]; | |||
| dest0[24] = src4[2]; | |||
| dest0[25] = src4[3]; | |||
| dest0[26] = src5[2]; | |||
| dest0[27] = src5[3]; | |||
| dest0[28] = src6[2]; | |||
| dest0[29] = src6[3]; | |||
| dest0[30] = src7[2]; | |||
| dest0[31] = src7[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| src4 = src4+4; | |||
| src5 = src5+4; | |||
| src6 = src6+4; | |||
| src7 = src7+4; | |||
| ii = (2<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| src4 = src4+2; | |||
| src5 = src5+2; | |||
| src6 = src6+2; | |||
| src7 = src7+2; | |||
| ii = (1<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&4) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| dest0[16] = src0[4]; | |||
| dest0[17] = src0[5]; | |||
| dest0[18] = src1[4]; | |||
| dest0[19] = src1[5]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src3[4]; | |||
| dest0[23] = src3[5]; | |||
| dest0[24] = src0[6]; | |||
| dest0[25] = src0[7]; | |||
| dest0[26] = src1[6]; | |||
| dest0[27] = src1[7]; | |||
| dest0[28] = src2[6]; | |||
| dest0[29] = src2[7]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (4<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| ii = (2<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| ii = (1<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<2); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| dest0[8] = src0[4]; | |||
| dest0[9] = src0[5]; | |||
| dest0[10] = src1[4]; | |||
| dest0[11] = src1[5]; | |||
| dest0[12] = src0[6]; | |||
| dest0[13] = src0[7]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (4<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| ii = (2<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| ii = (1<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<1); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (4<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| src0 = src0+4; | |||
| ii = (2<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| src0 = src0+2; | |||
| ii = (1<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,237 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| FLOAT *dest1,*dest2; | |||
| ii = col&-4; | |||
| ii = ii*(2*row); | |||
| dest2 = dest+ii; | |||
| ii = col&-2; | |||
| ii = ii*(2*row); | |||
| dest1 = dest+ii; | |||
| for (j=0; j<row/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (4<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src1[0]; | |||
| dest0[9] = src1[1]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src1[4]; | |||
| dest0[13] = src1[5]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| dest0[16] = src2[0]; | |||
| dest0[17] = src2[1]; | |||
| dest0[18] = src2[2]; | |||
| dest0[19] = src2[3]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src2[6]; | |||
| dest0[23] = src2[7]; | |||
| dest0[24] = src3[0]; | |||
| dest0[25] = src3[1]; | |||
| dest0[26] = src3[2]; | |||
| dest0[27] = src3[3]; | |||
| dest0[28] = src3[4]; | |||
| dest0[29] = src3[5]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| dest2[8] = src2[0]; | |||
| dest2[9] = src2[1]; | |||
| dest2[10] = src2[2]; | |||
| dest2[11] = src2[3]; | |||
| dest2[12] = src3[0]; | |||
| dest2[13] = src3[1]; | |||
| dest2[14] = src3[2]; | |||
| dest2[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| dest2 = dest2+16; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| dest1[4] = src2[0]; | |||
| dest1[5] = src2[1]; | |||
| dest1[6] = src3[0]; | |||
| dest1[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| dest1 = dest1+8; | |||
| } | |||
| } | |||
| if (row&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (2<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src1[0]; | |||
| dest0[9] = src1[1]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src1[4]; | |||
| dest0[13] = src1[5]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| dest2 = dest2+8; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| dest1 = dest1+4; | |||
| } | |||
| } | |||
| if (row&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (1<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| src0 = src0+4; | |||
| dest2 = dest2+4; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| src0 = src0+2; | |||
| dest1 = dest1+2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,370 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| FLOAT *dest1,*dest2,*dest4; | |||
| ii = col&-8; | |||
| ii = ii*(2*row); | |||
| dest4 = dest+ii; | |||
| ii = col&-4; | |||
| ii = ii*(2*row); | |||
| dest2 = dest+ii; | |||
| ii = col&-2; | |||
| ii = ii*(2*row); | |||
| dest1 = dest+ii; | |||
| for (j=0; j<row/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (4<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| dest0[16] = src1[0]; | |||
| dest0[17] = src1[1]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src1[4]; | |||
| dest0[21] = src1[5]; | |||
| dest0[22] = src1[6]; | |||
| dest0[23] = src1[7]; | |||
| dest0[24] = src1[8]; | |||
| dest0[25] = src1[9]; | |||
| dest0[26] = src1[10]; | |||
| dest0[27] = src1[11]; | |||
| dest0[28] = src1[12]; | |||
| dest0[29] = src1[13]; | |||
| dest0[30] = src1[14]; | |||
| dest0[31] = src1[15]; | |||
| dest0[32] = src2[0]; | |||
| dest0[33] = src2[1]; | |||
| dest0[34] = src2[2]; | |||
| dest0[35] = src2[3]; | |||
| dest0[36] = src2[4]; | |||
| dest0[37] = src2[5]; | |||
| dest0[38] = src2[6]; | |||
| dest0[39] = src2[7]; | |||
| dest0[40] = src2[8]; | |||
| dest0[41] = src2[9]; | |||
| dest0[42] = src2[10]; | |||
| dest0[43] = src2[11]; | |||
| dest0[44] = src2[12]; | |||
| dest0[45] = src2[13]; | |||
| dest0[46] = src2[14]; | |||
| dest0[47] = src2[15]; | |||
| dest0[48] = src3[0]; | |||
| dest0[49] = src3[1]; | |||
| dest0[50] = src3[2]; | |||
| dest0[51] = src3[3]; | |||
| dest0[52] = src3[4]; | |||
| dest0[53] = src3[5]; | |||
| dest0[54] = src3[6]; | |||
| dest0[55] = src3[7]; | |||
| dest0[56] = src3[8]; | |||
| dest0[57] = src3[9]; | |||
| dest0[58] = src3[10]; | |||
| dest0[59] = src3[11]; | |||
| dest0[60] = src3[12]; | |||
| dest0[61] = src3[13]; | |||
| dest0[62] = src3[14]; | |||
| dest0[63] = src3[15]; | |||
| src0 = src0+16; | |||
| src1 = src1+16; | |||
| src2 = src2+16; | |||
| src3 = src3+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| dest4[8] = src1[0]; | |||
| dest4[9] = src1[1]; | |||
| dest4[10] = src1[2]; | |||
| dest4[11] = src1[3]; | |||
| dest4[12] = src1[4]; | |||
| dest4[13] = src1[5]; | |||
| dest4[14] = src1[6]; | |||
| dest4[15] = src1[7]; | |||
| dest4[16] = src2[0]; | |||
| dest4[17] = src2[1]; | |||
| dest4[18] = src2[2]; | |||
| dest4[19] = src2[3]; | |||
| dest4[20] = src2[4]; | |||
| dest4[21] = src2[5]; | |||
| dest4[22] = src2[6]; | |||
| dest4[23] = src2[7]; | |||
| dest4[24] = src3[0]; | |||
| dest4[25] = src3[1]; | |||
| dest4[26] = src3[2]; | |||
| dest4[27] = src3[3]; | |||
| dest4[28] = src3[4]; | |||
| dest4[29] = src3[5]; | |||
| dest4[30] = src3[6]; | |||
| dest4[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| dest4 = dest4+32; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| dest2[8] = src2[0]; | |||
| dest2[9] = src2[1]; | |||
| dest2[10] = src2[2]; | |||
| dest2[11] = src2[3]; | |||
| dest2[12] = src3[0]; | |||
| dest2[13] = src3[1]; | |||
| dest2[14] = src3[2]; | |||
| dest2[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| dest2 = dest2+16; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| dest1[4] = src2[0]; | |||
| dest1[5] = src2[1]; | |||
| dest1[6] = src3[0]; | |||
| dest1[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| dest1 = dest1+8; | |||
| } | |||
| } | |||
| if (row&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (2<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| dest0[16] = src1[0]; | |||
| dest0[17] = src1[1]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src1[4]; | |||
| dest0[21] = src1[5]; | |||
| dest0[22] = src1[6]; | |||
| dest0[23] = src1[7]; | |||
| dest0[24] = src1[8]; | |||
| dest0[25] = src1[9]; | |||
| dest0[26] = src1[10]; | |||
| dest0[27] = src1[11]; | |||
| dest0[28] = src1[12]; | |||
| dest0[29] = src1[13]; | |||
| dest0[30] = src1[14]; | |||
| dest0[31] = src1[15]; | |||
| src0 = src0+16; | |||
| src1 = src1+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| dest4[8] = src1[0]; | |||
| dest4[9] = src1[1]; | |||
| dest4[10] = src1[2]; | |||
| dest4[11] = src1[3]; | |||
| dest4[12] = src1[4]; | |||
| dest4[13] = src1[5]; | |||
| dest4[14] = src1[6]; | |||
| dest4[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| dest4 = dest4+16; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| dest2 = dest2+8; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| dest1 = dest1+4; | |||
| } | |||
| } | |||
| if (row&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (1<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| src0 = src0+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| src0 = src0+8; | |||
| dest4 = dest4+8; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| src0 = src0+4; | |||
| dest2 = dest2+4; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| src0 = src0+2; | |||
| dest1 = dest1+2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,59 +1,84 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
| SGEMMINCOPY = gemm_ncopy_4.S | |||
| SGEMMITCOPY = gemm_tcopy_4.S | |||
| SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||
| DGEMMINCOPY = dgemm_ncopy_2.S | |||
| DGEMMITCOPY = dgemm_tcopy_2.S | |||
| DGEMMONCOPY = dgemm_ncopy_8.S | |||
| DGEMMOTCOPY = dgemm_tcopy_8.S | |||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| #DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| #DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
| CGEMMINCOPY = zgemm_ncopy_2.S | |||
| CGEMMITCOPY = zgemm_tcopy_2.S | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| #CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
| CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
| ZGEMMINCOPY = zgemm_ncopy_1.S | |||
| ZGEMMITCOPY = zgemm_tcopy_1.S | |||
| #ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
| #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
| #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
| #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
| #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
| #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
| #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
| #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
| #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||
| #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||
| #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -208,68 +208,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| #define QGEMM_DEFAULT_R qgemm_r | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R cgemm_r | |||
| #define ZGEMM_DEFAULT_R zgemm_r | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| #define SYMV_P 16 | |||
| #define HAVE_EXCLUSIVE_CACHE | |||
| #define GEMM_THREAD gemm_thread_mn | |||
| #endif | |||
| #if defined(BOBCAT) | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #ifdef ARCH_X86 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #endif | |||
| #define SGEMM_DEFAULT_P 448 | |||
| #define DGEMM_DEFAULT_P 224 | |||
| #define QGEMM_DEFAULT_P 112 | |||
| #define CGEMM_DEFAULT_P 224 | |||
| #define ZGEMM_DEFAULT_P 112 | |||
| #define XGEMM_DEFAULT_P 56 | |||
| #define SGEMM_DEFAULT_Q 224 | |||
| #define DGEMM_DEFAULT_Q 224 | |||
| #define QGEMM_DEFAULT_Q 224 | |||
| #define CGEMM_DEFAULT_Q 224 | |||
| #define ZGEMM_DEFAULT_Q 224 | |||
| #define XGEMM_DEFAULT_Q 224 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| #define QGEMM_DEFAULT_R qgemm_r | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| @@ -980,7 +918,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||
| #define GEMM_DEFAULT_OFFSET_A 32 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| @@ -990,57 +928,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef ARCH_X86 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #endif | |||
| #define SGEMM_DEFAULT_P 504 | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| //#define SGEMM_DEFAULT_R 1024 | |||
| #define DGEMM_DEFAULT_P 504 | |||
| #define DGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| //#define DGEMM_DEFAULT_R 1024 | |||
| #define QGEMM_DEFAULT_P 504 | |||
| #define QGEMM_DEFAULT_R qgemm_r | |||
| #define CGEMM_DEFAULT_P 252 | |||
| #define CGEMM_DEFAULT_R cgemm_r | |||
| #define CGEMM_DEFAULT_P 128 | |||
| //#define CGEMM_DEFAULT_R cgemm_r | |||
| #define CGEMM_DEFAULT_R 1024 | |||
| #define ZGEMM_DEFAULT_P 252 | |||
| #define ZGEMM_DEFAULT_P 512 | |||
| #define ZGEMM_DEFAULT_R zgemm_r | |||
| //#define ZGEMM_DEFAULT_R 1024 | |||
| #define XGEMM_DEFAULT_P 252 | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| #define SGEMM_DEFAULT_Q 512 | |||
| #define SGEMM_DEFAULT_Q 256 | |||
| #define DGEMM_DEFAULT_Q 256 | |||
| #define QGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 512 | |||
| #define ZGEMM_DEFAULT_Q 256 | |||
| #define CGEMM_DEFAULT_Q 256 | |||
| #define ZGEMM_DEFAULT_Q 192 | |||
| #define XGEMM_DEFAULT_Q 128 | |||
| #define GETRF_FACTOR 0.72 | |||