| @@ -336,14 +336,14 @@ ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||
| endif | |||
| endif | |||
| @@ -107,7 +107,7 @@ | |||
| #define CORE_BOBCAT 21 | |||
| #define CORE_BULLDOZER 22 | |||
| #define CORE_PILEDRIVER 23 | |||
| #define CORE_HASWELL CORE_SANDYBRIDGE | |||
| #define CORE_HASWELL 24 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -200,7 +200,6 @@ typedef struct { | |||
| #define CPUTYPE_BOBCAT 45 | |||
| #define CPUTYPE_BULLDOZER 46 | |||
| #define CPUTYPE_PILEDRIVER 47 | |||
| // this define is because BLAS doesn't have haswell specific optimizations yet | |||
| #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | |||
| #define CPUTYPE_HASWELL 48 | |||
| #endif | |||
| @@ -1243,6 +1243,7 @@ static char *cpuname[] = { | |||
| "BOBCAT", | |||
| "BULLDOZER", | |||
| "PILEDRIVER", | |||
| "HASWELL", | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1293,6 +1294,7 @@ static char *lowercpuname[] = { | |||
| "bobcat", | |||
| "bulldozer", | |||
| "piledriver", | |||
| "haswell", | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1320,6 +1322,7 @@ static char *corename[] = { | |||
| "BOBCAT", | |||
| "BULLDOZER", | |||
| "PILEDRIVER", | |||
| "HASWELL", | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1347,6 +1350,7 @@ static char *corename_lower[] = { | |||
| "bobcat", | |||
| "bulldozer", | |||
| "piledriver", | |||
| "haswell", | |||
| }; | |||
| @@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT; | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
| #endif | |||
| //Use sandy bridge kernels for haswell. | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #define VENDOR_INTEL 1 | |||
| #define VENDOR_AMD 2 | |||
| @@ -297,6 +298,7 @@ static char *corename[] = { | |||
| "Bobcat", | |||
| "Bulldozer", | |||
| "Piledriver", | |||
| "Haswell", | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -319,7 +321,8 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | |||
| return corename[0]; | |||
| } | |||
| @@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #ifdef FORCE_HASWELL | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -725,20 +740,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifndef FORCE | |||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
| defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | |||
| defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | |||
| #ifndef POWER | |||
| #define POWER | |||
| #endif | |||
| #define OPENBLAS_SUPPORTED | |||
| #endif | |||
| #if defined(__i386__) || (__x86_64__) | |||
| #include "cpuid_x86.c" | |||
| #define OPENBLAS_SUPPORTED | |||
| @@ -779,6 +790,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #error "This arch/CPU is not supported by OpenBLAS." | |||
| #endif | |||
| #else | |||
| #endif | |||
| static int get_num_cores(void) { | |||
| @@ -843,11 +856,10 @@ int main(int argc, char *argv[]){ | |||
| printf("NUM_CORES=%d\n", get_num_cores()); | |||
| #if defined(__arm__) && !defined(FORCE) | |||
| get_features(); | |||
| get_features(); | |||
| #endif | |||
| #if defined(__i386__) || defined(__x86_64__) | |||
| #ifndef FORCE | |||
| get_sse(); | |||
| @@ -0,0 +1,230 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset; | |||
| FLOAT ctemp1, ctemp2, ctemp3, ctemp4; | |||
| FLOAT ctemp5, ctemp6, ctemp7, ctemp8; | |||
| FLOAT ctemp9, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| j = (n >> 2); | |||
| if (j > 0){ | |||
| do{ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp10 = *(a_offset3 + 1); | |||
| ctemp11 = *(a_offset3 + 2); | |||
| ctemp12 = *(a_offset3 + 3); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| ctemp14 = *(a_offset4 + 1); | |||
| ctemp15 = *(a_offset4 + 2); | |||
| ctemp16 = *(a_offset4 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp9; | |||
| *(b_offset + 3) = ctemp13; | |||
| *(b_offset + 4) = ctemp2; | |||
| *(b_offset + 5) = ctemp6; | |||
| *(b_offset + 6) = ctemp10; | |||
| *(b_offset + 7) = ctemp14; | |||
| *(b_offset + 8) = ctemp3; | |||
| *(b_offset + 9) = ctemp7; | |||
| *(b_offset + 10) = ctemp11; | |||
| *(b_offset + 11) = ctemp15; | |||
| *(b_offset + 12) = ctemp4; | |||
| *(b_offset + 13) = ctemp8; | |||
| *(b_offset + 14) = ctemp12; | |||
| *(b_offset + 15) = ctemp16; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| b_offset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp9; | |||
| *(b_offset + 3) = ctemp13; | |||
| a_offset1 ++; | |||
| a_offset2 ++; | |||
| a_offset3 ++; | |||
| a_offset4 ++; | |||
| b_offset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 2){ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| *(b_offset + 2) = ctemp2; | |||
| *(b_offset + 3) = ctemp6; | |||
| *(b_offset + 4) = ctemp3; | |||
| *(b_offset + 5) = ctemp7; | |||
| *(b_offset + 6) = ctemp4; | |||
| *(b_offset + 7) = ctemp8; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset += 8; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp5; | |||
| a_offset1 ++; | |||
| a_offset2 ++; | |||
| b_offset += 2; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 1){ | |||
| a_offset1 = a_offset; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| *(b_offset + 0) = ctemp1; | |||
| *(b_offset + 1) = ctemp2; | |||
| *(b_offset + 2) = ctemp3; | |||
| *(b_offset + 3) = ctemp4; | |||
| a_offset1 += 4; | |||
| b_offset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| *(b_offset + 0) = ctemp1; | |||
| a_offset1 ++; | |||
| b_offset += 1; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,281 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; | |||
| FLOAT ctemp1, ctemp2, ctemp3, ctemp4; | |||
| FLOAT ctemp5, ctemp6, ctemp7, ctemp8; | |||
| FLOAT ctemp9, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| b_offset2 = b + m * (n & ~3); | |||
| b_offset3 = b + m * (n & ~1); | |||
| j = (m >> 2); | |||
| if (j > 0){ | |||
| do{ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 16; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| ctemp9 = *(a_offset3 + 0); | |||
| ctemp10 = *(a_offset3 + 1); | |||
| ctemp11 = *(a_offset3 + 2); | |||
| ctemp12 = *(a_offset3 + 3); | |||
| ctemp13 = *(a_offset4 + 0); | |||
| ctemp14 = *(a_offset4 + 1); | |||
| ctemp15 = *(a_offset4 + 2); | |||
| ctemp16 = *(a_offset4 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| *(b_offset1 + 4) = ctemp5; | |||
| *(b_offset1 + 5) = ctemp6; | |||
| *(b_offset1 + 6) = ctemp7; | |||
| *(b_offset1 + 7) = ctemp8; | |||
| *(b_offset1 + 8) = ctemp9; | |||
| *(b_offset1 + 9) = ctemp10; | |||
| *(b_offset1 + 10) = ctemp11; | |||
| *(b_offset1 + 11) = ctemp12; | |||
| *(b_offset1 + 12) = ctemp13; | |||
| *(b_offset1 + 13) = ctemp14; | |||
| *(b_offset1 + 14) = ctemp15; | |||
| *(b_offset1 + 15) = ctemp16; | |||
| b_offset1 += m * 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset2 + 0); | |||
| ctemp4 = *(a_offset2 + 1); | |||
| ctemp5 = *(a_offset3 + 0); | |||
| ctemp6 = *(a_offset3 + 1); | |||
| ctemp7 = *(a_offset4 + 0); | |||
| ctemp8 = *(a_offset4 + 1); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| a_offset3 += 2; | |||
| a_offset4 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| *(b_offset2 + 2) = ctemp3; | |||
| *(b_offset2 + 3) = ctemp4; | |||
| *(b_offset2 + 4) = ctemp5; | |||
| *(b_offset2 + 5) = ctemp6; | |||
| *(b_offset2 + 6) = ctemp7; | |||
| *(b_offset2 + 7) = ctemp8; | |||
| b_offset2 += 8; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset2 + 0); | |||
| ctemp3 = *(a_offset3 + 0); | |||
| ctemp4 = *(a_offset4 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| *(b_offset3 + 1) = ctemp2; | |||
| *(b_offset3 + 2) = ctemp3; | |||
| *(b_offset3 + 3) = ctemp4; | |||
| b_offset3 += 4; | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } | |||
| if (m & 2){ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 8; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| *(b_offset1 + 4) = ctemp5; | |||
| *(b_offset1 + 5) = ctemp6; | |||
| *(b_offset1 + 6) = ctemp7; | |||
| *(b_offset1 + 7) = ctemp8; | |||
| b_offset1 += m * 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset2 + 0); | |||
| ctemp4 = *(a_offset2 + 1); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| *(b_offset2 + 2) = ctemp3; | |||
| *(b_offset2 + 3) = ctemp4; | |||
| b_offset2 += 4; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset2 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| *(b_offset3 + 1) = ctemp2; | |||
| b_offset3 += 2; | |||
| } | |||
| } | |||
| if (m & 1){ | |||
| a_offset1 = a_offset; | |||
| b_offset1 = b_offset; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| a_offset1 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| b_offset1 += 4 * m; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| a_offset1 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,138 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, offset; | |||
| FLOAT data01, data02, data03, data04; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| while (js > 0){ | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; | |||
| if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; | |||
| if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| if (offset > -1) ao2 += lda; else ao2 ++; | |||
| if (offset > -2) ao3 += lda; else ao3 ++; | |||
| if (offset > -3) ao4 += lda; else ao4 ++; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 4; | |||
| js --; | |||
| } | |||
| if (n & 2) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| if (offset > -1) ao2 += lda; else ao2 ++; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b += 2; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 2; | |||
| } | |||
| if (n & 1) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| if (offset > 0) ao1 += lda; else ao1 ++; | |||
| b[ 0] = data01; | |||
| b ++; | |||
| offset --; | |||
| i --; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, offset; | |||
| FLOAT data01, data02, data03, data04; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| while (js > 0){ | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; | |||
| if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; | |||
| if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| if (offset > -1) ao2 ++; else ao2 += lda; | |||
| if (offset > -2) ao3 ++; else ao3 += lda; | |||
| if (offset > -3) ao4 ++; else ao4 += lda; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 4; | |||
| js --; | |||
| } | |||
| if (n & 2) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| if (offset > -1) ao2 ++; else ao2 += lda; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b += 2; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 2; | |||
| } | |||
| if (n & 1) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| if (offset > 0) ao1 ++; else ao1 += lda; | |||
| b[ 0] = data01; | |||
| b ++; | |||
| offset --; | |||
| i --; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,484 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data12 = *(ao3 + 3); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data02; | |||
| b[ 5] = ONE; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = ONE; | |||
| b[11] = ZERO; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = ZERO; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| b[ 4] = data02; | |||
| b[ 5] = data04; | |||
| b[ 6] = data06; | |||
| b[ 7] = data08; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| data03 = *(ao3 + 0); | |||
| data04 = *(ao4 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data02; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X < posY) { | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += lda; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| b += 1; | |||
| ao1 += 1; | |||
| } | |||
| X ++; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,488 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data12 = *(ao3 + 3); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ONE; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ONE; | |||
| b[11] = data12; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| if (i >= 2) { | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| } | |||
| if (i >= 3) { | |||
| data12 = *(ao3 + 3); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data07; | |||
| b[ 3] = data08; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data12; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data07; | |||
| b[ 3] = data08; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data12; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X > posY) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } else { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (i > 0) { | |||
| do { | |||
| if (X > posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += 1; | |||
| b += 1; | |||
| } | |||
| X ++; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,785 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X, mm; | |||
| FLOAT data01, data02, data03, data04, data05, data06; | |||
| FLOAT data07, data08, data09, data10, data11, data12; | |||
| FLOAT data13, data14, data15, data16, data17, data18; | |||
| FLOAT data19, data20, data21, data22, data23, data24; | |||
| FLOAT data25, data26, data27, data28, data29, data30; | |||
| FLOAT data31, data32, data33, data34, data35, data36; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; | |||
| //js = (n >> 2); | |||
| js = n/6; | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| ao5 = a + posX + (posY + 4) * lda; | |||
| ao6 = a + posX + (posY + 5) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| ao5 = a + posY + (posX + 4) * lda; | |||
| ao6 = a + posY + (posX + 5) * lda; | |||
| } | |||
| i = m/6; | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao1 + 4); | |||
| data06 = *(ao1 + 5); | |||
| data07 = *(ao2 + 0); | |||
| data08 = *(ao2 + 1); | |||
| data09 = *(ao2 + 2); | |||
| data10 = *(ao2 + 3); | |||
| data11 = *(ao2 + 4); | |||
| data12 = *(ao2 + 5); | |||
| data13 = *(ao3 + 0); | |||
| data14 = *(ao3 + 1); | |||
| data15 = *(ao3 + 2); | |||
| data16 = *(ao3 + 3); | |||
| data17 = *(ao3 + 4); | |||
| data18 = *(ao3 + 5); | |||
| data19 = *(ao4 + 0); | |||
| data20 = *(ao4 + 1); | |||
| data21 = *(ao4 + 2); | |||
| data22 = *(ao4 + 3); | |||
| data23 = *(ao4 + 4); | |||
| data24 = *(ao4 + 5); | |||
| data25 = *(ao5 + 0); | |||
| data26 = *(ao5 + 1); | |||
| data27 = *(ao5 + 2); | |||
| data28 = *(ao5 + 3); | |||
| data29 = *(ao5 + 4); | |||
| data30 = *(ao5 + 5); | |||
| data31 = *(ao6 + 0); | |||
| data32 = *(ao6 + 1); | |||
| data33 = *(ao6 + 2); | |||
| data34 = *(ao6 + 3); | |||
| data35 = *(ao6 + 4); | |||
| data36 = *(ao6 + 5); | |||
| b[ 0] = data01; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = data02; | |||
| b[ 7] = data08; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = data03; | |||
| b[13] = data09; | |||
| b[14] = data15; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = data04; | |||
| b[19] = data10; | |||
| b[20] = data16; | |||
| b[21] = data22; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = data05; | |||
| b[25] = data11; | |||
| b[26] = data17; | |||
| b[27] = data23; | |||
| b[28] = data29; | |||
| b[29] = data35; | |||
| b[30] = data06; | |||
| b[31] = data12; | |||
| b[32] = data18; | |||
| b[33] = data24; | |||
| b[34] = data30; | |||
| b[35] = data36; | |||
| ao1 += 6; | |||
| ao2 += 6; | |||
| ao3 += 6; | |||
| ao4 += 6; | |||
| ao5 += 6; | |||
| ao6 += 6; | |||
| b += 36; | |||
| } else | |||
| if (X > posY) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ZERO; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ZERO; | |||
| b[11] = ZERO; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ZERO; | |||
| b[16] = ZERO; | |||
| b[17] = ZERO; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ZERO; | |||
| b[22] = ZERO; | |||
| b[23] = ZERO; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = ZERO; | |||
| b[29] = ZERO; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = ZERO; | |||
| ao1 += 6 * lda; | |||
| ao2 += 6 * lda; | |||
| ao3 += 6 * lda; | |||
| ao4 += 6 * lda; | |||
| ao5 += 6 * lda; | |||
| ao6 += 6 * lda; | |||
| b += 36; | |||
| } else { | |||
| data01 = *(ao1 + 0); | |||
| data07 = *(ao2 + 0); | |||
| data13 = *(ao3 + 0); | |||
| data19 = *(ao4 + 0); | |||
| data25 = *(ao5 + 0); | |||
| data31 = *(ao6 + 0); | |||
| data08 = *(ao2 + 1); | |||
| data14 = *(ao3 + 1); | |||
| data20 = *(ao4 + 1); | |||
| data26 = *(ao5 + 1); | |||
| data32 = *(ao6 + 1); | |||
| data15 = *(ao3 + 2); | |||
| data21 = *(ao4 + 2); | |||
| data27 = *(ao5 + 2); | |||
| data33 = *(ao6 + 2); | |||
| data22 = *(ao4 + 3); | |||
| data28 = *(ao5 + 3); | |||
| data34 = *(ao6 + 3); | |||
| data29 = *(ao5 + 4); | |||
| data35 = *(ao6 + 4); | |||
| data36 = *(ao6 + 5); | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ONE; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ONE; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ONE; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = ONE; | |||
| b[29] = data35; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = ONE; | |||
| #else | |||
| b[ 0] = data01; | |||
| b[ 1] = data07; | |||
| b[ 2] = data13; | |||
| b[ 3] = data19; | |||
| b[ 4] = data25; | |||
| b[ 5] = data31; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = data08; | |||
| b[ 8] = data14; | |||
| b[ 9] = data20; | |||
| b[10] = data26; | |||
| b[11] = data32; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = data15; | |||
| b[15] = data21; | |||
| b[16] = data27; | |||
| b[17] = data33; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = data22; | |||
| b[22] = data28; | |||
| b[23] = data34; | |||
| b[24] = ZERO; | |||
| b[25] = ZERO; | |||
| b[26] = ZERO; | |||
| b[27] = ZERO; | |||
| b[28] = data29; | |||
| b[29] = data35; | |||
| b[30] = ZERO; | |||
| b[31] = ZERO; | |||
| b[32] = ZERO; | |||
| b[33] = ZERO; | |||
| b[34] = ZERO; | |||
| b[35] = data36; | |||
| #endif | |||
| ao1 += 6; | |||
| ao2 += 6; | |||
| ao3 += 6; | |||
| ao4 += 6; | |||
| ao5 += 6; | |||
| ao6 += 7; | |||
| b += 36; | |||
| } | |||
| X += 6; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| mm = m - m/6; | |||
| if (mm & 4) { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = data02; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = data03; | |||
| b[ 9] = data07; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = data04; | |||
| b[13] = data08; | |||
| b[14] = data12; | |||
| b[15] = data16; | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X > posY) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ZERO; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ZERO; | |||
| b[11] = ZERO; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ZERO; | |||
| b[16] = ZERO; | |||
| b[17] = ZERO; | |||
| b[18] = ZERO; | |||
| b[19] = ZERO; | |||
| b[20] = ZERO; | |||
| b[21] = ZERO; | |||
| b[22] = ZERO; | |||
| b[23] = ZERO; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = ONE; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = ONE; | |||
| b[11] = data15; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b[ 4] = ZERO; | |||
| b[ 5] = data06; | |||
| b[ 6] = data10; | |||
| b[ 7] = data14; | |||
| b[ 8] = ZERO; | |||
| b[ 9] = ZERO; | |||
| b[10] = data11; | |||
| b[11] = data15; | |||
| b[12] = ZERO; | |||
| b[13] = ZERO; | |||
| b[14] = ZERO; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| } | |||
| if (mm & 3) { | |||
| if (X < posY) { | |||
| if (mm & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| b[ 4] = data02; | |||
| b[ 5] = data04; | |||
| b[ 6] = data06; | |||
| b[ 7] = data08; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (mm & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data03 = *(ao2 + 0); | |||
| data05 = *(ao3 + 0); | |||
| data07 = *(ao4 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data03; | |||
| b[ 2] = data05; | |||
| b[ 3] = data07; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ONE; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ONE; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data13 = *(ao4 + 0); | |||
| if (i >= 2) { | |||
| data06 = *(ao2 + 1); | |||
| data10 = *(ao3 + 1); | |||
| data14 = *(ao4 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data11 = *(ao3 + 2); | |||
| data15 = *(ao4 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data09; | |||
| b[ 3] = data13; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = data06; | |||
| b[ 2] = data10; | |||
| b[ 3] = data14; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = ZERO; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data11; | |||
| b[ 3] = data15; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = data02; | |||
| b[ 3] = data06; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += lda; | |||
| ao2 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data05; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = data05; | |||
| #endif | |||
| ao1 += lda; | |||
| ao2 += lda; | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (m > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += 1; | |||
| b += 1; | |||
| } else | |||
| if (X > posY) { | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += lda; | |||
| b += 1; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,472 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4; | |||
| js = (n >> 2); | |||
| if (js > 0){ | |||
| do { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| ao3 = a + posX + (posY + 2) * lda; | |||
| ao4 = a + posX + (posY + 3) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| ao3 = a + posY + (posX + 2) * lda; | |||
| ao4 = a + posY + (posX + 3) * lda; | |||
| } | |||
| i = (m >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| ao1 += 4; | |||
| ao2 += 4; | |||
| ao3 += 4; | |||
| ao4 += 4; | |||
| b += 16; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data12 = *(ao3 + 3); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data05; | |||
| b[ 5] = ONE; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = ONE; | |||
| b[11] = ZERO; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| data13 = *(ao4 + 0); | |||
| data14 = *(ao4 + 1); | |||
| data15 = *(ao4 + 2); | |||
| data16 = *(ao4 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = ZERO; | |||
| b[ 7] = ZERO; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = ZERO; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| #endif | |||
| ao1 += 4 * lda; | |||
| ao2 += 4 * lda; | |||
| ao3 += 4 * lda; | |||
| ao4 += 4 * lda; | |||
| b += 16; | |||
| } | |||
| X += 4; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i) { | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| } else | |||
| if (X > posY) { | |||
| if (m & 2) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| data07 = *(ao2 + 2); | |||
| data08 = *(ao2 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao1 + 2); | |||
| data04 = *(ao1 + 3); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| } else { | |||
| #ifdef UNIT | |||
| if (i >= 2) { | |||
| data05 = *(ao2 + 0); | |||
| } | |||
| if (i >= 3) { | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| } | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = data05; | |||
| b[ 1] = ONE; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = data09; | |||
| b[ 1] = data10; | |||
| b[ 2] = ONE; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| if (i >= 2) { | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| } | |||
| if (i >= 3) { | |||
| data09 = *(ao3 + 0); | |||
| data10 = *(ao3 + 1); | |||
| data11 = *(ao3 + 2); | |||
| } | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| if(i >= 2) { | |||
| b[ 0] = data05; | |||
| b[ 1] = data06; | |||
| b[ 2] = ZERO; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| if (i >= 3) { | |||
| b[ 0] = data09; | |||
| b[ 1] = data10; | |||
| b[ 2] = data11; | |||
| b[ 3] = ZERO; | |||
| b += 4; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| posY += 4; | |||
| js --; | |||
| } while (js > 0); | |||
| } /* End of main loop */ | |||
| if (n & 2){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| ao2 = a + posX + (posY + 1) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posY + (posX + 1) * lda; | |||
| } | |||
| i = (m >> 1); | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data05 = *(ao2 + 0); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data05; | |||
| b[ 3] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data05 = *(ao2 + 0); | |||
| data06 = *(ao2 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| b[ 2] = data05; | |||
| b[ 3] = data06; | |||
| #endif | |||
| ao1 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 4; | |||
| } | |||
| X += 2; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| i = (m & 1); | |||
| if (i) { | |||
| if (X < posY) { | |||
| ao1 += 2; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| #endif | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| if (n & 1){ | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao1 = a + posX + (posY + 0) * lda; | |||
| } else { | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| } | |||
| i = m; | |||
| if (m > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += lda; | |||
| b += 1; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 6 | |||
| #define GEMM_UNROLL_M_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_M == 8 | |||
| #define GEMM_UNROLL_M_SHIFT 3 | |||
| #endif | |||
| @@ -0,0 +1,326 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data12 = *(a3 + 3); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data02; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data15; | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4; | |||
| a2 += 4; | |||
| a3 += 4; | |||
| a4 += 4; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data02; | |||
| *(b + 5) = INV(data06); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| data05 = *(a3 + 0); | |||
| data06 = *(a3 + 1); | |||
| data07 = *(a4 + 0); | |||
| data08 = *(a4 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data05; | |||
| *(b + 3) = data07; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data04; | |||
| *(b + 6) = data06; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| a3 += 2; | |||
| a4 += 2; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| data03 = *(a3 + 0); | |||
| data04 = *(a4 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4 * lda; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 2) = data02; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data02; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2 * lda; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1+= 1; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,346 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data12 = *(a3 + 3); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 11) = data12; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data12; | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4 * lda; | |||
| a2 += 4 * lda; | |||
| a3 += 4 * lda; | |||
| a4 += 4 * lda; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data02 = *(a1 + 1); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data02; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1 += 1 * lda; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,350 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 11) = data15; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 4) = data02; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| *(b + 8) = data03; | |||
| *(b + 9) = data07; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data15; | |||
| *(b + 12) = data04; | |||
| *(b + 13) = data08; | |||
| *(b + 14) = data12; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4; | |||
| a2 += 4; | |||
| a3 += 4; | |||
| a4 += 4; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 6) = data10; | |||
| *(b + 7) = data14; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| data05 = *(a3 + 0); | |||
| data06 = *(a3 + 1); | |||
| data07 = *(a4 + 0); | |||
| data08 = *(a4 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| data09 = *(a3 + 0); | |||
| data13 = *(a4 + 0); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data05; | |||
| *(b + 2) = data09; | |||
| *(b + 3) = data13; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| data03 = *(a3 + 0); | |||
| data04 = *(a4 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4 * lda; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data03; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data03; | |||
| *(b + 2) = data02; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2; | |||
| a2 += 2; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| *(b + 0) = INV(data01); | |||
| *(b + 1) = data03; | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a2 + 0); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2 * lda; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii < jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1+= 1; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,322 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT *a1, *a2, *a3, *a4; | |||
| jj = offset; | |||
| j = (n >> 2); | |||
| while (j > 0){ | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| a3 = a + 2 * lda; | |||
| a4 = a + 3 * lda; | |||
| i = (m >> 2); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| #ifndef UNIT | |||
| data11 = *(a3 + 2); | |||
| #endif | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| #ifndef UNIT | |||
| data16 = *(a4 + 3); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data05; | |||
| *(b + 5) = INV(data06); | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = INV(data11); | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = INV(data16); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| data09 = *(a3 + 0); | |||
| data10 = *(a3 + 1); | |||
| data11 = *(a3 + 2); | |||
| data12 = *(a3 + 3); | |||
| data13 = *(a4 + 0); | |||
| data14 = *(a4 + 1); | |||
| data15 = *(a4 + 2); | |||
| data16 = *(a4 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| *(b + 8) = data09; | |||
| *(b + 9) = data10; | |||
| *(b + 10) = data11; | |||
| *(b + 11) = data12; | |||
| *(b + 12) = data13; | |||
| *(b + 13) = data14; | |||
| *(b + 14) = data15; | |||
| *(b + 15) = data16; | |||
| } | |||
| a1 += 4 * lda; | |||
| a2 += 4 * lda; | |||
| a3 += 4 * lda; | |||
| a4 += 4 * lda; | |||
| b += 16; | |||
| i --; | |||
| ii += 4; | |||
| } | |||
| if ((m & 2) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data05 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data06 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 4) = data05; | |||
| *(b + 5) = INV(data06); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| data05 = *(a2 + 0); | |||
| data06 = *(a2 + 1); | |||
| data07 = *(a2 + 2); | |||
| data08 = *(a2 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| *(b + 4) = data05; | |||
| *(b + 5) = data06; | |||
| *(b + 6) = data07; | |||
| *(b + 7) = data08; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 8; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a1 + 2); | |||
| data04 = *(a1 + 3); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| b += 4; | |||
| } | |||
| a += 4; | |||
| jj += 4; | |||
| j --; | |||
| } | |||
| if (n & 2) { | |||
| a1 = a + 0 * lda; | |||
| a2 = a + 1 * lda; | |||
| i = (m >> 1); | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| data03 = *(a2 + 0); | |||
| #ifndef UNIT | |||
| data04 = *(a2 + 1); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| *(b + 2) = data03; | |||
| *(b + 3) = INV(data04); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| data03 = *(a2 + 0); | |||
| data04 = *(a2 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| *(b + 2) = data03; | |||
| *(b + 3) = data04; | |||
| } | |||
| a1 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 4; | |||
| i --; | |||
| ii += 2; | |||
| } | |||
| if ((m & 1) != 0) { | |||
| if (ii== jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| data02 = *(a1 + 1); | |||
| *(b + 0) = data01; | |||
| *(b + 1) = data02; | |||
| } | |||
| b += 2; | |||
| } | |||
| a += 2; | |||
| jj += 2; | |||
| } | |||
| if (n & 1) { | |||
| a1 = a + 0 * lda; | |||
| i = m; | |||
| ii = 0; | |||
| while (i > 0) { | |||
| if (ii == jj) { | |||
| #ifndef UNIT | |||
| data01 = *(a1 + 0); | |||
| #endif | |||
| *(b + 0) = INV(data01); | |||
| } | |||
| if (ii > jj) { | |||
| data01 = *(a1 + 0); | |||
| *(b + 0) = data01; | |||
| } | |||
| a1 += 1 * lda; | |||
| b += 1; | |||
| i --; | |||
| ii += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.PENRYN | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -0,0 +1,63 @@ | |||
| SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x4_haswell.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_8x2_haswell.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||
| @@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S | |||
| DDOTKERNEL = ddot_bulldozer.S | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| @@ -16,7 +16,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||
| DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S | |||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| @@ -25,7 +26,8 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||
| CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -34,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -52,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S | |||
| DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 12) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 12) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 24) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 24) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 24) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -57,7 +57,7 @@ | |||
| #define PREFETCHSIZE (16 * 24) | |||
| #endif | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 24) | |||
| @@ -1154,6 +1154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef HASWELL | |||
| <<<<<<< HEAD | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||
| @@ -1164,6 +1165,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 8 | |||
| #define SWITCH_RATIO 4 | |||
| ======= | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SYMV_P 8 | |||
| #define SWITCH_RATIO 4 | |||
| >>>>>>> origin/haswell | |||
| #ifdef ARCH_X86 | |||
| @@ -1233,6 +1246,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_Q 128 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| <<<<<<< HEAD | |||
| ======= | |||
| //#define DGEMM_DEFAULT_R dgemm_r | |||
| >>>>>>> origin/haswell | |||
| #define DGEMM_DEFAULT_R 13824 | |||
| #define CGEMM_DEFAULT_R cgemm_r | |||
| #define ZGEMM_DEFAULT_R zgemm_r | |||