Browse Source

Merge remote branch 'origin/develop' into armv7

tags/v0.2.9.rc1
wernsaar 12 years ago
parent
commit
402d6e91db
54 changed files with 35329 additions and 40 deletions
  1. +2
    -2
      Makefile.system
  2. +2
    -3
      cpuid.h
  3. +4
    -0
      cpuid_x86.c
  4. +6
    -3
      driver/others/dynamic.c
  5. +19
    -7
      getarch.c
  6. +230
    -0
      kernel/generic/gemm_ncopy_6.c
  7. +281
    -0
      kernel/generic/gemm_tcopy_6.c
  8. +138
    -0
      kernel/generic/symm_lcopy_6.c
  9. +136
    -0
      kernel/generic/symm_ucopy_6.c
  10. +484
    -0
      kernel/generic/trmm_lncopy_6.c
  11. +488
    -0
      kernel/generic/trmm_ltcopy_6.c
  12. +785
    -0
      kernel/generic/trmm_uncopy_6.c
  13. +472
    -0
      kernel/generic/trmm_utcopy_6.c
  14. +4
    -0
      kernel/generic/trsm_kernel_LN.c
  15. +4
    -0
      kernel/generic/trsm_kernel_LT.c
  16. +4
    -0
      kernel/generic/trsm_kernel_RN.c
  17. +5
    -0
      kernel/generic/trsm_kernel_RT.c
  18. +326
    -0
      kernel/generic/trsm_lncopy_6.c
  19. +346
    -0
      kernel/generic/trsm_ltcopy_6.c
  20. +350
    -0
      kernel/generic/trsm_uncopy_6.c
  21. +322
    -0
      kernel/generic/trsm_utcopy_6.c
  22. +1
    -0
      kernel/x86/KERNEL.HASWELL
  23. +1
    -1
      kernel/x86/trsm_kernel_LN_2x4_penryn.S
  24. +1
    -1
      kernel/x86/trsm_kernel_LN_4x4_penryn.S
  25. +1
    -1
      kernel/x86/trsm_kernel_LT_2x4_penryn.S
  26. +1
    -1
      kernel/x86/trsm_kernel_LT_4x4_penryn.S
  27. +1
    -1
      kernel/x86/trsm_kernel_RT_2x4_penryn.S
  28. +1
    -1
      kernel/x86/trsm_kernel_RT_4x4_penryn.S
  29. +1
    -1
      kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
  30. +1
    -1
      kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
  31. +1
    -1
      kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
  32. +1
    -1
      kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
  33. +1
    -1
      kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
  34. +63
    -0
      kernel/x86_64/KERNEL.HASWELL
  35. +9
    -6
      kernel/x86_64/KERNEL.PILEDRIVER
  36. +1920
    -0
      kernel/x86_64/cgemm_kernel_4x2_piledriver.S
  37. +2284
    -0
      kernel/x86_64/cgemm_kernel_8x2_haswell.S
  38. +5215
    -0
      kernel/x86_64/dgemm_kernel_16x2_haswell.S
  39. +3479
    -0
      kernel/x86_64/dgemm_kernel_4x4_haswell.S
  40. +1734
    -0
      kernel/x86_64/dgemm_kernel_6x4_piledriver.S
  41. +4523
    -0
      kernel/x86_64/dgemm_kernel_8x2_piledriver.S
  42. +5258
    -0
      kernel/x86_64/sgemm_kernel_16x2_piledriver.S
  43. +3159
    -0
      kernel/x86_64/sgemm_kernel_16x4_haswell.S
  44. +1
    -1
      kernel/x86_64/symv_L_sse.S
  45. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  46. +1
    -1
      kernel/x86_64/symv_U_sse.S
  47. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  48. +1428
    -0
      kernel/x86_64/zgemm_kernel_2x2_piledriver.S
  49. +1812
    -0
      kernel/x86_64/zgemm_kernel_4x2_haswell.S
  50. +1
    -1
      kernel/x86_64/zsymv_L_sse.S
  51. +1
    -1
      kernel/x86_64/zsymv_L_sse2.S
  52. +1
    -1
      kernel/x86_64/zsymv_U_sse.S
  53. +1
    -1
      kernel/x86_64/zsymv_U_sse2.S
  54. +17
    -0
      param.h

+ 2
- 2
Makefile.system View File

@@ -336,14 +336,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif

ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif



+ 2
- 3
cpuid.h View File

@@ -107,7 +107,7 @@
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE
#define CORE_HASWELL 24

#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -200,7 +200,6 @@ typedef struct {
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
#define CPUTYPE_HASWELL 48

#endif

+ 4
- 0
cpuid_x86.c View File

@@ -1243,6 +1243,7 @@ static char *cpuname[] = {
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
};

static char *lowercpuname[] = {
@@ -1293,6 +1294,7 @@ static char *lowercpuname[] = {
"bobcat",
"bulldozer",
"piledriver",
"haswell",
};

static char *corename[] = {
@@ -1320,6 +1322,7 @@ static char *corename[] = {
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
};

static char *corename_lower[] = {
@@ -1347,6 +1350,7 @@ static char *corename_lower[] = {
"bobcat",
"bulldozer",
"piledriver",
"haswell",
};




+ 6
- 3
driver/others/dynamic.c View File

@@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_HASWELL;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE


#define VENDOR_INTEL 1
#define VENDOR_AMD 2
@@ -297,6 +298,7 @@ static char *corename[] = {
"Bobcat",
"Bulldozer",
"Piledriver",
"Haswell",
};

char *gotoblas_corename(void) {
@@ -319,7 +321,8 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];

return corename[0];
}


+ 19
- 7
getarch.c View File

@@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "SANDYBRIDGE"
#endif

#ifdef FORCE_HASWELL
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif

#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
@@ -725,20 +740,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif




#ifndef FORCE

#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)

defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
#ifndef POWER
#define POWER
#endif
#define OPENBLAS_SUPPORTED
#endif


#if defined(__i386__) || (__x86_64__)
#include "cpuid_x86.c"
#define OPENBLAS_SUPPORTED
@@ -779,6 +790,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#error "This arch/CPU is not supported by OpenBLAS."
#endif

#else

#endif

static int get_num_cores(void) {
@@ -843,11 +856,10 @@ int main(int argc, char *argv[]){
printf("NUM_CORES=%d\n", get_num_cores());

#if defined(__arm__) && !defined(FORCE)
get_features();
get_features();
#endif



#if defined(__i386__) || defined(__x86_64__)
#ifndef FORCE
get_sse();


+ 230
- 0
kernel/generic/gemm_ncopy_6.c View File

@@ -0,0 +1,230 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;

FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;

a_offset = a;
b_offset = b;
j = (n >> 2);
if (j > 0){
do{
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
*(b_offset + 4) = ctemp2;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp10;
*(b_offset + 7) = ctemp14;
*(b_offset + 8) = ctemp3;
*(b_offset + 9) = ctemp7;
*(b_offset + 10) = ctemp11;
*(b_offset + 11) = ctemp15;
*(b_offset + 12) = ctemp4;
*(b_offset + 13) = ctemp8;
*(b_offset + 14) = ctemp12;
*(b_offset + 15) = ctemp16;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset += 16;
i --;
}while(i > 0);
}

i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
ctemp9 = *(a_offset3 + 0);
ctemp13 = *(a_offset4 + 0);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
a_offset1 ++;
a_offset2 ++;
a_offset3 ++;
a_offset4 ++;
b_offset += 4;
i --;
}while(i > 0);
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp2;
*(b_offset + 3) = ctemp6;
*(b_offset + 4) = ctemp3;
*(b_offset + 5) = ctemp7;
*(b_offset + 6) = ctemp4;
*(b_offset + 7) = ctemp8;
a_offset1 += 4;
a_offset2 += 4;
b_offset += 8;
i --;
}while(i > 0);
}
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
a_offset1 ++;
a_offset2 ++;
b_offset += 2;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */
if (n & 1){
a_offset1 = a_offset;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
a_offset1 += 4;
b_offset += 4;
i --;
}while(i > 0);
}
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
*(b_offset + 0) = ctemp1;
a_offset1 ++;
b_offset += 1;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */

return 0;
}

+ 281
- 0
kernel/generic/gemm_tcopy_6.c View File

@@ -0,0 +1,281 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){

BLASLONG i, j;

FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;

a_offset = a;
b_offset = b;

b_offset2 = b + m * (n & ~3);
b_offset3 = b + m * (n & ~1);

j = (m >> 2);
if (j > 0){
do{
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset += 16;

i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);

a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;
*(b_offset1 + 8) = ctemp9;
*(b_offset1 + 9) = ctemp10;
*(b_offset1 + 10) = ctemp11;
*(b_offset1 + 11) = ctemp12;
*(b_offset1 + 12) = ctemp13;
*(b_offset1 + 13) = ctemp14;
*(b_offset1 + 14) = ctemp15;
*(b_offset1 + 15) = ctemp16;
b_offset1 += m * 4;
i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);

ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
ctemp5 = *(a_offset3 + 0);
ctemp6 = *(a_offset3 + 1);
ctemp7 = *(a_offset4 + 0);
ctemp8 = *(a_offset4 + 1);
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
*(b_offset2 + 4) = ctemp5;
*(b_offset2 + 5) = ctemp6;
*(b_offset2 + 6) = ctemp7;
*(b_offset2 + 7) = ctemp8;
b_offset2 += 8;
}

if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
ctemp3 = *(a_offset3 + 0);
ctemp4 = *(a_offset4 + 0);
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
*(b_offset3 + 2) = ctemp3;
*(b_offset3 + 3) = ctemp4;
b_offset3 += 4;
}

j--;
}while(j > 0);
}

if (m & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset += 8;
i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
a_offset1 += 4;
a_offset2 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;

b_offset1 += m * 4;
i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
a_offset1 += 2;
a_offset2 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
b_offset2 += 4;
}
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
b_offset3 += 2;
}
}

if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
a_offset1 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;

b_offset1 += 4 * m;

i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
a_offset1 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
}
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
*(b_offset3 + 0) = ctemp1;
}
}

return 0;
}

+ 138
- 0
kernel/generic/symm_lcopy_6.c View File

@@ -0,0 +1,138 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js, offset;

FLOAT data01, data02, data03, data04;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);
while (js > 0){

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
if (offset > -3) ao4 += lda; else ao4 ++;

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b += 4;

offset --;
i --;
}

posX += 4;
js --;
}

if (n & 2) {

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;

b[ 0] = data01;
b[ 1] = data02;

b += 2;

offset --;
i --;
}

posX += 2;
}

if (n & 1) {

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;

b[ 0] = data01;

b ++;

offset --;
i --;
}
}

return 0;
}

+ 136
- 0
kernel/generic/symm_ucopy_6.c View File

@@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js, offset;

FLOAT data01, data02, data03, data04;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);
while (js > 0){

offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
if (offset > -3) ao4 ++; else ao4 += lda;

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b += 4;

offset --;
i --;
}

posX += 4;
js --;
}

if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;

b[ 0] = data01;
b[ 1] = data02;

b += 2;

offset --;
i --;
}
posX += 2;
}

if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;

b[ 0] = data01;

b ++;

offset --;
i --;
}
}

return 0;
}

+ 484
- 0
kernel/generic/trmm_lncopy_6.c View File

@@ -0,0 +1,484 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = data15;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;

} else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {
#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data12 = *(ao3 + 3);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = data02;
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data03;
b[ 9] = data07;
b[10] = ONE;
b[11] = ZERO;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = ZERO;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X > posY) {

if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;
b[ 4] = data02;
b[ 5] = data04;
b[ 6] = data06;
b[ 7] = data08;

ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
ao1 += lda;
b += 4;
}
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data15 = *(ao4 + 2);
}
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data15;
b += 4;
}
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data06 = *(ao2 + 1);
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data15;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */


if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
b[ 3] = data06;

ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data02;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = data02;
b[ 3] = data06;
#endif
ao1 += 2;
ao2 += 2;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
b[ 0] = data01;
b[ 1] = data02;

ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X < posY) {
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = data05;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
}

i = m;
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += 1;
} else
if (X < posY) {
b += 1;
ao1 += lda;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
b += 1;
ao1 += 1;
}

X ++;
i --;
} while (i > 0);
}

posY += 1;
}

return 0;
}

+ 488
- 0
kernel/generic/trmm_ltcopy_6.c View File

@@ -0,0 +1,488 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X > posY) {
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;

} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {

#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data12 = *(ao3 + 3);
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b[ 4] = ZERO;
b[ 5] = ONE;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
b[11] = data12;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = ZERO;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
b[11] = data12;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X > posY) {

if (m & 2) {
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += lda;
b += 4;
}
} else {

#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

if (i >= 2) {
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
}

if (i >= 3) {
data12 = *(ao3 + 3);
}
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data07;
b[ 3] = data08;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data12;
b += 4;
}
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

if (i >= 2) {
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
}
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data07;
b[ 3] = data08;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data12;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */


if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
b[ 3] = data06;

ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = ZERO;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
b[ 3] = data06;
#endif
ao1 += 2;
ao2 += 2;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X > posY) {
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
}

i = m;
if (i > 0) {
do {
if (X > posY) {
b += 1;
ao1 += 1;
} else
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += 1;
b += 1;
}

X ++;
i --;
} while (i > 0);
}

posY += 1;
}

return 0;
}

+ 785
- 0
kernel/generic/trmm_uncopy_6.c View File

@@ -0,0 +1,785 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X, mm;

FLOAT data01, data02, data03, data04, data05, data06;
FLOAT data07, data08, data09, data10, data11, data12;
FLOAT data13, data14, data15, data16, data17, data18;
FLOAT data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30;
FLOAT data31, data32, data33, data34, data35, data36;

FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6;

//js = (n >> 2);
js = n/6;
if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
ao5 = a + posX + (posY + 4) * lda;
ao6 = a + posX + (posY + 5) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
ao5 = a + posY + (posX + 4) * lda;
ao6 = a + posY + (posX + 5) * lda;
}

i = m/6;
if (i > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao1 + 4);
data06 = *(ao1 + 5);

data07 = *(ao2 + 0);
data08 = *(ao2 + 1);
data09 = *(ao2 + 2);
data10 = *(ao2 + 3);
data11 = *(ao2 + 4);
data12 = *(ao2 + 5);

data13 = *(ao3 + 0);
data14 = *(ao3 + 1);
data15 = *(ao3 + 2);
data16 = *(ao3 + 3);
data17 = *(ao3 + 4);
data18 = *(ao3 + 5);

data19 = *(ao4 + 0);
data20 = *(ao4 + 1);
data21 = *(ao4 + 2);
data22 = *(ao4 + 3);
data23 = *(ao4 + 4);
data24 = *(ao4 + 5);

data25 = *(ao5 + 0);
data26 = *(ao5 + 1);
data27 = *(ao5 + 2);
data28 = *(ao5 + 3);
data29 = *(ao5 + 4);
data30 = *(ao5 + 5);

data31 = *(ao6 + 0);
data32 = *(ao6 + 1);
data33 = *(ao6 + 2);
data34 = *(ao6 + 3);
data35 = *(ao6 + 4);
data36 = *(ao6 + 5);

b[ 0] = data01;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = data02;
b[ 7] = data08;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = data03;
b[13] = data09;
b[14] = data15;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = data04;
b[19] = data10;
b[20] = data16;
b[21] = data22;
b[22] = data28;
b[23] = data34;

b[24] = data05;
b[25] = data11;
b[26] = data17;
b[27] = data23;
b[28] = data29;
b[29] = data35;

b[30] = data06;
b[31] = data12;
b[32] = data18;
b[33] = data24;
b[34] = data30;
b[35] = data36;

ao1 += 6;
ao2 += 6;
ao3 += 6;
ao4 += 6;
ao5 += 6;
ao6 += 6;
b += 36;
} else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = ZERO;
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ZERO;
b[11] = ZERO;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = ZERO;
b[29] = ZERO;
b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = ZERO;

ao1 += 6 * lda;
ao2 += 6 * lda;
ao3 += 6 * lda;
ao4 += 6 * lda;
ao5 += 6 * lda;
ao6 += 6 * lda;

b += 36;
} else {
data01 = *(ao1 + 0);
data07 = *(ao2 + 0);
data13 = *(ao3 + 0);
data19 = *(ao4 + 0);
data25 = *(ao5 + 0);
data31 = *(ao6 + 0);

data08 = *(ao2 + 1);
data14 = *(ao3 + 1);
data20 = *(ao4 + 1);
data26 = *(ao5 + 1);
data32 = *(ao6 + 1);

data15 = *(ao3 + 2);
data21 = *(ao4 + 2);
data27 = *(ao5 + 2);
data33 = *(ao6 + 2);

data22 = *(ao4 + 3);
data28 = *(ao5 + 3);
data34 = *(ao6 + 3);

data29 = *(ao5 + 4);
data35 = *(ao6 + 4);

data36 = *(ao6 + 5);

#ifdef UNIT
b[ 0] = ONE;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = ZERO;
b[ 7] = ONE;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ONE;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ONE;
b[22] = data28;
b[23] = data34;

b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = ONE;
b[29] = data35;

b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = ONE;
#else
b[ 0] = data01;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = ZERO;
b[ 7] = data08;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = ZERO;
b[13] = ZERO;
b[14] = data15;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = data22;
b[22] = data28;
b[23] = data34;

b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = data29;
b[29] = data35;

b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = data36;
#endif

ao1 += 6;
ao2 += 6;
ao3 += 6;
ao4 += 6;
ao5 += 6;
ao6 += 7;

b += 36;
}
X += 6;
i --;
} while (i > 0);
}
mm = m - m/6;
if (mm & 4) {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = data15;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;

ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
} else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = ZERO;
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ZERO;
b[11] = ZERO;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;

b += 16;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;

b[ 4] = ZERO;
b[ 5] = ONE;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
b[11] = data15;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ONE;
#else
data01 = *(ao1 + 0);

data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;

b[ 4] = ZERO;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
b[11] = data15;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;

b += 16;
}
X += 4;
}

if (mm & 3) {
if (X < posY) {
if (mm & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;
b[ 4] = data02;
b[ 5] = data04;
b[ 6] = data06;
b[ 7] = data08;

ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}

if (mm & 1) {
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
data05 = *(ao3 + 0);
data07 = *(ao4 + 0);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;

ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}

} else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}

if (m & 1) {
ao1 += lda;
b += 4;
}

} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data15 = *(ao4 + 2);
}

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;

if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}

if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data15;
b += 4;
}
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data06 = *(ao2 + 1);
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;

if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}

if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data15;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */

if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
b[ 3] = data06;

ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;

} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = ZERO;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = ZERO;
b[ 3] = data06;
#endif

ao1 += 2 * lda;
ao2 += 2 * lda;

b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {

if (X < posY) {
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X > posY) {
ao1 += lda;
ao2 += lda;
b += 2;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
b[ 0] = ONE;
b[ 1] = data05;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
#endif
ao1 += lda;
ao2 += lda;
b += 2;
}
}

posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}

i = m;
if (m > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += 1;
b += 1;
} else
if (X > posY) {
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
b += 1;
}

X += 1;
i --;
} while (i > 0);
}
}

return 0;
}

+ 472
- 0
kernel/generic/trmm_utcopy_6.c View File

@@ -0,0 +1,472 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X < posY) {
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;

b[ 4] = data05;
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data09;
b[ 9] = data10;
b[10] = ONE;
b[11] = ZERO;

b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;

b[ 4] = data05;
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = ZERO;

b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
#endif

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X < posY) {

if (m & 2) {
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += lda;
b += 4;
}
} else {

#ifdef UNIT
if (i >= 2) {
data05 = *(ao2 + 0);
}

if (i >= 3) {
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
}

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
if(i >= 2) {
b[ 0] = data05;
b[ 1] = ONE;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
}
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
b[ 2] = ONE;
b[ 3] = ZERO;
b += 4;
}
#else
data01 = *(ao1 + 0);

if (i >= 2) {
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
}

if (i >= 3) {
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
}

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
if(i >= 2) {
b[ 0] = data05;
b[ 1] = data06;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
}
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
b[ 2] = data11;
b[ 3] = ZERO;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */

if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X < posY) {
ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
b[ 3] = data06;

ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data05;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = data05;
b[ 3] = data06;

#endif
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X < posY) {
ao1 += 2;
b += 2;
} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
#else
data01 = *(ao1 + 0);

b[ 0] = data01;
b[ 1] = ZERO;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}

i = m;
if (m > 0) {
do {

if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
b += 1;
}

X += 1;
i --;
} while (i > 0);
}
}

return 0;
}

+ 4
- 0
kernel/generic/trsm_kernel_LN.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 4
- 0
kernel/generic/trsm_kernel_LT.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 4
- 0
kernel/generic/trsm_kernel_RN.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 5
- 0
kernel/generic/trsm_kernel_RT.c View File

@@ -58,6 +58,11 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif


#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 326
- 0
kernel/generic/trsm_lncopy_6.c View File

@@ -0,0 +1,326 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);

#ifndef UNIT
data11 = *(a3 + 2);
#endif
data12 = *(a3 + 3);

#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data02;
*(b + 5) = INV(data06);

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = INV(data11);

*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = INV(data16);
}

if (ii > jj) {

data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
*(b + 4) = data02;
*(b + 5) = data06;
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = data11;
*(b + 11) = data15;
*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = data16;
}
a1 += 4;
a2 += 4;
a3 += 4;
a4 += 4;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data06 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data02;
*(b + 5) = INV(data06);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);
data05 = *(a3 + 0);
data06 = *(a3 + 1);
data07 = *(a4 + 0);
data08 = *(a4 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data05;
*(b + 3) = data07;
*(b + 4) = data02;
*(b + 5) = data04;
*(b + 6) = data06;
*(b + 7) = data08;
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
data03 = *(a3 + 0);
data04 = *(a4 + 0);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4 * lda;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 2) = data02;
*(b + 3) = INV(data04);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data02;
*(b + 3) = data04;
}
a1 += 2;
a2 += 2;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2 * lda;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1+= 1;
b += 1;
i --;
ii += 1;
}
}

return 0;
}

+ 346
- 0
kernel/generic/trsm_ltcopy_6.c View File

@@ -0,0 +1,346 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);

#ifndef UNIT
data11 = *(a3 + 2);
#endif
data12 = *(a3 + 3);

#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;

*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 10) = INV(data11);
*(b + 11) = data12;

*(b + 15) = INV(data16);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = data11;
*(b + 11) = data12;
*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = data16;
}
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
a4 += 4 * lda;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;

}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data02;

*(b + 3) = INV(data04);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1 += 1 * lda;
b += 1;

i --;
ii += 1;
}
}

return 0;
}

+ 350
- 0
kernel/generic/trsm_uncopy_6.c View File

@@ -0,0 +1,350 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);
#ifndef UNIT
data11 = *(a3 + 2);
#endif

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;

*(b + 5) = INV(data06);
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 10) = INV(data11);
*(b + 11) = data15;

*(b + 15) = INV(data16);
}

if (ii < jj) {

data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
*(b + 4) = data02;
*(b + 5) = data06;
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = data11;
*(b + 11) = data15;
*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = data16;
}
a1 += 4;
a2 += 4;
a3 += 4;
a4 += 4;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);

data13 = *(a4 + 0);
data14 = *(a4 + 1);

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;

*(b + 5) = INV(data06);
*(b + 6) = data10;
*(b + 7) = data14;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);
data05 = *(a3 + 0);
data06 = *(a3 + 1);
data07 = *(a4 + 0);
data08 = *(a4 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2;
a2 += 2;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
data09 = *(a3 + 0);
data13 = *(a4 + 0);

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
data03 = *(a3 + 0);
data04 = *(a4 + 0);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4 * lda;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif

data03 = *(a2 + 0);
#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data03;
*(b + 3) = INV(data04);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data02;
*(b + 3) = data04;
}
a1 += 2;
a2 += 2;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {


#ifndef UNIT
data01 = *(a1 + 0);
#endif

data03 = *(a2 + 0);

*(b + 0) = INV(data01);
*(b + 1) = data03;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2 * lda;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1+= 1;
b += 1;
i --;
ii += 1;
}
}

return 0;
}

+ 322
- 0
kernel/generic/trsm_utcopy_6.c View File

@@ -0,0 +1,322 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);
#ifndef UNIT
data11 = *(a3 + 2);
#endif

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data05;
*(b + 5) = INV(data06);

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = INV(data11);

*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = INV(data16);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = data11;
*(b + 11) = data12;
*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = data16;
}
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
a4 += 4 * lda;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data05;
*(b + 5) = INV(data06);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data03 = *(a2 + 0);
#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 2) = data03;
*(b + 3) = INV(data04);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1 += 1 * lda;
b += 1;

i --;
ii += 1;
}
}

return 0;
}

+ 1
- 0
kernel/x86/KERNEL.HASWELL View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

+ 1
- 1
kernel/x86/trsm_kernel_LN_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LN_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_LT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_2x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/trsm_kernel_RT_4x4_penryn.S View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LN_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_LT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_1x2_penryn.S View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 1
- 1
kernel/x86/ztrsm_kernel_RT_2x2_penryn.S View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif


+ 63
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -0,0 +1,63 @@
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DGEMMKERNEL = dgemm_kernel_4x4_haswell.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)

ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S


+ 9
- 6
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
@@ -16,7 +16,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S

DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
@@ -25,7 +26,8 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S

CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -34,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -52,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c


DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c


+ 1920
- 0
kernel/x86_64/cgemm_kernel_4x2_piledriver.S
File diff suppressed because it is too large
View File


+ 2284
- 0
kernel/x86_64/cgemm_kernel_8x2_haswell.S
File diff suppressed because it is too large
View File


+ 5215
- 0
kernel/x86_64/dgemm_kernel_16x2_haswell.S
File diff suppressed because it is too large
View File


+ 3479
- 0
kernel/x86_64/dgemm_kernel_4x4_haswell.S
File diff suppressed because it is too large
View File


+ 1734
- 0
kernel/x86_64/dgemm_kernel_6x4_piledriver.S
File diff suppressed because it is too large
View File


+ 4523
- 0
kernel/x86_64/dgemm_kernel_8x2_piledriver.S
File diff suppressed because it is too large
View File


+ 5258
- 0
kernel/x86_64/sgemm_kernel_16x2_piledriver.S
File diff suppressed because it is too large
View File


+ 3159
- 0
kernel/x86_64/sgemm_kernel_16x4_haswell.S
File diff suppressed because it is too large
View File


+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1428
- 0
kernel/x86_64/zgemm_kernel_2x2_piledriver.S
File diff suppressed because it is too large
View File


+ 1812
- 0
kernel/x86_64/zgemm_kernel_4x2_haswell.S
File diff suppressed because it is too large
View File


+ 1
- 1
kernel/x86_64/zsymv_L_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_L_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 1
- 1
kernel/x86_64/zsymv_U_sse2.S View File

@@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE)
#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)


+ 17
- 0
param.h View File

@@ -1154,6 +1154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifdef HASWELL

<<<<<<< HEAD
#define SNUMOPT 8
#define DNUMOPT 4

@@ -1164,6 +1165,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8

#define SWITCH_RATIO 4
=======
#define SNUMOPT 8
#define DNUMOPT 4

#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SYMV_P 8

#define SWITCH_RATIO 4
>>>>>>> origin/haswell

#ifdef ARCH_X86

@@ -1233,6 +1246,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_Q 128

#define SGEMM_DEFAULT_R sgemm_r
<<<<<<< HEAD
=======
//#define DGEMM_DEFAULT_R dgemm_r
>>>>>>> origin/haswell
#define DGEMM_DEFAULT_R 13824
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r


Loading…
Cancel
Save