Browse Source

Merge remote branch 'origin/develop' into haswell

tags/v0.2.9.rc1
wernsaar 12 years ago
parent
commit
4a1575e748
22 changed files with 19247 additions and 6 deletions
  1. +230
    -0
      kernel/generic/gemm_ncopy_6.c
  2. +281
    -0
      kernel/generic/gemm_tcopy_6.c
  3. +138
    -0
      kernel/generic/symm_lcopy_6.c
  4. +136
    -0
      kernel/generic/symm_ucopy_6.c
  5. +484
    -0
      kernel/generic/trmm_lncopy_6.c
  6. +488
    -0
      kernel/generic/trmm_ltcopy_6.c
  7. +785
    -0
      kernel/generic/trmm_uncopy_6.c
  8. +472
    -0
      kernel/generic/trmm_utcopy_6.c
  9. +4
    -0
      kernel/generic/trsm_kernel_LN.c
  10. +4
    -0
      kernel/generic/trsm_kernel_LT.c
  11. +4
    -0
      kernel/generic/trsm_kernel_RN.c
  12. +5
    -0
      kernel/generic/trsm_kernel_RT.c
  13. +326
    -0
      kernel/generic/trsm_lncopy_6.c
  14. +346
    -0
      kernel/generic/trsm_ltcopy_6.c
  15. +350
    -0
      kernel/generic/trsm_uncopy_6.c
  16. +322
    -0
      kernel/generic/trsm_utcopy_6.c
  17. +9
    -6
      kernel/x86_64/KERNEL.PILEDRIVER
  18. +1920
    -0
      kernel/x86_64/cgemm_kernel_4x2_piledriver.S
  19. +1734
    -0
      kernel/x86_64/dgemm_kernel_6x4_piledriver.S
  20. +4523
    -0
      kernel/x86_64/dgemm_kernel_8x2_piledriver.S
  21. +5258
    -0
      kernel/x86_64/sgemm_kernel_16x2_piledriver.S
  22. +1428
    -0
      kernel/x86_64/zgemm_kernel_2x2_piledriver.S

+ 230
- 0
kernel/generic/gemm_ncopy_6.c View File

@@ -0,0 +1,230 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;

FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;

a_offset = a;
b_offset = b;
j = (n >> 2);
if (j > 0){
do{
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
*(b_offset + 4) = ctemp2;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp10;
*(b_offset + 7) = ctemp14;
*(b_offset + 8) = ctemp3;
*(b_offset + 9) = ctemp7;
*(b_offset + 10) = ctemp11;
*(b_offset + 11) = ctemp15;
*(b_offset + 12) = ctemp4;
*(b_offset + 13) = ctemp8;
*(b_offset + 14) = ctemp12;
*(b_offset + 15) = ctemp16;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset += 16;
i --;
}while(i > 0);
}

i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
ctemp9 = *(a_offset3 + 0);
ctemp13 = *(a_offset4 + 0);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
a_offset1 ++;
a_offset2 ++;
a_offset3 ++;
a_offset4 ++;
b_offset += 4;
i --;
}while(i > 0);
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp2;
*(b_offset + 3) = ctemp6;
*(b_offset + 4) = ctemp3;
*(b_offset + 5) = ctemp7;
*(b_offset + 6) = ctemp4;
*(b_offset + 7) = ctemp8;
a_offset1 += 4;
a_offset2 += 4;
b_offset += 8;
i --;
}while(i > 0);
}
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
a_offset1 ++;
a_offset2 ++;
b_offset += 2;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */
if (n & 1){
a_offset1 = a_offset;
i = (m >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
a_offset1 += 4;
b_offset += 4;
i --;
}while(i > 0);
}
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
*(b_offset + 0) = ctemp1;
a_offset1 ++;
b_offset += 1;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */

return 0;
}

+ 281
- 0
kernel/generic/gemm_tcopy_6.c View File

@@ -0,0 +1,281 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){

BLASLONG i, j;

FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;

a_offset = a;
b_offset = b;

b_offset2 = b + m * (n & ~3);
b_offset3 = b + m * (n & ~1);

j = (m >> 2);
if (j > 0){
do{
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset += 16;

i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);

a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;
*(b_offset1 + 8) = ctemp9;
*(b_offset1 + 9) = ctemp10;
*(b_offset1 + 10) = ctemp11;
*(b_offset1 + 11) = ctemp12;
*(b_offset1 + 12) = ctemp13;
*(b_offset1 + 13) = ctemp14;
*(b_offset1 + 14) = ctemp15;
*(b_offset1 + 15) = ctemp16;
b_offset1 += m * 4;
i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);

ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
ctemp5 = *(a_offset3 + 0);
ctemp6 = *(a_offset3 + 1);
ctemp7 = *(a_offset4 + 0);
ctemp8 = *(a_offset4 + 1);
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
*(b_offset2 + 4) = ctemp5;
*(b_offset2 + 5) = ctemp6;
*(b_offset2 + 6) = ctemp7;
*(b_offset2 + 7) = ctemp8;
b_offset2 += 8;
}

if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
ctemp3 = *(a_offset3 + 0);
ctemp4 = *(a_offset4 + 0);
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
*(b_offset3 + 2) = ctemp3;
*(b_offset3 + 3) = ctemp4;
b_offset3 += 4;
}

j--;
}while(j > 0);
}

if (m & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset += 8;
i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
a_offset1 += 4;
a_offset2 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;

b_offset1 += m * 4;
i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
a_offset1 += 2;
a_offset2 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
b_offset2 += 4;
}
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
b_offset3 += 2;
}
}

if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
i = (n >> 2);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
a_offset1 += 4;
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;

b_offset1 += 4 * m;

i --;
}while(i > 0);
}

if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
a_offset1 += 2;
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
}
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
*(b_offset3 + 0) = ctemp1;
}
}

return 0;
}

+ 138
- 0
kernel/generic/symm_lcopy_6.c View File

@@ -0,0 +1,138 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js, offset;

FLOAT data01, data02, data03, data04;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);
while (js > 0){

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
if (offset > -3) ao4 += lda; else ao4 ++;

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b += 4;

offset --;
i --;
}

posX += 4;
js --;
}

if (n & 2) {

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;

b[ 0] = data01;
b[ 1] = data02;

b += 2;

offset --;
i --;
}

posX += 2;
}

if (n & 1) {

offset = posX - posY;
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
if (offset > 0) ao1 += lda; else ao1 ++;

b[ 0] = data01;

b ++;

offset --;
i --;
}
}

return 0;
}

+ 136
- 0
kernel/generic/symm_ucopy_6.c View File

@@ -0,0 +1,136 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js, offset;

FLOAT data01, data02, data03, data04;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);
while (js > 0){

offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
if (offset > -3) ao4 ++; else ao4 += lda;

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b += 4;

offset --;
i --;
}

posX += 4;
js --;
}

if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;

b[ 0] = data01;
b[ 1] = data02;

b += 2;

offset --;
i --;
}
posX += 2;
}

if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;

i = m;

while (i > 0) {
data01 = *(ao1 + 0);
if (offset > 0) ao1 ++; else ao1 += lda;

b[ 0] = data01;

b ++;

offset --;
i --;
}
}

return 0;
}

+ 484
- 0
kernel/generic/trmm_lncopy_6.c View File

@@ -0,0 +1,484 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = data15;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;

} else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {
#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data12 = *(ao3 + 3);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = data02;
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data03;
b[ 9] = data07;
b[10] = ONE;
b[11] = ZERO;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = ZERO;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X > posY) {

if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;
b[ 4] = data02;
b[ 5] = data04;
b[ 6] = data06;
b[ 7] = data08;

ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
ao1 += lda;
b += 4;
}
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data15 = *(ao4 + 2);
}
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data15;
b += 4;
}
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data06 = *(ao2 + 1);
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data15;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */


if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
b[ 3] = data06;

ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data02;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = data02;
b[ 3] = data06;
#endif
ao1 += 2;
ao2 += 2;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
b[ 0] = data01;
b[ 1] = data02;

ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X < posY) {
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = data05;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
}

i = m;
if (i > 0) {
do {
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += 1;
} else
if (X < posY) {
b += 1;
ao1 += lda;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
b += 1;
ao1 += 1;
}

X ++;
i --;
} while (i > 0);
}

posY += 1;
}

return 0;
}

+ 488
- 0
kernel/generic/trmm_ltcopy_6.c View File

@@ -0,0 +1,488 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X > posY) {
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;

} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {

#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data12 = *(ao3 + 3);
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;

b[ 4] = ZERO;
b[ 5] = ONE;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
b[11] = data12;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = ZERO;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
b[11] = data12;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X > posY) {

if (m & 2) {
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += lda;
b += 4;
}
} else {

#ifdef UNIT
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

if (i >= 2) {
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
}

if (i >= 3) {
data12 = *(ao3 + 3);
}
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data07;
b[ 3] = data08;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data12;
b += 4;
}
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

if (i >= 2) {
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
}
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data07;
b[ 3] = data08;
b += 4;
}
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data12;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */


if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
b[ 3] = data06;

ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = ZERO;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
b[ 3] = data06;
#endif
ao1 += 2;
ao2 += 2;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X > posY) {
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posY + (posX + 0) * lda;
} else {
ao1 = a + posX + (posY + 0) * lda;
}

i = m;
if (i > 0) {
do {
if (X > posY) {
b += 1;
ao1 += 1;
} else
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += 1;
b += 1;
}

X ++;
i --;
} while (i > 0);
}

posY += 1;
}

return 0;
}

+ 785
- 0
kernel/generic/trmm_uncopy_6.c View File

@@ -0,0 +1,785 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X, mm;

FLOAT data01, data02, data03, data04, data05, data06;
FLOAT data07, data08, data09, data10, data11, data12;
FLOAT data13, data14, data15, data16, data17, data18;
FLOAT data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30;
FLOAT data31, data32, data33, data34, data35, data36;

FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6;

//js = (n >> 2);
js = n/6;
if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
ao5 = a + posX + (posY + 4) * lda;
ao6 = a + posX + (posY + 5) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
ao5 = a + posY + (posX + 4) * lda;
ao6 = a + posY + (posX + 5) * lda;
}

i = m/6;
if (i > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao1 + 4);
data06 = *(ao1 + 5);

data07 = *(ao2 + 0);
data08 = *(ao2 + 1);
data09 = *(ao2 + 2);
data10 = *(ao2 + 3);
data11 = *(ao2 + 4);
data12 = *(ao2 + 5);

data13 = *(ao3 + 0);
data14 = *(ao3 + 1);
data15 = *(ao3 + 2);
data16 = *(ao3 + 3);
data17 = *(ao3 + 4);
data18 = *(ao3 + 5);

data19 = *(ao4 + 0);
data20 = *(ao4 + 1);
data21 = *(ao4 + 2);
data22 = *(ao4 + 3);
data23 = *(ao4 + 4);
data24 = *(ao4 + 5);

data25 = *(ao5 + 0);
data26 = *(ao5 + 1);
data27 = *(ao5 + 2);
data28 = *(ao5 + 3);
data29 = *(ao5 + 4);
data30 = *(ao5 + 5);

data31 = *(ao6 + 0);
data32 = *(ao6 + 1);
data33 = *(ao6 + 2);
data34 = *(ao6 + 3);
data35 = *(ao6 + 4);
data36 = *(ao6 + 5);

b[ 0] = data01;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = data02;
b[ 7] = data08;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = data03;
b[13] = data09;
b[14] = data15;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = data04;
b[19] = data10;
b[20] = data16;
b[21] = data22;
b[22] = data28;
b[23] = data34;

b[24] = data05;
b[25] = data11;
b[26] = data17;
b[27] = data23;
b[28] = data29;
b[29] = data35;

b[30] = data06;
b[31] = data12;
b[32] = data18;
b[33] = data24;
b[34] = data30;
b[35] = data36;

ao1 += 6;
ao2 += 6;
ao3 += 6;
ao4 += 6;
ao5 += 6;
ao6 += 6;
b += 36;
} else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = ZERO;
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ZERO;
b[11] = ZERO;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = ZERO;
b[29] = ZERO;
b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = ZERO;

ao1 += 6 * lda;
ao2 += 6 * lda;
ao3 += 6 * lda;
ao4 += 6 * lda;
ao5 += 6 * lda;
ao6 += 6 * lda;

b += 36;
} else {
data01 = *(ao1 + 0);
data07 = *(ao2 + 0);
data13 = *(ao3 + 0);
data19 = *(ao4 + 0);
data25 = *(ao5 + 0);
data31 = *(ao6 + 0);

data08 = *(ao2 + 1);
data14 = *(ao3 + 1);
data20 = *(ao4 + 1);
data26 = *(ao5 + 1);
data32 = *(ao6 + 1);

data15 = *(ao3 + 2);
data21 = *(ao4 + 2);
data27 = *(ao5 + 2);
data33 = *(ao6 + 2);

data22 = *(ao4 + 3);
data28 = *(ao5 + 3);
data34 = *(ao6 + 3);

data29 = *(ao5 + 4);
data35 = *(ao6 + 4);

data36 = *(ao6 + 5);

#ifdef UNIT
b[ 0] = ONE;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = ZERO;
b[ 7] = ONE;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ONE;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ONE;
b[22] = data28;
b[23] = data34;

b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = ONE;
b[29] = data35;

b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = ONE;
#else
b[ 0] = data01;
b[ 1] = data07;
b[ 2] = data13;
b[ 3] = data19;
b[ 4] = data25;
b[ 5] = data31;

b[ 6] = ZERO;
b[ 7] = data08;
b[ 8] = data14;
b[ 9] = data20;
b[10] = data26;
b[11] = data32;

b[12] = ZERO;
b[13] = ZERO;
b[14] = data15;
b[15] = data21;
b[16] = data27;
b[17] = data33;

b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = data22;
b[22] = data28;
b[23] = data34;

b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
b[27] = ZERO;
b[28] = data29;
b[29] = data35;

b[30] = ZERO;
b[31] = ZERO;
b[32] = ZERO;
b[33] = ZERO;
b[34] = ZERO;
b[35] = data36;
#endif

ao1 += 6;
ao2 += 6;
ao3 += 6;
ao4 += 6;
ao5 += 6;
ao6 += 7;

b += 36;
}
X += 6;
i --;
} while (i > 0);
}
mm = m - m/6;
if (mm & 4) {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);

data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b[ 4] = data02;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
b[11] = data15;
b[12] = data04;
b[13] = data08;
b[14] = data12;
b[15] = data16;

ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
} else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b[ 4] = ZERO;
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ZERO;
b[11] = ZERO;
b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
b[19] = ZERO;
b[20] = ZERO;
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;

b += 16;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;

b[ 4] = ZERO;
b[ 5] = ONE;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
b[11] = data15;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = ONE;
#else
data01 = *(ao1 + 0);

data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);

data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;

b[ 4] = ZERO;
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;

b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
b[11] = data15;

b[12] = ZERO;
b[13] = ZERO;
b[14] = ZERO;
b[15] = data16;
#endif
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;

b += 16;
}
X += 4;
}

if (mm & 3) {
if (X < posY) {
if (mm & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;
b[ 4] = data02;
b[ 5] = data04;
b[ 6] = data06;
b[ 7] = data08;

ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}

if (mm & 1) {
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
data05 = *(ao3 + 0);
data07 = *(ao4 + 0);

b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data05;
b[ 3] = data07;

ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}

} else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}

if (m & 1) {
ao1 += lda;
b += 4;
}

} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data15 = *(ao4 + 2);
}

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;

if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}

if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ONE;
b[ 3] = data15;
b += 4;
}
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data13 = *(ao4 + 0);

if (i >= 2) {
data06 = *(ao2 + 1);
data10 = *(ao3 + 1);
data14 = *(ao4 + 1);
}

if (i >= 3) {
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;

if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
b[ 2] = data10;
b[ 3] = data14;
b += 4;
}

if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = data11;
b[ 3] = data15;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */

if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
b[ 3] = data06;

ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;

} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = ZERO;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data05;
b[ 2] = ZERO;
b[ 3] = data06;
#endif

ao1 += 2 * lda;
ao2 += 2 * lda;

b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {

if (X < posY) {
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X > posY) {
ao1 += lda;
ao2 += lda;
b += 2;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
b[ 0] = ONE;
b[ 1] = data05;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);

b[ 0] = data01;
b[ 1] = data05;
#endif
ao1 += lda;
ao2 += lda;
b += 2;
}
}

posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}

i = m;
if (m > 0) {
do {
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += 1;
b += 1;
} else
if (X > posY) {
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
b += 1;
}

X += 1;
i --;
} while (i > 0);
}
}

return 0;
}

+ 472
- 0
kernel/generic/trmm_utcopy_6.c View File

@@ -0,0 +1,472 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){

BLASLONG i, js;
BLASLONG X;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *ao1, *ao2, *ao3, *ao4;

js = (n >> 2);

if (js > 0){
do {
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
ao3 = a + posX + (posY + 2) * lda;
ao4 = a + posX + (posY + 3) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
ao3 = a + posY + (posX + 2) * lda;
ao4 = a + posY + (posX + 3) * lda;
}

i = (m >> 2);
if (i > 0) {
do {
if (X < posY) {
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;

} else {
#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;

b[ 4] = data05;
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data09;
b[ 9] = data10;
b[10] = ONE;
b[11] = ZERO;

b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;

b[ 4] = data05;
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = ZERO;

b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
#endif

ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
b += 16;
}

X += 4;
i --;
} while (i > 0);
}

i = (m & 3);
if (i) {
if (X < posY) {

if (m & 2) {
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
if (m & 1) {
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
} else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
ao1 += lda;
b += 4;
}
} else {

#ifdef UNIT
if (i >= 2) {
data05 = *(ao2 + 0);
}

if (i >= 3) {
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
}

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
if(i >= 2) {
b[ 0] = data05;
b[ 1] = ONE;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
}
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
b[ 2] = ONE;
b[ 3] = ZERO;
b += 4;
}
#else
data01 = *(ao1 + 0);

if (i >= 2) {
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
}

if (i >= 3) {
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
}

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
if(i >= 2) {
b[ 0] = data05;
b[ 1] = data06;
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
}
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
b[ 2] = data11;
b[ 3] = ZERO;
b += 4;
}
#endif
}
}

posY += 4;
js --;
} while (js > 0);
} /* End of main loop */

if (n & 2){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
ao2 = a + posX + (posY + 1) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
ao2 = a + posY + (posX + 1) * lda;
}

i = (m >> 1);
if (i > 0) {
do {
if (X < posY) {
ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
b[ 3] = data06;

ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
} else {
#ifdef UNIT
data05 = *(ao2 + 0);

b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data05;
b[ 3] = ONE;
#else
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);

b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = data05;
b[ 3] = data06;

#endif
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 4;
}

X += 2;
i --;
} while (i > 0);
}

i = (m & 1);
if (i) {
if (X < posY) {
ao1 += 2;
b += 2;
} else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
#else
data01 = *(ao1 + 0);

b[ 0] = data01;
b[ 1] = ZERO;
#endif
b += 2;
}
}
posY += 2;
}

if (n & 1){
X = posX;

if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}

i = m;
if (m > 0) {
do {

if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
b += 1;
}

X += 1;
i --;
} while (i > 0);
}
}

return 0;
}

+ 4
- 0
kernel/generic/trsm_kernel_LN.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 4
- 0
kernel/generic/trsm_kernel_LT.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 4
- 0
kernel/generic/trsm_kernel_RN.c View File

@@ -58,6 +58,10 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 5
- 0
kernel/generic/trsm_kernel_RT.c View File

@@ -58,6 +58,11 @@ static FLOAT dm1 = -1.;
#define GEMM_UNROLL_M_SHIFT 2
#endif

#if GEMM_DEFAULT_UNROLL_M == 6
#define GEMM_UNROLL_M_SHIFT 2
#endif


#if GEMM_DEFAULT_UNROLL_M == 8
#define GEMM_UNROLL_M_SHIFT 3
#endif


+ 326
- 0
kernel/generic/trsm_lncopy_6.c View File

@@ -0,0 +1,326 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);

#ifndef UNIT
data11 = *(a3 + 2);
#endif
data12 = *(a3 + 3);

#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data02;
*(b + 5) = INV(data06);

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = INV(data11);

*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = INV(data16);
}

if (ii > jj) {

data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
*(b + 4) = data02;
*(b + 5) = data06;
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = data11;
*(b + 11) = data15;
*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = data16;
}
a1 += 4;
a2 += 4;
a3 += 4;
a4 += 4;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data06 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data02;
*(b + 5) = INV(data06);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);
data05 = *(a3 + 0);
data06 = *(a3 + 1);
data07 = *(a4 + 0);
data08 = *(a4 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data05;
*(b + 3) = data07;
*(b + 4) = data02;
*(b + 5) = data04;
*(b + 6) = data06;
*(b + 7) = data08;
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
data03 = *(a3 + 0);
data04 = *(a4 + 0);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4 * lda;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 2) = data02;
*(b + 3) = INV(data04);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data02;
*(b + 3) = data04;
}
a1 += 2;
a2 += 2;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2 * lda;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1+= 1;
b += 1;
i --;
ii += 1;
}
}

return 0;
}

+ 346
- 0
kernel/generic/trsm_ltcopy_6.c View File

@@ -0,0 +1,346 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);

#ifndef UNIT
data11 = *(a3 + 2);
#endif
data12 = *(a3 + 3);

#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;

*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 10) = INV(data11);
*(b + 11) = data12;

*(b + 15) = INV(data16);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = data11;
*(b + 11) = data12;
*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = data16;
}
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
a4 += 4 * lda;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;

}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);

#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data02;

*(b + 3) = INV(data04);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1 += 1 * lda;
b += 1;

i --;
ii += 1;
}
}

return 0;
}

+ 350
- 0
kernel/generic/trsm_uncopy_6.c View File

@@ -0,0 +1,350 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);
#ifndef UNIT
data11 = *(a3 + 2);
#endif

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;

*(b + 5) = INV(data06);
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 10) = INV(data11);
*(b + 11) = data15;

*(b + 15) = INV(data16);
}

if (ii < jj) {

data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
*(b + 4) = data02;
*(b + 5) = data06;
*(b + 6) = data10;
*(b + 7) = data14;

*(b + 8) = data03;
*(b + 9) = data07;
*(b + 10) = data11;
*(b + 11) = data15;
*(b + 12) = data04;
*(b + 13) = data08;
*(b + 14) = data12;
*(b + 15) = data16;
}
a1 += 4;
a2 += 4;
a3 += 4;
a4 += 4;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);

data13 = *(a4 + 0);
data14 = *(a4 + 1);

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;

*(b + 5) = INV(data06);
*(b + 6) = data10;
*(b + 7) = data14;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);
data05 = *(a3 + 0);
data06 = *(a3 + 1);
data07 = *(a4 + 0);
data08 = *(a4 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2;
a2 += 2;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
data09 = *(a3 + 0);
data13 = *(a4 + 0);

*(b + 0) = INV(data01);
*(b + 1) = data05;
*(b + 2) = data09;
*(b + 3) = data13;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
data03 = *(a3 + 0);
data04 = *(a4 + 0);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4 * lda;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {

#ifndef UNIT
data01 = *(a1 + 0);
#endif

data03 = *(a2 + 0);
#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 1) = data03;
*(b + 3) = INV(data04);
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data03;
*(b + 2) = data02;
*(b + 3) = data04;
}
a1 += 2;
a2 += 2;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {


#ifndef UNIT
data01 = *(a1 + 0);
#endif

data03 = *(a2 + 0);

*(b + 0) = INV(data01);
*(b + 1) = data03;
}

if (ii < jj) {
data01 = *(a1 + 0);
data02 = *(a2 + 0);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2 * lda;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii < jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1+= 1;
b += 1;
i --;
ii += 1;
}
}

return 0;
}

+ 322
- 0
kernel/generic/trsm_utcopy_6.c View File

@@ -0,0 +1,322 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

#ifndef UNIT
#define INV(a) (ONE / (a))
#else
#define INV(a) (ONE)
#endif

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){

BLASLONG i, ii, j, jj;

FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT *a1, *a2, *a3, *a4;

jj = offset;

j = (n >> 2);
while (j > 0){

a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;

i = (m >> 2);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif

data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

data09 = *(a3 + 0);
data10 = *(a3 + 1);
#ifndef UNIT
data11 = *(a3 + 2);
#endif

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
#ifndef UNIT
data16 = *(a4 + 3);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data05;
*(b + 5) = INV(data06);

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = INV(data11);

*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = INV(data16);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

data09 = *(a3 + 0);
data10 = *(a3 + 1);
data11 = *(a3 + 2);
data12 = *(a3 + 3);

data13 = *(a4 + 0);
data14 = *(a4 + 1);
data15 = *(a4 + 2);
data16 = *(a4 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;

*(b + 8) = data09;
*(b + 9) = data10;
*(b + 10) = data11;
*(b + 11) = data12;
*(b + 12) = data13;
*(b + 13) = data14;
*(b + 14) = data15;
*(b + 15) = data16;
}
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
a4 += 4 * lda;
b += 16;

i --;
ii += 4;
}

if ((m & 2) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data05 = *(a2 + 0);
#ifndef UNIT
data06 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);

*(b + 4) = data05;
*(b + 5) = INV(data06);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

data05 = *(a2 + 0);
data06 = *(a2 + 1);
data07 = *(a2 + 2);
data08 = *(a2 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
*(b + 4) = data05;
*(b + 5) = data06;
*(b + 6) = data07;
*(b + 7) = data08;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;

ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
b += 4;
}

a += 4;
jj += 4;
j --;
}

if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;

i = (m >> 1);
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data03 = *(a2 + 0);
#ifndef UNIT
data04 = *(a2 + 1);
#endif

*(b + 0) = INV(data01);
*(b + 2) = data03;
*(b + 3) = INV(data04);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
data03 = *(a2 + 0);
data04 = *(a2 + 1);

*(b + 0) = data01;
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
}
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;

i --;
ii += 2;
}

if ((m & 1) != 0) {

if (ii== jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
data02 = *(a1 + 1);
*(b + 0) = data01;
*(b + 1) = data02;
}
b += 2;
}
a += 2;
jj += 2;
}

if (n & 1) {
a1 = a + 0 * lda;

i = m;
ii = 0;
while (i > 0) {

if (ii == jj) {
#ifndef UNIT
data01 = *(a1 + 0);
#endif
*(b + 0) = INV(data01);
}

if (ii > jj) {
data01 = *(a1 + 0);
*(b + 0) = data01;
}
a1 += 1 * lda;
b += 1;

i --;
ii += 1;
}
}

return 0;
}

+ 9
- 6
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
@@ -16,7 +16,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S

DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
@@ -25,7 +26,8 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S

CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -34,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -52,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c


DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c


+ 1920
- 0
kernel/x86_64/cgemm_kernel_4x2_piledriver.S
File diff suppressed because it is too large
View File


+ 1734
- 0
kernel/x86_64/dgemm_kernel_6x4_piledriver.S
File diff suppressed because it is too large
View File


+ 4523
- 0
kernel/x86_64/dgemm_kernel_8x2_piledriver.S
File diff suppressed because it is too large
View File


+ 5258
- 0
kernel/x86_64/sgemm_kernel_16x2_piledriver.S
File diff suppressed because it is too large
View File


+ 1428
- 0
kernel/x86_64/zgemm_kernel_2x2_piledriver.S
File diff suppressed because it is too large
View File


Loading…
Cancel
Save