| @@ -262,6 +262,7 @@ endif | |||||
| lapack-test : | lapack-test : | ||||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||||
| make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib | make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib | ||||
| make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | ||||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | ||||
| @@ -291,4 +292,6 @@ endif | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | ||||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | ||||
| @rm -f *.grd Makefile.conf_last config_last.h | @rm -f *.grd Makefile.conf_last config_last.h | ||||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||||
| @rm -f $(NETLIB_LAPACK_DIR)/tmglib.a | |||||
| @echo Done. | @echo Done. | ||||
| @@ -350,7 +350,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||||
| SLAPACKOBJS = \ | SLAPACKOBJS = \ | ||||
| sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | ||||
| spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | ||||
| slauum.$(SUFFIX) strti2.$(SUFFIX) | |||||
| slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) | |||||
| #DLAPACKOBJS = \ | #DLAPACKOBJS = \ | ||||
| @@ -361,7 +361,7 @@ SLAPACKOBJS = \ | |||||
| DLAPACKOBJS = \ | DLAPACKOBJS = \ | ||||
| dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | ||||
| dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | ||||
| dlauum.$(SUFFIX) dtrti2.$(SUFFIX) | |||||
| dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) | |||||
| QLAPACKOBJS = \ | QLAPACKOBJS = \ | ||||
| @@ -377,7 +377,7 @@ QLAPACKOBJS = \ | |||||
| CLAPACKOBJS = \ | CLAPACKOBJS = \ | ||||
| cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | ||||
| cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | ||||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) | |||||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) | |||||
| #ZLAPACKOBJS = \ | #ZLAPACKOBJS = \ | ||||
| @@ -388,7 +388,7 @@ CLAPACKOBJS = \ | |||||
| ZLAPACKOBJS = \ | ZLAPACKOBJS = \ | ||||
| zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | ||||
| zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | ||||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) | |||||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) | |||||
| @@ -1883,19 +1883,19 @@ ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c | |||||
| xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c | xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c | |||||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c | |||||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c | qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c | |||||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c | |||||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c | xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c | ||||
| @@ -147,7 +147,7 @@ SLASRC = \ | |||||
| stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ | stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ | ||||
| stptrs.o \ | stptrs.o \ | ||||
| strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ | strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ | ||||
| strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||||
| strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||||
| slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ | slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ | ||||
| stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ | stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ | ||||
| sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ | sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ | ||||
| @@ -225,7 +225,7 @@ CLASRC = \ | |||||
| ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ | ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ | ||||
| ctprfs.o ctptri.o \ | ctprfs.o ctptri.o \ | ||||
| ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ | ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ | ||||
| ctrsyl.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||||
| ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||||
| cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ | cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ | ||||
| cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ | cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ | ||||
| cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ | cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ | ||||
| @@ -307,7 +307,7 @@ DLASRC = \ | |||||
| dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ | dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ | ||||
| dtptrs.o \ | dtptrs.o \ | ||||
| dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ | dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ | ||||
| dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||||
| dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||||
| dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ | dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ | ||||
| dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ | dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ | ||||
| dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ | dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ | ||||
| @@ -387,7 +387,7 @@ ZLASRC = \ | |||||
| ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ | ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ | ||||
| ztprfs.o ztptri.o \ | ztprfs.o ztptri.o \ | ||||
| ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ | ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ | ||||
| ztrsyl.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||||
| ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||||
| zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ | zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ | ||||
| zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ | zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ | ||||
| zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ | zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ | ||||
| @@ -2,7 +2,7 @@ TOPDIR = .. | |||||
| include ../Makefile.system | include ../Makefile.system | ||||
| #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | ||||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 | |||||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri | |||||
| FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | ||||
| @@ -1,190 +1,113 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2013, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2014/05/22 Saar | |||||
| * TEST double precision unblocked : OK | |||||
| * 2014/05/23 Saar | |||||
| * TEST double precision blocked: OK | |||||
| * TEST single precision blocked: OK | |||||
| **************************************************************************************/ | |||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dp1 = 1.; | |||||
| static FLOAT dm1 = -1.; | |||||
| // static FLOAT dp1 = 1.; | |||||
| // static FLOAT dm1 = -1.; | |||||
| #ifdef UNIT | #ifdef UNIT | ||||
| #define TRTI2 TRTI2_LU | |||||
| #define TRTI2 TRTI2_LU | |||||
| #define TRMM TRMM_LNLU | |||||
| #define TRSM TRSM_RNLU | |||||
| #else | #else | ||||
| #define TRTI2 TRTI2_LN | |||||
| #endif | |||||
| #if 0 | |||||
| #undef GEMM_P | |||||
| #undef GEMM_Q | |||||
| #undef GEMM_R | |||||
| #define GEMM_P 8 | |||||
| #define GEMM_Q 20 | |||||
| #define GEMM_R 64 | |||||
| #define TRTI2 TRTI2_LN | |||||
| #define TRMM TRMM_LNLN | |||||
| #define TRSM TRSM_RNLN | |||||
| #endif | #endif | ||||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | ||||
| BLASLONG n, lda; | |||||
| BLASLONG j, n, lda; | |||||
| FLOAT *a; | FLOAT *a; | ||||
| BLASLONG i, is, min_i, start_i; | |||||
| BLASLONG ls, min_l; | |||||
| BLASLONG bk; | |||||
| BLASLONG blocking; | |||||
| BLASLONG range_N[2]; | |||||
| // BLASLONG info=0; | |||||
| BLASLONG jb; | |||||
| BLASLONG NB; | |||||
| BLASLONG start_j; | |||||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||||
| + GEMM_OFFSET_A); | |||||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||||
| + GEMM_OFFSET_B); | |||||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||||
| n = args -> n; | n = args -> n; | ||||
| a = (FLOAT *)args -> a; | |||||
| lda = args -> lda; | |||||
| if (range_n) { | |||||
| n = range_n[1] - range_n[0]; | |||||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||||
| } | |||||
| NB = GEMM_Q; | |||||
| if (n <= DTB_ENTRIES) { | |||||
| if (n < NB) { | |||||
| TRTI2(args, NULL, range_n, sa, sb, 0); | TRTI2(args, NULL, range_n, sa, sb, 0); | ||||
| return 0; | return 0; | ||||
| } | } | ||||
| blocking = GEMM_Q; | |||||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||||
| start_i = 0; | |||||
| while (start_i < n) start_i += blocking; | |||||
| start_i -= blocking; | |||||
| for (i = start_i; i >= 0; i -= blocking) { | |||||
| bk = MIN(blocking, n - i); | |||||
| if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||||
| if (!range_n) { | |||||
| range_N[0] = i; | |||||
| range_N[1] = i + bk; | |||||
| } else { | |||||
| range_N[0] = range_n[0] + i; | |||||
| range_N[1] = range_n[0] + i + bk; | |||||
| } | |||||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||||
| if (i > 0) { | |||||
| TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||||
| for (ls = 0; ls < i; ls += REAL_GEMM_R) { | |||||
| min_l = i - ls; | |||||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||||
| if (n - bk - i > 0) { | |||||
| for (is = i + bk; is < n; is += GEMM_P) { | |||||
| min_i = n - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| if (ls == 0) { | |||||
| NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sa_trsm, | |||||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||||
| } else { | |||||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||||
| } | |||||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sb_gemm, | |||||
| a + (is + ls * lda) * COMPSIZE, lda); | |||||
| } | |||||
| } | |||||
| for (is = 0; is < bk; is += GEMM_P) { | |||||
| min_i = bk - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| TRMM_KERNEL_LT(min_i, min_l, bk, dp1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if (n - bk - i > 0) { | |||||
| for (is = 0; is < n - bk - i; is += GEMM_P) { | |||||
| min_i = n - bk - i - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); | |||||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sa_trsm, | |||||
| a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| lda = args -> lda; | |||||
| a = (FLOAT *) args -> a; | |||||
| args -> ldb = lda; | |||||
| args -> ldc = lda; | |||||
| args -> alpha = NULL; | |||||
| start_j = 0; | |||||
| while (start_j < n) start_j += NB; | |||||
| start_j -= NB; | |||||
| for (j = start_j ; j >=0 ; j-= NB) | |||||
| { | |||||
| jb = n - j; | |||||
| if ( jb > NB ) jb = NB; | |||||
| args -> n = jb; | |||||
| args -> m = n-j-jb; | |||||
| args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; | |||||
| args -> b = &a[(j+jb+j*lda) * COMPSIZE]; | |||||
| args -> beta = beta_plus; | |||||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||||
| args -> beta = beta_minus; | |||||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -1,46 +1,44 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2013, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2014/05/22 Saar | |||||
| * TEST double precision unblocked : OK | |||||
| * TEST double precision blocked : OK | |||||
| * 2014/05/23 | |||||
| * TEST single precision blocked : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include "common.h" | #include "common.h" | ||||
| static FLOAT dp1 = 1.; | |||||
| static FLOAT dm1 = -1.; | |||||
| // static FLOAT dp1 = 1.; | |||||
| // static FLOAT dm1 = -1.; | |||||
| #ifdef UNIT | #ifdef UNIT | ||||
| #define TRTI2 TRTI2_UU | #define TRTI2 TRTI2_UU | ||||
| @@ -48,152 +46,66 @@ static FLOAT dm1 = -1.; | |||||
| #define TRTI2 TRTI2_UN | #define TRTI2 TRTI2_UN | ||||
| #endif | #endif | ||||
| #if 0 | |||||
| #undef GEMM_P | |||||
| #undef GEMM_Q | |||||
| #undef GEMM_R | |||||
| #define GEMM_P 8 | |||||
| #define GEMM_Q 20 | |||||
| #define GEMM_R 64 | |||||
| #ifdef UNIT | |||||
| #define TRMM TRMM_LNUU | |||||
| #define TRSM TRSM_RNUU | |||||
| #else | |||||
| #define TRMM TRMM_LNUN | |||||
| #define TRSM TRSM_RNUN | |||||
| #endif | #endif | ||||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | ||||
| BLASLONG n, lda; | |||||
| BLASLONG j, n, lda; | |||||
| FLOAT *a; | FLOAT *a; | ||||
| BLASLONG i, is, min_i, start_is; | |||||
| BLASLONG ls, min_l; | |||||
| BLASLONG bk; | |||||
| BLASLONG blocking; | |||||
| BLASLONG range_N[2]; | |||||
| // BLASLONG info=0; | |||||
| BLASLONG jb; | |||||
| BLASLONG NB; | |||||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||||
| + GEMM_OFFSET_A); | |||||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||||
| + GEMM_OFFSET_B); | |||||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||||
| n = args -> n; | n = args -> n; | ||||
| a = (FLOAT *)args -> a; | |||||
| lda = args -> lda; | |||||
| if (range_n) { | |||||
| n = range_n[1] - range_n[0]; | |||||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||||
| } | |||||
| NB = GEMM_Q; | |||||
| if (n <= DTB_ENTRIES) { | |||||
| if (n <= NB) { | |||||
| TRTI2(args, NULL, range_n, sa, sb, 0); | TRTI2(args, NULL, range_n, sa, sb, 0); | ||||
| return 0; | return 0; | ||||
| } | } | ||||
| blocking = GEMM_Q; | |||||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||||
| for (i = 0; i < n; i += blocking) { | |||||
| bk = MIN(blocking, n - i); | |||||
| if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||||
| if (!range_n) { | |||||
| range_N[0] = i; | |||||
| range_N[1] = i + bk; | |||||
| } else { | |||||
| range_N[0] = range_n[0] + i; | |||||
| range_N[1] = range_n[0] + i + bk; | |||||
| } | |||||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||||
| if (n -bk - i > 0) { | |||||
| TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||||
| for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { | |||||
| min_l = n - ls; | |||||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||||
| if (i > 0) { | |||||
| for (is = 0; is < i; is += GEMM_P) { | |||||
| min_i = i - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| if (ls == i + bk) { | |||||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||||
| GEMM_BETA(min_i, bk, 0, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sa_trsm, | |||||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||||
| } else { | |||||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||||
| } | |||||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sb_gemm, | |||||
| a + (is + ls * lda) * COMPSIZE, lda); | |||||
| } | |||||
| } | |||||
| start_is = 0; | |||||
| while (start_is < bk) start_is += GEMM_P; | |||||
| start_is -= GEMM_P; | |||||
| for (is = 0; is < bk; is += GEMM_P) { | |||||
| min_i = bk - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| TRMM_KERNEL_LN(min_i, min_l, bk, dp1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if (i > 0) { | |||||
| for (is = 0; is < i; is += GEMM_P) { | |||||
| min_i = i - is; | |||||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||||
| GEMM_BETA(min_i, bk, 0, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||||
| lda = args -> lda; | |||||
| a = (FLOAT *) args -> a; | |||||
| args -> ldb = lda; | |||||
| args -> ldc = lda; | |||||
| args -> alpha = NULL; | |||||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||||
| #ifdef COMPLEX | |||||
| ZERO, | |||||
| #endif | |||||
| sa, sa_trsm, | |||||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| for (j = 0; j < n; j += NB) | |||||
| { | |||||
| jb = n - j; | |||||
| if ( jb > NB ) jb = NB; | |||||
| args -> n = jb; | |||||
| args -> m = j; | |||||
| args -> a = &a[0]; | |||||
| args -> b = &a[(j*lda) * COMPSIZE]; | |||||
| args -> beta = beta_plus; | |||||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||||
| args -> beta = beta_minus; | |||||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||