Merged some Lapack optimized functions https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-Listtags/v0.2.9^2
| @@ -207,6 +207,7 @@ else | |||
| netlib : lapack_prebuild | |||
| ifndef NOFORTRAN | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||
| @@ -230,11 +231,18 @@ ifndef NOFORTRAN | |||
| -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| @@ -256,13 +264,12 @@ lapack-timing : large.tgz timing.tgz | |||
| ifndef NOFORTRAN | |||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| make -C $(NETLIB_LAPACK_DIR) tmglib | |||
| make -C $(NETLIB_LAPACK_DIR)/TIMING | |||
| endif | |||
| lapack-test : | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib | |||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| @@ -291,4 +298,5 @@ endif | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
| @echo Done. | |||
| @@ -2667,34 +2667,34 @@ | |||
| ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
| ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg | |||
| ## (see `lapack-3.4.1/TESTING/MATGEN`). | |||
| #LAPACKE_clatms, | |||
| #LAPACKE_clatms_work, | |||
| #LAPACKE_dlatms, | |||
| #LAPACKE_dlatms_work, | |||
| #LAPACKE_slatms, | |||
| #LAPACKE_slatms_work, | |||
| #LAPACKE_zlatms, | |||
| #LAPACKE_zlatms_work, | |||
| #LAPACKE_clagge, | |||
| #LAPACKE_clagge_work, | |||
| #LAPACKE_dlagge, | |||
| #LAPACKE_dlagge_work, | |||
| #LAPACKE_slagge, | |||
| #LAPACKE_slagge_work, | |||
| #LAPACKE_zlagge, | |||
| #LAPACKE_zlagge_work, | |||
| #LAPACKE_claghe, | |||
| #LAPACKE_claghe_work, | |||
| #LAPACKE_zlaghe, | |||
| #LAPACKE_zlaghe_work, | |||
| #LAPACKE_clagsy, | |||
| #LAPACKE_clagsy_work, | |||
| #LAPACKE_dlagsy, | |||
| #LAPACKE_dlagsy_work, | |||
| #LAPACKE_slagsy, | |||
| #LAPACKE_slagsy_work, | |||
| #LAPACKE_zlagsy, | |||
| #LAPACKE_zlagsy_work, | |||
| LAPACKE_clatms, | |||
| LAPACKE_clatms_work, | |||
| LAPACKE_dlatms, | |||
| LAPACKE_dlatms_work, | |||
| LAPACKE_slatms, | |||
| LAPACKE_slatms_work, | |||
| LAPACKE_zlatms, | |||
| LAPACKE_zlatms_work, | |||
| LAPACKE_clagge, | |||
| LAPACKE_clagge_work, | |||
| LAPACKE_dlagge, | |||
| LAPACKE_dlagge_work, | |||
| LAPACKE_slagge, | |||
| LAPACKE_slagge_work, | |||
| LAPACKE_zlagge, | |||
| LAPACKE_zlagge_work, | |||
| LAPACKE_claghe, | |||
| LAPACKE_claghe_work, | |||
| LAPACKE_zlaghe, | |||
| LAPACKE_zlaghe_work, | |||
| LAPACKE_clagsy, | |||
| LAPACKE_clagsy_work, | |||
| LAPACKE_dlagsy, | |||
| LAPACKE_dlagsy_work, | |||
| LAPACKE_slagsy, | |||
| LAPACKE_slagsy_work, | |||
| LAPACKE_zlagsy, | |||
| LAPACKE_zlagsy_work, | |||
| ); | |||
| #These function may need 2 underscores. | |||
| @@ -349,7 +349,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||
| SLAPACKOBJS = \ | |||
| sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | |||
| spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) | |||
| spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | |||
| slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX) | |||
| #DLAPACKOBJS = \ | |||
| @@ -359,7 +360,8 @@ SLAPACKOBJS = \ | |||
| DLAPACKOBJS = \ | |||
| dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | |||
| dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) | |||
| dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | |||
| dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX) | |||
| QLAPACKOBJS = \ | |||
| @@ -374,7 +376,8 @@ QLAPACKOBJS = \ | |||
| CLAPACKOBJS = \ | |||
| cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | |||
| cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) | |||
| cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | |||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) | |||
| #ZLAPACKOBJS = \ | |||
| @@ -384,7 +387,9 @@ CLAPACKOBJS = \ | |||
| ZLAPACKOBJS = \ | |||
| zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | |||
| zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) | |||
| zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | |||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX) | |||
| @@ -1788,37 +1793,37 @@ zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : lapack/zgetrf.c | |||
| xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lauu2.c | |||
| slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lapack/lauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lauu2.c | |||
| dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lapack/lauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : zlauu2.c | |||
| clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : lapack/zlauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : zlauu2.c | |||
| zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : lapack/zlauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| slauum.$(SUFFIX) slauum.$(PSUFFIX) : lauum.c | |||
| slauum.$(SUFFIX) slauum.$(PSUFFIX) : lapack/lauum.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lauum.c | |||
| dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lapack/lauum.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| clauum.$(SUFFIX) clauum.$(PSUFFIX) : zlauum.c | |||
| clauum.$(SUFFIX) clauum.$(PSUFFIX) : lapack/zlauum.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : zlauum.c | |||
| zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : lapack/zlauum.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c | |||
| @@ -1860,37 +1865,37 @@ zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : lapack/zpotrf.c | |||
| xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| strti2.$(SUFFIX) strti2.$(PSUFFIX) : trti2.c | |||
| strti2.$(SUFFIX) strti2.$(PSUFFIX) : lapack/trti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : trti2.c | |||
| dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : lapack/trti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : ztrti2.c | |||
| ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : lapack/ztrti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : ztrti2.c | |||
| ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c | |||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c | |||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c | |||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c | |||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c | |||
| @@ -1950,19 +1955,19 @@ zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : lapack/gesv.c | |||
| xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| spotri.$(SUFFIX) spotri.$(PSUFFIX) : potri.c | |||
| spotri.$(SUFFIX) spotri.$(PSUFFIX) : lapack/potri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : potri.c | |||
| dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : lapack/potri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : zpotri.c | |||
| cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : lapack/zpotri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : zpotri.c | |||
| zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : lapack/zpotri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c | |||
| @@ -149,7 +149,10 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| blas_memory_free(buffer); | |||
| #endif | |||
| FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); | |||
| FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, | |||
| args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) | |||
| + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); | |||
| IDEBUG_END; | |||
| @@ -120,14 +120,14 @@ SLASRC = \ | |||
| slarrv.o slartv.o \ | |||
| slarz.o slarzb.o slarzt.o slasy2.o slasyf.o slasyf_rook.o \ | |||
| slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ | |||
| slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ | |||
| sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ | |||
| sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ | |||
| sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ | |||
| sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ | |||
| sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ | |||
| spbstf.o spbsv.o spbsvx.o \ | |||
| spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ | |||
| sposvx.o spotri.o spstrf.o spstf2.o \ | |||
| sposvx.o spstrf.o spstf2.o \ | |||
| sppcon.o sppequ.o \ | |||
| spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ | |||
| spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ | |||
| @@ -147,7 +147,7 @@ SLASRC = \ | |||
| stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ | |||
| stptrs.o \ | |||
| strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ | |||
| strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
| strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
| slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ | |||
| stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ | |||
| sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ | |||
| @@ -208,9 +208,9 @@ CLASRC = \ | |||
| clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ | |||
| clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ | |||
| clasyf.o clasyf_rook.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ | |||
| clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ | |||
| clatzm.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ | |||
| cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ | |||
| cposv.o cposvx.o cpotri.o cpstrf.o cpstf2.o \ | |||
| cposv.o cposvx.o cpstrf.o cpstf2.o \ | |||
| cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ | |||
| cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ | |||
| crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ | |||
| @@ -225,7 +225,7 @@ CLASRC = \ | |||
| ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ | |||
| ctprfs.o ctptri.o \ | |||
| ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ | |||
| ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
| ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
| cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ | |||
| cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ | |||
| cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ | |||
| @@ -279,15 +279,15 @@ DLASRC = \ | |||
| dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ | |||
| dlargv.o dlarrv.o dlartv.o \ | |||
| dlarz.o dlarzb.o dlarzt.o dlasy2.o dlasyf.o dlasyf_rook.o \ | |||
| dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ | |||
| dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ | |||
| dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o \ | |||
| dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ | |||
| dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ | |||
| dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ | |||
| dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ | |||
| dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ | |||
| dpbstf.o dpbsv.o dpbsvx.o \ | |||
| dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ | |||
| dposvx.o dpotri.o dpotrs.o dpstrf.o dpstf2.o \ | |||
| dposvx.o dpotrs.o dpstrf.o dpstf2.o \ | |||
| dppcon.o dppequ.o \ | |||
| dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ | |||
| dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ | |||
| @@ -307,7 +307,7 @@ DLASRC = \ | |||
| dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ | |||
| dtptrs.o \ | |||
| dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ | |||
| dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
| dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
| dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ | |||
| dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ | |||
| dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ | |||
| @@ -369,10 +369,10 @@ ZLASRC = \ | |||
| zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ | |||
| zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ | |||
| zlassq.o zlasyf.o zlasyf_rook.o \ | |||
| zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ | |||
| zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ | |||
| zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o \ | |||
| zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ | |||
| zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ | |||
| zposv.o zposvx.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ | |||
| zposv.o zposvx.o zpotrs.o zpstrf.o zpstf2.o \ | |||
| zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ | |||
| zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ | |||
| zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ | |||
| @@ -387,7 +387,7 @@ ZLASRC = \ | |||
| ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ | |||
| ztprfs.o ztptri.o \ | |||
| ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ | |||
| ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
| ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
| zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ | |||
| zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ | |||
| zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ | |||
| @@ -5,7 +5,7 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines | |||
| 1 3 3 3 10 Values of NB (blocksize) | |||
| 2 2 2 2 2 Values of NBMIN (minimum blocksize) | |||
| 1 0 5 9 1 Values of NX (crossover point) | |||
| 50.0 Threshold value | |||
| 60.0 Threshold value | |||
| T Put T to test the LAPACK routines | |||
| T Put T to test the driver routines | |||
| T Put T to test the error exits | |||
| @@ -2072,9 +2072,9 @@ SOBJ_FILES := $(SSRC_OBJ) | |||
| DOBJ_FILES := $(DSRC_OBJ) | |||
| ZOBJ_FILES := $(ZSRC_OBJ) | |||
| ifdef LAPACKE_TESTING | |||
| # ifdef LAPACKE_TESTING | |||
| ZOBJ_FILES += $(MATGEN_OBJ) | |||
| endif | |||
| #endif | |||
| ALLOBJ = $(COBJ_FILES) $(DOBJ_FILES) $(SOBJ_FILES) $(ZOBJ_FILES) $(OBJ_FILES) | |||
| @@ -2,7 +2,7 @@ TOPDIR = .. | |||
| include ../Makefile.system | |||
| #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | |||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 | |||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri | |||
| FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | |||
| @@ -1,194 +0,0 @@ | |||
| SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) | |||
| * | |||
| * -- LAPACK routine (version 3.0) -- | |||
| * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., | |||
| * Courant Institute, Argonne National Lab, and Rice University | |||
| * June 30, 1999 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LWORK, N | |||
| * .. | |||
| * .. Array Arguments .. | |||
| INTEGER IPIV( * ) | |||
| COMPLEX A( LDA, * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * Purpose | |||
| * ======= | |||
| * | |||
| * CGETRI computes the inverse of a matrix using the LU factorization | |||
| * computed by CGETRF. | |||
| * | |||
| * This method inverts U and then computes inv(A) by solving the system | |||
| * inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| * Arguments | |||
| * ========= | |||
| * | |||
| * N (input) INTEGER | |||
| * The order of the matrix A. N >= 0. | |||
| * | |||
| * A (input/output) COMPLEX array, dimension (LDA,N) | |||
| * On entry, the factors L and U from the factorization | |||
| * A = P*L*U as computed by CGETRF. | |||
| * On exit, if INFO = 0, the inverse of the original matrix A. | |||
| * | |||
| * LDA (input) INTEGER | |||
| * The leading dimension of the array A. LDA >= max(1,N). | |||
| * | |||
| * IPIV (input) INTEGER array, dimension (N) | |||
| * The pivot indices from CGETRF; for 1<=i<=N, row i of the | |||
| * matrix was interchanged with row IPIV(i). | |||
| * | |||
| * WORK (workspace/output) COMPLEX array, dimension (LWORK) | |||
| * On exit, if INFO=0, then WORK(1) returns the optimal LWORK. | |||
| * | |||
| * LWORK (input) INTEGER | |||
| * The dimension of the array WORK. LWORK >= max(1,N). | |||
| * For optimal performance LWORK >= N*NB, where NB is | |||
| * the optimal blocksize returned by ILAENV. | |||
| * | |||
| * If LWORK = -1, then a workspace query is assumed; the routine | |||
| * only calculates the optimal size of the WORK array, returns | |||
| * this value as the first entry of the WORK array, and no error | |||
| * message related to LWORK is issued by XERBLA. | |||
| * | |||
| * INFO (output) INTEGER | |||
| * = 0: successful exit | |||
| * < 0: if INFO = -i, the i-th argument had an illegal value | |||
| * > 0: if INFO = i, U(i,i) is exactly zero; the matrix is | |||
| * singular and its inverse could not be computed. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| COMPLEX ZERO, ONE | |||
| PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), | |||
| $ ONE = ( 1.0E+0, 0.0E+0 ) ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY | |||
| INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, | |||
| $ NBMIN, NN | |||
| * .. | |||
| * .. External Functions .. | |||
| INTEGER ILAENV | |||
| EXTERNAL ILAENV | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| * Test the input parameters. | |||
| * | |||
| INFO = 0 | |||
| NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| WORK( 1 ) = LWKOPT | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN | |||
| INFO = -6 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'CGETRI', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Form inv(U). If INFO > 0 from CTRTRI, then U is singular, | |||
| * and the inverse is not computed. | |||
| * | |||
| CALL CTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) | |||
| IF( INFO.GT.0 ) | |||
| $ RETURN | |||
| * | |||
| NBMIN = 2 | |||
| LDWORK = N | |||
| IF( NB.GT.1 .AND. NB.LT.N ) THEN | |||
| IWS = MAX( LDWORK*NB, 1 ) | |||
| IF( LWORK.LT.IWS ) THEN | |||
| NB = LWORK / LDWORK | |||
| NBMIN = MAX( 2, ILAENV( 2, 'CGETRI', ' ', N, -1, -1, -1 ) ) | |||
| END IF | |||
| ELSE | |||
| IWS = N | |||
| END IF | |||
| * | |||
| * Solve the equation inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN | |||
| * | |||
| * Use unblocked code. | |||
| * | |||
| DO 20 J = N, 1, -1 | |||
| * | |||
| * Copy current column of L to WORK and replace with zeros. | |||
| * | |||
| DO 10 I = J + 1, N | |||
| WORK( I ) = A( I, J ) | |||
| A( I, J ) = ZERO | |||
| 10 CONTINUE | |||
| * | |||
| * Compute current column of inv(A). | |||
| * | |||
| IF( J.LT.N ) | |||
| $ CALL CGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), | |||
| $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) | |||
| 20 CONTINUE | |||
| ELSE | |||
| * | |||
| * Use blocked code. | |||
| * | |||
| NN = ( ( N-1 ) / NB )*NB + 1 | |||
| DO 50 J = NN, 1, -NB | |||
| JB = MIN( NB, N-J+1 ) | |||
| * | |||
| * Copy current block column of L to WORK and replace with | |||
| * zeros. | |||
| * | |||
| DO 40 JJ = J, J + JB - 1 | |||
| DO 30 I = JJ + 1, N | |||
| WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) | |||
| A( I, JJ ) = ZERO | |||
| 30 CONTINUE | |||
| 40 CONTINUE | |||
| * | |||
| * Compute current block column of inv(A). | |||
| * | |||
| IF( J+JB.LE.N ) | |||
| $ CALL CGEMM( 'No transpose', 'No transpose', N, JB, | |||
| $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, | |||
| $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) | |||
| CALL CTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, | |||
| $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) | |||
| 50 CONTINUE | |||
| END IF | |||
| * | |||
| * Apply column interchanges. | |||
| * | |||
| DO 60 J = N - 1, 1, -1 | |||
| JP = IPIV( J ) | |||
| IF( JP.NE.J ) | |||
| $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) | |||
| 60 CONTINUE | |||
| * | |||
| WORK( 1 ) = IWS | |||
| RETURN | |||
| * | |||
| * End of CGETRI | |||
| * | |||
| END | |||
| @@ -1,193 +0,0 @@ | |||
| SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) | |||
| * | |||
| * -- LAPACK routine (version 3.0) -- | |||
| * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., | |||
| * Courant Institute, Argonne National Lab, and Rice University | |||
| * June 30, 1999 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LWORK, N | |||
| * .. | |||
| * .. Array Arguments .. | |||
| INTEGER IPIV( * ) | |||
| DOUBLE PRECISION A( LDA, * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * Purpose | |||
| * ======= | |||
| * | |||
| * DGETRI computes the inverse of a matrix using the LU factorization | |||
| * computed by DGETRF. | |||
| * | |||
| * This method inverts U and then computes inv(A) by solving the system | |||
| * inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| * Arguments | |||
| * ========= | |||
| * | |||
| * N (input) INTEGER | |||
| * The order of the matrix A. N >= 0. | |||
| * | |||
| * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) | |||
| * On entry, the factors L and U from the factorization | |||
| * A = P*L*U as computed by DGETRF. | |||
| * On exit, if INFO = 0, the inverse of the original matrix A. | |||
| * | |||
| * LDA (input) INTEGER | |||
| * The leading dimension of the array A. LDA >= max(1,N). | |||
| * | |||
| * IPIV (input) INTEGER array, dimension (N) | |||
| * The pivot indices from DGETRF; for 1<=i<=N, row i of the | |||
| * matrix was interchanged with row IPIV(i). | |||
| * | |||
| * WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) | |||
| * On exit, if INFO=0, then WORK(1) returns the optimal LWORK. | |||
| * | |||
| * LWORK (input) INTEGER | |||
| * The dimension of the array WORK. LWORK >= max(1,N). | |||
| * For optimal performance LWORK >= N*NB, where NB is | |||
| * the optimal blocksize returned by ILAENV. | |||
| * | |||
| * If LWORK = -1, then a workspace query is assumed; the routine | |||
| * only calculates the optimal size of the WORK array, returns | |||
| * this value as the first entry of the WORK array, and no error | |||
| * message related to LWORK is issued by XERBLA. | |||
| * | |||
| * INFO (output) INTEGER | |||
| * = 0: successful exit | |||
| * < 0: if INFO = -i, the i-th argument had an illegal value | |||
| * > 0: if INFO = i, U(i,i) is exactly zero; the matrix is | |||
| * singular and its inverse could not be computed. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| DOUBLE PRECISION ZERO, ONE | |||
| PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY | |||
| INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, | |||
| $ NBMIN, NN | |||
| * .. | |||
| * .. External Functions .. | |||
| INTEGER ILAENV | |||
| EXTERNAL ILAENV | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL DGEMM, DGEMV, DSWAP, DTRSM, DTRTRI, XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| * Test the input parameters. | |||
| * | |||
| INFO = 0 | |||
| NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| WORK( 1 ) = LWKOPT | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN | |||
| INFO = -6 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'DGETRI', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Form inv(U). If INFO > 0 from DTRTRI, then U is singular, | |||
| * and the inverse is not computed. | |||
| * | |||
| CALL DTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) | |||
| IF( INFO.GT.0 ) | |||
| $ RETURN | |||
| * | |||
| NBMIN = 2 | |||
| LDWORK = N | |||
| IF( NB.GT.1 .AND. NB.LT.N ) THEN | |||
| IWS = MAX( LDWORK*NB, 1 ) | |||
| IF( LWORK.LT.IWS ) THEN | |||
| NB = LWORK / LDWORK | |||
| NBMIN = MAX( 2, ILAENV( 2, 'DGETRI', ' ', N, -1, -1, -1 ) ) | |||
| END IF | |||
| ELSE | |||
| IWS = N | |||
| END IF | |||
| * | |||
| * Solve the equation inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN | |||
| * | |||
| * Use unblocked code. | |||
| * | |||
| DO 20 J = N, 1, -1 | |||
| * | |||
| * Copy current column of L to WORK and replace with zeros. | |||
| * | |||
| DO 10 I = J + 1, N | |||
| WORK( I ) = A( I, J ) | |||
| A( I, J ) = ZERO | |||
| 10 CONTINUE | |||
| * | |||
| * Compute current column of inv(A). | |||
| * | |||
| IF( J.LT.N ) | |||
| $ CALL DGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), | |||
| $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) | |||
| 20 CONTINUE | |||
| ELSE | |||
| * | |||
| * Use blocked code. | |||
| * | |||
| NN = ( ( N-1 ) / NB )*NB + 1 | |||
| DO 50 J = NN, 1, -NB | |||
| JB = MIN( NB, N-J+1 ) | |||
| * | |||
| * Copy current block column of L to WORK and replace with | |||
| * zeros. | |||
| * | |||
| DO 40 JJ = J, J + JB - 1 | |||
| DO 30 I = JJ + 1, N | |||
| WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) | |||
| A( I, JJ ) = ZERO | |||
| 30 CONTINUE | |||
| 40 CONTINUE | |||
| * | |||
| * Compute current block column of inv(A). | |||
| * | |||
| IF( J+JB.LE.N ) | |||
| $ CALL DGEMM( 'No transpose', 'No transpose', N, JB, | |||
| $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, | |||
| $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) | |||
| CALL DTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, | |||
| $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) | |||
| 50 CONTINUE | |||
| END IF | |||
| * | |||
| * Apply column interchanges. | |||
| * | |||
| DO 60 J = N - 1, 1, -1 | |||
| JP = IPIV( J ) | |||
| IF( JP.NE.J ) | |||
| $ CALL DSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) | |||
| 60 CONTINUE | |||
| * | |||
| WORK( 1 ) = IWS | |||
| RETURN | |||
| * | |||
| * End of DGETRI | |||
| * | |||
| END | |||
| @@ -1,193 +0,0 @@ | |||
| SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) | |||
| * | |||
| * -- LAPACK routine (version 3.0) -- | |||
| * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., | |||
| * Courant Institute, Argonne National Lab, and Rice University | |||
| * June 30, 1999 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LWORK, N | |||
| * .. | |||
| * .. Array Arguments .. | |||
| INTEGER IPIV( * ) | |||
| REAL A( LDA, * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * Purpose | |||
| * ======= | |||
| * | |||
| * SGETRI computes the inverse of a matrix using the LU factorization | |||
| * computed by SGETRF. | |||
| * | |||
| * This method inverts U and then computes inv(A) by solving the system | |||
| * inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| * Arguments | |||
| * ========= | |||
| * | |||
| * N (input) INTEGER | |||
| * The order of the matrix A. N >= 0. | |||
| * | |||
| * A (input/output) REAL array, dimension (LDA,N) | |||
| * On entry, the factors L and U from the factorization | |||
| * A = P*L*U as computed by SGETRF. | |||
| * On exit, if INFO = 0, the inverse of the original matrix A. | |||
| * | |||
| * LDA (input) INTEGER | |||
| * The leading dimension of the array A. LDA >= max(1,N). | |||
| * | |||
| * IPIV (input) INTEGER array, dimension (N) | |||
| * The pivot indices from SGETRF; for 1<=i<=N, row i of the | |||
| * matrix was interchanged with row IPIV(i). | |||
| * | |||
| * WORK (workspace/output) REAL array, dimension (LWORK) | |||
| * On exit, if INFO=0, then WORK(1) returns the optimal LWORK. | |||
| * | |||
| * LWORK (input) INTEGER | |||
| * The dimension of the array WORK. LWORK >= max(1,N). | |||
| * For optimal performance LWORK >= N*NB, where NB is | |||
| * the optimal blocksize returned by ILAENV. | |||
| * | |||
| * If LWORK = -1, then a workspace query is assumed; the routine | |||
| * only calculates the optimal size of the WORK array, returns | |||
| * this value as the first entry of the WORK array, and no error | |||
| * message related to LWORK is issued by XERBLA. | |||
| * | |||
| * INFO (output) INTEGER | |||
| * = 0: successful exit | |||
| * < 0: if INFO = -i, the i-th argument had an illegal value | |||
| * > 0: if INFO = i, U(i,i) is exactly zero; the matrix is | |||
| * singular and its inverse could not be computed. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| REAL ZERO, ONE | |||
| PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY | |||
| INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, | |||
| $ NBMIN, NN | |||
| * .. | |||
| * .. External Functions .. | |||
| INTEGER ILAENV | |||
| EXTERNAL ILAENV | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| * Test the input parameters. | |||
| * | |||
| INFO = 0 | |||
| NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| WORK( 1 ) = LWKOPT | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN | |||
| INFO = -6 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'SGETRI', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Form inv(U). If INFO > 0 from STRTRI, then U is singular, | |||
| * and the inverse is not computed. | |||
| * | |||
| CALL STRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) | |||
| IF( INFO.GT.0 ) | |||
| $ RETURN | |||
| * | |||
| NBMIN = 2 | |||
| LDWORK = N | |||
| IF( NB.GT.1 .AND. NB.LT.N ) THEN | |||
| IWS = MAX( LDWORK*NB, 1 ) | |||
| IF( LWORK.LT.IWS ) THEN | |||
| NB = LWORK / LDWORK | |||
| NBMIN = MAX( 2, ILAENV( 2, 'SGETRI', ' ', N, -1, -1, -1 ) ) | |||
| END IF | |||
| ELSE | |||
| IWS = N | |||
| END IF | |||
| * | |||
| * Solve the equation inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN | |||
| * | |||
| * Use unblocked code. | |||
| * | |||
| DO 20 J = N, 1, -1 | |||
| * | |||
| * Copy current column of L to WORK and replace with zeros. | |||
| * | |||
| DO 10 I = J + 1, N | |||
| WORK( I ) = A( I, J ) | |||
| A( I, J ) = ZERO | |||
| 10 CONTINUE | |||
| * | |||
| * Compute current column of inv(A). | |||
| * | |||
| IF( J.LT.N ) | |||
| $ CALL SGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), | |||
| $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) | |||
| 20 CONTINUE | |||
| ELSE | |||
| * | |||
| * Use blocked code. | |||
| * | |||
| NN = ( ( N-1 ) / NB )*NB + 1 | |||
| DO 50 J = NN, 1, -NB | |||
| JB = MIN( NB, N-J+1 ) | |||
| * | |||
| * Copy current block column of L to WORK and replace with | |||
| * zeros. | |||
| * | |||
| DO 40 JJ = J, J + JB - 1 | |||
| DO 30 I = JJ + 1, N | |||
| WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) | |||
| A( I, JJ ) = ZERO | |||
| 30 CONTINUE | |||
| 40 CONTINUE | |||
| * | |||
| * Compute current block column of inv(A). | |||
| * | |||
| IF( J+JB.LE.N ) | |||
| $ CALL SGEMM( 'No transpose', 'No transpose', N, JB, | |||
| $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, | |||
| $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) | |||
| CALL STRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, | |||
| $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) | |||
| 50 CONTINUE | |||
| END IF | |||
| * | |||
| * Apply column interchanges. | |||
| * | |||
| DO 60 J = N - 1, 1, -1 | |||
| JP = IPIV( J ) | |||
| IF( JP.NE.J ) | |||
| $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) | |||
| 60 CONTINUE | |||
| * | |||
| WORK( 1 ) = IWS | |||
| RETURN | |||
| * | |||
| * End of SGETRI | |||
| * | |||
| END | |||
| @@ -1,194 +0,0 @@ | |||
| SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) | |||
| * | |||
| * -- LAPACK routine (version 3.0) -- | |||
| * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., | |||
| * Courant Institute, Argonne National Lab, and Rice University | |||
| * June 30, 1999 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LWORK, N | |||
| * .. | |||
| * .. Array Arguments .. | |||
| INTEGER IPIV( * ) | |||
| COMPLEX*16 A( LDA, * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * Purpose | |||
| * ======= | |||
| * | |||
| * ZGETRI computes the inverse of a matrix using the LU factorization | |||
| * computed by ZGETRF. | |||
| * | |||
| * This method inverts U and then computes inv(A) by solving the system | |||
| * inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| * Arguments | |||
| * ========= | |||
| * | |||
| * N (input) INTEGER | |||
| * The order of the matrix A. N >= 0. | |||
| * | |||
| * A (input/output) COMPLEX*16 array, dimension (LDA,N) | |||
| * On entry, the factors L and U from the factorization | |||
| * A = P*L*U as computed by ZGETRF. | |||
| * On exit, if INFO = 0, the inverse of the original matrix A. | |||
| * | |||
| * LDA (input) INTEGER | |||
| * The leading dimension of the array A. LDA >= max(1,N). | |||
| * | |||
| * IPIV (input) INTEGER array, dimension (N) | |||
| * The pivot indices from ZGETRF; for 1<=i<=N, row i of the | |||
| * matrix was interchanged with row IPIV(i). | |||
| * | |||
| * WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) | |||
| * On exit, if INFO=0, then WORK(1) returns the optimal LWORK. | |||
| * | |||
| * LWORK (input) INTEGER | |||
| * The dimension of the array WORK. LWORK >= max(1,N). | |||
| * For optimal performance LWORK >= N*NB, where NB is | |||
| * the optimal blocksize returned by ILAENV. | |||
| * | |||
| * If LWORK = -1, then a workspace query is assumed; the routine | |||
| * only calculates the optimal size of the WORK array, returns | |||
| * this value as the first entry of the WORK array, and no error | |||
| * message related to LWORK is issued by XERBLA. | |||
| * | |||
| * INFO (output) INTEGER | |||
| * = 0: successful exit | |||
| * < 0: if INFO = -i, the i-th argument had an illegal value | |||
| * > 0: if INFO = i, U(i,i) is exactly zero; the matrix is | |||
| * singular and its inverse could not be computed. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| COMPLEX*16 ZERO, ONE | |||
| PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), | |||
| $ ONE = ( 1.0D+0, 0.0D+0 ) ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY | |||
| INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, | |||
| $ NBMIN, NN | |||
| * .. | |||
| * .. External Functions .. | |||
| INTEGER ILAENV | |||
| EXTERNAL ILAENV | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL XERBLA, ZGEMM, ZGEMV, ZSWAP, ZTRSM, ZTRTRI | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| * Test the input parameters. | |||
| * | |||
| INFO = 0 | |||
| NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| WORK( 1 ) = LWKOPT | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN | |||
| INFO = -6 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'ZGETRI', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Form inv(U). If INFO > 0 from ZTRTRI, then U is singular, | |||
| * and the inverse is not computed. | |||
| * | |||
| CALL ZTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) | |||
| IF( INFO.GT.0 ) | |||
| $ RETURN | |||
| * | |||
| NBMIN = 2 | |||
| LDWORK = N | |||
| IF( NB.GT.1 .AND. NB.LT.N ) THEN | |||
| IWS = MAX( LDWORK*NB, 1 ) | |||
| IF( LWORK.LT.IWS ) THEN | |||
| NB = LWORK / LDWORK | |||
| NBMIN = MAX( 2, ILAENV( 2, 'ZGETRI', ' ', N, -1, -1, -1 ) ) | |||
| END IF | |||
| ELSE | |||
| IWS = N | |||
| END IF | |||
| * | |||
| * Solve the equation inv(A)*L = inv(U) for inv(A). | |||
| * | |||
| IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN | |||
| * | |||
| * Use unblocked code. | |||
| * | |||
| DO 20 J = N, 1, -1 | |||
| * | |||
| * Copy current column of L to WORK and replace with zeros. | |||
| * | |||
| DO 10 I = J + 1, N | |||
| WORK( I ) = A( I, J ) | |||
| A( I, J ) = ZERO | |||
| 10 CONTINUE | |||
| * | |||
| * Compute current column of inv(A). | |||
| * | |||
| IF( J.LT.N ) | |||
| $ CALL ZGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), | |||
| $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) | |||
| 20 CONTINUE | |||
| ELSE | |||
| * | |||
| * Use blocked code. | |||
| * | |||
| NN = ( ( N-1 ) / NB )*NB + 1 | |||
| DO 50 J = NN, 1, -NB | |||
| JB = MIN( NB, N-J+1 ) | |||
| * | |||
| * Copy current block column of L to WORK and replace with | |||
| * zeros. | |||
| * | |||
| DO 40 JJ = J, J + JB - 1 | |||
| DO 30 I = JJ + 1, N | |||
| WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) | |||
| A( I, JJ ) = ZERO | |||
| 30 CONTINUE | |||
| 40 CONTINUE | |||
| * | |||
| * Compute current block column of inv(A). | |||
| * | |||
| IF( J+JB.LE.N ) | |||
| $ CALL ZGEMM( 'No transpose', 'No transpose', N, JB, | |||
| $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, | |||
| $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) | |||
| CALL ZTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, | |||
| $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) | |||
| 50 CONTINUE | |||
| END IF | |||
| * | |||
| * Apply column interchanges. | |||
| * | |||
| DO 60 J = N - 1, 1, -1 | |||
| JP = IPIV( J ) | |||
| IF( JP.NE.J ) | |||
| $ CALL ZSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) | |||
| 60 CONTINUE | |||
| * | |||
| WORK( 1 ) = IWS | |||
| RETURN | |||
| * | |||
| * End of ZGETRI | |||
| * | |||
| END | |||
| @@ -1,190 +1,113 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2013, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2014/05/22 Saar | |||
| * TEST double precision unblocked : OK | |||
| * 2014/05/23 Saar | |||
| * TEST double precision blocked: OK | |||
| * TEST single precision blocked: OK | |||
| **************************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| static FLOAT dp1 = 1.; | |||
| static FLOAT dm1 = -1.; | |||
| // static FLOAT dp1 = 1.; | |||
| // static FLOAT dm1 = -1.; | |||
| #ifdef UNIT | |||
| #define TRTI2 TRTI2_LU | |||
| #define TRTI2 TRTI2_LU | |||
| #define TRMM TRMM_LNLU | |||
| #define TRSM TRSM_RNLU | |||
| #else | |||
| #define TRTI2 TRTI2_LN | |||
| #endif | |||
| #if 0 | |||
| #undef GEMM_P | |||
| #undef GEMM_Q | |||
| #undef GEMM_R | |||
| #define GEMM_P 8 | |||
| #define GEMM_Q 20 | |||
| #define GEMM_R 64 | |||
| #define TRTI2 TRTI2_LN | |||
| #define TRMM TRMM_LNLN | |||
| #define TRSM TRSM_RNLN | |||
| #endif | |||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, lda; | |||
| BLASLONG j, n, lda; | |||
| FLOAT *a; | |||
| BLASLONG i, is, min_i, start_i; | |||
| BLASLONG ls, min_l; | |||
| BLASLONG bk; | |||
| BLASLONG blocking; | |||
| BLASLONG range_N[2]; | |||
| // BLASLONG info=0; | |||
| BLASLONG jb; | |||
| BLASLONG NB; | |||
| BLASLONG start_j; | |||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_A); | |||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_B); | |||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) { | |||
| n = range_n[1] - range_n[0]; | |||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||
| } | |||
| NB = GEMM_Q; | |||
| if (n <= DTB_ENTRIES) { | |||
| if (n < NB) { | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| blocking = GEMM_Q; | |||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
| start_i = 0; | |||
| while (start_i < n) start_i += blocking; | |||
| start_i -= blocking; | |||
| for (i = start_i; i >= 0; i -= blocking) { | |||
| bk = MIN(blocking, n - i); | |||
| if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
| if (!range_n) { | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| } else { | |||
| range_N[0] = range_n[0] + i; | |||
| range_N[1] = range_n[0] + i + bk; | |||
| } | |||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
| if (i > 0) { | |||
| TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
| for (ls = 0; ls < i; ls += REAL_GEMM_R) { | |||
| min_l = i - ls; | |||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
| if (n - bk - i > 0) { | |||
| for (is = i + bk; is < n; is += GEMM_P) { | |||
| min_i = n - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| if (ls == 0) { | |||
| NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } else { | |||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| } | |||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sb_gemm, | |||
| a + (is + ls * lda) * COMPSIZE, lda); | |||
| } | |||
| } | |||
| for (is = 0; is < bk; is += GEMM_P) { | |||
| min_i = bk - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| TRMM_KERNEL_LT(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
| } | |||
| } | |||
| } else { | |||
| if (n - bk - i > 0) { | |||
| for (is = 0; is < n - bk - i; is += GEMM_P) { | |||
| min_i = n - bk - i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); | |||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| lda = args -> lda; | |||
| a = (FLOAT *) args -> a; | |||
| args -> ldb = lda; | |||
| args -> ldc = lda; | |||
| args -> alpha = NULL; | |||
| start_j = 0; | |||
| while (start_j < n) start_j += NB; | |||
| start_j -= NB; | |||
| for (j = start_j ; j >=0 ; j-= NB) | |||
| { | |||
| jb = n - j; | |||
| if ( jb > NB ) jb = NB; | |||
| args -> n = jb; | |||
| args -> m = n-j-jb; | |||
| args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; | |||
| args -> b = &a[(j+jb+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_plus; | |||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_minus; | |||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,46 +1,44 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2013, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2014/05/22 Saar | |||
| * TEST double precision unblocked : OK | |||
| * TEST double precision blocked : OK | |||
| * 2014/05/23 | |||
| * TEST single precision blocked : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| static FLOAT dp1 = 1.; | |||
| static FLOAT dm1 = -1.; | |||
| // static FLOAT dp1 = 1.; | |||
| // static FLOAT dm1 = -1.; | |||
| #ifdef UNIT | |||
| #define TRTI2 TRTI2_UU | |||
| @@ -48,152 +46,66 @@ static FLOAT dm1 = -1.; | |||
| #define TRTI2 TRTI2_UN | |||
| #endif | |||
| #if 0 | |||
| #undef GEMM_P | |||
| #undef GEMM_Q | |||
| #undef GEMM_R | |||
| #define GEMM_P 8 | |||
| #define GEMM_Q 20 | |||
| #define GEMM_R 64 | |||
| #ifdef UNIT | |||
| #define TRMM TRMM_LNUU | |||
| #define TRSM TRSM_RNUU | |||
| #else | |||
| #define TRMM TRMM_LNUN | |||
| #define TRSM TRSM_RNUN | |||
| #endif | |||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, lda; | |||
| BLASLONG j, n, lda; | |||
| FLOAT *a; | |||
| BLASLONG i, is, min_i, start_is; | |||
| BLASLONG ls, min_l; | |||
| BLASLONG bk; | |||
| BLASLONG blocking; | |||
| BLASLONG range_N[2]; | |||
| // BLASLONG info=0; | |||
| BLASLONG jb; | |||
| BLASLONG NB; | |||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_A); | |||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_B); | |||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) { | |||
| n = range_n[1] - range_n[0]; | |||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||
| } | |||
| NB = GEMM_Q; | |||
| if (n <= DTB_ENTRIES) { | |||
| if (n <= NB) { | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| blocking = GEMM_Q; | |||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
| for (i = 0; i < n; i += blocking) { | |||
| bk = MIN(blocking, n - i); | |||
| if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
| if (!range_n) { | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| } else { | |||
| range_N[0] = range_n[0] + i; | |||
| range_N[1] = range_n[0] + i + bk; | |||
| } | |||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
| if (n -bk - i > 0) { | |||
| TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
| for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { | |||
| min_l = n - ls; | |||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
| if (i > 0) { | |||
| for (is = 0; is < i; is += GEMM_P) { | |||
| min_i = i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| if (ls == i + bk) { | |||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| GEMM_BETA(min_i, bk, 0, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } else { | |||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| } | |||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sb_gemm, | |||
| a + (is + ls * lda) * COMPSIZE, lda); | |||
| } | |||
| } | |||
| start_is = 0; | |||
| while (start_is < bk) start_is += GEMM_P; | |||
| start_is -= GEMM_P; | |||
| for (is = 0; is < bk; is += GEMM_P) { | |||
| min_i = bk - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| TRMM_KERNEL_LN(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
| } | |||
| } | |||
| } else { | |||
| if (i > 0) { | |||
| for (is = 0; is < i; is += GEMM_P) { | |||
| min_i = i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| GEMM_BETA(min_i, bk, 0, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
| lda = args -> lda; | |||
| a = (FLOAT *) args -> a; | |||
| args -> ldb = lda; | |||
| args -> ldc = lda; | |||
| args -> alpha = NULL; | |||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (j = 0; j < n; j += NB) | |||
| { | |||
| jb = n - j; | |||
| if ( jb > NB ) jb = NB; | |||
| args -> n = jb; | |||
| args -> m = j; | |||
| args -> a = &a[0]; | |||
| args -> b = &a[(j*lda) * COMPSIZE]; | |||
| args -> beta = beta_plus; | |||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_minus; | |||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,11 +1,7 @@ | |||
| SHELL = /bin/sh | |||
| PLAT = _LINUX | |||
| DRVOPTS = $(OPTS) | |||
| LOADER = $(FORTRAN) | |||
| TIMER = NONE | |||
| LOADER = $(FORTRAN) -pthread | |||
| ARCHFLAGS= -ru | |||
| #RANLIB = ranlib | |||
| BLASLIB = ../../../libopenblas.a | |||
| TMGLIB = tmglib.a | |||
| #EIGSRCLIB = eigsrc.a | |||
| #LINSRCLIB = linsrc.a | |||