Browse Source

Merge branch 'release-v0.1alpha2'

tags/v0.1alpha2^0
traits 14 years ago
parent
commit
4a73f5c5ea
41 changed files with 12958 additions and 50 deletions
  1. +5
    -0
      .gitignore
  2. +18
    -3
      Changelog.txt
  3. +18
    -5
      Makefile
  4. +65
    -0
      Makefile.install
  5. +3
    -0
      Makefile.rule
  6. +4
    -0
      Makefile.system
  7. +7
    -1
      README
  8. +1
    -1
      c_check
  9. +5
    -0
      common_mips64.h
  10. +4
    -0
      common_reference.h
  11. +4
    -1
      driver/others/Makefile
  12. +1
    -1
      driver/others/blas_server_omp.c
  13. +45
    -0
      driver/others/openblas_set_num_threads.c
  14. +5
    -4
      driver/others/profile.c
  15. +6
    -5
      exports/Makefile
  16. +5
    -1
      interface/axpy.c
  17. +0
    -0
      interface/create
  18. +7
    -4
      interface/dsdot.c
  19. +9
    -3
      interface/rotmg.c
  20. +9
    -3
      kernel/Makefile
  21. +1
    -1
      kernel/Makefile.L1
  22. +22
    -0
      kernel/mips64/KERNEL
  23. +22
    -0
      kernel/mips64/KERNEL.LOONGSON3A
  24. +6
    -2
      kernel/mips64/dot.S
  25. +2390
    -0
      kernel/mips64/gemm_kernel_loongson3a.S
  26. +2579
    -0
      kernel/mips64/sgemm_kernel_loongson3a.S
  27. +1938
    -0
      kernel/mips64/trsm_kernel_LN_loongson3a.S
  28. +1783
    -0
      kernel/mips64/trsm_kernel_LT_loongson3a.S
  29. +1852
    -0
      kernel/mips64/trsm_kernel_RN_loongson3a.S
  30. +1958
    -0
      kernel/mips64/trsm_kernel_RT_loongson3a.S
  31. +24
    -0
      kernel/setparam-ref.c
  32. +3
    -0
      kernel/x86/zdot_sse2.S
  33. +4
    -0
      kernel/x86_64/dot_sse.S
  34. +2
    -2
      kernel/x86_64/zgemm_kernel_1x4_nehalem.S
  35. +21
    -0
      openblas_config_template.h
  36. +12
    -10
      param.h
  37. +2
    -2
      utest/Makefile
  38. +4
    -0
      utest/common_utest.h
  39. +4
    -1
      utest/main.c
  40. +50
    -0
      utest/test_dsdot.c
  41. +60
    -0
      utest/test_rotmg.c

+ 5
- 0
.gitignore View File

@@ -1,8 +1,13 @@
*.obj
*.lib
*.dll
*.def
*.o
lapack-3.1.1
lapack-3.1.1.tgz
*.so
*.a
.svn
*~
config.h
Makefile.conf


+ 18
- 3
Changelog.txt View File

@@ -1,6 +1,7 @@
OpenBLAS ChangeLog
====================================================================
Version 0.1 alpha2(in development)
Version 0.1 alpha2
23-Jun-2011

common:
* Fixed blasint undefined bug in <cblas.h> file. Other software
@@ -15,11 +16,25 @@ common:
* Provided an error message when the arch is not supported.(Refs
issue #19 on github)
* Fixed issue #23. Fixed a bug of f_check script about generating link flags.
* Added openblas_set_num_threads for Fortran.
* Fixed #25 a wrong result of rotmg.
* Fixed a bug about detecting underscore prefix in c_check.
* Print the wall time (cycles) with enabling FUNCTION_PROFILE
* Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1
* Added install target. You can use "make install". (Refs #20)


x86/x86_64:
*
* Fixed #28 a wrong result of dsdot on x86_64.
* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
* Fixed #33 ztrmm bug on Nehalem.
* Walk round #27 the low performance axpy issue with small imput size & multithreads.

MIPS64:
*
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)

====================================================================
Version 0.1 alpha1
20-Mar-2011


+ 18
- 5
Makefile View File

@@ -15,6 +15,10 @@ ifdef SANITY_CHECK
BLASDIRS += reference
endif

ifndef PREFIX
PREFIX = /opt/OpenBLAS
endif

SUBDIRS = $(BLASDIRS)
ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
@@ -22,8 +26,8 @@ endif

SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

.PHONY : all libs netlib test ctest shared
.NOTPARALLEL : all libs prof lapack-test
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install

all :: libs netlib tests shared
@echo
@@ -70,7 +74,7 @@ ifeq ($(OSNAME), Darwin)
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
# -ln -fs $(LIBDLLNAME) libopenblas.dll
-ln -fs $(LIBDLLNAME) libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
@@ -105,12 +109,17 @@ endif
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
#Save the config files for installation
cp Makefile.conf Makefile.conf_last
cp config.h config_last.h
ifdef DYNAMIC_ARCH
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
touch lib.grd

prof : prof_blas prof_lapack

@@ -230,19 +239,23 @@ lapack-test :

dummy :

install :
$(MAKE) -f Makefile.install install

clean ::
@for d in $(SUBDIRS_ALL) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
#ifdef DYNAMIC_ARCH
@$(MAKE) -C kernel clean
endif
#endif
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.1.1; then \
echo deleting lapack-3.1.1; \
rm -rf lapack-3.1.1 ;\
fi
@rm -f *.grd Makefile.conf_last config_last.h
@echo Done.

+ 65
- 0
Makefile.install View File

@@ -0,0 +1,65 @@
TOPDIR = .
export GOTOBLAS_MAKEFILE = 1
-include $(TOPDIR)/Makefile.conf_last
include ./Makefile.system

.PHONY : install
.NOTPARALLEL : install

lib.grd :
$(error OpenBLAS: Please run "make" firstly)

install : lib.grd
@-mkdir -p $(PREFIX)
@echo Generating openblas_config.h in $(PREFIX)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h
@cat config_last.h >> $(PREFIX)/openblas_config.h
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h
@cat openblas_config_template.h >> $(PREFIX)/openblas_config.h
@echo \#endif >> $(PREFIX)/openblas_config.h

@echo Generating f77blas.h in $(PREFIX)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h
@echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h
@echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h
@cat common_interface.h >> $(PREFIX)/f77blas.h
@echo \#endif >> $(PREFIX)/f77blas.h

@echo Generating cblas.h in $(PREFIX)
@sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h

#for install static library
@echo Copy the static library to $(PREFIX)
@cp $(LIBNAME) $(PREFIX)
@-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX)
#for install shared library
@echo Copy the shared library to $(PREFIX)
ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib
endif
ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
endif

@echo Install OK!


+ 3
- 0
Makefile.rule View File

@@ -91,6 +91,9 @@ VERSION = 0.1alpha2
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1

# The installation directory.
# PREFIX = /opt/OpenBLAS

# Common Optimization Flag; -O2 is enough.
# DEBUG = 1



+ 4
- 0
Makefile.system View File

@@ -515,6 +515,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH
endif

ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK
endif

ifdef SMP
CCOMMON_OPT += -DSMP_SERVER



+ 7
- 1
README View File

@@ -22,6 +22,11 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g
3)Debug version
make DEBUG=1

4)Intall to the directory (Optional)
e.g.
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS

3.Support CPU & OS
Please read GotoBLAS_01Readme.txt

@@ -67,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully.

10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
@@ -74,4 +80,4 @@ Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages
* The gh-pages branch. This is for web pages

+ 1
- 1
c_check View File

@@ -149,7 +149,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/);

$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;

$data =~ /globl\ ([_\.]*)(.*)/;
$data =~ /globl\s([_\.]*)(.*)/;

$need_fu = $1;



+ 5
- 0
common_mips64.h View File

@@ -220,6 +220,11 @@ REALNAME: ;\

#define BUFFER_SIZE ( 8 << 20)

#if defined(LOONGSON3A)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif

#ifndef PAGESIZE
#define PAGESIZE (64UL << 10)
#endif


+ 4
- 0
common_reference.h View File

@@ -60,4 +60,8 @@ float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *,
double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *);

void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);

double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
#endif

+ 4
- 1
driver/others/Makefile View File

@@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)

ifdef SMP
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX)
endif
@@ -100,6 +100,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h
blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F)

openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)

blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)



+ 1
- 1
driver/others/blas_server_omp.c View File

@@ -38,7 +38,7 @@

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
//#include <sys/mman.h>
#include "common.h"

#ifndef USE_OPENMP


+ 45
- 0
driver/others/openblas_set_num_threads.c View File

@@ -0,0 +1,45 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

#include "common.h"

#ifdef SMP_SERVER
#ifdef OS_LINUX

extern void openblas_set_num_threads(int num_threads) ;

void NAME(int* num_threads){
openblas_set_num_threads(*num_threads);
}

#endif
#endif

+ 5
- 4
driver/others/profile.c View File

@@ -74,20 +74,21 @@ void gotoblas_profile_quit(void) {
if (cycles > 0) {

fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n");
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n");
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n");
for (i = 0; i < MAX_PROF_TABLE; i ++) {
if (function_profile_table[i].calls) {
#ifndef OS_WINDOWS
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n",
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
#else
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n",
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
#endif
func_table[i],
function_profile_table[i].calls,
(double)function_profile_table[i].cycles / (double)cycles * 100.,
(double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100.,
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles,
function_profile_table[i].cycles
);
}
}


+ 6
- 5
exports/Makefile View File

@@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME)
zip : dll
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME)

dll : libgoto2.dll
dll : ../$(LIBDLLNAME)
#libgoto2.dll

dll2 : libgoto2_shared.dll

libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def
else
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def
endif
@@ -84,7 +85,7 @@ libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)

$(LIBDYNNAME) : ../$(LIBNAME) osx.def
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)

symbol.$(SUFFIX) : symbol.S
$(CC) $(CFLAGS) -c -o $(@F) $^


+ 5
- 1
interface/axpy.c View File

@@ -85,7 +85,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;

//Temporarily walk around the low performance issue with small imput size & multithreads.
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
#endif



+ 0
- 0
interface/create View File


+ 7
- 4
interface/dsdot.c View File

@@ -49,6 +49,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
double ret = 0.0;

PRINT_DEBUG_NAME;

@@ -61,19 +62,21 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;

return DSDOT_K(n, x, incx, y, incy);
ret=DSDOT_K(n, x, incx, y, incy);

FUNCTION_PROFILE_END(1, n, n);

IDEBUG_END;

return 0;
return ret;
}

#else

double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){

double ret = 0.0;
PRINT_DEBUG_CNAME;

@@ -86,13 +89,13 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;

return DSDOT_K(n, x, incx, y, incy);
ret=DSDOT_K(n, x, incx, y, incy);

FUNCTION_PROFILE_END(1, n, n);

IDEBUG_END;

return 0;
return ret;
}



+ 9
- 3
interface/rotmg.c View File

@@ -7,6 +7,12 @@
#define GAMSQ 16777216.e0
#define RGAMSQ 5.9604645e-8

#ifdef DOUBLE
#define ABS(x) fabs(x)
#else
#define ABS(x) fabsf(x)
#endif

#ifndef CBLAS

void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){
@@ -47,7 +53,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq2 = dp2 * dy1;
dq1 = dp1 * *dx1;

if (! (abs(dq1) > abs(dq2))) goto L40;
if (! (ABS(dq1) > ABS(dq2))) goto L40;

dh21 = -(dy1) / *dx1;
dh12 = dp2 / dp1;
@@ -140,7 +146,7 @@ L150:
goto L130;

L160:
if (! (abs(*dd2) <= RGAMSQ)) {
if (! (ABS(*dd2) <= RGAMSQ)) {
goto L190;
}
if (*dd2 == ZERO) {
@@ -157,7 +163,7 @@ L180:
goto L160;

L190:
if (! (abs(*dd2) >= GAMSQ)) {
if (! (ABS(*dd2) >= GAMSQ)) {
goto L220;
}
igo = 3;


+ 9
- 3
kernel/Makefile View File

@@ -53,6 +53,11 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif

KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
ifneq ($(NO_LAPACK), 1)
KERNEL_INTERFACE += ../common_lapack.h
endif

ifeq ($(ARCH), x86)
COMMONOBJS += cpuid.$(SUFFIX)
endif
@@ -88,9 +93,10 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)

kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h
kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F)


cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(CFLAGS) $< -o $(@F)

@@ -112,10 +118,10 @@ lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(PFLAGS) $< -o $(@F)

ifdef DYNAMIC_ARCH
#ifdef DYNAMIC_ARCH
clean ::
@rm -f setparam_*.c kernel_*.h setparam.h kernel.h

endif
#endif

include $(TOPDIR)/Makefile.tail

+ 1
- 1
kernel/Makefile.L1 View File

@@ -668,7 +668,7 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@

$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@

$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@


+ 22
- 0
kernel/mips64/KERNEL View File

@@ -91,15 +91,37 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S
endif

ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S
endif

ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S
endif

ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S
endif

ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S
endif

ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S
endif

ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S
endif

ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif

CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S


+ 22
- 0
kernel/mips64/KERNEL.LOONGSON3A View File

@@ -1,2 +1,24 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S

SGEMMKERNEL = sgemm_kernel_loongson3a.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = gemm_kernel_loongson3a.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

+ 6
- 2
kernel/mips64/dot.S View File

@@ -300,7 +300,11 @@
.align 3

.L999:
j $31
ADD s1, s1, s2

#ifdef DSDOT
cvt.d.s s1, s1
#endif
j $31
NOP
EPILOGUE

+ 2390
- 0
kernel/mips64/gemm_kernel_loongson3a.S
File diff suppressed because it is too large
View File


+ 2579
- 0
kernel/mips64/sgemm_kernel_loongson3a.S
File diff suppressed because it is too large
View File


+ 1938
- 0
kernel/mips64/trsm_kernel_LN_loongson3a.S
File diff suppressed because it is too large
View File


+ 1783
- 0
kernel/mips64/trsm_kernel_LT_loongson3a.S
File diff suppressed because it is too large
View File


+ 1852
- 0
kernel/mips64/trsm_kernel_RN_loongson3a.S
File diff suppressed because it is too large
View File


+ 1958
- 0
kernel/mips64/trsm_kernel_RT_loongson3a.S
File diff suppressed because it is too large
View File


+ 24
- 0
kernel/setparam-ref.c View File

@@ -101,7 +101,11 @@ gotoblas_t TABLE_NAME = {
#endif
ssymm_outcopyTS, ssymm_oltcopyTS,

#ifndef NO_LAPACK
sneg_tcopyTS, slaswp_ncopyTS,
#else
NULL,NULL,
#endif

0, 0, 0,
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
@@ -147,7 +151,11 @@ gotoblas_t TABLE_NAME = {
#endif
dsymm_outcopyTS, dsymm_oltcopyTS,

#ifndef NO_LAPACK
dneg_tcopyTS, dlaswp_ncopyTS,
#else
NULL, NULL,
#endif

#ifdef EXPRECISION

@@ -195,7 +203,11 @@ gotoblas_t TABLE_NAME = {
#endif
qsymm_outcopyTS, qsymm_oltcopyTS,

#ifndef NO_LAPACK
qneg_tcopyTS, qlaswp_ncopyTS,
#else
NULL, NULL,
#endif

#endif

@@ -286,7 +298,11 @@ gotoblas_t TABLE_NAME = {
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
chemm3m_oucopyiTS, chemm3m_olcopyiTS,

#ifndef NO_LAPACK
cneg_tcopyTS, claswp_ncopyTS,
#else
NULL, NULL,
#endif

0, 0, 0,
ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
@@ -375,7 +391,11 @@ gotoblas_t TABLE_NAME = {
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,

#ifndef NO_LAPACK
zneg_tcopyTS, zlaswp_ncopyTS,
#else
NULL, NULL,
#endif

#ifdef EXPRECISION

@@ -466,7 +486,11 @@ gotoblas_t TABLE_NAME = {
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,

#ifndef NO_LAPACK
xneg_tcopyTS, xlaswp_ncopyTS,
#else
NULL, NULL,
#endif

#endif



+ 3
- 0
kernel/x86/zdot_sse2.S View File

@@ -1541,5 +1541,8 @@
popl %ebx
popl %esi
popl %edi
/*remove the hidden return value address from the stack.*/
popl %ecx
xchgl %ecx, 0(%esp)
ret
EPILOGUE

+ 4
- 0
kernel/x86_64/dot_sse.S View File

@@ -1286,6 +1286,10 @@
haddps %xmm0, %xmm0
#endif

#ifdef DSDOT
cvtss2sd %xmm0, %xmm0
#endif

RESTOREREGISTERS

ret


+ 2
- 2
kernel/x86_64/zgemm_kernel_1x4_nehalem.S View File

@@ -544,7 +544,7 @@
jg .L11

#if defined(TRMMKERNEL) && !defined(LEFT)
addq $1, KK
addq $4, KK
#endif

leaq (C, LDC, 4), C
@@ -594,7 +594,7 @@
jg .L11

#if defined(TRMMKERNEL) && !defined(LEFT)
addq $1, KK
addq $4, KK
#endif

leaq (C, LDC, 4), C


+ 21
- 0
openblas_config_template.h View File

@@ -0,0 +1,21 @@
/*This is only for "make install" target.*/

#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
#else
#define BLASFUNC(FUNC) FUNC
#endif

#if defined(OS_WINDOWS) && defined(__64BIT__)
typedef long long BLASLONG;
typedef unsigned long long BLASULONG;
#else
typedef long BLASLONG;
typedef unsigned long BLASULONG;
#endif

#ifdef USE64BITINT
typedef BLASLONG blasint;
#else
typedef int blasint;
#endif

+ 12
- 10
param.h View File

@@ -1480,27 +1480,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4

#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4

#define CGEMM_DEFAULT_UNROLL_M 1
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4

#define SGEMM_DEFAULT_P 108
#define DGEMM_DEFAULT_P 112
#define SGEMM_DEFAULT_P 32
#define DGEMM_DEFAULT_P 32
#define CGEMM_DEFAULT_P 108
#define ZGEMM_DEFAULT_P 112

#define SGEMM_DEFAULT_Q 288
#define DGEMM_DEFAULT_Q 144
#define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 72

#define SGEMM_DEFAULT_R 2000
#define DGEMM_DEFAULT_R 2000
#define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000
#define CGEMM_DEFAULT_R 2000
#define ZGEMM_DEFAULT_R 2000



+ 2
- 2
utest/Makefile View File

@@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system
TARGET=openblas_utest
CUNIT_LIB=/usr/local/lib/libcunit.a

OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o

all : run_test

$(TARGET): $(OBJS)
$(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
$(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)

run_test: $(TARGET)
./$(TARGET)


+ 4
- 0
utest/common_utest.h View File

@@ -57,4 +57,8 @@ void test_caxpy_inc_0(void);
void test_zdotu_n_1(void);
void test_zdotu_offset_1(void);

void test_drotmg(void);

void test_dsdot_n_1(void);

#endif

+ 4
- 1
utest/main.c View File

@@ -54,7 +54,10 @@ CU_TestInfo test_level1[]={

{"Testing zdotu with n == 1",test_zdotu_n_1},
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1},

{"Testing drotmg",test_drotmg},

{"Testing dsdot with n == 1",test_dsdot_n_1},
CU_TEST_INFO_NULL,
};



+ 50
- 0
utest/test_dsdot.c View File

@@ -0,0 +1,50 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

#include "common_utest.h"

void test_dsdot_n_1()
{
float x= 0.172555164;
float y= -0.0138700781;
int incx=1;
int incy=1;
int n=1;
double res1=0.0f, res2=0.0f;

res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy);

CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS);
}

+ 60
- 0
utest/test_rotmg.c View File

@@ -0,0 +1,60 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

#include "common_utest.h"

void test_drotmg()
{
double te_d1, tr_d1;
double te_d2, tr_d2;
double te_x1, tr_x1;
double te_y1, tr_y1;
double te_param[5],tr_param[5];
int i=0;
te_d1= tr_d1=0.21149573940783739;
te_d2= tr_d2=0.046892057172954082;
te_x1= tr_x1=-0.42272687517106533;
te_y1= tr_y1=0.42211309121921659;
//OpenBLAS
BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
//reference
BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param);
CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS);

for(i=0; i<5; i++){
CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS);
}
}

Loading…
Cancel
Save