1.Verify correctness using BLAS-Tester
2.Using the built-in benchmark to verify performance, the performance of float and doule type improved by about 60% and about 40% respectively.The test command is:
export OMP_NUM_THREADS=1;numactl -C 10 -l ./sgemv.goto 3000 4000 100
export OMP_NUM_THREADS=1;numactl -C 10 -l ./dgemv.goto 3000 4000 100
On some x86 configurations, this fails with (see error below)
It seems that there are many x86 configurations supported by OpenBLAS,
specific for various CPU family names.
But, if (on some x86 builds) this isn't met, then some parameters become
undefined.
Link:
https://github.com/openwrt/packages/pull/27179#issuecomment-3163947279
```
In file included from ../../common.h:586,
from gemm3m.c:40:
gemm3m_level3.c: In function 'cgemm3m_nn':
../../common_param.h:1435:33: error: 'CGEMM3M_DEFAULT_R' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_R'?
1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R'
1671 | #define GEMM3M_R CGEMM3M_R
| ^~~~~~~~~
gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R'
306 | for(js = n_from; js < n_to; js += GEMM3M_R){
| ^~~~~~~~
../../common_param.h:1435:33: note: each undeclared identifier is reported only once for each function it appears in
1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R'
1671 | #define GEMM3M_R CGEMM3M_R
| ^~~~~~~~~
gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R'
306 | for(js = n_from; js < n_to; js += GEMM3M_R){
| ^~~~~~~~
../../common_param.h:1434:33: error: 'CGEMM3M_DEFAULT_Q' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_Q'?
1434 | #define CGEMM3M_Q CGEMM3M_DEFAULT_Q
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1661:25: note: in expansion of macro 'CGEMM3M_Q'
1661 | #define GEMM3M_Q CGEMM3M_Q
| ^~~~~~~~~
gemm3m_level3.c:313:20: note: in expansion of macro 'GEMM3M_Q'
313 | if (min_l >= GEMM3M_Q * 2) {
| ^~~~~~~~
i486-openwrt-linux-musl-gcc -Os -pipe -march=pentium-mmx -fno-caller-saves -fno-plt -fhonour-copts -ffile-prefix-map=/builder/build_dir/target-i386_pentium-mmx_musl/OpenBLAS-0.3.30=OpenBLAS-0.3.30 -Wformat -Werror=format-security -fstack-protector -D_FORTIFY_SOURCE=1 -Wl,-z,now -Wl,-z,relro -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/usr/include -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/include -I/builder/staging_dir/toolchain-i386_pentium-mmx_gcc-14.3.0_musl/include/fortify -DMAX_STACK_ALLOC=2048 -DEXPRECISION -m128bit-long-double -Wall -m32 -DF_INTERFACE_GFORT -fPIC -DC_LAPACK -DNO_LAPACK -DNO_LAPACKE -DNO_AVX -DNO_AVX512 -DSMP_SERVER -DNO_WARMUP -DMAX_CPU_NUMBER=2 -DMAX_PARALLEL_NUMBER=1 -DBUILD_SINGLE=1 -DBUILD_DOUBLE=1 -DBUILD_COMPLEX=1 -DBUILD_COMPLEX16=1 -DVERSION=\"0.3.30\" -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -DASMNAME= -DASMFNAME=_ -DNAME=_ -DCNAME= -DCHAR_NAME=\"_\" -DCHAR_CNAME=\"\" -DNO_AFFINITY -I. -DMAX_STACK_ALLOC=2048 -DEXPRECISION -m128bit-long-double -Wall -m32 -DF_INTERFACE_GFORT -fPIC -DC_LAPACK -DNO_LAPACK -DNO_LAPACKE -DNO_AVX -DNO_AVX512 -DSMP_SERVER -DNO_WARMUP -DMAX_CPU_NUMBER=2 -DMAX_PARALLEL_NUMBER=1 -DBUILD_SINGLE=1 -DBUILD_DOUBLE=1 -DBUILD_COMPLEX=1 -DBUILD_COMPLEX16=1 -DVERSION=\"0.3.30\" -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME -DASMNAME=cgemm3m_cn -DASMFNAME=cgemm3m_cn_ -DNAME=cgemm3m_cn_ -DCNAME=cgemm3m_cn -DCHAR_NAME=\"cgemm3m_cn_\" -DCHAR_CNAME=\"cgemm3m_cn\" -DNO_AFFINITY -I../.. -UDOUBLE -DCOMPLEX -c -UDOUBLE -DCOMPLEX -DCN gemm3m.c -o cgemm3m_cn.o
../../common_param.h:1433:33: error: 'CGEMM3M_DEFAULT_P' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_P'?
1433 | #define CGEMM3M_P CGEMM3M_DEFAULT_P
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1651:25: note: in expansion of macro 'CGEMM3M_P'
1651 | #define GEMM3M_P CGEMM3M_P
| ^~~~~~~~~
gemm3m_level3.c:325:20: note: in expansion of macro 'GEMM3M_P'
325 | if (min_i >= GEMM3M_P * 2) {
| ^~~~~~~~
../../common_param.h:1436:33: error: 'CGEMM3M_DEFAULT_UNROLL_M' undeclared (first use in this function); did you mean 'CGEMM3M_DEFAULT_UNROLL_N'?
1436 | #define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M
| ^~~~~~~~~~~~~~~~~~~~~~~~
../../common_param.h:1580:25: note: in expansion of macro 'CGEMM3M_UNROLL_M'
1580 | #define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M
| ^~~~~~~~~~~~~~~~
gemm3m_level3.c:329:33: note: in expansion of macro 'GEMM3M_UNROLL_M'
329 | min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
| ^~~~~~~~~~~~~~~
make[4]: *** [Makefile:1865: cgemm3m_nn.o] Error 1
make[4]: *** Waiting for unfinished jobs....
In file included from ../../common.h:586,
from gemm3m.c:40:
gemm3m_level3.c: In function 'cgemm3m_cn':
../../common_param.h:1435:33: error: 'CGEMM3M_DEFAULT_R' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_R'?
1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R'
1671 | #define GEMM3M_R CGEMM3M_R
| ^~~~~~~~~
gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R'
306 | for(js = n_from; js < n_to; js += GEMM3M_R){
| ^~~~~~~~
../../common_param.h:1435:33: note: each undeclared identifier is reported only once for each function it appears in
1435 | #define CGEMM3M_R CGEMM3M_DEFAULT_R
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1671:25: note: in expansion of macro 'CGEMM3M_R'
1671 | #define GEMM3M_R CGEMM3M_R
| ^~~~~~~~~
gemm3m_level3.c:306:37: note: in expansion of macro 'GEMM3M_R'
306 | for(js = n_from; js < n_to; js += GEMM3M_R){
| ^~~~~~~~
../../common_param.h:1434:33: error: 'CGEMM3M_DEFAULT_Q' undeclared (first use in this function); did you mean 'CGEMM_DEFAULT_Q'?
1434 | #define CGEMM3M_Q CGEMM3M_DEFAULT_Q
| ^~~~~~~~~~~~~~~~~
../../common_param.h:1661:25: note: in expansion of macro 'CGEMM3M_Q'
1661 | #define GEMM3M_Q CGEMM3M_Q
| ^~~~~~~~~
gemm3m_level3.c:313:20: note: in expansion of macro 'GEMM3M_Q'
313 | if (min_l >= GEMM3M_Q * 2) {
| ^~~~~~~~
```